cinaeco
/
dotfiles


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
							#!/usr/bin/env python2
# vim: set fdm=marker:

# imports {{{1
from sys import version_info
from subprocess import check_output
import json
import re
try:
    from vim_pandoc.bib.collator import SourceCollator
    from vim_pandoc.bib.util import flatten
except:
    from collator import SourceCollator
    from util import flatten

# _bib_extensions {{{1
# Filetypes that citeproc.py will attempt to parse.
_bib_extensions = ["bib",\
                   "bibtex",\
                   "ris",\
                   "mods",\
                   "json",\
                   "enl",\
                   "wos",\
                   "medline",\
                   "copac",\
                   "xml"]

# _significant_tags {{{1
# Tags that citeproc.py will search in, together with scaling
# factors for relative importance. These are currently non-functional.
_significant_tags = {"id": 0.5,
                     "author": 1.0,
                     "issued": 1.0,
                     "title": 1.0,
                     "publisher": 1.0,
                     "abstract": 0.1}

# _variable_type {{{1
# Map of tags -> types.
_variable_type = {
        "abstract": "plain",
        "annote": "plain",
        "archive": "plain",
        "archive_location": "plain",
        "archive-place": "plain",
        "authority": "plain",
        "call-number": "plain",
        "citation-label": "plain",
        "citation-number": "plain",
        "collection-title": "plain",
        "container-title": "plain",
        "container-title-short": "plain",
        "dimensions": "plain",
        "doi": "plain",
        "event": "plain",
        "event-place": "plain",
        "first-reference-note-number": "plain",
        "genre": "plain",
        "isbn": "plain",
        "issn": "plain",
        "jurisdiction": "plain",
        "keyword": "plain",
        "locator": "plain",
        "medium": "plain",
        "note": "plain",
        "original-publisher": "plain",
        "original-publisher-place": "plain",
        "original-title": "plain",
        "page": "plain",
        "page-first": "plain",
        "pmcid": "plain",
        "pmid": "plain",
        "publisher": "plain",
        "publisher-place": "plain",
        "references": "plain",
        "reviewed-title": "plain",
        "scale": "plain",
        "section": "plain",
        "source": "plain",
        "status": "plain",
        "title": "plain",
        "title-short": "plain",
        "url": "plain",
        "version": "plain",
        "year-suffix": "plain",

        "chapter-number": "number",
        "collection-number": "number",
        "edition": "number",
        "issue": "number",
        "number": "number",
        "number-of-pages": "number",
        "number-of-volumes": "number",
        "volume": "number",

        "accessed": "date",
        "container": "date",
        "event-date": "date",
        "issued": "date",
        "original-date": "date",
        "submitted": "date",

        "author": "name",
        "collection-editor": "name",
        "composer": "name",
        "container-author": "name",
        "director": "name",
        "editor": "name",
        "editorial-director": "name",
        "illustrator": "name",
        "interviewer": "name",
        "original-author": "name",
        "recipient": "name",
        "reviewed-author": "name",
        "translator": "name"
        }

class CSLItem: #{{{1
    # This class implements various helper methods for CSL-JSON formatted bibliography
    # entries.
    def __init__(self, entry): #{{{2
        self.data = entry

    def as_array(self, variable_name): #{{{2
        def plain(variable_contents): #{{{3
            # Takes the contents of a 'plain' variable and splits it into an array.
            if version_info.major == 2:
                return unicode(variable_contents).split('\n')
            else:
                return variable_contents.split('\n')

        def number(variable_contents): #{{{3
            if version_info.major == 2:
                return [unicode(variable_contents)]
            else:
                return [variable_contents]

        def name(variable_contents): #{{{3
            # Parses "name" CSL Variables and returns an array of names.

            def surname(author):
                # Concat dropping particle and non-dropping particle with family name.
                return [" ".join((author.get("dropping-particle", ""),
                                  author.get("non-dropping-particle", ""),
                                  author.get("family", ""))).strip()]

            def given_names(author):
                return [author.get("given", "").strip()]

            def literal_name(author):
                # It seems likely there is some particular reason for the author being
                # a literal, so don't try and do clever stuff like splitting into tokens...
                return [author.get("literal", "").strip()]

            names = []

            for author in variable_contents:
                name = ""
                if "literal" in author:
                    name = literal_name(author)
                else:
                    name = surname(author) + given_names(author)
                names.append(name)

            return names

        def date(variable_contents): #{{{3
            # Currently a placeholder. Will parse 'date' CSL variables and return an array of
            # strings for matches.
            def date_parse(raw_date_array):
                # Presently, this function returns the date in yyyy-mm-dd format. In future, it
                # will provide a variety of alternative forms.
                if version_info.major == 2:
                    date = [unicode(x) for x in raw_date_array]
                else:
                    date = [str(x) for x in  raw_date_array]
                return ["-".join(date)]
            def date_parts(date_parts_contents):
                # Call date_parts for each element.
                response = []
                for date in date_parts_contents:
                    response.extend(date_parse(date))
                return response
            def season(season_type):
                # Not actually clear from the spec what is meant to go in here. Zotero doesn't
                # 'do' seasons, and I can't work it out from the pandoc-citeproc source. Will
                # try and make this work when I have useful internet
                season_lookup = {1: "spring",
                                 2: "summer",
                                 3: "autumn",
                                 4: "winter"}
                return []
            def circa(circa_boolean):
                return []
            def literal(date_string):
                return [date_string]

            date_function_lookup = {"date-parts": date_parts,
                                    "season": season,
                                    "circa": circa,
                                    "literal": literal,
                                    "raw": literal}

            response = []

            for element in variable_contents:
                response.extend(date_function_lookup[element](variable_contents[element]))

            return response
            # }}}3

        variable_contents = self.data.get(variable_name, False)

        if variable_contents:
            return eval(_variable_type.get(variable_name, "plain"))(variable_contents)
        else:
            return []

    def match(self, query): #{{{2
        # Matching engine. Returns 1 if match found, 0 otherwise.
        # Expects query to be a compiled regexp.

        # Very simple, just searches for substrings. Could be updated
        # to provide a 'matches' value for ranking? Using numbers here
        # so as to permit this future application.

        matched = False
        for variable in _significant_tags:
            for token in self.as_array(variable):
                matched = matched or query.search(flatten(token))
                if matched:
                    break

        if matched:
            return 1
        else:
            return 0

    def matches(self, query): #{{{2
        # Provides a boolean match response to query.
        # Expects query to be a compiled regexp.
        if self.match(query) == 0:
            return False
        else:
            return True

    def relevance(self, query): #{{{2
        # Returns the relevance of an item for a query
        query = re.compile(query, re.I)
        relevance = float(0.0)
        tags_matched = []
        for tag in _significant_tags:
            for token in self.as_array(tag):
                if query.search(flatten(token)):
                    tags_matched.append(tag)
                    break
        if tags_matched != []:
            relevance = sum([_significant_tags[t] for t in tags_matched])
        return relevance

class CiteprocSource: #{{{1
    def __init__(self, bib): #{{{2
        try:
            citeproc_output = check_output(["pandoc-citeproc", "-j", bib])
        except:
            print('pandoc: pandoc-citeproc had an error. maybe the bibliography file is corrupted?')
            self.data = []
            return
        if version_info.major == 2:
            raw_bib = json.loads(citeproc_output)
        elif version_info.major == 3:
            raw_bib = json.loads(citeproc_output.decode("utf-8"))
        self.data = [CSLItem(entry) for entry in raw_bib]

    def __iter__(self): #{{{2
        for a in self.data:
            yield a

class CiteprocCollator(SourceCollator): #{{{1
    def collate(self): #{{{2
        data = []

        for bib in self.find_bibfiles():
            for item in CiteprocSource(bib):
                if item.matches(re.compile(self.query, re.I)) and item not in data:
                    data.append(item)

        data.sort(key=lambda i: i.relevance(self.query), reverse=True)

        return [item.data for item in data]