citeproc.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. #!/usr/bin/env python2
  2. # vim: set fdm=marker:
  3. # imports {{{1
  4. from sys import version_info
  5. from subprocess import check_output
  6. import json
  7. import re
  8. try:
  9. from vim_pandoc.bib.collator import SourceCollator
  10. from vim_pandoc.bib.util import flatten
  11. except:
  12. from collator import SourceCollator
  13. from util import flatten
  14. # _bib_extensions {{{1
  15. # Filetypes that citeproc.py will attempt to parse.
  16. _bib_extensions = ["bib",\
  17. "bibtex",\
  18. "ris",\
  19. "mods",\
  20. "json",\
  21. "enl",\
  22. "wos",\
  23. "medline",\
  24. "copac",\
  25. "xml"]
  26. # _significant_tags {{{1
  27. # Tags that citeproc.py will search in, together with scaling
  28. # factors for relative importance. These are currently non-functional.
  29. _significant_tags = {"id": 0.5,
  30. "author": 1.0,
  31. "issued": 1.0,
  32. "title": 1.0,
  33. "publisher": 1.0,
  34. "abstract": 0.1}
  35. # _variable_type {{{1
  36. # Map of tags -> types.
  37. _variable_type = {
  38. "abstract": "plain",
  39. "annote": "plain",
  40. "archive": "plain",
  41. "archive_location": "plain",
  42. "archive-place": "plain",
  43. "authority": "plain",
  44. "call-number": "plain",
  45. "citation-label": "plain",
  46. "citation-number": "plain",
  47. "collection-title": "plain",
  48. "container-title": "plain",
  49. "container-title-short": "plain",
  50. "dimensions": "plain",
  51. "doi": "plain",
  52. "event": "plain",
  53. "event-place": "plain",
  54. "first-reference-note-number": "plain",
  55. "genre": "plain",
  56. "isbn": "plain",
  57. "issn": "plain",
  58. "jurisdiction": "plain",
  59. "keyword": "plain",
  60. "locator": "plain",
  61. "medium": "plain",
  62. "note": "plain",
  63. "original-publisher": "plain",
  64. "original-publisher-place": "plain",
  65. "original-title": "plain",
  66. "page": "plain",
  67. "page-first": "plain",
  68. "pmcid": "plain",
  69. "pmid": "plain",
  70. "publisher": "plain",
  71. "publisher-place": "plain",
  72. "references": "plain",
  73. "reviewed-title": "plain",
  74. "scale": "plain",
  75. "section": "plain",
  76. "source": "plain",
  77. "status": "plain",
  78. "title": "plain",
  79. "title-short": "plain",
  80. "url": "plain",
  81. "version": "plain",
  82. "year-suffix": "plain",
  83. "chapter-number": "number",
  84. "collection-number": "number",
  85. "edition": "number",
  86. "issue": "number",
  87. "number": "number",
  88. "number-of-pages": "number",
  89. "number-of-volumes": "number",
  90. "volume": "number",
  91. "accessed": "date",
  92. "container": "date",
  93. "event-date": "date",
  94. "issued": "date",
  95. "original-date": "date",
  96. "submitted": "date",
  97. "author": "name",
  98. "collection-editor": "name",
  99. "composer": "name",
  100. "container-author": "name",
  101. "director": "name",
  102. "editor": "name",
  103. "editorial-director": "name",
  104. "illustrator": "name",
  105. "interviewer": "name",
  106. "original-author": "name",
  107. "recipient": "name",
  108. "reviewed-author": "name",
  109. "translator": "name"
  110. }
  111. class CSLItem: #{{{1
  112. # This class implements various helper methods for CSL-JSON formatted bibliography
  113. # entries.
  114. def __init__(self, entry): #{{{2
  115. self.data = entry
  116. def as_array(self, variable_name): #{{{2
  117. def plain(variable_contents): #{{{3
  118. # Takes the contents of a 'plain' variable and splits it into an array.
  119. if version_info.major == 2:
  120. return unicode(variable_contents).split('\n')
  121. else:
  122. return variable_contents.split('\n')
  123. def number(variable_contents): #{{{3
  124. if version_info.major == 2:
  125. return [unicode(variable_contents)]
  126. else:
  127. return [variable_contents]
  128. def name(variable_contents): #{{{3
  129. # Parses "name" CSL Variables and returns an array of names.
  130. def surname(author):
  131. # Concat dropping particle and non-dropping particle with family name.
  132. return [" ".join((author.get("dropping-particle", ""),
  133. author.get("non-dropping-particle", ""),
  134. author.get("family", ""))).strip()]
  135. def given_names(author):
  136. return [author.get("given", "").strip()]
  137. def literal_name(author):
  138. # It seems likely there is some particular reason for the author being
  139. # a literal, so don't try and do clever stuff like splitting into tokens...
  140. return [author.get("literal", "").strip()]
  141. names = []
  142. for author in variable_contents:
  143. name = ""
  144. if "literal" in author:
  145. name = literal_name(author)
  146. else:
  147. name = surname(author) + given_names(author)
  148. names.append(name)
  149. return names
  150. def date(variable_contents): #{{{3
  151. # Currently a placeholder. Will parse 'date' CSL variables and return an array of
  152. # strings for matches.
  153. def date_parse(raw_date_array):
  154. # Presently, this function returns the date in yyyy-mm-dd format. In future, it
  155. # will provide a variety of alternative forms.
  156. if version_info.major == 2:
  157. date = [unicode(x) for x in raw_date_array]
  158. else:
  159. date = [str(x) for x in raw_date_array]
  160. return ["-".join(date)]
  161. def date_parts(date_parts_contents):
  162. # Call date_parts for each element.
  163. response = []
  164. for date in date_parts_contents:
  165. response.extend(date_parse(date))
  166. return response
  167. def season(season_type):
  168. # Not actually clear from the spec what is meant to go in here. Zotero doesn't
  169. # 'do' seasons, and I can't work it out from the pandoc-citeproc source. Will
  170. # try and make this work when I have useful internet
  171. season_lookup = {1: "spring",
  172. 2: "summer",
  173. 3: "autumn",
  174. 4: "winter"}
  175. return []
  176. def circa(circa_boolean):
  177. return []
  178. def literal(date_string):
  179. return [date_string]
  180. date_function_lookup = {"date-parts": date_parts,
  181. "season": season,
  182. "circa": circa,
  183. "literal": literal,
  184. "raw": literal}
  185. response = []
  186. for element in variable_contents:
  187. response.extend(date_function_lookup[element](variable_contents[element]))
  188. return response
  189. # }}}3
  190. variable_contents = self.data.get(variable_name, False)
  191. if variable_contents:
  192. return eval(_variable_type.get(variable_name, "plain"))(variable_contents)
  193. else:
  194. return []
  195. def match(self, query): #{{{2
  196. # Matching engine. Returns 1 if match found, 0 otherwise.
  197. # Expects query to be a compiled regexp.
  198. # Very simple, just searches for substrings. Could be updated
  199. # to provide a 'matches' value for ranking? Using numbers here
  200. # so as to permit this future application.
  201. matched = False
  202. for variable in _significant_tags:
  203. for token in self.as_array(variable):
  204. matched = matched or query.search(flatten(token))
  205. if matched:
  206. break
  207. if matched:
  208. return 1
  209. else:
  210. return 0
  211. def matches(self, query): #{{{2
  212. # Provides a boolean match response to query.
  213. # Expects query to be a compiled regexp.
  214. if self.match(query) == 0:
  215. return False
  216. else:
  217. return True
  218. def relevance(self, query): #{{{2
  219. # Returns the relevance of an item for a query
  220. query = re.compile(query, re.I)
  221. relevance = float(0.0)
  222. tags_matched = []
  223. for tag in _significant_tags:
  224. for token in self.as_array(tag):
  225. if query.search(flatten(token)):
  226. tags_matched.append(tag)
  227. break
  228. if tags_matched != []:
  229. relevance = sum([_significant_tags[t] for t in tags_matched])
  230. return relevance
  231. class CiteprocSource: #{{{1
  232. def __init__(self, bib): #{{{2
  233. try:
  234. citeproc_output = check_output(["pandoc-citeproc", "-j", bib])
  235. except:
  236. print('pandoc: pandoc-citeproc had an error. maybe the bibliography file is corrupted?')
  237. self.data = []
  238. return
  239. if version_info.major == 2:
  240. raw_bib = json.loads(citeproc_output)
  241. elif version_info.major == 3:
  242. raw_bib = json.loads(citeproc_output.decode("utf-8"))
  243. self.data = [CSLItem(entry) for entry in raw_bib]
  244. def __iter__(self): #{{{2
  245. for a in self.data:
  246. yield a
  247. class CiteprocCollator(SourceCollator): #{{{1
  248. def collate(self): #{{{2
  249. data = []
  250. for bib in self.find_bibfiles():
  251. for item in CiteprocSource(bib):
  252. if item.matches(re.compile(self.query, re.I)) and item not in data:
  253. data.append(item)
  254. data.sort(key=lambda i: i.relevance(self.query), reverse=True)
  255. return [item.data for item in data]