-
Notifications
You must be signed in to change notification settings - Fork 2
/
fix_bibtex.py
326 lines (286 loc) · 16.4 KB
/
fix_bibtex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
import biblib.bib as bbl
from biblib.messages import InputError
import dateutil.parser as dp
import sce_utils.utils as scu
import numpy as np
from habanero import Crossref
from argparse import ArgumentParser
from collections import OrderedDict, Counter
from urllib.request import Request, urlopen, HTTPError
import os, sys, re
####
# clean up bibtex file produced by anystyle *or* a bib file provided by an author
# try to find dois where they are missing
# get nice citations from doi.org when we do have dois
# neaten up all bibtex entries (date/year, pages, url)
# if docx/odt parsing, create standardized entry keys for matching
# if from tex template, use --keepkeys or -k flag to keep the initial entry keys
# fallback on all queries (crossref and doi.org) is to keep the input entry
# Usage:
# python3 fix_bibtex.py -i path/to/input.bib -o path_to_output.bib [-k]
# TODO format terminal printing/inputs better (\033[1m and \033[0m for bold)
# TODO actually un-tex/html-escape special characters from doi.org bibtex entries?
# like < and {\'{e}} or whatever (this tends to come up with authors and titles)
# for now this is taken care of by option to keep old entry title and/or authors
# TODO do we really need to keep page number info? tends to be poorly curated/mixed bag
# TODO query limit 5 or so, options for scrolling back and forth before choosing a ref (or not)
####
parser = ArgumentParser()
parser.add_argument('--ifile','-i',type=str,help='path to input bib file')
parser.add_argument('--ofile','-o',type=str,help='path to output bib file')
parser.add_argument('--keepkeys','-k',action='store_true',help='flag to retain input bib entry keys for output')
parser.add_argument('--nosearch','-n',action='store_true',help='flag to not do any crossref querying')
args = parser.parse_args()
in_bib = args.ifile
if in_bib == None:
in_bib = input('Enter path to input file: ') or 'refs.bib'
assert os.path.isfile(in_bib),'input bibfile does not exist'
out_bib = args.ofile
if out_bib == None:
of1 = in_bib[:-4]+'_corr.bib'
out_bib = input('Enter path to output file, or skip to use %s: ' % of1) or of1
if os.path.isfile(out_bib):
iq = input('warning: %s already exists. Overwrite? [y]/n: ' % out_bib) or 'y'
if iq == 'n':
sys.exit()
########################################################################
# Part 0: load entries, and check for trailing apostrophes
########################################################################
parser = bbl.Parser() # parser for bibtex
parsed = parser.parse(open(in_bib,'r')) # parse the input file with biblib
# get entries
bib_OD = parsed.get_entries() # returns collections.OrderedDict
# check if all the titles end with an apostrophe; if so, probably an error and they need to
# be removed (anystyle is not great with titles in quotes)
last_char = np.array([bib_OD[key]['title'][-1] for key in bib_OD])
strp_last = False
lc = Counter(last_char)
if max(lc.values()) >= len(last_char)/2: # this is a red flag, most entries have the same end char
print('title trailing characters:')
for i,k in enumerate(lc.keys()):
print('[%i] %i instances of %s' % (i,lc[k],k))
irem = int(input('enter index to remove: ') or -1)
if irem >= 0:
strp_last = True # and we'll remove the one with that index in the keys list
char_out = list(lc.keys())[irem]
if strp_last: # remove trailing character if set to do so
for key in bib_OD:
entry = bib_OD[key]
if entry['title'].endswith(char_out):
entry['title'] = entry['title'][:-1] # remove trailing apostrophe (probably)
########################################################################
# Part 1: try to add DOIs (and nice metadata) to entries from input file
########################################################################
if not args.nosearch:
# connection for querying, being polite for server priority
cr = Crossref(mailto="[email protected]",ua_string='Seismica SCE, seismica.org')
bib_new = OrderedDict() # to save entries with DOIs added
for key in bib_OD: # loop entry keys
entry = bib_OD[key]
doi = None # to start, assume no doi
print('\nWorking on ', entry)
if 'doi' in entry.keys(): # get provided doi if present, check to make sure it will work
if entry['doi'][-1] == '.': # check if doi ends with . and if it does, get rid of the .
entry['doi'] = entry['doi'][:-1] # (this is sometimes an anystyle problem)
doi = entry['doi']
ourl = scu.make_doi_url(doi)
req = Request(ourl, headers=dict(Accept='application/x-bibtex'))
try:
bibtext = urlopen(req).read().decode('utf-8')
except HTTPError:
doi = None # authors made a DOI mistake or anystyle messed up; try a search
# if url looks like a doi link and we don't have the doi already, try the url
if 'url' in entry.keys() and re.search(r"doi\.org",entry['url']) and not doi:
req = Request(entry['url'], headers=dict(Accept='application/x-bibtex'))
try:
bibtext = urlopen(req).read().decode('utf-8')
except HTTPError:
doi = None # url is not actually a good DOI link
if not doi: # try querying crossref to get a doi
print('no DOI, querying crossref to try and find one')
try:
q = cr.works(query_bibliographic=entry['title'],query_author=entry['author'],\
limit=2,select='DOI,title,author,score,type,published',sort='score')
except HTTPError: # shouldn't happen, but if it does anyway we give up on this one
continue
except bbl.FieldError: # probably no 'author' - might be 'editor'
# in this (pretty edge) case, just keep the entry as is and move on
entry_new = entry # keep whatever the initial entry was
entry_new.key = key # keep the input key for now
bib_new[key] = entry_new # as that should be unique, even from anystyle (a/b etc)
continue
if q['message']['total-results'] > 0: # TODO deal with case where exactly 1 result
q0 = scu.format_crossref_query(q, i=0)
print('\nqueried for:\ntitle: %s\nby: %s' % (entry['title'],entry['author']))
if q['message']['total-results'] == 1:
iok = scu.print_query_options(q0, i=1)
else:
iok = scu.print_query_options(q0, i=0)
if iok.lower() == 'y': # if it matches, save the doi
doi = q0['doi']
elif iok.lower() == 'n':
doi = None
elif iok.lower() == 'p':
q1 = scu.format_crossref_query(q, i=1)
iok = scu.print_query_options(q1, i=1)
if iok.lower() == 'y': # if it matches, save the doi
doi = q1['doi']
elif iok.lower() == 'n':
doi = None
elif iok.lower() == 'p':
doi = q0['doi']
if doi: # if doi not none, use to query for a clean citation
print('DOI provided/obtained, checking for cleaner citation into')
ourl = scu.make_doi_url(doi)
req = Request(ourl, headers=dict(Accept='application/x-bibtex'))
try:
bibtext = urlopen(req).read().decode('utf-8')
# parse bibtex to an Entry and check to make sure all the main pieces are there
# we need this bc some metadata deposited with DOIs is imperfect in weird ways
parser = bbl.Parser() # need a new parser every time which is annoying
parsed = parser.parse(bibtext)
key0 = list(parsed.get_entries().keys())[0]
entry_new = parsed.get_entries()[key0] # keyed based on doi.org convention
# see if we need to fix/un-replace authors or titles
# this is a kind of patch for dealing with the odd formatting (tex escaping,
# html escaping) that occastionally shows up in the doi.org bibtex entries
rereparse = False
# first check if there are author list inconsistencies, see if we want old or new list
if 'author' in entry_new.keys() and 'author' in entry.keys():
if len(entry.authors()) != len(entry_new.authors()): #or \
#re.search(r'[^\.a-zA-Z, -]',entry_new['author']): # TODO this needs work - say which criterion was hit
print('\nchecking '+entry.key+' author list: ')
print('old: %s' % entry['author'])
print('new: %s' % entry_new['author'])
ika = input('keep (o)ld or [n]ew? >> ') or 'n'
if ika == 'o':
entry_new['author'] = entry['author']
rereparse = True
elif 'author' not in entry_new.keys() and 'author' in entry.keys(): # avoid losing info
entry_new['author'] = entry['author']
rereparse = True
# next check if there are title inconsistencies (easier to check tbh)
if 'title' in entry_new.keys() and 'title' in entry.keys():
if entry['title'].lower() != entry_new['title'].lower(): # case is not as important
print('\nchecking '+entry.key+' title: ')
print('old: %s' % entry['title'])
print('new: %s' % entry_new['title'])
ikt = input('keep (o)ld or [n]ew? >> ') or 'n'
if ikt == 'o':
entry_new['title'] = entry['title']
rereparse = True
elif 'title' not in entry_new.keys() and 'title' in entry.keys():
entry_new['title'] = entry['title']
rereparse = True
if rereparse: # info has been re-added, re-parse to reset field_pos
parser = bbl.Parser()
parsed = parser.parse(entry_new.to_bib())
entry_new = parsed.get_entries()[key0]
except InputError: # crossref returned text that biblib doesn't like to parse
entry_new = entry # keep whatever the initial entry was
except HTTPError: # this is some kind of crossref issue, maybe a bad DOI
entry_new = entry
else: # no doi, from authors or from crossref
entry_new = entry # keep whatever the initial entry was
entry_new.key = key # keep the input key for now
bib_new[key] = entry_new # as that should be unique, even from anystyle (a/b etc)
else:
bib_new = bib_OD
########################################################################
# Part 2: go through the new OrderedDict with "cleaner" entries and fix up as needed
########################################################################
# open final outfile for writing
fout = open(out_bib,'w') # will overwrite if file exists
newkey_list = [] # for tracking keys used in case we need 'a' and 'b', when not keeping keys
n_with_doi = 0
n_without_doi = 0
for key in bib_new: # loop entry keys
entry = bib_new[key]
# for each, if 'date' is a key and 'year' is not, make 'year' based on 'date'
if 'date' in entry.keys() and 'year' not in entry.keys():
if len(entry['date']) == 4: # assume this means it's a year, which is probably true
entry['year'] = entry['date']
else: # some other format, try to parse
try:
try_year = str(dp.parse(entry['date']).year)
except: # unspecified error with dateutil parsing
try_year = entry['date']
print('date is %s, year set to %s' % (entry['date'],try_year))
iy = input('enter corrected year if needed: ') or None
if iy != None:
entry['year'] = iy
else:
entry['year'] = try_year
if 'url' in entry.keys():
if re.search(r"doi\.org",entry['url']): # check if url is just a doi link
_ = entry.pop('url') # if it is, we should have used it earlier and can clean now
if 'pages' in entry.keys():
# _ = entry.pop('pages')
entry['pages'] = entry['pages'].rstrip(',') # anystyle often leaves trailing commas
if "n/a" in entry['pages']: # sometimes get "n/a -- n/a" or similar from crossref
_ = entry.pop('pages')
if 'note' in entry.keys():
entry['note'] = re.sub(r'Available at','',entry['note'])
if len(entry['note']) <= 3: # if we've only got 3 characters left or less, probably nothing
_ = entry.pop('note') # should take care of case with : at the end
if 'title' in entry.keys() and entry['title'].endswith("'"):
entry['title'] = entry['title'][:-1] # remove trailing apostrophe if present
if 'title' not in entry.keys():
entry['title'] = " " # TODO is this a good workaround?
if 'journal' in entry.keys(): # check for ampersands, regex them
entry['journal'] = re.sub(r" & ",r" \& ",entry['journal']) # misplaced xml
entry['journal'] = re.sub(r" & ",r" \& ",entry['journal']) # and any others not escaped
if not args.keepkeys: # if we want to make new entry keys (docx/odt input file)
if 'author' not in entry.keys(): # our key convention is based on authors and years
if 'editor' in entry.keys():
entry['author'] = entry['editor']
else:
print('no author found! available keys are:')
print(entry.keys())
akey = input('enter a key to use for author: ') or entry.keys()[0]
if akey not in entry.keys():
akey = entry.keys()[0] # fallback for typos
entry['author'] = entry[akey]
parser = bbl.Parser() # need to re-parse to get field_pos right in Entry
parsed = parser.parse(entry.to_bib())
entry = parsed.get_entries()[key]
# clean up first author last name: no {}, no spaces
auth0 = re.sub(r'[ ]',r'',entry.authors()[0].last.lstrip('{').rstrip('}'))
# for each, reformat key to be what we will look for in inline citations
# first, count authors: if >2, key is firstauthorEAYYYY, if 2 or less is author(author)YYYY
try:
if len(entry.authors()) > 2:
newkey = auth0 + 'EA' + entry['year']
elif len(entry.authors()) == 2:
auth1 = re.sub(r'[ ]',r'',entry.authors()[1].last.lstrip('{').rstrip('}'))
newkey = auth0 + auth1 + entry['year']
elif len(entry.authors()) == 1:
newkey = auth0 + entry['year']
except:
newkey = auth0+"MessyKey"
print('problem with key: ',newkey)
newkey = newkey.replace(" ","") # get rid of spaces if there are any (like van Keken or something)
newkey = newkey.replace("-","") # get rid of hyphens, which tex2jats doesn't like
nextletter = 'a'
if newkey + nextletter in newkey_list: # already have 'a', try the next letter
thisauthor = [e for e in newkey_list if e.startswith(newkey)]
alphabet = [e[-1] for e in thisauthor]
nextletter = chr(ord(alphabet[-1]) + 1)
newkey = newkey + nextletter # this is what we'll use for bibtex
newkey_list.append(newkey)
entry.key = newkey
# check if doi field is present
if 'doi' in entry.keys() or 'DOI' in entry.keys():
n_with_doi += 1
# escape any underscores that might be in DOIs for the sake of tex
# (only do this just before writing bc needs to be clean for queries)
entry['doi'] = re.sub(r'_',r'\_',entry['doi'])
else:
n_without_doi += 1
# write entry to new bib file
fout.write(entry.to_bib())
fout.write('\n')
fout.close()
print('%i items have doi, %i do not have doi' % (n_with_doi, n_without_doi))
print('%.2f percent lack doi' % (100*n_without_doi/(n_with_doi + n_without_doi)))
#print('if many dois are missing, try searching the plain text reference list with %s' % ("https://apps.crossref.org/SimpleTextQuery"))