-
Notifications
You must be signed in to change notification settings - Fork 5
/
bibtex2json.py
176 lines (141 loc) · 6.06 KB
/
bibtex2json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# -*- coding: utf-8 -*-
"""
script for transfering a BibTex library (.bib-file) into a bipartite graph
that is save into a JavaScript Object Notation file (.json), which can be read
by JavaScript to create an interactive visualisation.
By Florian Klimm, March 2018
"""
## some options
inputBibFileName = 'publicationsRosalindFranklin.bib'
outputJSONFileName = 'collaborationNetwork.json'
#
authorInformationFile = 'authorinfoRosalindFranklin.csv' # optional co-author information
deleteEgoNode = False
##
# import necessary libraries
from pybtex.database.input import bibtex # for reading the bib files
import json # for writing the json
import csv # for loading comma seperated values
import re
# some auxiliary functions
# you migth have to add further repalcement rules
def latex2unicode( latexString ):
"takes the name of an author as string and return the string with latex character replaced as normal string for the HTML"
latexString = latexString.replace('{\\"u}' ,'ü')
latexString = latexString.replace('{\\\'o}' ,'ó')
latexString = latexString.replace('{\\\'a}' ,'á')
return(latexString)
# some preperation to read the bibtex file
parser = bibtex.Parser()
bib_data = parser.parse_file(inputBibFileName)
listOfAuthors=[] # empty list of authors
# go throuh all entries
for paperKeys in bib_data.entries.keys():
# get the authors of this paper
authors = bib_data.entries[paperKeys].fields['author'].split(" and ")
# save them to the list of authors
for author in authors:
listOfAuthors.append(author)
# deleting ego
if deleteEgoNode == True:
# we assume that the author with the most entries is the ego node and delete it
egoNode = max(listOfAuthors, key=listOfAuthors.count) # returns the ego node
listOfAuthors = list(set(listOfAuthors)) # gets unique list of authors
listOfAuthors.pop(listOfAuthors.index(egoNode)) # deletes it from the list of authors
print("Removing the ego node: %s " %egoNode )
else:
listOfAuthors = list(set(listOfAuthors)) # gets unique list of authors
nAuthors = len(listOfAuthors) # number of author nodes
nPapers = len(bib_data.entries.keys()) # number of paper nodes
# read the additional author information from the csv
authorLinks_dict = {} # create an empty dictionary
authorImage_dict = {} # create an empty dictionary
try:
authorInfo_reader = csv.DictReader(open(authorInformationFile))
for row in authorInfo_reader:
authorLinks_dict[row['name']] = row['url']
authorImage_dict[row['name']] = row['image']
except FileNotFoundError:
print("no optional co-author information available")
# create a dictionary reflecting the graph (there are more pythonic ways
# possible to creat this, e.g., with zip, but this is easiest)
node_list = []
# create author nodes
for i in range(nAuthors):
node_dict = {} # create an empty dictionary for this node
node_dict["id"] = "A" + str(i)
node_dict["group"] = 0
# invert the name such that the given name is before the last name
try: # we need this try to deal with single author papers
authorSplit = listOfAuthors[i].split(",")
nameThisAuthor = authorSplit[1][1::] + ' ' + authorSplit[0]
except:
authorSplit = listOfAuthors[i]
nameThisAuthor = authorSplit[1][1::] + ' ' + authorSplit[0]
nameThisAuthorUnicode = latex2unicode(nameThisAuthor)
print(nameThisAuthorUnicode)
node_dict["name"] = nameThisAuthorUnicode
node_list.append(node_dict)
# try to set the url for this author but default is google it
node_dict["url"] = "https://www.google.com/search?q=" + nameThisAuthor
try:
if authorLinks_dict[nameThisAuthor] is None:
raise KeyError('no information for this author')
else:
node_dict["url"] = authorLinks_dict[nameThisAuthor]
except KeyError:
node_dict["url"] = "https://www.google.com/search?q=" + nameThisAuthor
# try to set a image for this author
try:
node_dict["image"] = authorImage_dict[nameThisAuthor]
except KeyError: # if no image jsut leave blank
node_dict["image"] = []
## create paper nodes
#for i in range(nPapers):
# node_dict = {} # create an empty dictionary for this node
# node_dict["id"] = "P" + str(i)
# node_dict["group"] = 2
# node_list.append(node_dict)
# create the links between the nodes
link_list = []
i=0
for paperKeys in bib_data.entries.keys(): # go over every paper
# create the paper node
node_dict = {} # create an empty dictionary for this node
node_dict["id"] = "P" + str(i)
node_dict["group"] = 1
thisPaperName = bib_data.entries[paperKeys].fields['title']
# remove curly bracket in paper name, remove it
if thisPaperName[0]=='{':
thisPaperName=thisPaperName[1:-1]
node_dict["name"] = thisPaperName
node_list.append(node_dict)
# set image for this paper
try:
node_dict["image"] = bib_data.entries[paperKeys].fields['image']
except KeyError: # if no image just leave blank
node_dict["image"] = []
# find the authors for this paper
authorsThisPaper = bib_data.entries[paperKeys].fields['author'].split(" and ")
# if the paper has a url, add it
try:
node_dict["url"] = bib_data.entries[paperKeys].fields['url']
except KeyError: # otherwise refer to google
node_dict["url"] = "https://www.google.com/search?q=" + bib_data.entries[paperKeys].fields['title']
for authors in authorsThisPaper:
link_dict = {} # empty dictionary for this edge
link_dict["source"] = "P" + str(i) # attached to this paper
try:
link_dict["target"] = "A" + str(listOfAuthors.index(authors)) # and attached to co-author
link_list.append(link_dict) # save it into the list
except ValueError:
pass
#print("Author %s not in list, probably the ego node." %authors )
i=i+1
# write into dictionary
graph_dict = {"nodes" : node_list, "links" : link_list}
# opening the file to write
if outputJSONFileName:
# Writing JSON data
with open(outputJSONFileName, 'w') as f:
json.dump(graph_dict, f)