-
Notifications
You must be signed in to change notification settings - Fork 3
/
parser.py
121 lines (100 loc) · 4.02 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import json
import time
import os
from outbreak_parser_tools import safe_request as requests
from outbreak_parser_tools.logger import get_logger
from outbreak_parser_tools.addendum import Addendum
logger = get_logger('biorxiv')
from dateutil import parser
from datetime import date
import pathlib
def parse_item(rec):
publication={
"@context": {
"schema":"http://schema.org/",
"outbreak":"https://discovery.biothings.io/view/outbreak/",
},
"@type":'Publication',
"keywords":[],
"author":[],
"funding":[],
"isBasedOn":[]
}
publication["_id"] = rec['rel_doi'].split('/', 1)[-1]
publication["doi"] = rec.get("rel_doi", None)
publication["url"] = rec.get("rel_link", None)
website = {"@type":"schema:WebSite", "curationDate": date.today().strftime("%Y-%m-%d")}
name = rec.get("rel_site", "")
website['name'] = name
website['url'] = rec.get("rel_link", "")
publication["curatedBy"] = website
publication["publicationType"] = ["Preprint"]
publication["name"] = rec.get("rel_title", None)
publication["journalName"] = rec.get("rel_site", None)
publication["journalNameAbbreviation"] = rec.get("rel_site", None)
publication["abstract"] = rec.get("rel_abs", None)
publication["identifier"] = rec['rel_doi'].split('/', 1)[-1]
dp = rec.get("rel_date", None)
if dp:
d = parser.parse(dp)
dp = d.strftime("%Y-%m-%d")
publication["datePublished"] = dp
authors = rec.get("rel_authors")
if authors and len(authors):
for auth in authors:
author = {"@type":"outbreak:Person"}
full_name = auth.get("author_name", None)
if full_name is not None:
author["name"] = full_name
author["givenName"] = full_name.split(' ', 1)[0]
try:
author["familyName"] = full_name.split(' ',1)[1]
except:
logger.info("No familyName for: '%s'" % rec['rel_doi'])
pass
institutions = auth.get("author_inst", None)
if institutions is not None:
organization = {"@type":"outbreak:Organization"}
author["affiliation"] =[]
organization["name"] = auth["author_inst"]
if organization["name"] is not None:
author["affiliation"].append(organization)
for key in author:
if author[key] is None: del author[key]
publication["author"].append(author)
#cleanup doc of empty vals
for key in list(publication):
if not publication.get(key):del publication[key]
return publication
def fetch_data():
data_url = "https://api.biorxiv.org/covid19/{cursor}/json"
cursor = 0
collected_dois = set([None])
data_request = requests.get(data_url.format(cursor=cursor))
try:
data_json = data_request.json()
total = data_json['messages'][0]['total']
collection = data_json['collection']
except:
logger.warning("Biorxiv API down")
raise StopIteration
while len(collection) > 0:
logger.info(f"getting {cursor}")
for result in collection:
if result.get('rel_doi') not in collected_dois:
yield result
collected_dois |= set(r.get('rel_doi') for r in collection)
cursor += 30
data_request = requests.get(data_url.format(cursor=cursor))
data_json = data_request.json()
collection = data_json['collection']
new_total = data_json['messages'][0].get('total') or new_total
collected_dois.remove(None)
logger.info(f"initial total {total}, latest total {new_total}, actually collected {len(collected_dois)}")
def load_annotations():
pubs = [parse_item(rec) for rec in fetch_data()]
Addendum.biorxiv_corrector().update(pubs)
Addendum.topic_adder().update(pubs)
Addendum.altmetric_adder().update(pubs)
for publication in pubs:
yield publication