-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathorcidNameSearch.py
92 lines (66 loc) · 2.68 KB
/
orcidNameSearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import urllib.request
import urllib.parse
import xml.etree.ElementTree as ET
import re
import pandas as pd
from orcidData import *
ns = {'o': 'http://www.orcid.org/ns/orcid' ,
's' : 'http://www.orcid.org/ns/search' ,
'h': 'http://www.orcid.org/ns/history' ,
'p': 'http://www.orcid.org/ns/person' ,
'pd': 'http://www.orcid.org/ns/personal-details' ,
'a': 'http://www.orcid.org/ns/activities' ,
'e': 'http://www.orcid.org/ns/employment' ,
'c': 'http://www.orcid.org/ns/common' }
def getData( firstName , lastName ):
queryName = firstName + ' ' + lastName
fullOutput = ''
query = f'https://pub.orcid.org/v2.1/search?q=family-name:{ urlEncode(lastName) }+AND+given-names:{ urlEncode(firstName) }'
root = getTree( query )
hits = root.findall('s:result' , ns )
if len(hits) == 0:
queryName = urlEncode( queryName )
query = "https://pub.orcid.org/v3.0/search?q=" + queryName
root = getTree( query )
hits = root.findall('s:result' , ns )
count = 0
for result in hits:
count += 1
data = dict()
orcidId = result.find('c:orcid-identifier/c:path' , ns ).text
orcidUrl = "https://pub.orcid.org/v3.0/" + orcidId + "/record"
xml = getTree( orcidUrl )
data['lastName'] = getLastName( xml )
data['firstName'] = getFirstName( xml )
data['creationDate'] = getCreationDate( xml )
data['nrWorks'] = getNumberOfWorks( xml )
aff = getAffiliations( xml )
fullOutput += f"{ lastName },"
fullOutput += f"{ firstName },"
fullOutput += f"{ orcidId },"
fullOutput += f"{ data.get('lastName' , '' ) },"
fullOutput += f"{ data.get('firstName' , '' ) },"
fullOutput += f"{ data.get('creationDate' , '' ) },"
fullOutput += f"{ data.get('nrWorks' , '' ) },"
if len(aff) > 0:
fullOutput += f"{ aff[0][0] },"
fullOutput += f"{ aff[0][1] },"
else:
fullOutput += ',,'
if len(aff) > 1:
fullOutput += f"{ aff[1][0] },"
fullOutput += f"{ aff[1][1] }\n"
else:
fullOutput += ',\n'
if count == 3:
break
return fullOutput
out = open( 'researchers.csv' , 'w' )
out.write( 'lastName,firstName,orcid,OrcidlastName,OrcidfirstName,creationDate,nrWorks,organisation1,department1,organisation2,department2\n' )
xl = pd.ExcelFile( 'researchers.xlsx' )
df = xl.parse( 'Sheet1' )
for index , column in df.iterrows():
if pd.notnull( column['lastName'] ):
print( column['firstName'] , column['lastName'] )
out.write( getData( urllib.parse.quote( column['firstName'] ) , urllib.parse.quote( column['lastName'] ) ) )
out.close()