-
Notifications
You must be signed in to change notification settings - Fork 2
/
cssd.py
105 lines (87 loc) · 2.88 KB
/
cssd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from SPARQLWrapper import SPARQLWrapper, JSON
unmatched_wikipedia = []
unmatched_imdb = []
unmatched_other = []
matched = []
# Get what is already in Wikidata
endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"
sparql = SPARQLWrapper(endpoint)
sparql.setQuery("""
SELECT ?item ?itemLabel ?imdb ?source ?sitelink WHERE {
?item wdt:P31 wd:Q5 .
?item p:P69 ?statement .
?statement ps:P69 wd:Q981195 .
OPTIONAL {
?statement prov:wasDerivedFrom ?ref .
?ref (pr:P854|pr:P143) ?source .
}
OPTIONAL { ?sitelink schema:about ?item ;
schema:isPartOf <https://en.wikipedia.org/> . }
OPTIONAL { ?item wdt:P345 ?imdb . }
SERVICE wikibase:label {
bd:serviceParam wikibase:language "en" .
}
}
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
results = results['results']['bindings']
# Get content from the CSSD website
alumni_url = "http://www.cssd.ac.uk/content/high-profile-alumni"
response = requests.get(alumni_url)
soup = BeautifulSoup(response.text, "lxml")
alumni = soup.select("p")
for alumnus in alumni:
if alumnus.strong in alumnus:
urls = alumnus.select("a")
name = alumnus.strong.text[:-3]
for url in urls:
url = url['href']
if "wikipedia" in url:
wikipedia = url.replace("_", "%20")
wikipedia = wikipedia.replace("(", "%28")
wikipedia = wikipedia.replace(")", "%29")
wikipedia = wikipedia.replace("http:", "https:")
# print(wikipedia)
else:
wikipedia = ""
if "imdb.com" in url:
imdb = url.split('/')[-2]
else:
imdb = ""
qid = ""
for r in results:
if wikipedia and 'sitelink' in r:
if wikipedia == r['sitelink']['value']:
qid = r['item']['value'].split('/')[-1]
if imdb and 'imdb' in r:
if imdb == r['imdb']['value']:
qid = r['item']['value'].split('/')[-1]
if qid:
matched.append(qid)
else:
if wikipedia:
unmatched_wikipedia.append(name)
if imdb:
unmatched_imdb.append(imdb)
if not wikipedia and not imdb:
unmatched_other.append(name)
print("Matched")
print(matched)
print("Unmatched Wikipedia")
print(unmatched_wikipedia)
print("Unmatched IMDb")
print(unmatched_imdb)
print("Unmatched Other")
print(unmatched_other)
print('QS')
for m in matched:
print('\t'.join([m,
'P69',
'Q981195',
'S854',
'"http://www.cssd.ac.uk/content/high-profile-alumni"']))