forked from endrikacupaj/CARTON
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrap.py
79 lines (57 loc) · 2.44 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from unidecode import unidecode
from time import perf_counter
from knowledge_graph.KnowledgeGraphs import KGZODB, MiniKGWikidataJSON
from elasticsearch import Elasticsearch
from utils import rapidfuzz_query
from args import parse_and_get_args
args = parse_and_get_args()
def time_query(fun):
def helper(*args, **kwargs):
tic = perf_counter()
res = fun(*args, **kwargs)
print(f"t: {perf_counter() - tic} | {res}")
return helper
def es_query(query, client):
query = unidecode(query)
res = client.search(index='csqa_wikidata', size=50, query={
'match': {
'label': {
'query': query,
'fuzziness': 'AUTO',
}
}
})
for hit in res['hits']['hits']:
print(f'{hit["_source"]["id"]} - {hit["_source"]["label"]} - {hit["_score"]}', end='\n')
if __name__ == '__main__':
kg = KGZODB('./knowledge_graph/Wikidata.fs', run_adapter=True)
kg_memory = MiniKGWikidataJSON()
client = Elasticsearch(args.elastic_host, ca_certs=args.elastic_certs,
basic_auth=(args.elastic_user, args.elastic_password)) # for inverse index search
print(client.info())
timed_rp_query = time_query(rapidfuzz_query)
timed_es_query = time_query(es_query)
timed_es_query('Albret Enstein', client) # warmup
tic = perf_counter()
timed_es_query('Albret Enstein', client)
timed_es_query('Stargate', client)
timed_es_query('Borat', client)
timed_es_query('Boat', client)
timed_es_query('Prety littl liers', client)
print(f'ElasticSearch performance: {perf_counter() - tic}')
timed_rp_query('Albret Enstein', 'Q1417412', kg_memory) # warmup
tic = perf_counter()
timed_rp_query('Albret Enstein', 'Q1417412', kg_memory)
timed_rp_query('Stargate', 'Q1417412', kg_memory)
timed_rp_query('Borat', 'Q1417412', kg_memory)
timed_rp_query('Boat', 'Q1417412', kg_memory)
timed_rp_query('Prety littl liers', 'Q1417412', kg_memory)
print(f'Memory performance: {perf_counter() - tic}')
timed_rp_query('Albret Enstein', 'Q1417412', kg) # warmup
tic = perf_counter()
timed_rp_query('Albret Enstein', 'Q1417412', kg)
timed_rp_query('Stargate', 'Q1417412', kg)
timed_rp_query('Borat', 'Q1417412', kg)
timed_rp_query('Boat', 'Q1417412', kg)
timed_rp_query('Prety littl liers', 'Q1417412', kg)
print(f'ZODB performance: {perf_counter() - tic}')