-
Notifications
You must be signed in to change notification settings - Fork 1
/
support.py
129 lines (116 loc) · 5.27 KB
/
support.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# ISC License (ISC)
# ==================================
# Copyright 2021 Arcangelo Massari, Cristian Santini, Ricarda Boente, Deniz Tural
# Permission to use, copy, modify, and/or distribute this software for any purpose with or
# without fee is hereby granted, provided that the above copyright notice and this permission
# notice appear in all copies.
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
# SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
# THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
from typing import Tuple, List, Set
import requests, requests_cache, json, csv, os, ijson, re
from requests import Session
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
from tqdm import tqdm
class Support(object):
@staticmethod
def process_csv_input(path:str) -> list:
print(f"[Support:INFO Proccessing csv at path {path}]")
with open(path, 'r', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile)
return list(reader)
@staticmethod
def read_cache(autosave_path:str) -> Tuple[int, list]:
num = 0
data = list()
if not os.path.exists(autosave_path):
return num, data
with open(autosave_path, 'r', encoding='utf8') as read_obj:
dict_reader = csv.DictReader(read_obj)
for row in dict_reader:
row_data = {
"Valid_citing_DOI": row.get("Valid_citing_DOI"),
"Invalid_cited_DOI": row.get("Invalid_cited_DOI"),
"Valid_DOI": row.get("Valid_DOI"),
"Already_valid": row.get("Already_valid"),
"Prefix_error": row.get("Prefix_error"),
"Suffix_error": row.get("Suffix_error"),
"Other-type_error": row.get("Other-type_error")
}
data.append(row_data)
num += 1
return num, data
@staticmethod
def dump_csv(data:list, path:str) -> None:
print(f"[Support:INFO Writing csv at path {path}]")
with open(path, 'w', newline='', encoding='utf8') as output_file:
keys = data[0].keys()
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(data)
def _requests_retry_session(
self,
tries=1,
status_forcelist=(500, 502, 504, 520, 521),
session=None
) -> Session:
session = session or requests.Session()
retry = Retry(
total=tries,
read=tries,
connect=tries,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def handle_request(self, url:str, cache_path:str="", error_log_dict:dict=dict()) -> json:
if cache_path != "":
requests_cache.install_cache(cache_path)
try:
data = self._requests_retry_session().get(url, timeout=5)
if data.status_code == 200:
return data.json()
else:
error_log_dict[url] = data.status_code
except Exception as e:
error_log_dict[url] = str(e)
@staticmethod
def get_all_crossref_dois(folder_path:str="./dataset/crossref/") -> list:
json_files = [pos_json for pos_json in os.listdir(folder_path) if pos_json.endswith('.json')]
dois = list()
pbar = tqdm(total=len(json_files))
for json_file in json_files:
with open(os.path.join(folder_path, json_file)) as json_file:
parser = ijson.parse(json_file)
for prefix, event, value in parser:
if (prefix, event) == ('items.item.DOI', 'string'):
dois.append({"crossref_doi": value})
pbar.update(1)
pbar.close()
return dois
def get_number_of_citations(data:List[List], regex:str="") -> int:
output:List[Set] = list()
for data in data:
citations = set()
for row in data:
if row["Valid_DOI"]:
if row["Already_valid"] == "0":
if regex:
match = re.search(regex, row["Invalid_cited_DOI"].upper())
if match:
citations.add(row["Valid_citing_DOI"] + row["Valid_DOI"])
else:
citations.add(row["Valid_citing_DOI"] + row["Valid_DOI"])
output.append(len(citations))
return output
def compare_number_of_matches(new_procedure_matches:dict, reference_procedure_matches:set) -> dict:
diff = dict()
for error_type, expressions in reference_procedure_matches.items():
for regex, matches in expressions.items():
diff[regex] = new_procedure_matches[error_type][regex] - matches
return sorted(diff.items(), key=lambda item: item[1], reverse=True)