-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_elife_dataset.py
86 lines (64 loc) · 2.28 KB
/
create_elife_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import requests
import os
import Levenshtein
def calculate_nb_chars(original_sentence, simple_sentence):
"""Calculate and return the character length ratio between an original
sentence and a simplified sentence"""
return round(len(simple_sentence) / len(original_sentence), 1)
def get_levenshtein_similarity(complex_sentence, simple_sentence):
""" Return the similarity between complex_sentence and simple_sentence """
return round(Levenshtein.ratio(complex_sentence, simple_sentence), 1)
def get_digest_original(article_id):
url = "https://prod--gateway.elifesciences.org/articles/{}".format(
article_id)
r = requests.get(url).json()
try:
contents = r['digest']['content']
except:
return 1
digest = ""
for content in contents:
try:
digest = digest + content['text']
except:
pass
abstract = r['abstract']['content'][0]['text']
for section in r['body']:
if 'discussion' in section['title'].lower():
try:
discussion = section['content'][0]['text']
except:
return 1
abstract_discussion = abstract + discussion
return (digest, abstract_discussion)
def main():
source_file = "elife/source.txt"
target_file = "elife/target.txt"
total_url = "https://prod--gateway.elifesciences.org/articles?per-page=100"
r = requests.get(total_url).json()
items = r['items']
if os.path.isfile(source_file):
os.remove(source_file)
if os.path.isfile(target_file):
os.remove(target_file)
source = open(source_file, 'a')
target = open(target_file, 'a')
for item in items:
digest_original = get_digest_original(item['id'])
if digest_original == 1:
continue
else:
original = digest_original[1]
digest = digest_original[0]
source.write('\n')
source.write(
"<NbChars_" + str(calculate_nb_chars(original, digest)) + ">")
source.write(
"<LevSim_" + str(get_levenshtein_similarity(original, digest)) + ">")
source.write(
"<" + "Discussion" + ">")
source.write(original)
target.write('\n')
target.write(digest)
if __name__ == '__main__':
main()