forked from vedic-partap/negtool
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfile_reading.py
141 lines (131 loc) · 5.37 KB
/
file_reading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import sys
import subprocess
def run_corenlp(corenlp_path, filename):
"""
Run corenlp for file with raw text.
"""
with open(filename, 'r') as infile:
line1 = infile.readline()
line2 = infile.readline()
if line1.split()[0] == '1' and line2.split()[0] == '2':
print("ERROR: File seems to be parsed already")
sys.exit(1)
absolute_path = corenlp_path + "/*"
args = ['java', '-cp', absolute_path, '-Xmx1800m', 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', 'tokenize,ssplit,pos,lemma,depparse', '-file', filename, '-outputFormat', 'conll']
pipe = subprocess.call(args)
def read_parsed_data(filename, mode):
"""
Read input file on CoNLL-X format and create sentence dictionaries.
If file is parsed with corenlp, there will be 7 columns, not 8 as for conll-x.
"""
with open(filename, 'r') as infile:
sentence = {}
counter = 0
instances = []
for line in infile:
token_dict = {}
tokens = line.split()
if len(tokens) == 0:
for key in sentence:
head_index = int(sentence[key]['head']) - 1
if head_index > -1:
sentence[key]['head-pos'] = sentence[head_index][5]
else:
sentence[key]['head-pos'] = sentence[key][5]
instances.append(sentence)
sentence = {}
counter = 0
continue
if counter == 0 and tokens[0] != '1':
print("ERROR: Trying to read file that is not (properly) parsed")
sys.exit(1)
for i in range(3):
if tokens[i] != "_":
token_dict[i+2] = tokens[i]
if mode == 'raw':
token_dict[5] = tokens[3]
token_dict['head'] = tokens[5]
token_dict['deprel'] = tokens[6]
else:
token_dict[5] = tokens[4]
token_dict['head'] = tokens[6]
token_dict['deprel'] = tokens[7]
sentence[counter] = token_dict
counter += 1
return instances
def read_cuepredicted_data(filename, mode):
"""
Read file with predicted cues and create sentence dictionaries for scopes
"""
if mode == 'raw':
lower_limit = 4 #end of columns with data that can be automatically copied
upper_limit = 6 #end of original data: next index is start of cue-column
else:
lower_limit = 3
upper_limit = 7
cue_offset = upper_limit - 5 #to simplify modulo calculations
with open(filename, 'r') as infile:
sentence = {}
cues = []
mw_cues = []
line_counter = 0
counter = 0
cue_counter = 0
prev_cue_column = -1
instances = []
for line in infile:
token_dict = {}
tokens = line.split()
#check for sentence end
if len(tokens) == 0:
for key in sentence:
#store the head index and pos for each token in sentence
head_index = int(sentence[key]['head']) - 1
if head_index > -1:
sentence[key]['head-pos'] = sentence[head_index][5]
else:
sentence[key]['head-pos'] = sentence[key][5]
sentence['cues'] = cues
sentence['mw_cues'] = mw_cues
if len(cues) > 0:
sentence['neg'] = True
else:
sentence['neg'] = False
instances.append(sentence)
sentence = {}
counter = 0
prev_cue_column = -1
cues = []
mw_cues = []
line_counter += 1
continue
for i in range(len(tokens)):
if tokens[i] != "_" and i < lower_limit:
#add an offset of 2 to make token dicts match the original CD dicts
token_dict[i+2] = tokens[i]
#cue column
elif tokens[i] != "***" and tokens[i] != "_" and i > upper_limit and (i-cue_offset) % 3 == 0:
if i == prev_cue_column:
#same column has another cue token. cue must be mw
cues[-1][2] = 'm'
prev_cue_column = i
mw_cues.append([cues[-1][0],cues[-1][1]])
mw_cues.append([tokens[i], counter])
elif tokens[i] != tokens[1]:
#cue does not match current token. must be affixal cue
cues.append([tokens[i], counter, 'a'])
prev_cue_column = i
else:
cues.append([tokens[i], counter, 's'])
prev_cue_column = i
if mode == 'raw':
token_dict['head'] = tokens[5]
token_dict['deprel'] = tokens[6]
else:
token_dict[5] = tokens[4] #for conll-x data: record only the pos-tag, not cpos-tag
token_dict['head'] = tokens[6]
token_dict['deprel'] = tokens[7]
sentence[counter] = token_dict
counter += 1
line_counter += 1
return instances