forked from rubenIzquierdo/opinion_miner_deluxePP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_sequences.py
executable file
·97 lines (81 loc) · 2.93 KB
/
extract_sequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python
'''
Extract sequences from the ouput of CRF
'''
import sys
def extract_sequences(input, this_type):
#Input can be a filename or a list of sentence
my_input = None
if isinstance(input, str):
my_input = open(input,'r')
elif isinstance(input,list):
my_input = input
current = []
sequences_of_ids = []
word_for_id = {}
num_sequence = None
for line in my_input:
line = line.strip()
if line.startswith('#'):
# # 1 0.025510
fields = line.strip().split()
num_sequence = int(fields[1])
elif line == '':
#breakline
if len(current) != 0:
sequences_of_ids.append((num_sequence, current))
current = []
else:
#normal line
fields = line.strip().split('\t')
this_id = fields[0]
this_class = fields[-1]
token = fields[1]
word_for_id[this_id] = token
if this_class != 'O':
current.append(this_id)
else:
if len(current) != 0:
sequences_of_ids.append((num_sequence, current))
current = []
if len(current) != 0:
sequences_of_ids.append((num_sequence, current))
current = []
##Remove those sequences that are completely contained in other
indexes_to_remove = []
for n1, (numseq1, s1) in enumerate(sequences_of_ids):
#Should we remove s1?
for n2, (numseq2, s2) in enumerate(sequences_of_ids):
if n1 != n2 and numseq1 > numseq2:
common = set(s1) & set(s2)
if len(common) != 0:
indexes_to_remove.append(n1)
#print>>sys.stderr, 'Removed %s of sequence %d because overlaps with %s of seq %d' % (s1,numseq1, s2, numseq2)
#print 'Remove:'
#for i in indexes_to_remove:
# print ' ',sequences_of_ids[i]
these_sequences = []
for n, (numseq, s) in enumerate(sequences_of_ids):
if n not in indexes_to_remove:
these_sequences.append(s)
remove_duplicated = False
already_printed = set()
final_sequences = []
for s in these_sequences:
string_for_ids = ' '.join(s)
if remove_duplicated:
if string_for_ids in already_printed:
continue
already_printed.add(string_for_ids)
words = [word_for_id[this_id] for this_id in s]
## REMOVE DUPLICATED???
final_sequences.append((s, words))
if isinstance(input, str):
my_input.close()
return final_sequences
if __name__ == '__main__':
filename = sys.argv[1]
this_type = sys.argv[2]
sequences = extract_sequences(filename, this_type)
for ids, words in sequences:
print '%s\t%s\t%s' % (this_type,' '.join(words),' '.join(ids))