-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutil.py
143 lines (114 loc) · 4.44 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import re, os, json, operator, functools
import numpy as np
from collections import Counter, deque
from nltk.stem.snowball import FrenchStemmer, SnowballStemmer
from nltk.tokenize import WordPunctTokenizer
# sentenceTokenizer = nltk.data.load('tokenizers/punkt/PY3/french.pickle')
stemmer = SnowballStemmer("french")
tokenizer = WordPunctTokenizer()
# stemmer = FrenchStemmer()
# Takes a string and return a list of words
def clean_text(text, stem=False):
# Removing curly braces, those are metadata in the corpus
text = re.sub(r'\{.*}', '', text)
# Remove x2, x3 etc. (repeating verse annotation)
text = re.sub(r'(x|X)\d+', '', text)
# Replacing purely stylistics chars
text = re.sub(r'æ', 'ae', text)
text = re.sub(r'œ', 'oe', text)
text = re.sub(r'[ìíîï]', 'i', text)
text = re.sub(r'[ýÿ]', 'y', text)
text = re.sub(r'[òóôõö]', 'o', text)
text = re.sub(r'[áâãä]', 'a', text)
text = re.sub(r'ë', 'e', text)
text = re.sub(r'ñ', 'n', text)
text = re.sub(r'[ûü]', 'u', text)
text = re.sub(r'[«“”»]', '"', text)
text = re.sub(r'[…]', '...', text)
# Characters whitelist to avoid any unknkown chars
text = re.sub(r'[^a-zA-Z0-9 àáâãäçèéêëìíîïñòóôõöùúûüýÿ\'"\.,?;:\'"!-]', '', text)
tokens = tokenizer.tokenize(text)
if stem is True:
cleaned_tokens = [stemmer.stem(w) for w in tokens]
else:
cleaned_tokens = [w.lower() for w in tokens]
return cleaned_tokens
def clean_textfile(fullpath, stem=False):
EOP = False
with open(fullpath, 'r') as f:
corpus = f.readlines()
cleaned_corpus = []
for line in corpus:
cleaned_line = clean_text(line, stem)
if len(cleaned_line) != 0:
cleaned_line += ['<EOL>']
EOP = False
cleaned_corpus += cleaned_line
else:
# If EOP is True, it means we have multiple empty line
if EOP is False:
cleaned_line = ['<EOP>']
EOP = True
cleaned_corpus += cleaned_line
if EOP is False:
cleaned_corpus += ['<EOP>']
return cleaned_corpus
def get_regions_from_corpus(corpus):
regions = []
region = []
for word in corpus:
region.append(word)
if word == '<EOP>':
regions.append(region)
region = []
return regions
def dump_corpus(corpus, fullpath):
with open(fullpath, 'w') as f:
new_text = [' '.join(sublist) for sublist in corpus]
new_text = '\n'.join(new_text)
f.write(new_text)
def word_to_id(wti_dict, word):
if word in wti_dict:
return wti_dict[word]
else:
return wti_dict['<UNK>']
def make_sets(corpus, wti_dict, dev_test_size=0.1):
counter = Counter()
counter.update(corpus)
nb_para = counter['<EOP>']
nb_para_dev_test = max(int(nb_para * dev_test_size), 1)
nb_para_train = nb_para - 2 * nb_para_dev_test
para_indexes = [i for i,word in enumerate(corpus) if word == '<EOP>']
end_train_set_index = para_indexes[nb_para_train - 1]
end_dev_set_index = para_indexes[nb_para_train + nb_para_dev_test - 1]
train_set = corpus[:end_train_set_index]
train_set = [word_to_id(wti_dict, word) for word in train_set]
dev_set = corpus[end_train_set_index:end_dev_set_index]
dev_set = [word_to_id(wti_dict, word) for word in dev_set]
test_set = corpus[end_dev_set_index:]
test_set = [word_to_id(wti_dict, word) for word in test_set]
return train_set, dev_set, test_set
def load_corpus_as_sets(fullpath, wti_dict):
corpus = clean_textfile(fullpath)
return make_sets(corpus, wti_dict)
def print_learningconfig():
for subdir, dirs, files in os.walk('results'):
for file in files:
if file == 'config.json':
path = os.path.join(subdir, file)
print(path)
with open(path) as jsonData:
data = json.load(jsonData)
print(data['config'])
def get_nb_parameters(tf_var_list):
nb_params = 0
for t_var in tf_var_list:
shape = t_var.get_shape().as_list()
if type(shape[0]) != int:
shape.pop(0)
nb_params += get_nb_elements_from_shape(shape)
return nb_params
def get_nb_elements_from_shape(shape):
if len(shape) == 0:
return 0
return functools.reduce(operator.mul, shape)