-
Notifications
You must be signed in to change notification settings - Fork 0
/
util.py
41 lines (29 loc) · 864 Bytes
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from os import path
import numpy as np
import re
import mojimoji
def get_kws(text, mt):
return mt.parse(text).split()
def get_vector_from_kws(kws, wv, w_by_kw):
vectors = list()
for kw in kws:
try:
if w_by_kw[kw] == 0:
w_by_kw[kw] = 1
vectors.append(wv[kw] * w_by_kw[kw])
except KeyError:
continue
if len(vectors) == 0:
return np.zeros(200)
vectors = np.array(vectors)
vector = np.mean(vectors, axis=0)
return vector
def get_vector_from_text(text, mt, wv, w_by_kw):
kws = get_kws(text, mt)
return get_vector_from_kws(kws, wv, w_by_kw)
def normalize_text(text):
blank = re.compile(r'[ \t\f\v]+')
text = re.sub(blank, '', text)
text = mojimoji.zen_to_han(text)
text = mojimoji.han_to_zen(text, digit=False)
return text