-
Notifications
You must be signed in to change notification settings - Fork 0
/
tools.py
92 lines (66 loc) · 2.21 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from collections import defaultdict
import spacy
class SpacyTokenizer:
""" Tool for tokenize powered by spacy module
"""
def __init__(self, lang: str, disable=['parser', 'tagger', 'ner']):
""" Initialize the language type for token
Args:
lang (str): language type for tokenizer
"""
self._nlp = spacy.load(lang, disable=disable)
def tokenize(self, text: str) -> list:
# we don't need new line as token
lines = text.split('.')
doc = [[token.text for token
in self._nlp.tokenizer(text.strip())] for text in lines]
return doc
class Dictionary:
""" Tool to build word2idx and doc2idx
Args:
doc {list}: list of documents contains words
"""
def __init__(self, doc=None):
self.vocab_size = 0
self.word2idx = defaultdict(int)
self.update(doc)
def update(self, doc: list):
""" Update word2idx information by doc
Args:
doc (list): list of words
"""
if doc is None:
return
vocab_size, word2idx = self.vocab_size, self.word2idx
# count word occurrance and vocab size
tokens = dict()
# tokens = set()
for line in doc:
# tokens.update(line)
# tokens.append(line)
# print("line:", type(line))
for t in line:
tokens[t] = 1
# tokens[line] = 1
# tokens = list(dict.fromkeys(tokens))
for token in tokens:
if token not in word2idx:
word2idx[token] = vocab_size
vocab_size += 1
self.vocab_size = vocab_size
def corpus(self, doc: list) -> list:
""" Convert text of documents to idx of documents
Args:
doc (list): text of documents
Returns:
list: idx of documents
"""
word2idx = self.word2idx
corpus = [[word2idx[word] for word in line if word in word2idx]
for line in doc]
return corpus
if __name__ == '__main__':
tokenizer = SpacyTokenizer('en_core_web_sm')
text = "This is an apple. \n This is a tea."
doc = tokenizer.tokenize(text)
print(doc)