-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data.py
86 lines (65 loc) · 2.28 KB
/
Data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import PyPDF2
import re
import numpy as np
import pandas as pd
def tokenize(data):
pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
return pattern.findall(data)
def read_pdf_file(name):
data = []
file = PyPDF2.PdfReader(name)
for page in file.pages:
text = page.extractText()
text = tokenize(text)
for word in text:
data.append(word)
return data
def read_txt_file(name):
data = []
with open(name, 'r') as file:
for line in file:
line = tokenize(line)
for word in line:
data.append(word)
return data
class Data:
def __init__(self, name):
if name.endswith('.pdf'):
self.data = read_pdf_file(name)
elif name.endswith('.txt'):
self.data = read_txt_file(name)
else:
raise Exception("Incorrect file format. Supported file types: pdf, txt")
self.first_words = self.first_words()
self.normalize()
self.tokens = self.mapping()
def transitions(self, tokens, k):
matrix = np.zeros((len(tokens), len(tokens)))
for i in range(len(self.data) - 1):
for j in range(1, k + 1):
try:
matrix[tokens[self.data[i]]][tokens[self.data[i + j]]] += 1
except IndexError:
pass
df = pd.DataFrame(matrix)
df = df.div(df.sum(axis=1), axis=0)
df.replace(np.NAN, 0, inplace=True)
return df.to_numpy()
def first_words_dist(self):
first_words_cnt = np.zeros(len(self.tokens))
for word in self.first_words:
first_words_cnt[self.tokens[word.lower()]] += 1
first_words_cnt = np.divide(first_words_cnt, len(self.first_words))
return first_words_cnt
def tokens_ids(self):
return dict((i, word.lower()) for i, word in enumerate(set(self.data)))
def mapping(self):
return dict((word, i) for i, word in enumerate(set(self.data)))
def normalize(self):
self.data = [word.lower() for word in self.data]
def first_words(self):
first_words = []
for i in range(len(self.data)):
if self.data[i][0].isupper():
first_words.append(self.data[i])
return first_words