-
Notifications
You must be signed in to change notification settings - Fork 0
/
tfidf.py
56 lines (48 loc) · 1.38 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import numpy as np
import math
import svr
def tfidf(train_file):
idf = {} # word: number of documents with word in it
curr_review_words = set()
num_docs = 0
with open(train_file, 'r') as f:
line = f.readline()
while line != '':
# Iterate through file collecting counts for the entire corpus
if 'review/text: ' in line:
line = line[len('review/text: '):]
words = line.split()
for word in words:
if word in idf:
if word not in curr_review_words:
idf[word] += 1
curr_review_words.add(word)
else:
idf[word] = 1
curr_review_words.add(word)
num_docs += 1
curr_review_words = set()
line = f.readline()
for word in idf:
idf[word] = math.log(num_docs / idf[word])
word_index_lookup = {}
keys = idf.keys()
for i in xrange(len(keys)):
word_index_lookup[keys[i]] = i
tfidf_features = np.zeros((num_docs, len(idf)))
# Iterate through the reviews to get frequency counts
doc_i = 0
with open(train_file, 'r') as f:
line = f.readline()
while line != '':
if 'review/text: ' in line:
line = line[len('review/text: '):]
words = line.split()
for word in words:
tfidf_features[doc_i, word_index_lookup[word]] += (idf[word] / len(words))
doc_i += 1
line = f.readline()
return tfidf_features
if __name__ == "__main__":
''' For unit testing purposes only. '''
features = tfidf('dataset/small_train.txt')