-
Notifications
You must be signed in to change notification settings - Fork 0
/
tf_idf.py
152 lines (119 loc) · 5.07 KB
/
tf_idf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/bin/python
import re
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
import numpy as np
import operator
import hashlib
from nltk import stem
# Not checking this file in github because data/ folder is 20Gb.Working on travel.stackexchange.com for now
filename = "questions"
NO_OF_TAGS_TO_CONSIDER = 4
num_of_posts = 0
post_properties = []
all_posts_words = []
idf = {}
questions = []
titles = []
tags_collection = []
test_data_file_name = "test_data_labels.csv"
predicted_file_name = "predicted_labels.csv"
test_file = open(test_data_file_name, "w")
pred_file = open(predicted_file_name, "w")
stopset = set(stopwords.words('english'))
snowball = stem.snowball.EnglishStemmer()
with open(filename, "r") as file:
# Each line is actually a question post in stackexchange
for line in file:
num_of_posts = num_of_posts + 1
post_property = {}
post_property['tf'] = {}
post_property['idf'] = {} # Reduntant . . not needed
question_match = re.search(r'Body=\"(.*?)\"', line)
question_body = question_match.group(1)
question_body = question_body.replace('<p>', '').replace('</p>
', '')
title_match = re.search(r'Title=\"(.*?)\"', line)
title = title_match.group(1)
# words = question_body.split(" ")
for c in string.punctuation:
question_body = question_body.replace(c,"")
words = word_tokenize(question_body)
# Stemming and stop word removal
words = [w for w in words if not w in stopset]
words = [(snowball.stem(w.decode('utf-8-sig'))).encode('utf-8-sig').lower() for w in words]
this_post_words = words
for word in words:
if word in post_property['tf']:
post_property['tf'][word] = post_property['tf'][word] + 1
else: post_property['tf'][word] = 1
# words = title.split(" ")
for c in string.punctuation:
title = title.replace(c,"")
words = word_tokenize(question_body)
# Stemming and stop word removal
words = [w for w in words if not w in stopset]
words = [(snowball.stem(w.decode('utf-8-sig'))).encode('utf-8-sig').lower() for w in words]
this_post_words += words
for word in words:
if word in post_property['tf']:
post_property['tf'][word] = post_property['tf'][word] + 1
else: post_property['tf'][word] = 1
all_posts_words.append(this_post_words)
tags_match = re.search(r'Tags=\"(.*?)\"', line)
tags = tags_match.group(1)
tags = tags.replace('><', ',').replace('<','').replace('>','')
tags = tags.split(',')
post_properties.append(post_property)
questions.append(question_body)
titles.append(title)
tags_collection.append(tags)
# print "\nTitle = %s \nQuestion = %s\nTags=%s"%(title, question_body, tags)
for post_number, post_words in enumerate(all_posts_words):
for word in post_words:
if not word in idf:
idf[word] = set()
idf[word].add(post_number)
for word in idf.keys():
idf[word] = np.log(num_of_posts / len(idf[word]))
questions_only_file = open('questions_stemmed', 'w')
questions_with_id_file = open('questions_id_stemmed', 'w')
for post_number, post_words in enumerate(all_posts_words):
word_score = {}
doc_hash = hashlib.sha256(titles[post_number]+" "+questions[post_number]).hexdigest()
questions_only_file_line = ''
questions_id_file_line = doc_hash+': '
for word in post_words:
questions_only_file_line += snowball.stem(word.decode('utf-8-sig')).encode('utf-8-sig').lower()+' '
questions_id_file_line += snowball.stem(word.decode('utf-8-sig')).encode('utf-8-sig').lower()+' '
word_score[word] = post_properties[post_number]['tf'][word] * idf[word]
questions_only_file.write(questions_only_file_line.encode('utf-8')+'\n')
questions_with_id_file.write(questions_id_file_line.encode('utf-8')+'\n')
sorted_word_scores = sorted(word_score.items(), key=operator.itemgetter(1))[::-1]
#print titles[post_number]+" "+questions[post_number]
test_file_line = doc_hash
tags = tags_collection[post_number]
for tag in tags:
test_file_line += ","+(snowball.stem(tag.decode('utf-8-sig'))).encode('utf-8-sig').lower()
test_file.write(test_file_line+"\n")
#print "Title: %s"%titles[post_number]
#print "Question: %s"%questions[post_number]
#print "Actual Tags: %s"%tags_collection[post_number]
pred_file_line = doc_hash
#print "Predicted Top %d tags with scores :"%NO_OF_TAGS_TO_CONSIDER
for i in range(NO_OF_TAGS_TO_CONSIDER):
print next(iter(sorted_word_scores[i])),
pred_file_line += ","+next(iter(sorted_word_scores[i]))
print("\n"+"*" * 160+"\n")
pred_file.write(pred_file_line+"\n")
pred_file.close()
test_file.close()
questions_only_file.close()
questions_with_id_file.close()
pass
tag_set = set()
for tag_words in tags_collection:
for tag in tag_words:
tag_set.add(tag)
print len(tag_set)
print len(tags_collection)