-
Notifications
You must be signed in to change notification settings - Fork 0
/
query_processor.py
160 lines (110 loc) · 4.92 KB
/
query_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import pandas as pd
import numpy as np
import string
import math
import unicodedata as ud
from indexer import preprocessing, termFrequency, TF_Process, IDF_Process
# Function for query preprocessing
def query_Preprocessing(query):
# Make all letters lowercase
query = query.lower()
# Remove punctuation
query = query.translate(str.maketrans('', '', string.punctuation))
# Remove diacritics
query = query.split()
d = {ord('\N{COMBINING ACUTE ACCENT}'):None}
for i in range (0, len(query)):
s = ud.normalize('NFD',query[i]).translate(d)
query[i] = s
# Remove stopwords
f = open("stopwords.txt", "r")
stopwords = f.read()
query = ' '.join([word for word in query if word not in stopwords])
return query.split()
# Function to calculate TF for every term in the query
def query_TF_Process(query, data, globalDict):
queryGlobalDict = globalDict.copy()
TF={} # Dictionary where:
# key = term,
# value = TF in the query
max_frequency = 0
for term in query:
# Update the query global dictionary
if not term in queryGlobalDict:
queryGlobalDict[term] = ["query"]
elif not term in TF:
doc_list = queryGlobalDict.get(term)
doc_list.append("query")
queryGlobalDict[term] = doc_list
#Calculate maximum frequency of a term in the query
if(max_frequency <= query.count(term)):
max_frequency = query.count(term)
# calculate the TF for each term
for term in query:
TF[term] = termFrequency(term,query,max_frequency)
return queryGlobalDict, TF
# Function to calculate IDF for every term
def query_IDF_Process(query, queryGlobalDict, dictionaryIDF, total_docs):
queryDictionaryIDF = dictionaryIDF.copy()
for i in query:
queryDictionaryIDF[i] = 1 + math.log(float(total_docs/len(queryGlobalDict[i])))
return queryDictionaryIDF
# Function to create the tf-idf vector of a document
def create_vector(single_doc, queryDictionaryIDF, queryGlobalDict):
doc_tfidf = {}
# Every weight is initially 0
for term in queryGlobalDict.keys():
doc_tfidf[term] = 0
for term in queryGlobalDict.keys():
if term in single_doc.keys():
doc_tfidf[term] = single_doc[term]* queryDictionaryIDF[term]
return doc_tfidf
# Function to perform cosine similarity
def cosine_formula(query,doc):
dot_product = np.dot(query,doc)
query_norm = np.linalg.norm(query)
doc_norm = np.linalg.norm(doc)
return (dot_product/query_norm*doc_norm)
# This is the function that builds the similarity matrix between the query and the documents
def cosine_similarity(query_vector_list,tfidf_list):
similarity_matrix = [cosine_formula(query_vector_list,list(doc.values())) for doc in tfidf_list]
return similarity_matrix
# This is the main query search function
def query_search(query):
data = pd.read_csv("parliament.csv")
# Preprocess data
data = preprocessing(data)
# globalDict: dictionary with all the terms (keys) and the documents where they can be found (values)
# documentsTF: list of dictionaries with terms and their frequency in a document
globalDict, documentsTF = TF_Process(data)
# dictionaryIDF: dictionary with all the terms (keys) and their respective IDF (values)
dictionaryIDF = IDF_Process(globalDict,data.shape[0])
# Preprocess query
query = query_Preprocessing(query)
# Same as the dicionaries above, with the addition of the query terms in them
queryGlobalDict, queryTF = query_TF_Process(query,data,globalDict)
queryDictionaryIDF = query_IDF_Process(query, queryGlobalDict, dictionaryIDF, data.shape[0])
#Fill the tf of terms not included in a document with zeroes
for i in queryGlobalDict.keys():
for j in range(len(documentsTF)):
if not i in documentsTF[j]:
documentsTF[j][i]=0
# Create tf-idf vectors for all the documents and put them on a list
tfidf_list = []
for single_doc in documentsTF:
tfidf_list.append(create_vector(single_doc, queryDictionaryIDF, queryGlobalDict))
# Create a dictionary for all terms tf-idf
queryTFIDF={}
for term in queryGlobalDict.keys():
queryTFIDF[term] = 0
for i in query:
queryTFIDF[i] = queryDictionaryIDF[i]*queryTF[i]
# Turn the query TFIDF dictionary to a list
query_vector_list = list(queryTFIDF.values())
# Compute the similarity matrix between our query and our documents
similarity_matrix = cosine_similarity(query_vector_list,tfidf_list)
query_data = data.copy()
#query_data['TFIDF'] = tfidf_list # you can put the tfidf data in your final results if you want
query_data['Score'] = np.array(similarity_matrix)
query_data.sort_values(by=['Score'], inplace=True, ascending=False)
return query_data, queryTFIDF