forked from pmk21/paper-analytica
-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.py
160 lines (117 loc) · 4.63 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import json
import pickle
import pprint
from collections import Counter
import fire
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import linear_kernel
from preprocess import preprocess_text
from topicModel import get_topics
def recommendation(query, tfidf_model, tfidf_matrix, paperTitles):
"""
Takes the query, performs basic preprocessing. Converts it into a
tfidf vector. Then returns a list of paper titles most similar to
the query based on cosine similarity.
Parameters
----------
query : string
Search query for the paper.
tfidf_model : sklearn.feature_extraction.text.TfidfVectorizer
TfidfVectorizer model
tfidf_matrix : np.array
Precomputed TF-IDF vectors of paper summaries.
paperTitles : list of string
List of titles of papers
Returns
-------
result : list
Top 10 titles of research papers
"""
# Preprocess the query
processedQuery = np.array(list(map(preprocess_text, query)))
tfidfQuery = tfidf_model.transform(processedQuery)
cosine_similarities = linear_kernel(tfidf_matrix, tfidfQuery)
related_docs_indices = np.argsort(
cosine_similarities, axis=0)[-10:].reshape((-1, ))[::-1]
result = []
for id in related_docs_indices:
result.append(paperTitles[id])
return result
def get_related_topic(query, nmf_model, tfidf_model, topic_dict, num_top_words=10):
processedQuery = np.array(list(map(preprocess_text, query)))
vectQuery = tfidf_model.transform(processedQuery)
# Gives the probability of each topic for the query
# in a matrix of (top_words * no_of_topics) form
# i.e. which topic does each word belong to.
topic_probability_scores = nmf_model.transform(vectQuery)
query_topic = np.argmax(np.sum(topic_probability_scores, axis=0))
return query_topic, topic_dict[query_topic]
def plot_count_dict(cdict, title, sort='key'):
if sort == 'key':
items = cdict.items()
items = sorted(items, key=lambda x: x[0])
elif sort == 'value':
items = cdict.items()
items = sorted(items, key=lambda x: x[1])
else:
raise ValueError("sort takes either 'key' or 'value'")
labels = [i[0] for i in items]
counts = [i[1] for i in items]
plt.barh(labels, counts)
plt.title(title)
plt.show()
def get_papers_per_year(data_dir, data, topic_no):
with open(data_dir + "topic_labels.pk", "rb") as fp:
topic_labels = pickle.load(fp)
idxs = np.where(topic_labels == topic_no)[0]
years = [data[i]["year"] for i in idxs]
return Counter(years)
def top_authors(data_dir, data, topic_no):
with open(data_dir + "topic_labels.pk", "rb") as fp:
topic_labels = pickle.load(fp)
idxs = np.where(topic_labels == topic_no)[0]
authors = []
for i in idxs:
temp_auth = eval(data[i]["author"])
for j in range(len(temp_auth)):
authors.append(temp_auth[j]["name"])
return Counter(authors)
def model(search_query, data_dir="./data/"):
"""
Takes in a query and path to the data and returns the list of
top 10 papers related to the query as well as 2 horizontal bar
plots
Parameters
----------
search_query : string
Query to be searched for in the dataset.
data_dir : string
Path to the directory storing the data required.
"""
with open(data_dir + "arxivData.json", "r") as fp:
data = json.load(fp)
paperTitles = list(map(lambda x: x["title"], data))
with open(data_dir + 'vectorizer.pk', 'rb') as pickle_in:
vectorizer = pickle.load(pickle_in)
with open(data_dir + "tfidf-vectors-200.pk", "rb") as fp:
vectSum = pickle.load(fp)
with open(data_dir + "nmf_model.pk", 'rb') as fp:
nmf_model = pickle.load(fp)
with open(data_dir + "topic_dict.pk", 'rb') as fp:
topic_dict = pickle.load(fp)
query = np.array(
[search_query], dtype=object)
recommendedList = recommendation(query, vectorizer, vectSum, paperTitles)
topic_no, _ = get_related_topic(
query, nmf_model, vectorizer, topic_dict)
pprint.pprint(recommendedList)
per_year_count = get_papers_per_year(data_dir, data, topic_no)
plot_count_dict(
per_year_count, "Papers published related to the topic per year")
auth_count = top_authors(data_dir, data, topic_no)
plot_count_dict(dict(auth_count.most_common(10)),
"Top authors and number of papers",
"value")
if __name__ == "__main__":
fire.Fire(model)