-
Notifications
You must be signed in to change notification settings - Fork 0
/
diffOverIncReverse.py
124 lines (97 loc) · 3.28 KB
/
diffOverIncReverse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import nltk
import string
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
import scipy.sparse as sps
import nltk.data
import matplotlib.pyplot
from matplotlib.widgets import Slider
import pylab
import mpld3
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
path = '/Users/tuckerkirven/Desktop/Independent Study/levelDesign1.txt'
token_dict = {}
token_dict_full = {}
token_dict2= {}
sentence_list = []
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
tokens = nltk.word_tokenize(text)
stems = stem_tokens(tokens, stemmer)
return stems
file_path = path
shakes = open(file_path, 'r')
text = shakes.read()
text = re.sub('[^\x00-\x7F]+',' ', text)
sentences = tokenizer.tokenize(text)
lowers = text.lower()
no_punctuation = lowers.translate(None, string.punctuation)
token_dict_full[file] = no_punctuation
#this can take some time
tfidf_full = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs_full = tfidf_full.fit_transform(token_dict_full.values())
maxNum = 0
maxSent =""
count = 0
for i in xrange(0,len(sentences) -1):
textAfter = " ".join(sentences[i+1:])
lowers = textAfter.lower()
no_punctuation = lowers.translate(None, string.punctuation)
token_dict[file] = no_punctuation
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())
curSent = sentences[i]
lowers = curSent.lower()
no_punctuation = lowers.translate(None, string.punctuation)
tfs2_full = tfidf_full.transform([no_punctuation])
tfs2 = tfidf.transform([no_punctuation])
val1 = tfs2_full*tfs_full.T
val2 = tfs2*tfs.T
val = val1-val2
sentence_list.append((count, val, curSent))
count+=1
x = []
y = []
count = 0
for (key, value, sent) in sentence_list:
x.append(count)
y.append(value.data)
count+=1
axframe = matplotlib.pyplot.axes([0.0, 0.1, 1, 0.05])
top5 = sorted(sentence_list,key=lambda x: x[1])[-5:]
parRanges = [3,13,19,25,31,41,46,55,63,81,85,100,109,115,124,133,137,142,151,154,159,162,165,172,175,180,188,192,196]
last = -1
c = 0
for par in parRanges:
if c % 2 ==0:
matplotlib.pyplot.axvspan(last+.2, par+.2, facecolor='b', alpha=0.5,zorder=2)
last = par
c+=1
secRanges = [31,63,115,133, 142, 165, 180, 196]
last = 3
c = 0
for par in secRanges:
if c % 2 ==0:
matplotlib.pyplot.axvspan(last+.2, par+.2, facecolor='g', alpha=1, zorder=1)
last = par
c+=1
matplotlib.pyplot.axis([-1, 200, 0, 1.2]) # Put in side your range [xmin,xmax,ymin,ymax], like ax.axis([-5,5,-5,200])
#matplotlib.pyplot.xlim([0,200])
bestParSents = [1, 5, 18, 24, 29, 40, 43, 49, 56, 69, 85, 94, 101, 115, 118, 129, 136, 142, 145, 152, 157, 160, 164, 166, 174, 180, 186, 190, 196]
bestOveralSents = [97, 167, 81, 116, 80, 127, 0, 182, 102, 66, 193, 136, 166, 85, 31, 101, 145, 96, 26, 86, 29, 196, 25, 134, 186, 183, 30, 188, 143]
z = []
for a in sentence_list:
if a in top5:
z.append('r')
else:
z.append('b')
matplotlib.pyplot.locator_params(nbins=64)
matplotlib.pyplot.scatter(x,y,c=z, zorder=3)
matplotlib.pyplot.show()