-
Notifications
You must be signed in to change notification settings - Fork 1
/
marx_smith.py
183 lines (170 loc) · 9.68 KB
/
marx_smith.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
__author__ = 'jimclifford'
import urllib2
from bs4 import BeautifulSoup
#This script requires the Natural Language Tool Kit installed and the book data downloaded.
#You can install NLTK using pip: http://www.nltk.org/install.html
#Test the install from a python command line by entering >> import nltk
# Next enter >> nltk.download()
#a graphical interface downloader will pop up and you can choose to download everything included in the book.
#with NLTK installed on your machine, the following two imports should work.
import nltk
from nltk.corpus import stopwords
# This is a regular expression pattern used lower down in the tokenizer.
# It results in a more nuanced split of the raw text into a list of tokens than simply splitting at white spaces.
pattern = r'''(?x) # set flag to allow verbose regexps
([A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \w+(-\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens
'''
#This defines which sentense tokenizer I've choosen to use.
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#I've also chossen to strip punctuation when removing stopwords. I add this list of punctuation to the end of a list of
#stopwords provided by the NLTK:
punk = [".", ",", ";", "\"", "\'", "?", "(", ")", ":", "-", "_", "`"]
stopword = stopwords.words('english')
stopword.extend(punk)
#With those basics prepared, the script starts here:
#This print command simply prints "Karl Marx Capital"
print "Karl Marx Capital"
raw_input("Press Enter to continue...")
#This opens and reads the website, returning the HTML code
html = urllib2.urlopen("http://archive.org/stream/capitalcritiqueo00marx/capitalcritiqueo00marx_djvu.txt").read()
#I checked the source HTML code and found that the <pre> tag comes after all of the website preamble that we don't
#want to analysis. This command searches through the string of HTML data and finds the location of the <pre> tag and
#returns the string from that point forward.
be = html.find("<pre>")
html2 = html[be:]
#This is a crucial step if we were working with a page with a lot of HTML coding. It strips out the HTLM and returns
#raw plaintext.
raw = nltk.clean_html(html2).replace("- ","").replace("\xe2", "")
#This tokenizer using the regular expressions above to break the text in too a list of word tokens.
tokens = nltk.regexp_tokenize(raw, pattern)
#This next steps does two things. It creates a "Text" from the tokens, which allows us to preform other NLTK fucntions.
#And it normalizes all the words to lowercase, which makes it possible to count word frequencies.
print "WORDS:"
text = nltk.Text(tokens)
words = [w.lower() for w in text]
print words[100:200]
raw_input("Press Enter to continue...")
#Our word list, however, still contains common English worlds "the", "of", "a", "to" and punctuation. This next step
#uses our stopword list created above to strip the list of words. I've also only returned words larger than 1 characters
#as I noticed some OCR noise showing up in the word frequencies. The final step prints a random selection of filtered
#words from the 500th to the 600th in the list.
print "FILTERED WORDS"
filtered_words = [w for w in words if not w in stopword]
#filtered_words = [w for w in filtered_words if not w in punk]
filtered_words = [w for w in filtered_words if len(w) > 1]
print filtered_words[500:600]
raw_input("Press Enter to continue...")
#This step uses an NLTK function to determine the most frequent words in the filtered list:
print "WORD FREQUENCY:"
print nltk.FreqDist(filtered_words)
#It is also possible to tokenize the sentences and this step is useful for further natural language analysis. The third
#step prints ten sentences from the middle of the text.
print "SENTENCES:"
sents = sent_tokenizer.tokenize(raw)
print sents[171:181]
raw_input("Press Enter to continue...")
#This step produces a vocabulary of words used in the text by discarding all duplicates and sorting the results.
print "VOCAB:"
vocab = sorted(set(filtered_words))
print vocab[500:600]
raw_input("Press Enter to continue...")
#The Collocations command finds the most common word pairs in the text. You'll notice we still have some OCR issues
#as a few split words show up in the collocations results "commod ities":
print "COLLOCATIONS:"
print text.collocations()
raw_input("Press Enter to continue...")
#Concordance is a really powerful function that displays keywords in context. It give us a quick way to check how the
#text uses words like freedom or nature. Feel free to add a third concordance:
print "CONCORDANCE:"
print text.concordance('freedom')
print text.concordance('nature')
raw_input("Press Enter to continue...")
#This function adds part of speech tags to 801 word tokens. This is a slow, computationally intensive, task, so
#I limited to a small sample for this workshop. This is an important step in an NLP pipeline, as more complicated
#tasks, such as Named Entity Recognition reply on part of speech tags. We could also explore the most frequently used
#verbs or nouns in this text with a little more coding.
print "PART OF SPEECH TAGGED:"
tagged = nltk.pos_tag(tokens[200:1000])
print tagged
raw_input("Press Enter to continue...")
#This searches through the text and provides us with the words found before the word man in the textL
print "WORDS FOUND BEFORE _MAN_"
print text.findall(r" (<.*>) <man>")
raw_input("Press Enter to continue...")
#This final function creates a text dispersion graph that shows us where different words appear in the text:
text.dispersion_plot(["labour", "democracy", "freedom", "capital", "nature"])
#Same again for Adam Smith
print "Adam Smith Wealth of Nations"
html = urllib2.urlopen("https://archive.org/stream/WealthOfNationsAdamSmith/Wealth%20of%20Nations_Adam%20Smith_djvu.txt").read()
raw_input("Press Enter to continue...")
#I checked the source HTML code and found that the <pre> tag comes after all of the website preamble that we don't
#want to analysis. This command searches through the string of HTML data and finds the location of the <pre> tag and
#returns the string from that point forward.
be = html.find("<pre>")
html2 = html[be:]
#This is a crucial step if we were working with a page with a lot of HTML coding. It strips out the HTLM and returns
#raw plaintext.
raw = nltk.clean_html(html2).replace("- ","").replace("\xe2", "")
#This tokenizer using the regular expressions above to break the text in too a list of word tokens.
tokens = nltk.regexp_tokenize(raw, pattern)
#This next steps does two things. It creates a "Text" from the tokens, which allows us to preform other NLTK fucntions.
#And it normalizes all the words to lowercase, which makes it possible to count word frequencies.
print "WORDS:"
text = nltk.Text(tokens)
words = [w.lower() for w in text]
print words[100:200]
raw_input("Press Enter to continue...")
#Our word list, however, still contains common English worlds "the", "of", "a", "to" and punctuation. This next step
#uses our stopword list created above to strip the list of words. I've also only returned words larger than 1 characters
#as I noticed some OCR noise showing up in the word frequencies. The final step prints a random selection of filtered
#words from the 500th to the 600th in the list.
print "FILTERED WORDS"
filtered_words = [w for w in words if not w in stopword]
#filtered_words = [w for w in filtered_words if not w in punk]
filtered_words = [w for w in filtered_words if len(w) > 1]
print filtered_words[500:600]
raw_input("Press Enter to continue...")
#This step uses an NLTK function to determine the most frequent words in the filtered list:
print "WORD FREQUENCY:"
print nltk.FreqDist(filtered_words)
raw_input("Press Enter to continue...")
#It is also possible to tokenize the sentences and this step is useful for further natural language analysis. The third
#step prints ten sentences from the middle of the text.
print "SENTENCES:"
sents = sent_tokenizer.tokenize(raw)
print sents[171:181]
raw_input("Press Enter to continue...")
#This step produces a vocabulary of words used in the text by discarding all duplicates and sorting the results.
print "VOCAB:"
vocab = sorted(set(filtered_words))
print vocab[500:600]
raw_input("Press Enter to continue...")
#The Collocations command finds the most common word pairs in the text. You'll notice we still have some OCR issues
#as a few split words show up in the collocations results "commod ities":
print "COLLOCATIONS:"
print text.collocations()
raw_input("Press Enter to continue...")
#Concordance is a really powerful function that displays keywords in context. It give us a quick way to check how the
#text uses words like freedom or nature. Feel free to add a third concordance:
print "CONCORDANCE:"
print text.concordance('freedom')
print text.concordance('nature')
raw_input("Press Enter to continue...")
#This function adds part of speech tags to 801 word tokens. This is a slow, computationally intensive, task, so
#I limited to a small sample for this workshop. This is an important step in an NLP pipeline, as more complicated
#tasks, such as Named Entity Recognition reply on part of speech tags. We could also explore the most frequently used
#verbs or nouns in this text with a little more coding.
print "PART OF SPEECH TAGGED:"
tagged = nltk.pos_tag(tokens[200:1000])
print tagged
raw_input("Press Enter to continue...")
#This searches through the text and provides us with the words found before the word man in the textL
print "WORDS FOUND BEFORE _MAN_"
print text.findall(r" (<.*>) <man>")
raw_input("Press Enter to continue...")
#This final function creates a text dispersion graph that shows us where different words appear in the text:
text.dispersion_plot(["labour", "democracy", "freedom", "capital", "nature"])