-
Notifications
You must be signed in to change notification settings - Fork 1
/
BookINDEX_identifyFREQUENTtokens_NLTK.py
72 lines (47 loc) · 2.35 KB
/
BookINDEX_identifyFREQUENTtokens_NLTK.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# Script to identify frequent tokens in a text for developing a book index, project-specific tag list or ontology
# further data processing (e.g. categorisation) needed
# based on a tutorial by by Abder-Rahman Ali, 12 Dec 2016
# https://code.tutsplus.com/tutorials/preparing-a-book-index-using-python--cms-27556
# possible use cases: preparing a book index for publication, semi-automated tagging, developing ontologies
import nltk, collections
from nltk.collocations import *
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import csv
# define stop words
my_stopwords=stopwords.words('en_fr_de') # call custom NLTK stopword list from local nltk-data folder
my_stopwords.extend(['?', '&', '!', '’', ':', '..']) # expand stopword list if necessary
# count frequency of individual words excluding stop words
frequencies=collections.Counter()
with open("C:\\Users\\mobarget\\Google Drive\\ACADEMIA\\BRILL\\INDEX\\INDEX.txt", encoding="utf-8") as book:
read_book=book.read()
words=nltk.word_tokenize(read_book)
filtered_text = [w for w in words if not w.lower() in my_stopwords]
filtered_text = sorted(list(dict.fromkeys(filtered_text)))
print(filtered_text)
for f in filtered_text:
frequencies[f] += 1
print(frequencies)
print(type(frequencies))
most_frequent={k:v for (k,v) in frequencies.items() if 25 > v > 3} # dict comprehension
print(most_frequent)
# write tokens and frequencies to table
with open("C:\\Users\\mobarget\\Google Drive\\ACADEMIA\\BRILL\\INDEX\\INDEX_freq.csv", "w", encoding="utf-8") as freq_csv:
write=csv.writer(freq_csv)
for key, value in frequencies.items():
write.writerow([key, value])
# count frequency of n-grams
# Combinations of words that often co-occur are called collocations.
# An example of collocations is bigrams, that is a list of word pairs.
# Similar to that is trigrams (a combination of three words), and so forth (i.e. n-grams).
# extracting n-grams
bigram = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)
# ignore all bigrams that occur at least "n" times
finder.apply_freq_filter(3)
# print 30 most frequent bigrams
print (finder.nbest(bigram.pmi, 30))
# find the location of word or phrase in text
print (read_book.index('newspaper'))
# find the location of word or phrase in text
print (read_book.index('journal'))