-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean.py
46 lines (34 loc) · 1.63 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
filler_words = ["a", "after", "all", "All", "am", "an",
"and", "Any", "any", "are", "as", "As", "at", "be", "been", "but", "by", "can", "could",
"did", "during", "for", "For", "from", "had", "has", "have", "he", "hence", "henceforth",
"hereafter",
"herein", "however", "in", "In", "indeed", "is", "it", "It", "of", "on", "On", "one", "or",
"our",
"out", "per", "shall", "she", "since", "so", "the", "The", "then", "There", "thereafter",
"therefore", "they", "this", "through", "thus", "to", "until", "was", "were", "when",
"whereby", "which", "while", "will", "with", "would", "you", "With", "While", "When", "What",
"We", "get", "Very", "very", "much", "many", "Many", "too", "being", "should",
"Upon", "upon", "Too", "there", "due", "Due"
]
class Clean:
def __init__(self, str_input):
self.to_clean = str_input
self.new_str = ""
toggle = False
"""
TODO: Find a neat way to remove html formatting
"""
def remove_punctuation(self):
for c in self.to_clean:
if (('a' <= c <= 'z') or '0' <= c <= '9' or 'A' <= c <= 'Z'):
self.new_str += c
return self.new_str
def tokenize(self):
#Remove html formatting
sentence = self.remove_punctuation()
words = sentence.split(' ')
filtered = set()
#manually filter
for elem in words:
if elem in filler_words:
filtered.add(elem)