-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
88 lines (68 loc) · 2.59 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import re
import nltk
import copy
from nltk import ngrams
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import Term
def data_set_preprocessing(essay_list, stopword):
for essay in essay_list:
essay_case_folded = case_folding(essay.essay_content)
term_list_tokenize = tokenizing(essay_case_folded)
term_list_removed_stopword = stopword_removal(stopword, term_list_tokenize)
term_list_stemming = stemming(term_list_removed_stopword)
term_list_ngram = ngram(term_list_stemming)
term_list = normalize_term_list(term_list_ngram)
essay.set_term_list(term_list)
def case_folding(essay):
return re.sub('[^a-z]+', ' ', essay.lower())
def tokenizing(essay):
term_list_tokenize = []
for item in nltk.word_tokenize(essay):
term = Term.Term(item)
term_list_tokenize.append(term)
return term_list_tokenize
def stopword_removal(stopword, term_list):
term_list_removed_stopword = []
for term in term_list:
if term.term_content not in stopword:
term_list_removed_stopword.append(term)
return term_list_removed_stopword
def stemming(term_list):
term_list_stemming = []
factory = StemmerFactory()
stemmer = factory.create_stemmer()
for term in term_list:
output = stemmer.stem(str(term.term_content))
term_list_stemming.append(Term.Term(output))
return term_list_stemming
def ngram(term_list):
term_list_ngram = copy.deepcopy(term_list)
sentence = ''
for term in term_list:
sentence += str(term.term_content)+' '
bigram = ngrams(sentence.split(), 2)
for gram in bigram:
string = re.sub('[^a-z ]+', '', str(gram))
term_list_ngram.append(Term.Term(string))
return term_list_ngram
def normalize_term_list(term_list):
existed_term = []
term_content_in_string_array = convert_term_content_to_string_array(term_list)
new_term_list = []
max_tf = 0
for term in term_list:
if term.term_content not in existed_term:
existed_term.append(term.term_content)
t = Term.Term(term.term_content)
t.set_tf(term_content_in_string_array.count(t.term_content))
new_term_list.append(t)
if t.tf > max_tf:
max_tf = t.tf
for term in new_term_list:
term.set_ntf(float(term.tf) / float(max_tf))
return new_term_list
def convert_term_content_to_string_array(term_list):
term_content_in_string_array = []
for term in term_list:
term_content_in_string_array.append(term.term_content)
return term_content_in_string_array