forked from yassersouri/classify-text
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
135 lines (108 loc) · 3.98 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import util
import sklearn.datasets
import sklearn.metrics
import sklearn.cross_validation
import sklearn.svm
import sklearn.naive_bayes
import sklearn.neighbors
from colorama import init
from termcolor import colored
import sys
import os
import glob
def main():
init()
# get the dataset
print colored("Where is the dataset?", 'cyan', attrs=['bold'])
print colored('warning: files might get deleted if they are incompatible with utf8', 'yellow')
ans = sys.stdin.readline()
# remove any newlines or spaces at the end of the input
path = ans.strip('\n')
if path.endswith(' '):
path = path.rstrip(' ')
# preprocess data into two folders instead of 6
print colored("Reorganizing folders, into two classes", 'cyan', attrs=['bold'])
reorganize_dataset(path)
print '\n\n'
# do the main test
main_test(path)
def reorganize_dataset(path):
likes = ['rec.sport.hockey', 'sci.crypt', 'sci.electronics']
dislikes = ['sci.space', 'rec.motorcycles', 'misc.forsale']
folders = glob.glob(path + '/*')
if len(folders) == 2:
return
else:
# create `likes` and `dislikes` directories
if not os.path.exists(path + '/' + 'likes'):
os.makedirs(path + '/' + 'likes')
if not os.path.exists(path + '/' + 'dislikes'):
os.makedirs(path + '/' + 'dislikes')
for like in likes:
files = glob.glob(path + '/' + like + '/*')
for f in files:
parts = f.split('/')
name = parts[len(parts) -1]
newname = like + '_' + name
os.rename(f, path+'/likes/'+newname)
os.rmdir(path + '/' + like)
for like in dislikes:
files = glob.glob(path + '/' + like + '/*')
for f in files:
parts = f.split('/')
name = parts[len(parts) -1]
newname = like + '_' + name
os.rename(f, path+'/dislikes/'+newname)
os.rmdir(path + '/' + like)
def main_test(path = None):
dir_path = path or 'dataset'
remove_incompatible_files(dir_path)
print '\n\n'
# load data
print colored('Loading files into memory', 'green', attrs=['bold'])
files = sklearn.datasets.load_files(dir_path)
# refine all emails
print colored('Refining all files', 'green', attrs=['bold'])
util.refine_all_emails(files.data)
# calculate the BOW representation
print colored('Calculating BOW', 'green', attrs=['bold'])
word_counts = util.bagOfWords(files.data)
# TFIDF
print colored('Calculating TFIDF', 'green', attrs=['bold'])
tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts)
X = tf_transformer.transform(word_counts)
print '\n\n'
# create classifier
# clf = sklearn.naive_bayes.MultinomialNB()
# clf = sklearn.svm.LinearSVC()
n_neighbors = 11
weights = 'uniform'
weights = 'distance'
clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
# test the classifier
print '\n\n'
print colored('Testing classifier with train-test split', 'magenta', attrs=['bold'])
test_classifier(X, files.target, clf, test_size=0.2, y_names=files.target_names, confusion=False)
def remove_incompatible_files(dir_path):
# find incompatible files
print colored('Finding files incompatible with utf8: ', 'green', attrs=['bold'])
incompatible_files = util.find_incompatible_files(dir_path)
print colored(len(incompatible_files), 'yellow'), 'files found'
# delete them
if(len(incompatible_files) > 0):
print colored('Deleting incompatible files', 'red', attrs=['bold'])
util.delete_incompatible_files(incompatible_files)
def test_classifier(X, y, clf, test_size=0.4, y_names=None, confusion=False):
#train-test split
print 'test size is: %2.0f%%' % (test_size*100)
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=test_size)
clf.fit(X_train, y_train)
y_predicted = clf.predict(X_test)
if not confusion:
print colored('Classification report:', 'magenta', attrs=['bold'])
print sklearn.metrics.classification_report(y_test, y_predicted, target_names=y_names)
else:
print colored('Confusion Matrix:', 'magenta', attrs=['bold'])
print sklearn.metrics.confusion_matrix(y_test, y_predicted)
if __name__ == '__main__':
main()