forked from larsmaaloee/deep-belief-nets-for-topic-modeling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
200 lines (160 loc) · 9.13 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
__author__ = 'larsmaaloee'
import os
from Testing import dbn_testing
from Testing import visualise
from DataPreparation import data_processing
from DBN import dbn
from env_paths import archive_outputs
def example1():
'''
Run simulation on the 20 Newsgroups dataset "20news-bydate.tar.gz" from http://qwone.com/~jason/20Newsgroups/
through a network structure 2000-500-500-128 (binary outputs).
'''
### Archiving output files ###
# Archive output files so that the new simulation for example 1 will not use the data already present.
archive_outputs()
### DATA PREPARATION ###
# Define training and test set paths.
train_path = os.path.join('input', '20news-bydate', '20news-bydate-train')
test_path = os.path.join('input', '20news-bydate', '20news-bydate-test')
# Generate list of all the subfolders in the training path
paths = os.listdir(train_path)
train_paths = []
for p in paths:
if p.startswith('.'): # check for hidden files
continue
train_paths.append(os.path.join(train_path, p))
print train_paths
# Generate list of all the subfolders in the test path
paths = os.listdir(test_path)
test_paths = []
for p in paths:
if p.startswith('.'): # check for hidden files
continue
test_paths.append(os.path.join(test_path, p))
print test_paths
# Stem documents and compute a .p file (serlialized file).
data_processing.stem_docs_parallel(train_paths)
data_processing.stem_docs_parallel(test_paths)
# Generate bag of word matrix for training set.
dat_proc_train = data_processing.DataProcessing(train_paths, words_count=2000, trainingset_size=1.0)
dat_proc_train.generate_bows()
# Generate bag of word matrix for test set.
dat_proc_test = data_processing.DataProcessing(test_paths, trainingset_size=0.0,
trainingset_attributes=data_processing.get_attributes())
dat_proc_test.generate_bows()
### DBN TRAINING ###
# Generate network 2000-500-500-128 (binary outputs), training 50 epochs.
deepbelief = dbn.DBN(2000, data_processing.get_batch_list(), [500, 500], 128, 50, binary_output=True)
# Pretrain with a replicated softmax model at the bottom and restricted boltzmann machines in the remaining layers.
deepbelief.run_pretraining(learning_rate=0.01, weight_cost=0.0002, momentum=0.9, gibbs_steps=1)
# Construct deep autoencoder and finetune using backpropagation with conjugate gradient as optimization.
deepbelief.run_finetuning(load_from_serialization=True)
### EVALUATION ###
# Evaluate on the test set and output as binary output units.
eval = dbn_testing.DBNTesting(testing=True, binary_output=True)
# Evaluate the output space on the 1,3,7,15 nearest neighbors.
eval.generate_accuracy_measurement_parallel([1, 3, 7, 15])
### VISUALISATION ###
# Initialise visualization. Only plot 6 categories so that the plot will not get too cluttered.
v = visualise.Visualise(testing=True, classes_to_visualise=["rec.sport.hockey", "comp.graphics", "sci.crypt",
"soc.religion.christian", "talk.politics.mideast",
"talk.politics.guns"])
# Visualise the output data with 4 principal components.
v.visualise_data_pca_2d(input_data=False, number_of_components=4)
# Visualise the output data with 2 principal components.
v.visualise_data_pca_2d_two_components(1, 2, input_data=False)
# Visualise the output data in 3d with 3 principal components.
v.visualise_data_pca_3d(1, 2, 3, input_data=False)
# Visualise the output in a 3D movie. For this you need to install mencoder and ffmpeg (only tested on OSX).
# v.visualise_data_pca_3d_movie(1, 2, 3, input_data=False)
def example2():
'''
Run simulation on the 20 Newsgroups dataset "20news-18828.tar.gz" from http://qwone.com/~jason/20Newsgroups/
through a network structure 2000-500-250-125-10 (real valued outputs).
'''
### Archiving output files ###
# Archive output files so that the new simulation for example 1 will not use the data already present.
archive_outputs()
### DATA PREPARATION ###
# Define training and test set paths.
datapath = os.path.join('input', '20news-18828')
# Generate list of all the subfolders in the data path
paths = os.listdir(datapath)
datapaths = []
for p in paths:
if p.startswith('.'): # check for hidden files
continue
datapaths.append(os.path.join(datapath, p))
print datapaths
# Stem documents and compute a .p file (serlialized file).
data_processing.stem_docs_parallel(datapaths)
# Generate bag of word matrix for training set which is 0.7 (70%) of the data in the data paths.
dat_proc_train = data_processing.DataProcessing(datapaths, words_count=2000, trainingset_size=1.0)
dat_proc_train.generate_bows()
# Generate bag of word matrix for test set which is 0.3 (30%) of the data in the data paths.
dat_proc_test = data_processing.DataProcessing(datapaths, trainingset_size=0.0,
trainingset_attributes=data_processing.get_attributes())
dat_proc_test.generate_bows()
### DBN TRAINING ###
# Generate network 2000-500-250-125-10 (real valued outputs), training 50 epochs.
deepbelief = dbn.DBN(2000, data_processing.get_batch_list(), [500, 250, 125], 10, 50, binary_output=False)
# Pretrain with a replicated softmax model at the bottom and restricted boltzmann machines in the remaining layers.
deepbelief.run_pretraining(learning_rate=0.01, weight_cost=0.0002, momentum=0.9, gibbs_steps=1)
# Construct deep autoencoder and finetune using backpropagation with conjugate gradient as optimization.
deepbelief.run_finetuning(load_from_serialization=True)
### EVALUATION ###
# Evaluate on the test set and output as real output units.
eval = dbn_testing.DBNTesting(testing=True, binary_output=False)
# Evaluate the output space on the 1,3,7,15 nearest neighbors.
eval.generate_accuracy_measurement_parallel([1, 3, 7, 15])
### VISUALISATION ###
# Initialise visualization. Only plot 6 categories so that the plot will not get too cluttered.
v = visualise.Visualise(testing=True, classes_to_visualise=["rec.sport.hockey", "comp.graphics", "sci.crypt",
"soc.religion.christian", "talk.politics.mideast",
"talk.politics.guns"])
# Visualise the output data with 4 principal components.
v.visualise_data_pca_2d(input_data=False, number_of_components=4)
# Visualise the output data with 2 principal components.
v.visualise_data_pca_2d_two_components(1, 2, input_data=False)
# Visualise the output data in 3d with 3 principal components.
v.visualise_data_pca_3d(1, 2, 3, input_data=False)
# Visualise the output in a 3D movie. For this you need to install mencoder and ffmpeg (only tested on OSX).
# v.visualise_data_pca_3d_movie(1, 2, 3, input_data=False)
def example3():
'''
In the output folder ./output you'll find "20news-19997.tar.gz" from http://qwone.com/~jason/20Newsgroups/
processed so that you can run evaluation on the data directly. The data has been trained through a network
of 2000-500-250-125-10 (real valued output). Unzip the compressed chunks by running the output/_unzip.sh shell
script.
'''
### EVALUATION ###
# Evaluate on the test set and output as real output units.
eval = dbn_testing.DBNTesting(testing=True, binary_output=False)
# Evaluate the output space on the 1,3,7,15 nearest neighbors.
eval.generate_accuracy_measurement_parallel([1, 3, 7, 15])
### VISUALISATION ###
# Initialise visualization. Only plot 6 categories so that the plot will not get too cluttered.
v = visualise.Visualise(testing=True, classes_to_visualise=["rec.sport.hockey", "comp.graphics", "sci.crypt",
"soc.religion.christian", "talk.politics.mideast",
"talk.politics.guns"])
# Visualise the output data with 4 principal components.
v.visualise_data_pca_2d(input_data=False, number_of_components=4)
# Visualise the output data with 2 principal components.
v.visualise_data_pca_2d_two_components(1, 2, input_data=False)
# Visualise the output data in 3d with 3 principal components.
v.visualise_data_pca_3d(1, 2, 3, input_data=False)
def run_examples():
'''
In order to run example 1 and example 2, please download from http://qwone.com/~jason/20Newsgroups/:
Example1: "20news-bydate.tar.gz"
(save training data to ./input/20news-bydate-small/train and test data to ./input/20news-bydate-small/train.)
Example2: "20news-18828.tar.gz"
(save all data to ./input/20_newsgroups)
Example3: Runs out of the box on the output data given in ./output folder.
'''
example1()
# example2()
# example3()
if __name__ == '__main__':
run_examples()