-
Notifications
You must be signed in to change notification settings - Fork 0
/
train1.py
107 lines (84 loc) · 3.51 KB
/
train1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import sys
import os
import json
import pandas
import numpy
import optparse
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, GRU
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict
from pprint import pprint
def train(csv_file):
dataframe = pandas.read_csv(csv_file, engine='python', quotechar='"', header=None)
dataset = dataframe.sample(frac=1).values
# Preprocess dataset
X = dataset[:, 0]
Y = dataset[:, 1]
for index, item in enumerate(X):
X[index] = item
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)
# Extract and save word dictionary
word_dict_file = 'build/word-dictionary.json'
if not os.path.exists(os.path.dirname(word_dict_file)):
os.makedirs(os.path.dirname(word_dict_file))
with open(word_dict_file, 'w') as outfile:
json.dump(tokenizer.word_index, outfile, ensure_ascii=False)
num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(X)
max_log_length = 2083
train_size = int(len(dataset) * .75)
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)
X_train, X_test = X_processed[0:train_size], X_processed[train_size:len(X_processed)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(Y)]
#tb_callback = TensorBoard(log_dir='./logs', embeddings_freq=1)
model = Sequential()
model.add(Embedding(num_words, 32, input_length=max_log_length))
model.add(Dropout(0.2))
model.add(GRU(16, recurrent_dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
#from keras.utils import multi_gpu_model
#model = multi_gpu_model(model, gpus=)
'''
lstm_model = Sequential()
lstm_model.add(Embedding(vocab_len, emb_dim, trainable = False, weights=[embedding_matrix]))
lstm_model.add(LSTM(128, return_sequences=False))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(1, activation = 'sigmoid'))
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
gru_model = Sequential()
gru_model.add(Embedding(vocab_len, emb_dim, trainable = False, weights=[embedding_matrix]))
gru_model.add(GRU(128, return_sequences=False))
gru_model.add(Dropout(0.5))
gru_model.add(Dense(1, activation = 'sigmoid'))
gru_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
'''
model.fit(X_train, Y_train, validation_split=0.25, epochs=20, batch_size=500)
# Evaluate model
score, acc = model.evaluate(X_test, Y_test, verbose=1, batch_size=500)
print("Model Accuracy: {:0.2f}%".format(acc * 100))
# Save model
model.save_weights('urls-lstm-weights.h5')
model.save('urls-lstm-model.h5')
with open('urls-lstm-model.json', 'w') as outfile:
outfile.write(model.to_json())
if __name__ == '__main__':
parser = optparse.OptionParser()
parser.add_option('-f', '--file', action="store", dest="file", help="data file")
options, args = parser.parse_args()
if options.file is not None:
csv_file = options.file
else:
csv_file = 'data.csv'
train(csv_file)