From 38e411c37ceed561910a040d82fdf7797288ae14 Mon Sep 17 00:00:00 2001 From: Vinayakumar R Date: Tue, 23 Jan 2018 23:48:35 +0530 Subject: [PATCH] Add files via upload --- coset/task1copy.py | 122 ++++++++++++++++++++++++++++++++++++++++ coset/task1copytest.py | 125 +++++++++++++++++++++++++++++++++++++++++ coset/task2copy.py | 122 ++++++++++++++++++++++++++++++++++++++++ coset/task2copytest.py | 125 +++++++++++++++++++++++++++++++++++++++++ coset/task3copy.py | 124 ++++++++++++++++++++++++++++++++++++++++ coset/task3copytest.py | 124 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 742 insertions(+) create mode 100644 coset/task1copy.py create mode 100644 coset/task1copytest.py create mode 100644 coset/task2copy.py create mode 100644 coset/task2copytest.py create mode 100644 coset/task3copy.py create mode 100644 coset/task3copytest.py diff --git a/coset/task1copy.py b/coset/task1copy.py new file mode 100644 index 0000000..bd7939c --- /dev/null +++ b/coset/task1copy.py @@ -0,0 +1,122 @@ +import numpy as np +import pandas as pd + +from gensim import corpora +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import SnowballStemmer + +from keras.preprocessing import sequence +from keras.utils import np_utils +from keras.models import Sequential +from keras.layers import Dense, Dropout, Activation, Embedding +from keras.layers import LSTM +from sklearn import preprocessing +from sklearn.metrics import (precision_score, recall_score, + f1_score, accuracy_score,mean_squared_error,mean_absolute_error) +np.random.seed(0) +from keras import callbacks +from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger + +if __name__ == "__main__": + + #load data + train_df = pd.read_csv('data/task1-train.csv', sep='\t', header=0) + test_df = pd.read_csv('data/task1-test.csv', sep='\t', header=0) + + raw_docs_train = train_df['Phrase'].values + raw_docs_test = test_df['Phrase'].values + #raw_docs_train = raw_docs_train.decode("utf8") + #raw_docs_test = raw_docs_test.decode("utf8") + #print(raw_docs_test) + sentiment_train = train_df['Sentiment'].values + num_labels = len(np.unique(sentiment_train)) + sentiment_test = test_df['Sentiment'].values + #text pre-processing + stop_words = set(stopwords.words('french')) + stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}']) + stemmer = SnowballStemmer('french') + + print "pre-processing train docs..." + processed_docs_train = [] + #print(raw_docs_train) + #np.savetxt("traindata.txt",raw_docs_train,fmt="%s") + for doc in raw_docs_train: + doc = doc.decode("utf8") + tokens = word_tokenize(doc) + filtered = [word for word in tokens if word not in stop_words] + stemmed = [stemmer.stem(word) for word in filtered] + processed_docs_train.append(stemmed) + + print "pre-processing test docs..." + processed_docs_test = [] + for doc in raw_docs_test: + doc = doc.decode("utf8") + tokens = word_tokenize(doc) + filtered = [word for word in tokens if word not in stop_words] + stemmed = [stemmer.stem(word) for word in filtered] + processed_docs_test.append(stemmed) + + processed_docs_all = np.concatenate((processed_docs_train, processed_docs_test), axis=0) + + dictionary = corpora.Dictionary(processed_docs_all) + dictionary_size = len(dictionary.keys()) + print "dictionary size: ", dictionary_size + #dictionary.save('dictionary.dict') + #corpus = [dictionary.doc2bow(doc) for doc in processed_docs] + + print "converting to token ids..." + word_id_train, word_id_len = [], [] + for doc in processed_docs_train: + word_ids = [dictionary.token2id[word] for word in doc] + word_id_train.append(word_ids) + word_id_len.append(len(word_ids)) + + word_id_test, word_ids = [], [] + for doc in processed_docs_test: + word_ids = [dictionary.token2id[word] for word in doc] + word_id_test.append(word_ids) + word_id_len.append(len(word_ids)) + + seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int) + + #pad sequences + word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len) + word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len) + print(num_labels) + #y_train_enc = np_utils.to_categorical(sentiment_train,) + #le = preprocessing.LabelEncoder() + #le.fit(sentiment_train) + #v = le.transform(sentiment_train) + + y_train_enc = np_utils.to_categorical(sentiment_train) + + #le1 = preprocessing.LabelEncoder() + #le1.fit(sentiment_test) + #v1 = le1.transform(sentiment_test) + y_test_enc = np_utils.to_categorical(sentiment_test) + + + #LSTM + print "fitting LSTM ..." + model = Sequential() + model.add(Embedding(dictionary_size, 256, dropout=0.2)) + model.add(LSTM(256, dropout_W=0.2, dropout_U=0.2)) + model.add(Dense(num_labels)) + model.add(Activation('softmax')) + + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) + checkpointer = callbacks.ModelCheckpoint(filepath="logs/checkpoint-{epoch:02d}.hdf5", verbose=1, save_best_only=True, monitor='loss') + csv_logger = CSVLogger('logs/training_set_iranalysis1.csv',separator=',', append=False) + + model.fit(word_id_train, y_train_enc, nb_epoch=1000, batch_size=64, validation_split=(word_id_test, y_test_enc), verbose=1, callbacks=[checkpointer,csv_logger]) + model.save("logs/lstm_model.hdf5") + test_pred = model.predict_classes(word_id_test) + + #make a submission + #test_df['Sentiment'] = test_pred.reshape(-1,1) + #header = ['PhraseId', 'Sentiment'] + #test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True) + accuracy = accuracy_score(v1, test_pred) + print(accuracy) + diff --git a/coset/task1copytest.py b/coset/task1copytest.py new file mode 100644 index 0000000..55863ca --- /dev/null +++ b/coset/task1copytest.py @@ -0,0 +1,125 @@ +import numpy as np +import pandas as pd + +from gensim import corpora +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import SnowballStemmer + +from keras.preprocessing import sequence +from keras.utils import np_utils +from keras.models import Sequential +from keras.layers import Dense, Dropout, Activation, Embedding +from keras.layers import LSTM, GRU, SimpleRNN +from sklearn import preprocessing +from sklearn.metrics import (precision_score, recall_score, + f1_score, accuracy_score,mean_squared_error,mean_absolute_error) +np.random.seed(0) +from keras import callbacks +from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger + +if __name__ == "__main__": + + #load data + train_df = pd.read_csv('data/task1-train.csv', sep='\t', header=0) + test_df = pd.read_csv('cleaned/task1-test.csv', sep='\t', header=0) + + raw_docs_train = train_df['Phrase'].values + raw_docs_test = test_df['Phrase'].values + #raw_docs_train = raw_docs_train.decode("utf8") + #raw_docs_test = raw_docs_test.decode("utf8") + #print(raw_docs_test) + sentiment_train = train_df['Sentiment'].values + num_labels = len(np.unique(sentiment_train)) + #sentiment_test = test_df.iloc[:,0] + #text pre-processing + stop_words = set(stopwords.words('french')) + stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}']) + stemmer = SnowballStemmer('french') + + print "pre-processing train docs..." + processed_docs_train = [] + #print(raw_docs_train) + #np.savetxt("traindata.txt",raw_docs_train,fmt="%s") + for doc in raw_docs_train: + doc = doc.decode("utf8") + tokens = word_tokenize(doc) + filtered = [word for word in tokens if word not in stop_words] + stemmed = [stemmer.stem(word) for word in filtered] + processed_docs_train.append(stemmed) + + print "pre-processing test docs..." + processed_docs_test = [] + for doc in raw_docs_test: + doc = doc.decode("utf8") + tokens = word_tokenize(doc) + filtered = [word for word in tokens if word not in stop_words] + stemmed = [stemmer.stem(word) for word in filtered] + processed_docs_test.append(stemmed) + + processed_docs_all = processed_docs_test + + dictionary = corpora.Dictionary(processed_docs_all) + dictionary_size = len(dictionary.keys()) + print "dictionary size: ", dictionary_size + #dictionary.save('dictionary.dict') + #corpus = [dictionary.doc2bow(doc) for doc in processed_docs] + ''' + print "converting to token ids..." + word_id_train, word_id_len = [], [] + for doc in processed_docs_train: + word_ids = [dictionary.token2id[word] for word in doc] + word_id_train.append(word_ids) + word_id_len.append(len(word_ids)) + ''' + word_id_test, word_id_len = [], [] + for doc in processed_docs_test: + word_ids = [dictionary.token2id[word] for word in doc] + word_id_test.append(word_ids) + word_id_len.append(len(word_ids)) + + seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int) + + #pad sequences + #word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len) + word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len) + print(num_labels) + #y_train_enc = np_utils.to_categorical(sentiment_train,) + #le = preprocessing.LabelEncoder() + #le.fit(sentiment_train) + #v = le.transform(sentiment_train) + #print(v) + #y_train_enc = np_utils.to_categorical(v) + + #le1 = preprocessing.LabelEncoder() + #le1.fit(sentiment_test) + #v1 = le1.transform(sentiment_test) + + + + #LSTM + print "fitting LSTM ..." + model = Sequential() + model.add(Embedding(dictionary_size, 256, dropout=0.2)) + model.add(LSTM(256, dropout_W=0.2, dropout_U=0.2)) + model.add(Dense(num_labels)) + model.add(Activation('softmax')) + model.load_weights("logs/checkpoint-589.hdf5") + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) + checkpointer = callbacks.ModelCheckpoint(filepath="logs2/checkpoint-{epoch:02d}.hdf5", verbose=1, save_best_only=True, monitor='loss') + csv_logger = CSVLogger('logs1/training_set_iranalysis1.csv',separator=',', append=False) + + #model.fit(word_id_train, y_train_enc, nb_epoch=1000, batch_size=256, validation_split=0.33, verbose=1, callbacks=[checkpointer,csv_logger]) + + test_pred = model.predict_classes(word_id_test) + + #make a submission + #test_df['Sentiment'] = test_pred.reshape(-1,1) + #header = ['PhraseId', 'Sentiment'] + #test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True) + #accuracy = accuracy_score(v1, test_pred) + print(test_pred.shape) + print(test_pred) + np.savetxt("res/task1-predicted3.txt", test_pred, fmt="%01d") + + diff --git a/coset/task2copy.py b/coset/task2copy.py new file mode 100644 index 0000000..f5b3dbd --- /dev/null +++ b/coset/task2copy.py @@ -0,0 +1,122 @@ +import numpy as np +import pandas as pd + +from gensim import corpora +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import SnowballStemmer + +from keras.preprocessing import sequence +from keras.utils import np_utils +from keras.models import Sequential +from keras.layers import Dense, Dropout, Activation, Embedding +from keras.layers import LSTM, SimpleRNN +from sklearn import preprocessing +from sklearn.metrics import (precision_score, recall_score, + f1_score, accuracy_score,mean_squared_error,mean_absolute_error) +np.random.seed(0) +from keras import callbacks +from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger + +if __name__ == "__main__": + + #load data + train_df = pd.read_csv('data/task2-train.csv', sep='\t', header=0) + test_df = pd.read_csv('data/task2-test.csv', sep='\t', header=0) + + raw_docs_train = train_df['Phrase'].values + raw_docs_test = test_df['Phrase'].values + #raw_docs_train = raw_docs_train.decode("utf8") + #raw_docs_test = raw_docs_test.decode("utf8") + #print(raw_docs_test) + sentiment_train = train_df['Sentiment'].values + num_labels = len(np.unique(sentiment_train)) + sentiment_test = test_df['Sentiment'].values + #text pre-processing + stop_words = set(stopwords.words('french')) + stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}']) + stemmer = SnowballStemmer('french') + + print "pre-processing train docs..." + processed_docs_train = [] + #print(raw_docs_train) + #np.savetxt("traindata.txt",raw_docs_train,fmt="%s") + for doc in raw_docs_train: + doc = doc.decode("utf8") + tokens = word_tokenize(doc) + filtered = [word for word in tokens if word not in stop_words] + stemmed = [stemmer.stem(word) for word in filtered] + processed_docs_train.append(stemmed) + + print "pre-processing test docs..." + processed_docs_test = [] + for doc in raw_docs_test: + doc = doc.decode("utf8") + tokens = word_tokenize(doc) + filtered = [word for word in tokens if word not in stop_words] + stemmed = [stemmer.stem(word) for word in filtered] + processed_docs_test.append(stemmed) + + processed_docs_all = np.concatenate((processed_docs_train, processed_docs_test), axis=0) + + dictionary = corpora.Dictionary(processed_docs_all) + dictionary_size = len(dictionary.keys()) + print "dictionary size: ", dictionary_size + #dictionary.save('dictionary.dict') + #corpus = [dictionary.doc2bow(doc) for doc in processed_docs] + + print "converting to token ids..." + word_id_train, word_id_len = [], [] + for doc in processed_docs_train: + word_ids = [dictionary.token2id[word] for word in doc] + word_id_train.append(word_ids) + word_id_len.append(len(word_ids)) + + word_id_test, word_ids = [], [] + for doc in processed_docs_test: + word_ids = [dictionary.token2id[word] for word in doc] + word_id_test.append(word_ids) + word_id_len.append(len(word_ids)) + + seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int) + + #pad sequences + word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len) + word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len) + print(num_labels) + #y_train_enc = np_utils.to_categorical(sentiment_train,) + #le = preprocessing.LabelEncoder() + #le.fit(sentiment_train) + #v = le.transform(sentiment_train) + + y_train_enc = np_utils.to_categorical(sentiment_train) + + #le1 = preprocessing.LabelEncoder() + #le1.fit(sentiment_test) + #v1 = le1.transform(sentiment_test) + y_test_enc = np_utils.to_categorical(sentiment_test) + + + #LSTM + print "fitting LSTM ..." + model = Sequential() + model.add(Embedding(dictionary_size, 256, dropout=0.2)) + model.add(SimpleRNN(256, dropout_W=0.2, dropout_U=0.2)) + model.add(Dense(num_labels)) + model.add(Activation('softmax')) + + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) + checkpointer = callbacks.ModelCheckpoint(filepath="logs1/checkpoint-{epoch:02d}.hdf5", verbose=1, save_best_only=True, monitor='loss') + csv_logger = CSVLogger('logs1/training_set_iranalysis1.csv',separator=',', append=False) + + model.fit(word_id_train, y_train_enc, nb_epoch=1000, batch_size=256, validation_data=(word_id_test, y_test_enc), verbose=1, callbacks=[checkpointer,csv_logger]) + model.save("logs1/rnn_model.hdf5") + test_pred = model.predict_classes(word_id_test) + + #make a submission + #test_df['Sentiment'] = test_pred.reshape(-1,1) + #header = ['PhraseId', 'Sentiment'] + #test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True) + accuracy = accuracy_score(v1, test_pred) + print(accuracy) + diff --git a/coset/task2copytest.py b/coset/task2copytest.py new file mode 100644 index 0000000..b277754 --- /dev/null +++ b/coset/task2copytest.py @@ -0,0 +1,125 @@ +import numpy as np +import pandas as pd + +from gensim import corpora +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import SnowballStemmer + +from keras.preprocessing import sequence +from keras.utils import np_utils +from keras.models import Sequential +from keras.layers import Dense, Dropout, Activation, Embedding +from keras.layers import LSTM, GRU, SimpleRNN +from sklearn import preprocessing +from sklearn.metrics import (precision_score, recall_score, + f1_score, accuracy_score,mean_squared_error,mean_absolute_error) +np.random.seed(0) +from keras import callbacks +from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger + +if __name__ == "__main__": + + #load data + train_df = pd.read_csv('data/task2-train.csv', sep='\t', header=0) + test_df = pd.read_csv('cleaned/task2-test.csv', sep='\t', header=0) + + raw_docs_train = train_df['Phrase'].values + raw_docs_test = test_df['Phrase'].values + #raw_docs_train = raw_docs_train.decode("utf8") + #raw_docs_test = raw_docs_test.decode("utf8") + #print(raw_docs_test) + sentiment_train = train_df['Sentiment'].values + num_labels = len(np.unique(sentiment_train)) + #sentiment_test = test_df.iloc[:,0] + #text pre-processing + stop_words = set(stopwords.words('french')) + stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}']) + stemmer = SnowballStemmer('french') + + print "pre-processing train docs..." + processed_docs_train = [] + #print(raw_docs_train) + #np.savetxt("traindata.txt",raw_docs_train,fmt="%s") + for doc in raw_docs_train: + doc = doc.decode("utf8") + tokens = word_tokenize(doc) + filtered = [word for word in tokens if word not in stop_words] + stemmed = [stemmer.stem(word) for word in filtered] + processed_docs_train.append(stemmed) + + print "pre-processing test docs..." + processed_docs_test = [] + for doc in raw_docs_test: + doc = doc.decode("utf8") + tokens = word_tokenize(doc) + filtered = [word for word in tokens if word not in stop_words] + stemmed = [stemmer.stem(word) for word in filtered] + processed_docs_test.append(stemmed) + + processed_docs_all = processed_docs_test + + dictionary = corpora.Dictionary(processed_docs_all) + dictionary_size = len(dictionary.keys()) + print "dictionary size: ", dictionary_size + #dictionary.save('dictionary.dict') + #corpus = [dictionary.doc2bow(doc) for doc in processed_docs] + ''' + print "converting to token ids..." + word_id_train, word_id_len = [], [] + for doc in processed_docs_train: + word_ids = [dictionary.token2id[word] for word in doc] + word_id_train.append(word_ids) + word_id_len.append(len(word_ids)) + ''' + word_id_test, word_id_len = [], [] + for doc in processed_docs_test: + word_ids = [dictionary.token2id[word] for word in doc] + word_id_test.append(word_ids) + word_id_len.append(len(word_ids)) + + seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int) + + #pad sequences + #word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len) + word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len) + print(num_labels) + #y_train_enc = np_utils.to_categorical(sentiment_train,) + #le = preprocessing.LabelEncoder() + #le.fit(sentiment_train) + #v = le.transform(sentiment_train) + #print(v) + #y_train_enc = np_utils.to_categorical(v) + + #le1 = preprocessing.LabelEncoder() + #le1.fit(sentiment_test) + #v1 = le1.transform(sentiment_test) + + + + #LSTM + print "fitting LSTM ..." + model = Sequential() + model.add(Embedding(dictionary_size, 256, dropout=0.2)) + model.add(SimpleRNN(256, dropout_W=0.2, dropout_U=0.2)) + model.add(Dense(num_labels)) + model.add(Activation('softmax')) + model.load_weights("logs1/rnn_model.hdf5") + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) + checkpointer = callbacks.ModelCheckpoint(filepath="logs2/checkpoint-{epoch:02d}.hdf5", verbose=1, save_best_only=True, monitor='loss') + csv_logger = CSVLogger('logs1/training_set_iranalysis1.csv',separator=',', append=False) + + #model.fit(word_id_train, y_train_enc, nb_epoch=1000, batch_size=256, validation_split=0.33, verbose=1, callbacks=[checkpointer,csv_logger]) + + test_pred = model.predict_classes(word_id_test) + + #make a submission + #test_df['Sentiment'] = test_pred.reshape(-1,1) + #header = ['PhraseId', 'Sentiment'] + #test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True) + #accuracy = accuracy_score(v1, test_pred) + print(test_pred.shape) + print(test_pred) + np.savetxt("res/task2-predicted3.txt", test_pred, fmt="%01d") + + diff --git a/coset/task3copy.py b/coset/task3copy.py new file mode 100644 index 0000000..2b86846 --- /dev/null +++ b/coset/task3copy.py @@ -0,0 +1,124 @@ +import numpy as np +import pandas as pd + +from gensim import corpora +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import SnowballStemmer + +from keras.preprocessing import sequence +from keras.utils import np_utils +from keras.models import Sequential +from keras.layers import Dense, Dropout, Activation, Embedding +from keras.layers import LSTM, GRU +from sklearn import preprocessing +from sklearn.metrics import (precision_score, recall_score, + f1_score, accuracy_score,mean_squared_error,mean_absolute_error) +np.random.seed(0) +from keras import callbacks +from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger + +if __name__ == "__main__": + + #load data + train_df = pd.read_csv('data/task3-train.csv', sep='\t', header=0) + test_df = pd.read_csv('data/task3-test.csv', sep='\t', header=0) + + raw_docs_train = train_df['Phrase'].values + raw_docs_test = test_df['Phrase'].values + #raw_docs_train = raw_docs_train.decode("utf8") + #raw_docs_test = raw_docs_test.decode("utf8") + #print(raw_docs_test) + sentiment_train = train_df['Sentiment'].values + num_labels = len(np.unique(sentiment_train)) + sentiment_test = test_df['Sentiment'].values + #text pre-processing + stop_words = set(stopwords.words('french')) + stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}']) + stemmer = SnowballStemmer('french') + + print "pre-processing train docs..." + processed_docs_train = [] + #print(raw_docs_train) + #np.savetxt("traindata.txt",raw_docs_train,fmt="%s") + for doc in raw_docs_train: + doc = doc.decode("utf8") + tokens = word_tokenize(doc) + filtered = [word for word in tokens if word not in stop_words] + stemmed = [stemmer.stem(word) for word in filtered] + processed_docs_train.append(stemmed) + + print "pre-processing test docs..." + processed_docs_test = [] + for doc in raw_docs_test: + doc = doc.decode("utf8") + tokens = word_tokenize(doc) + filtered = [word for word in tokens if word not in stop_words] + stemmed = [stemmer.stem(word) for word in filtered] + processed_docs_test.append(stemmed) + + processed_docs_all = np.concatenate((processed_docs_train, processed_docs_test), axis=0) + + dictionary = corpora.Dictionary(processed_docs_all) + dictionary_size = len(dictionary.keys()) + print "dictionary size: ", dictionary_size + #dictionary.save('dictionary.dict') + #corpus = [dictionary.doc2bow(doc) for doc in processed_docs] + + print "converting to token ids..." + word_id_train, word_id_len = [], [] + for doc in processed_docs_train: + word_ids = [dictionary.token2id[word] for word in doc] + word_id_train.append(word_ids) + word_id_len.append(len(word_ids)) + + word_id_test, word_ids = [], [] + for doc in processed_docs_test: + word_ids = [dictionary.token2id[word] for word in doc] + word_id_test.append(word_ids) + word_id_len.append(len(word_ids)) + + seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int) + + #pad sequences + word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len) + word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len) + print(num_labels) + #y_train_enc = np_utils.to_categorical(sentiment_train,) + #le = preprocessing.LabelEncoder() + #le.fit(sentiment_train) + #v = le.transform(sentiment_train) + + y_train_enc = np_utils.to_categorical(sentiment_train) + + #le1 = preprocessing.LabelEncoder() + #le1.fit(sentiment_test) + #v1 = le1.transform(sentiment_test) + + y_test_enc = np_utils.to_categorical(sentiment_test) + + + #LSTM + print "fitting LSTM ..." + model = Sequential() + model.add(Embedding(dictionary_size, 256, dropout=0.2)) + model.add(GRU(256, dropout_W=0.2, dropout_U=0.2)) + model.add(Dense(num_labels)) + model.add(Activation('softmax')) + + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) + checkpointer = callbacks.ModelCheckpoint(filepath="logs2/checkpoint-{epoch:02d}.hdf5", verbose=1, save_best_only=True, monitor='loss') + csv_logger = CSVLogger('logs2/training_set_iranalysis1.csv',separator=',', append=False) + + model.fit(word_id_train, y_train_enc, nb_epoch=1000, batch_size=256, validation_data=(word_id_test, y_test_enc), verbose=1, callbacks=[checkpointer,csv_logger]) + model.save("logs2/gru_model.hdf5") + + test_pred = model.predict_classes(word_id_test) + + #make a submission + #test_df['Sentiment'] = test_pred.reshape(-1,1) + #header = ['PhraseId', 'Sentiment'] + #test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True) + accuracy = accuracy_score(v1, test_pred) + print(accuracy) + diff --git a/coset/task3copytest.py b/coset/task3copytest.py new file mode 100644 index 0000000..29d30bc --- /dev/null +++ b/coset/task3copytest.py @@ -0,0 +1,124 @@ +import numpy as np +import pandas as pd + +from gensim import corpora +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import SnowballStemmer + +from keras.preprocessing import sequence +from keras.utils import np_utils +from keras.models import Sequential +from keras.layers import Dense, Dropout, Activation, Embedding +from keras.layers import LSTM, GRU +from sklearn import preprocessing +from sklearn.metrics import (precision_score, recall_score, + f1_score, accuracy_score,mean_squared_error,mean_absolute_error) +np.random.seed(0) +from keras import callbacks +from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger + +if __name__ == "__main__": + + #load data + train_df = pd.read_csv('data/task3-train.csv', sep='\t', header=0) + test_df = pd.read_csv('cleaned/task3-test.csv', sep='\t', header=0) + + raw_docs_train = train_df['Phrase'].values + raw_docs_test = test_df['Phrase'].values + #raw_docs_train = raw_docs_train.decode("utf8") + #raw_docs_test = raw_docs_test.decode("utf8") + #print(raw_docs_test) + sentiment_train = train_df['Sentiment'].values + num_labels = len(np.unique(sentiment_train)) + #sentiment_test = test_df.iloc[:,0] + #text pre-processing + stop_words = set(stopwords.words('french')) + stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}']) + stemmer = SnowballStemmer('french') + + print "pre-processing train docs..." + processed_docs_train = [] + #print(raw_docs_train) + #np.savetxt("traindata.txt",raw_docs_train,fmt="%s") + for doc in raw_docs_train: + doc = doc.decode("utf8") + tokens = word_tokenize(doc) + filtered = [word for word in tokens if word not in stop_words] + stemmed = [stemmer.stem(word) for word in filtered] + processed_docs_train.append(stemmed) + + print "pre-processing test docs..." + processed_docs_test = [] + for doc in raw_docs_test: + doc = doc.decode("utf8") + tokens = word_tokenize(doc) + filtered = [word for word in tokens if word not in stop_words] + stemmed = [stemmer.stem(word) for word in filtered] + processed_docs_test.append(stemmed) + + processed_docs_all = processed_docs_test + + dictionary = corpora.Dictionary(processed_docs_all) + dictionary_size = len(dictionary.keys()) + print "dictionary size: ", dictionary_size + #dictionary.save('dictionary.dict') + #corpus = [dictionary.doc2bow(doc) for doc in processed_docs] + ''' + print "converting to token ids..." + word_id_train, word_id_len = [], [] + for doc in processed_docs_train: + word_ids = [dictionary.token2id[word] for word in doc] + word_id_train.append(word_ids) + word_id_len.append(len(word_ids)) + ''' + word_id_test, word_id_len = [], [] + for doc in processed_docs_test: + word_ids = [dictionary.token2id[word] for word in doc] + word_id_test.append(word_ids) + word_id_len.append(len(word_ids)) + + seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int) + + #pad sequences + #word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len) + word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len) + print(num_labels) + #y_train_enc = np_utils.to_categorical(sentiment_train,) + #le = preprocessing.LabelEncoder() + #le.fit(sentiment_train) + #v = le.transform(sentiment_train) + #print(v) + #y_train_enc = np_utils.to_categorical(v) + + #le1 = preprocessing.LabelEncoder() + #le1.fit(sentiment_test) + #v1 = le1.transform(sentiment_test) + + + + #LSTM + print "fitting LSTM ..." + model = Sequential() + model.add(Embedding(dictionary_size, 256, dropout=0.2)) + model.add(GRU(256, dropout_W=0.2, dropout_U=0.2)) + model.add(Dense(num_labels)) + model.add(Activation('softmax')) + model.load_weights("logs2/gru_model.hdf5") + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) + checkpointer = callbacks.ModelCheckpoint(filepath="logs2/checkpoint-{epoch:02d}.hdf5", verbose=1, save_best_only=True, monitor='loss') + csv_logger = CSVLogger('logs2/training_set_iranalysis1.csv',separator=',', append=False) + + #model.fit(word_id_train, y_train_enc, nb_epoch=1000, batch_size=256, validation_split=0.33, verbose=1, callbacks=[checkpointer,csv_logger]) + + test_pred = model.predict_classes(word_id_test) + + #make a submission + #test_df['Sentiment'] = test_pred.reshape(-1,1) + #header = ['PhraseId', 'Sentiment'] + #test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True) + #accuracy = accuracy_score(v1, test_pred) + print(test_pred.shape) + print(test_pred) + np.savetxt("res/task3-predicted3.txt", test_pred, fmt="%01d") +