From 38e411c37ceed561910a040d82fdf7797288ae14 Mon Sep 17 00:00:00 2001
From: Vinayakumar R <vinayakumarr88@gmail.com>
Date: Tue, 23 Jan 2018 23:48:35 +0530
Subject: [PATCH] Add files via upload

---
 coset/task1copy.py     | 122 ++++++++++++++++++++++++++++++++++++++++
 coset/task1copytest.py | 125 +++++++++++++++++++++++++++++++++++++++++
 coset/task2copy.py     | 122 ++++++++++++++++++++++++++++++++++++++++
 coset/task2copytest.py | 125 +++++++++++++++++++++++++++++++++++++++++
 coset/task3copy.py     | 124 ++++++++++++++++++++++++++++++++++++++++
 coset/task3copytest.py | 124 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 742 insertions(+)
 create mode 100644 coset/task1copy.py
 create mode 100644 coset/task1copytest.py
 create mode 100644 coset/task2copy.py
 create mode 100644 coset/task2copytest.py
 create mode 100644 coset/task3copy.py
 create mode 100644 coset/task3copytest.py

diff --git a/coset/task1copy.py b/coset/task1copy.py
new file mode 100644
index 0000000..bd7939c
--- /dev/null
+++ b/coset/task1copy.py
@@ -0,0 +1,122 @@
+import numpy as np
+import pandas as pd
+
+from gensim import corpora
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize 
+from nltk.stem import SnowballStemmer
+
+from keras.preprocessing import sequence
+from keras.utils import np_utils
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation, Embedding
+from keras.layers import LSTM
+from sklearn import preprocessing
+from sklearn.metrics import (precision_score, recall_score,
+                             f1_score, accuracy_score,mean_squared_error,mean_absolute_error)
+np.random.seed(0)
+from keras import callbacks
+from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger
+
+if __name__ == "__main__":
+
+    #load data
+    train_df = pd.read_csv('data/task1-train.csv', sep='\t', header=0)
+    test_df = pd.read_csv('data/task1-test.csv', sep='\t', header=0)
+
+    raw_docs_train = train_df['Phrase'].values
+    raw_docs_test = test_df['Phrase'].values
+    #raw_docs_train = raw_docs_train.decode("utf8")
+    #raw_docs_test = raw_docs_test.decode("utf8")
+    #print(raw_docs_test)
+    sentiment_train = train_df['Sentiment'].values
+    num_labels = len(np.unique(sentiment_train))
+    sentiment_test = test_df['Sentiment'].values
+    #text pre-processing
+    stop_words = set(stopwords.words('french'))
+    stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
+    stemmer = SnowballStemmer('french')
+
+    print "pre-processing train docs..."
+    processed_docs_train = []
+    #print(raw_docs_train)
+    #np.savetxt("traindata.txt",raw_docs_train,fmt="%s")
+    for doc in raw_docs_train:
+       doc = doc.decode("utf8")
+       tokens = word_tokenize(doc)
+       filtered = [word for word in tokens if word not in stop_words]
+       stemmed = [stemmer.stem(word) for word in filtered]
+       processed_docs_train.append(stemmed)
+   
+    print "pre-processing test docs..."
+    processed_docs_test = []
+    for doc in raw_docs_test:
+       doc = doc.decode("utf8")
+       tokens = word_tokenize(doc)
+       filtered = [word for word in tokens if word not in stop_words]
+       stemmed = [stemmer.stem(word) for word in filtered]
+       processed_docs_test.append(stemmed)
+
+    processed_docs_all = np.concatenate((processed_docs_train, processed_docs_test), axis=0)
+
+    dictionary = corpora.Dictionary(processed_docs_all)
+    dictionary_size = len(dictionary.keys())
+    print "dictionary size: ", dictionary_size 
+    #dictionary.save('dictionary.dict')
+    #corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
+
+    print "converting to token ids..."
+    word_id_train, word_id_len = [], []
+    for doc in processed_docs_train:
+        word_ids = [dictionary.token2id[word] for word in doc]
+        word_id_train.append(word_ids)
+        word_id_len.append(len(word_ids))
+
+    word_id_test, word_ids = [], []
+    for doc in processed_docs_test:
+        word_ids = [dictionary.token2id[word] for word in doc]
+        word_id_test.append(word_ids)
+        word_id_len.append(len(word_ids))
+ 
+    seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)
+
+    #pad sequences
+    word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
+    word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len)
+    print(num_labels)
+    #y_train_enc = np_utils.to_categorical(sentiment_train,)
+    #le = preprocessing.LabelEncoder()
+    #le.fit(sentiment_train)
+    #v = le.transform(sentiment_train)   
+    
+    y_train_enc = np_utils.to_categorical(sentiment_train)
+
+    #le1 = preprocessing.LabelEncoder()
+    #le1.fit(sentiment_test)
+    #v1 = le1.transform(sentiment_test)
+    y_test_enc = np_utils.to_categorical(sentiment_test)
+    
+
+    #LSTM
+    print "fitting LSTM ..."
+    model = Sequential()
+    model.add(Embedding(dictionary_size, 256, dropout=0.2))
+    model.add(LSTM(256, dropout_W=0.2, dropout_U=0.2))
+    model.add(Dense(num_labels))
+    model.add(Activation('softmax'))
+    
+    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
+    checkpointer = callbacks.ModelCheckpoint(filepath="logs/checkpoint-{epoch:02d}.hdf5", verbose=1, save_best_only=True, monitor='loss')
+    csv_logger = CSVLogger('logs/training_set_iranalysis1.csv',separator=',', append=False)
+
+    model.fit(word_id_train, y_train_enc, nb_epoch=1000, batch_size=64, validation_split=(word_id_test, y_test_enc), verbose=1, callbacks=[checkpointer,csv_logger])
+    model.save("logs/lstm_model.hdf5")
+    test_pred = model.predict_classes(word_id_test)
+    
+    #make a submission
+    #test_df['Sentiment'] = test_pred.reshape(-1,1) 
+    #header = ['PhraseId', 'Sentiment']
+    #test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True)
+    accuracy = accuracy_score(v1, test_pred)
+    print(accuracy)
+    
diff --git a/coset/task1copytest.py b/coset/task1copytest.py
new file mode 100644
index 0000000..55863ca
--- /dev/null
+++ b/coset/task1copytest.py
@@ -0,0 +1,125 @@
+import numpy as np
+import pandas as pd
+
+from gensim import corpora
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize 
+from nltk.stem import SnowballStemmer
+
+from keras.preprocessing import sequence
+from keras.utils import np_utils
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation, Embedding
+from keras.layers import LSTM, GRU, SimpleRNN
+from sklearn import preprocessing
+from sklearn.metrics import (precision_score, recall_score,
+                             f1_score, accuracy_score,mean_squared_error,mean_absolute_error)
+np.random.seed(0)
+from keras import callbacks
+from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger
+
+if __name__ == "__main__":
+
+    #load data
+    train_df = pd.read_csv('data/task1-train.csv', sep='\t', header=0)
+    test_df = pd.read_csv('cleaned/task1-test.csv', sep='\t', header=0)
+
+    raw_docs_train = train_df['Phrase'].values
+    raw_docs_test = test_df['Phrase'].values
+    #raw_docs_train = raw_docs_train.decode("utf8")
+    #raw_docs_test = raw_docs_test.decode("utf8")
+    #print(raw_docs_test)
+    sentiment_train = train_df['Sentiment'].values
+    num_labels = len(np.unique(sentiment_train))
+    #sentiment_test = test_df.iloc[:,0]
+    #text pre-processing
+    stop_words = set(stopwords.words('french'))
+    stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
+    stemmer = SnowballStemmer('french')
+
+    print "pre-processing train docs..."
+    processed_docs_train = []
+    #print(raw_docs_train)
+    #np.savetxt("traindata.txt",raw_docs_train,fmt="%s")
+    for doc in raw_docs_train:
+       doc = doc.decode("utf8")
+       tokens = word_tokenize(doc)
+       filtered = [word for word in tokens if word not in stop_words]
+       stemmed = [stemmer.stem(word) for word in filtered]
+       processed_docs_train.append(stemmed)
+   
+    print "pre-processing test docs..."
+    processed_docs_test = []
+    for doc in raw_docs_test:
+       doc = doc.decode("utf8")
+       tokens = word_tokenize(doc)
+       filtered = [word for word in tokens if word not in stop_words]
+       stemmed = [stemmer.stem(word) for word in filtered]
+       processed_docs_test.append(stemmed)
+
+    processed_docs_all = processed_docs_test
+
+    dictionary = corpora.Dictionary(processed_docs_all)
+    dictionary_size = len(dictionary.keys())
+    print "dictionary size: ", dictionary_size 
+    #dictionary.save('dictionary.dict')
+    #corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
+    '''
+    print "converting to token ids..."
+    word_id_train, word_id_len = [], []
+    for doc in processed_docs_train:
+        word_ids = [dictionary.token2id[word] for word in doc]
+        word_id_train.append(word_ids)
+        word_id_len.append(len(word_ids))
+    '''
+    word_id_test, word_id_len = [], []
+    for doc in processed_docs_test:
+        word_ids = [dictionary.token2id[word] for word in doc]
+        word_id_test.append(word_ids)
+        word_id_len.append(len(word_ids))
+ 
+    seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)
+
+    #pad sequences
+    #word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
+    word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len)
+    print(num_labels)
+    #y_train_enc = np_utils.to_categorical(sentiment_train,)
+    #le = preprocessing.LabelEncoder()
+    #le.fit(sentiment_train)
+    #v = le.transform(sentiment_train)   
+    #print(v)
+    #y_train_enc = np_utils.to_categorical(v)
+
+    #le1 = preprocessing.LabelEncoder()
+    #le1.fit(sentiment_test)
+    #v1 = le1.transform(sentiment_test)
+
+    
+
+    #LSTM
+    print "fitting LSTM ..."
+    model = Sequential()
+    model.add(Embedding(dictionary_size, 256, dropout=0.2))
+    model.add(LSTM(256, dropout_W=0.2, dropout_U=0.2))
+    model.add(Dense(num_labels))
+    model.add(Activation('softmax'))
+    model.load_weights("logs/checkpoint-589.hdf5")
+    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
+    checkpointer = callbacks.ModelCheckpoint(filepath="logs2/checkpoint-{epoch:02d}.hdf5", verbose=1, save_best_only=True, monitor='loss')
+    csv_logger = CSVLogger('logs1/training_set_iranalysis1.csv',separator=',', append=False)
+
+    #model.fit(word_id_train, y_train_enc, nb_epoch=1000, batch_size=256, validation_split=0.33, verbose=1, callbacks=[checkpointer,csv_logger])
+
+    test_pred = model.predict_classes(word_id_test)
+    
+    #make a submission
+    #test_df['Sentiment'] = test_pred.reshape(-1,1) 
+    #header = ['PhraseId', 'Sentiment']
+    #test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True)
+    #accuracy = accuracy_score(v1, test_pred)
+    print(test_pred.shape)
+    print(test_pred)    
+    np.savetxt("res/task1-predicted3.txt", test_pred, fmt="%01d")
+    
+
diff --git a/coset/task2copy.py b/coset/task2copy.py
new file mode 100644
index 0000000..f5b3dbd
--- /dev/null
+++ b/coset/task2copy.py
@@ -0,0 +1,122 @@
+import numpy as np
+import pandas as pd
+
+from gensim import corpora
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize 
+from nltk.stem import SnowballStemmer
+
+from keras.preprocessing import sequence
+from keras.utils import np_utils
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation, Embedding
+from keras.layers import LSTM, SimpleRNN
+from sklearn import preprocessing
+from sklearn.metrics import (precision_score, recall_score,
+                             f1_score, accuracy_score,mean_squared_error,mean_absolute_error)
+np.random.seed(0)
+from keras import callbacks
+from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger
+
+if __name__ == "__main__":
+
+    #load data
+    train_df = pd.read_csv('data/task2-train.csv', sep='\t', header=0)
+    test_df = pd.read_csv('data/task2-test.csv', sep='\t', header=0)
+
+    raw_docs_train = train_df['Phrase'].values
+    raw_docs_test = test_df['Phrase'].values
+    #raw_docs_train = raw_docs_train.decode("utf8")
+    #raw_docs_test = raw_docs_test.decode("utf8")
+    #print(raw_docs_test)
+    sentiment_train = train_df['Sentiment'].values
+    num_labels = len(np.unique(sentiment_train))
+    sentiment_test = test_df['Sentiment'].values
+    #text pre-processing
+    stop_words = set(stopwords.words('french'))
+    stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
+    stemmer = SnowballStemmer('french')
+
+    print "pre-processing train docs..."
+    processed_docs_train = []
+    #print(raw_docs_train)
+    #np.savetxt("traindata.txt",raw_docs_train,fmt="%s")
+    for doc in raw_docs_train:
+       doc = doc.decode("utf8")
+       tokens = word_tokenize(doc)
+       filtered = [word for word in tokens if word not in stop_words]
+       stemmed = [stemmer.stem(word) for word in filtered]
+       processed_docs_train.append(stemmed)
+   
+    print "pre-processing test docs..."
+    processed_docs_test = []
+    for doc in raw_docs_test:
+       doc = doc.decode("utf8")
+       tokens = word_tokenize(doc)
+       filtered = [word for word in tokens if word not in stop_words]
+       stemmed = [stemmer.stem(word) for word in filtered]
+       processed_docs_test.append(stemmed)
+
+    processed_docs_all = np.concatenate((processed_docs_train, processed_docs_test), axis=0)
+
+    dictionary = corpora.Dictionary(processed_docs_all)
+    dictionary_size = len(dictionary.keys())
+    print "dictionary size: ", dictionary_size 
+    #dictionary.save('dictionary.dict')
+    #corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
+
+    print "converting to token ids..."
+    word_id_train, word_id_len = [], []
+    for doc in processed_docs_train:
+        word_ids = [dictionary.token2id[word] for word in doc]
+        word_id_train.append(word_ids)
+        word_id_len.append(len(word_ids))
+
+    word_id_test, word_ids = [], []
+    for doc in processed_docs_test:
+        word_ids = [dictionary.token2id[word] for word in doc]
+        word_id_test.append(word_ids)
+        word_id_len.append(len(word_ids))
+ 
+    seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)
+
+    #pad sequences
+    word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
+    word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len)
+    print(num_labels)
+    #y_train_enc = np_utils.to_categorical(sentiment_train,)
+    #le = preprocessing.LabelEncoder()
+    #le.fit(sentiment_train)
+    #v = le.transform(sentiment_train)   
+    
+    y_train_enc = np_utils.to_categorical(sentiment_train)
+
+    #le1 = preprocessing.LabelEncoder()
+    #le1.fit(sentiment_test)
+    #v1 = le1.transform(sentiment_test)
+    y_test_enc = np_utils.to_categorical(sentiment_test)
+    
+
+    #LSTM
+    print "fitting LSTM ..."
+    model = Sequential()
+    model.add(Embedding(dictionary_size, 256, dropout=0.2))
+    model.add(SimpleRNN(256, dropout_W=0.2, dropout_U=0.2))
+    model.add(Dense(num_labels))
+    model.add(Activation('softmax'))
+
+    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
+    checkpointer = callbacks.ModelCheckpoint(filepath="logs1/checkpoint-{epoch:02d}.hdf5", verbose=1, save_best_only=True, monitor='loss')
+    csv_logger = CSVLogger('logs1/training_set_iranalysis1.csv',separator=',', append=False)
+
+    model.fit(word_id_train, y_train_enc, nb_epoch=1000, batch_size=256, validation_data=(word_id_test, y_test_enc), verbose=1, callbacks=[checkpointer,csv_logger])
+    model.save("logs1/rnn_model.hdf5")
+    test_pred = model.predict_classes(word_id_test)
+    
+    #make a submission
+    #test_df['Sentiment'] = test_pred.reshape(-1,1) 
+    #header = ['PhraseId', 'Sentiment']
+    #test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True)
+    accuracy = accuracy_score(v1, test_pred)
+    print(accuracy)
+    
diff --git a/coset/task2copytest.py b/coset/task2copytest.py
new file mode 100644
index 0000000..b277754
--- /dev/null
+++ b/coset/task2copytest.py
@@ -0,0 +1,125 @@
+import numpy as np
+import pandas as pd
+
+from gensim import corpora
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize 
+from nltk.stem import SnowballStemmer
+
+from keras.preprocessing import sequence
+from keras.utils import np_utils
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation, Embedding
+from keras.layers import LSTM, GRU, SimpleRNN
+from sklearn import preprocessing
+from sklearn.metrics import (precision_score, recall_score,
+                             f1_score, accuracy_score,mean_squared_error,mean_absolute_error)
+np.random.seed(0)
+from keras import callbacks
+from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger
+
+if __name__ == "__main__":
+
+    #load data
+    train_df = pd.read_csv('data/task2-train.csv', sep='\t', header=0)
+    test_df = pd.read_csv('cleaned/task2-test.csv', sep='\t', header=0)
+
+    raw_docs_train = train_df['Phrase'].values
+    raw_docs_test = test_df['Phrase'].values
+    #raw_docs_train = raw_docs_train.decode("utf8")
+    #raw_docs_test = raw_docs_test.decode("utf8")
+    #print(raw_docs_test)
+    sentiment_train = train_df['Sentiment'].values
+    num_labels = len(np.unique(sentiment_train))
+    #sentiment_test = test_df.iloc[:,0]
+    #text pre-processing
+    stop_words = set(stopwords.words('french'))
+    stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
+    stemmer = SnowballStemmer('french')
+
+    print "pre-processing train docs..."
+    processed_docs_train = []
+    #print(raw_docs_train)
+    #np.savetxt("traindata.txt",raw_docs_train,fmt="%s")
+    for doc in raw_docs_train:
+       doc = doc.decode("utf8")
+       tokens = word_tokenize(doc)
+       filtered = [word for word in tokens if word not in stop_words]
+       stemmed = [stemmer.stem(word) for word in filtered]
+       processed_docs_train.append(stemmed)
+   
+    print "pre-processing test docs..."
+    processed_docs_test = []
+    for doc in raw_docs_test:
+       doc = doc.decode("utf8")
+       tokens = word_tokenize(doc)
+       filtered = [word for word in tokens if word not in stop_words]
+       stemmed = [stemmer.stem(word) for word in filtered]
+       processed_docs_test.append(stemmed)
+
+    processed_docs_all = processed_docs_test
+
+    dictionary = corpora.Dictionary(processed_docs_all)
+    dictionary_size = len(dictionary.keys())
+    print "dictionary size: ", dictionary_size 
+    #dictionary.save('dictionary.dict')
+    #corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
+    '''
+    print "converting to token ids..."
+    word_id_train, word_id_len = [], []
+    for doc in processed_docs_train:
+        word_ids = [dictionary.token2id[word] for word in doc]
+        word_id_train.append(word_ids)
+        word_id_len.append(len(word_ids))
+    '''
+    word_id_test, word_id_len = [], []
+    for doc in processed_docs_test:
+        word_ids = [dictionary.token2id[word] for word in doc]
+        word_id_test.append(word_ids)
+        word_id_len.append(len(word_ids))
+ 
+    seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)
+
+    #pad sequences
+    #word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
+    word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len)
+    print(num_labels)
+    #y_train_enc = np_utils.to_categorical(sentiment_train,)
+    #le = preprocessing.LabelEncoder()
+    #le.fit(sentiment_train)
+    #v = le.transform(sentiment_train)   
+    #print(v)
+    #y_train_enc = np_utils.to_categorical(v)
+
+    #le1 = preprocessing.LabelEncoder()
+    #le1.fit(sentiment_test)
+    #v1 = le1.transform(sentiment_test)
+
+    
+
+    #LSTM
+    print "fitting LSTM ..."
+    model = Sequential()
+    model.add(Embedding(dictionary_size, 256, dropout=0.2))
+    model.add(SimpleRNN(256, dropout_W=0.2, dropout_U=0.2))
+    model.add(Dense(num_labels))
+    model.add(Activation('softmax'))
+    model.load_weights("logs1/rnn_model.hdf5")
+    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
+    checkpointer = callbacks.ModelCheckpoint(filepath="logs2/checkpoint-{epoch:02d}.hdf5", verbose=1, save_best_only=True, monitor='loss')
+    csv_logger = CSVLogger('logs1/training_set_iranalysis1.csv',separator=',', append=False)
+
+    #model.fit(word_id_train, y_train_enc, nb_epoch=1000, batch_size=256, validation_split=0.33, verbose=1, callbacks=[checkpointer,csv_logger])
+
+    test_pred = model.predict_classes(word_id_test)
+    
+    #make a submission
+    #test_df['Sentiment'] = test_pred.reshape(-1,1) 
+    #header = ['PhraseId', 'Sentiment']
+    #test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True)
+    #accuracy = accuracy_score(v1, test_pred)
+    print(test_pred.shape)
+    print(test_pred)    
+    np.savetxt("res/task2-predicted3.txt", test_pred, fmt="%01d")
+    
+
diff --git a/coset/task3copy.py b/coset/task3copy.py
new file mode 100644
index 0000000..2b86846
--- /dev/null
+++ b/coset/task3copy.py
@@ -0,0 +1,124 @@
+import numpy as np
+import pandas as pd
+
+from gensim import corpora
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize 
+from nltk.stem import SnowballStemmer
+
+from keras.preprocessing import sequence
+from keras.utils import np_utils
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation, Embedding
+from keras.layers import LSTM, GRU
+from sklearn import preprocessing
+from sklearn.metrics import (precision_score, recall_score,
+                             f1_score, accuracy_score,mean_squared_error,mean_absolute_error)
+np.random.seed(0)
+from keras import callbacks
+from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger
+
+if __name__ == "__main__":
+
+    #load data
+    train_df = pd.read_csv('data/task3-train.csv', sep='\t', header=0)
+    test_df = pd.read_csv('data/task3-test.csv', sep='\t', header=0)
+
+    raw_docs_train = train_df['Phrase'].values
+    raw_docs_test = test_df['Phrase'].values
+    #raw_docs_train = raw_docs_train.decode("utf8")
+    #raw_docs_test = raw_docs_test.decode("utf8")
+    #print(raw_docs_test)
+    sentiment_train = train_df['Sentiment'].values
+    num_labels = len(np.unique(sentiment_train))
+    sentiment_test = test_df['Sentiment'].values
+    #text pre-processing
+    stop_words = set(stopwords.words('french'))
+    stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
+    stemmer = SnowballStemmer('french')
+
+    print "pre-processing train docs..."
+    processed_docs_train = []
+    #print(raw_docs_train)
+    #np.savetxt("traindata.txt",raw_docs_train,fmt="%s")
+    for doc in raw_docs_train:
+       doc = doc.decode("utf8")
+       tokens = word_tokenize(doc)
+       filtered = [word for word in tokens if word not in stop_words]
+       stemmed = [stemmer.stem(word) for word in filtered]
+       processed_docs_train.append(stemmed)
+   
+    print "pre-processing test docs..."
+    processed_docs_test = []
+    for doc in raw_docs_test:
+       doc = doc.decode("utf8")
+       tokens = word_tokenize(doc)
+       filtered = [word for word in tokens if word not in stop_words]
+       stemmed = [stemmer.stem(word) for word in filtered]
+       processed_docs_test.append(stemmed)
+
+    processed_docs_all = np.concatenate((processed_docs_train, processed_docs_test), axis=0)
+
+    dictionary = corpora.Dictionary(processed_docs_all)
+    dictionary_size = len(dictionary.keys())
+    print "dictionary size: ", dictionary_size 
+    #dictionary.save('dictionary.dict')
+    #corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
+
+    print "converting to token ids..."
+    word_id_train, word_id_len = [], []
+    for doc in processed_docs_train:
+        word_ids = [dictionary.token2id[word] for word in doc]
+        word_id_train.append(word_ids)
+        word_id_len.append(len(word_ids))
+
+    word_id_test, word_ids = [], []
+    for doc in processed_docs_test:
+        word_ids = [dictionary.token2id[word] for word in doc]
+        word_id_test.append(word_ids)
+        word_id_len.append(len(word_ids))
+ 
+    seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)
+
+    #pad sequences
+    word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
+    word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len)
+    print(num_labels)
+    #y_train_enc = np_utils.to_categorical(sentiment_train,)
+    #le = preprocessing.LabelEncoder()
+    #le.fit(sentiment_train)
+    #v = le.transform(sentiment_train)   
+    
+    y_train_enc = np_utils.to_categorical(sentiment_train)
+
+    #le1 = preprocessing.LabelEncoder()
+    #le1.fit(sentiment_test)
+    #v1 = le1.transform(sentiment_test)
+
+    y_test_enc = np_utils.to_categorical(sentiment_test)
+
+
+    #LSTM
+    print "fitting LSTM ..."
+    model = Sequential()
+    model.add(Embedding(dictionary_size, 256, dropout=0.2))
+    model.add(GRU(256, dropout_W=0.2, dropout_U=0.2))
+    model.add(Dense(num_labels))
+    model.add(Activation('softmax'))
+
+    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
+    checkpointer = callbacks.ModelCheckpoint(filepath="logs2/checkpoint-{epoch:02d}.hdf5", verbose=1, save_best_only=True, monitor='loss')
+    csv_logger = CSVLogger('logs2/training_set_iranalysis1.csv',separator=',', append=False)
+
+    model.fit(word_id_train, y_train_enc, nb_epoch=1000, batch_size=256, validation_data=(word_id_test, y_test_enc), verbose=1, callbacks=[checkpointer,csv_logger])
+    model.save("logs2/gru_model.hdf5")
+
+    test_pred = model.predict_classes(word_id_test)
+    
+    #make a submission
+    #test_df['Sentiment'] = test_pred.reshape(-1,1) 
+    #header = ['PhraseId', 'Sentiment']
+    #test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True)
+    accuracy = accuracy_score(v1, test_pred)
+    print(accuracy)
+    
diff --git a/coset/task3copytest.py b/coset/task3copytest.py
new file mode 100644
index 0000000..29d30bc
--- /dev/null
+++ b/coset/task3copytest.py
@@ -0,0 +1,124 @@
+import numpy as np
+import pandas as pd
+
+from gensim import corpora
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize 
+from nltk.stem import SnowballStemmer
+
+from keras.preprocessing import sequence
+from keras.utils import np_utils
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation, Embedding
+from keras.layers import LSTM, GRU
+from sklearn import preprocessing
+from sklearn.metrics import (precision_score, recall_score,
+                             f1_score, accuracy_score,mean_squared_error,mean_absolute_error)
+np.random.seed(0)
+from keras import callbacks
+from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger
+
+if __name__ == "__main__":
+
+    #load data
+    train_df = pd.read_csv('data/task3-train.csv', sep='\t', header=0)
+    test_df = pd.read_csv('cleaned/task3-test.csv', sep='\t', header=0)
+
+    raw_docs_train = train_df['Phrase'].values
+    raw_docs_test = test_df['Phrase'].values
+    #raw_docs_train = raw_docs_train.decode("utf8")
+    #raw_docs_test = raw_docs_test.decode("utf8")
+    #print(raw_docs_test)
+    sentiment_train = train_df['Sentiment'].values
+    num_labels = len(np.unique(sentiment_train))
+    #sentiment_test = test_df.iloc[:,0]
+    #text pre-processing
+    stop_words = set(stopwords.words('french'))
+    stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
+    stemmer = SnowballStemmer('french')
+
+    print "pre-processing train docs..."
+    processed_docs_train = []
+    #print(raw_docs_train)
+    #np.savetxt("traindata.txt",raw_docs_train,fmt="%s")
+    for doc in raw_docs_train:
+       doc = doc.decode("utf8")
+       tokens = word_tokenize(doc)
+       filtered = [word for word in tokens if word not in stop_words]
+       stemmed = [stemmer.stem(word) for word in filtered]
+       processed_docs_train.append(stemmed)
+   
+    print "pre-processing test docs..."
+    processed_docs_test = []
+    for doc in raw_docs_test:
+       doc = doc.decode("utf8")
+       tokens = word_tokenize(doc)
+       filtered = [word for word in tokens if word not in stop_words]
+       stemmed = [stemmer.stem(word) for word in filtered]
+       processed_docs_test.append(stemmed)
+
+    processed_docs_all = processed_docs_test
+
+    dictionary = corpora.Dictionary(processed_docs_all)
+    dictionary_size = len(dictionary.keys())
+    print "dictionary size: ", dictionary_size 
+    #dictionary.save('dictionary.dict')
+    #corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
+    '''
+    print "converting to token ids..."
+    word_id_train, word_id_len = [], []
+    for doc in processed_docs_train:
+        word_ids = [dictionary.token2id[word] for word in doc]
+        word_id_train.append(word_ids)
+        word_id_len.append(len(word_ids))
+    '''
+    word_id_test, word_id_len = [], []
+    for doc in processed_docs_test:
+        word_ids = [dictionary.token2id[word] for word in doc]
+        word_id_test.append(word_ids)
+        word_id_len.append(len(word_ids))
+ 
+    seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)
+
+    #pad sequences
+    #word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
+    word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len)
+    print(num_labels)
+    #y_train_enc = np_utils.to_categorical(sentiment_train,)
+    #le = preprocessing.LabelEncoder()
+    #le.fit(sentiment_train)
+    #v = le.transform(sentiment_train)   
+    #print(v)
+    #y_train_enc = np_utils.to_categorical(v)
+
+    #le1 = preprocessing.LabelEncoder()
+    #le1.fit(sentiment_test)
+    #v1 = le1.transform(sentiment_test)
+
+    
+
+    #LSTM
+    print "fitting LSTM ..."
+    model = Sequential()
+    model.add(Embedding(dictionary_size, 256, dropout=0.2))
+    model.add(GRU(256, dropout_W=0.2, dropout_U=0.2))
+    model.add(Dense(num_labels))
+    model.add(Activation('softmax'))
+    model.load_weights("logs2/gru_model.hdf5")
+    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
+    checkpointer = callbacks.ModelCheckpoint(filepath="logs2/checkpoint-{epoch:02d}.hdf5", verbose=1, save_best_only=True, monitor='loss')
+    csv_logger = CSVLogger('logs2/training_set_iranalysis1.csv',separator=',', append=False)
+
+    #model.fit(word_id_train, y_train_enc, nb_epoch=1000, batch_size=256, validation_split=0.33, verbose=1, callbacks=[checkpointer,csv_logger])
+
+    test_pred = model.predict_classes(word_id_test)
+    
+    #make a submission
+    #test_df['Sentiment'] = test_pred.reshape(-1,1) 
+    #header = ['PhraseId', 'Sentiment']
+    #test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True)
+    #accuracy = accuracy_score(v1, test_pred)
+    print(test_pred.shape)
+    print(test_pred)    
+    np.savetxt("res/task3-predicted3.txt", test_pred, fmt="%01d")
+