urvigodha · urvigodha · Mar 25, 2020 · Mar 7, 2020 · Mar 7, 2020 · Mar 8, 2020
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,7 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
+.idea/
+
 *.py[cod]
 *$py.class
 
@@ -109,6 +111,8 @@ venv/
 ENV/
 env.bak/
 venv.bak/
+.vs 
+.vscode 
 
 # Spyder project settings
 .spyderproject
@@ -128,5 +132,9 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
+# Phpstorm
+.idea
+
 # Mac crap
 .DS_Store
+
diff --git a/README.md b/README.md
@@ -96,8 +96,21 @@ other contributions at this point, unless to resolve errors or typos.
 Code formatting guidelines should strinctly adhere  to [Python Black](https://pypi.org/project/black/) formatting guidelines. Please ensure that all PRs pass a local black formatting check.
 
 
+
+
 ## Information for Outreachy participants
 
+__Please note that this project is currently closed to new Outreachy
+contributions.__
+
+- At this time, we are only considering Outreachy candidates who have submitted
+  a PR on or before _Friday March 20_.
+- If you have submitted a PR by this date, you may continue working on existing
+  PRs or create new ones as usual. All your contributions will be considered.
+- If you have not yet submitted a PR by this date, we will unfortunately not be
+  able to consider you as an Outreachy candidate for this round.
+
+
 This project is intentionally broadly scoped, and the initial phase will be
   exploratory.
 
@@ -140,24 +153,47 @@ Contributions can be made by submitting a [pull request](https://help.github.com
   request review. This tag ('work in progress') indicates that the PR is not
   ready to be merged. When it is ready for final submission, you can modify the
   title to remove the "WIP:" tag.
+- Should you use a separate jupyter notebook for comparing different models? If
+  you had a PR merged in to satisfy issue #2 already and are now comparing
+  models for another issue, then a new notebook would be helpful. That being
+  said, a notebook should satisfy the following criteria:
+
+    a) it should run beginning to end without error
+
+    b) it should be easy to follow and have a clear narrative presenting context,
+   data, results, and interpretation. This may mean some redundancy in code, but
+   will often mean that your notebook is much more helpful to other people
+   looking at it in isolation (including reviewers).
 
 
 ## Getting started
 
 1. Install [Anaconda](https://www.anaconda.com/download) or [Miniconda](https://conda.io/miniconda.html).
 
-2. Setup and activate environment:
+2. Fork this repository and clone it into your local machine(using git CLI).
+
+3. Setup and activate environment:
 
 ```
  $ conda env create -f environment.yml
  $ conda activate presc
 ```
 
-3. Run Jupyter. The notebook will open in your browser at `localhost:8888` by default.
+
+__For Windows:__ Open anaconda prompt and `cd` into the folder where you cloned the repository
+
+```
+cd PRESC
+```
+then type the above commands to activate the environment.
+
+
+4. Run Jupyter. The notebook will open in your browser at `localhost:8888` by default.
 
 ```
  $ jupyter notebook
 ```
+After running this commands you will see the notebook containing the datasets and now you can start working with it.
 
 We recommend everyone start by working on
 [#2](https://github.com/mozilla/PRESC/issues/2).

diff --git a/dev/Addi-11/calibration.ipynb b/dev/Addi-11/calibration.ipynb
diff --git a/dev/Addi-11/calibration.py b/dev/Addi-11/calibration.py
@@ -0,0 +1,45 @@
+from sklearn.calibration import CalibratedClassifierCV, calibration_curve
+import matplotlib.pyplot as plt
+from sklearn.metrics import brier_score_loss
+
+def calibration(clf, x_train, y_train, x_val, y_val):
+    '''
+    The function plots the calibration curves for classifaction models.
+
+    Parameters:
+        clf : trained classification moodel
+        x_train : array-like, shape(n_train_samples, n_features)
+        y_train : of length n_train_samples
+        x_val : array-like, shape(n_val_samples, n_features)
+        y_val : of length of n_val_samples
+
+    Returns:
+        null
+
+    '''
+
+    methods = ['sigmoid', 'isotonic']
+
+    fop = {}
+    apv ={}
+    clf_score = {}
+    for i in range(len(methods)):
+
+        calibrated_model = CalibratedClassifierCV(clf, method=methods[i], cv=5)
+        calibrated_model.fit(x_train, y_train)
+
+        y_score = calibrated_model.predict_proba(x_val)[:,1]
+        fop[i], apv[i] = calibration_curve(y_val, y_score, n_bins = 10, normalize=True)
+
+        clf_score[i] = brier_score_loss(y_val, y_score, pos_label=1)
+
+    plt.figure(figsize=(10,6))
+    plt.plot([0,1],[0,1])
+    plt.plot(apv[0], fop[0], label='Sigmoid (Brier loss={:.3f})'.format(clf_score[0]))
+    plt.plot(apv[1], fop[1], label='Isotonic(Brier loss={:.3f})'.format(clf_score[1]))
+    plt.grid()
+    plt.xlabel("Average Probability")
+    plt.ylabel("Fraction of Positive")
+    plt.title("Calibration Plots")
+    plt.legend()
+    plt.show()
diff --git a/dev/Addi-11/classifiers.py b/dev/Addi-11/classifiers.py
@@ -0,0 +1,151 @@
+# This file contains various classifiers to be used on the dataset 
+from evaluation import evaluate
+from sklearn.svm import SVC
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.metrics import plot_precision_recall_curve, confusion_matrix, plot_confusion_matrix
+import matplotlib.pyplot as plt
+
+class Classifier:
+    '''
+    This class contains different classification models which can be trained on the dataset.
+    '''
+    def svm_classifier(self,x_train,y_train):
+        '''
+        Support Vector Machine is a classifier
+        Parameters :
+            x_train : array-like, shape (n_samples, n_features)
+            y_train : of length n_samples
+
+        Returns :
+            classifier : trained classification model 
+        '''
+        classifier = SVC(gamma='auto')
+        classifier.fit(x_train, y_train)
+        return classifier
+
+    def KNeighbors(self, x_train,y_train):
+        '''
+        K-Nearest Neighbours is supervised classifier, which takes a bunch of labelled points and uses them to learn how to label other points, wrt to thier degree of closeness.
+
+        Parameters :
+            x_train : array-like, shape (n_samples, n_features)
+            y_train : of length n_samples
+
+        Returns :
+            classifier : trained classification model
+        '''
+        classifier = KNeighborsClassifier()
+        classifier.fit(x_train, y_train)
+        return classifier
+
+    def Logistic_Reg(self, x_train,y_train):
+        '''
+        Logistic Regression, takes some input and calculates the probabilty of the outcome using mathematical functions like sigmoid or ReLu.
+
+        Parameters :
+            x_train : array-like, shape (n_samples, n_features)
+            y_train : of length n_samples
+
+        Returns :
+            classifier : trained classification model
+        '''
+        classifier = LogisticRegression()
+        classifier.fit(x_train, y_train)
+        return classifier
+
+    def Decision_Tree(self,x_train,y_train):
+        '''
+        Decision Tree Classifier, a mechanical way to make a decision by dividing the inputs into smaller decisions.
+
+        Parameters :
+            x_train : array-like, shape (n_samples, n_features)
+            y_train : of length n_samples
+
+        Returns :
+            classifier : trained classification model
+        '''
+        classifier = DecisionTreeClassifier()
+        classifier.fit(x_train, y_train)
+        return classifier
+
+    def Random_Forest(self, x_train,y_train):
+        '''
+        Random Forest Classifier, a way to make a decision by dividing the inputs into smaller decisions, with some randomness.The group outcomes are based on the positive responses. Used in Recommendation Systems.
+
+        Parameters :
+            x_train : array-like, shape (n_samples, n_features)
+            y_train : of length n_samples
+
+        Returns :
+            classifier : trained classification model
+        '''
+        classifier = RandomForestClassifier()
+        classifier.fit(x_train, y_train)
+        return classifier
+
+    def Gaussian(self, x_train,y_train):
+        '''
+        Gaussian Naive Bayes, classification technique based on Bayes’ Theorem with an assumption of independence among predictors. It is easy to build and particularly useful for very large data sets.
+
+        Parameters :
+            x_train : array-like, shape (n_samples, n_features)
+            y_train : of length n_samples
+
+        Returns :
+            classifier : trained classification model
+        '''
+        classifier = GaussianNB()
+        classifier.fit(x_train, y_train)
+        return classifier
+
+    def evaluation(self, classifier, x_val, y_val):
+        '''
+        This function is used to evaluate the performance of the trained model, using evaluation metrics like :
+            Accuracy
+            Precision
+            Recall
+            Precision Recall Curve
+            F1_score
+            Confusion Matrix
+            AUC-ROC Curve, on the validation set.
+
+        Parameters :
+            classifier : trained classification model
+            x_val : array-like, shape(n_samples, n_features)
+            y_val : of length n_samples
+
+        Returns :
+            void
+        '''
+        accuracy, precision, recall, f_score , y_score = evaluate(classifier, x_val, y_val)
+        print("Accuracy : ",accuracy)
+        print("Precision: ", precision)
+        print("Recall: ", recall)
+        print("F1 score : ",f_score)
+
+        # Plotting Precision Recall Curve
+        print("Precision vs Recall Curve")
+        disp = plot_precision_recall_curve(classifier,x_val, y_val)
+
+        # Plotting Confusion Matrix
+        print("Confusion Matrix")
+        labels = ['Class 1', 'Class 2']
+        cm = confusion_matrix(y_val, y_score)
+        fig = plt.figure()
+        ax = fig.add_subplot(111)
+        cax = ax.matshow(cm)
+        plt.title('Confusion matrix of the classifier')
+        fig.colorbar(cax)
+        ax.set_xticklabels([''] + labels)
+        ax.set_yticklabels([''] + labels)
+        plt.xlabel('Predicted')
+        plt.ylabel('True')
+        plt.show()
+
+
+
+
diff --git a/dev/Addi-11/data_split_examine.py b/dev/Addi-11/data_split_examine.py
@@ -0,0 +1,49 @@
+# This file compares various evaluation metrics for different data splits 
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+import pandas as pd
+from IPython.display import HTML
+from pylab import *
+from dataloader import get_x_y
+from evaluation import evaluate
+from classifiers import Classifier
+
+
+
+test_sizes = np.arange(0.005,1,0.05)
+columns = ['Training data','Testing Data','Accuracy %', 'Precision', 'Recall', 'F1_score']
+df = pd.DataFrame(columns = columns)
+
+def data_split_examine(clf):
+	'''
+	The fuction calculates evaluation metrics like f1_score, accuracy, precision, recall for various test data sizes
+
+	Parameters:
+		clf : a trained classification model
+
+	Return:
+		void
+	'''
+	model = Classifier()
+	for index in range(len(test_sizes)):
+		x, y = get_x_y() 
+		x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_sizes[index])
+		classifier = getattr(model, clf)(x_train, y_train) 
+		accuracy, precision, recall, f_score, _ = evaluate(classifier, x_test, y_test)
+		train = round((1-test_sizes[index])*100)
+		test = round(test_sizes[index]*100)
+		df.loc[index+1] = [train, test, accuracy*100, precision, recall, f_score]
+
+	display(df)
+
+def visualise_split(clf):
+	'''
+	The function visualises the corelation between data splits and evaluation metrics by plotting graph between testing data sizes and accuracy.
+	'''
+	fig,axes = plt.subplots()
+	axes.set_xlabel("Test Data Size")
+	axes.set_ylabel("Accuracy %")
+	axes.set_ylim([50,100])
+	axes.set_title("Relation btw accuracy and test data size for {} classifier".format(clf))
+	disp = axes.plot(df['Testing Data'], df['Accuracy %'])