Union Python

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import rfpimp

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from yellowbrick.regressor import ResidualsPlot
from sklearn.linear_model import Ridge

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import decomposition
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder #allows transform to 1, 0

from mpl_toolkits.mplot3d import Axes3D
from statsmodels.graphics.gofplots import qqplot
from sklearn.metrics import classification_report, recall_score, confusion_matrix, roc_auc_score, precision_score, f1_score, roc_curve, auc, plot_confusion_matrix,plot_roc_curve
1.0 Exploratory data analysis
# load custom data set & predict data.

union_train_1 = pd.read_csv(r'JEFFREY-train.csv')
pred = pd.read_csv(r'comp_pred.csv')

union_train_1.sample(5)
ID	gender	Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	StateOfResidence	Connectivity	FeatureA	...	FeatureC	FeatureD	FeatureE	FeatureF	DuesFrequency	PaperlessBilling	PaymentMethod	MonthlyDues	TotalDues	LeftUnion
1770	Jeffrey508-19480	Female	1	Yes	No	27	Yes	MO	Fiber optic	Yes	...	No	No	No	No	Month-to-month	Yes	Bank transfer (automatic)	75.20	1929.35	Yes
2437	Jeffrey2226-19369	Male	1	Yes	No	49	Yes	IL	Fiber optic	No	...	No	No	No	Yes	Month-to-month	No	Credit card (automatic)	90.05	4547.25	Yes
978	Jeffrey1429-18544	Male	0	Yes	Yes	29	Yes	MO	No	Maryville	...	Maryville	Maryville	Maryville	Maryville	Month-to-month	Yes	Mailed check	20.65	654.85	No
174	Jeffrey1066-13859	Male	0	No	Yes	3	Yes	MO	No	Maryville	...	Maryville	Maryville	Maryville	Maryville	Month-to-month	No	Bank transfer (automatic)	20.20	50.6	No
2199	Jeffrey1962-19351	Male	0	Yes	Yes	50	Yes	IL	DSL	No	...	Yes	Yes	Yes	No	One year	Yes	Electronic check	69.65	3442.15	No
5 rows × 21 columns

pred.sample(2)
DS_ID	gender	Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	StateOfResidence	Connectivity	FeatureA	FeatureB	FeatureC	FeatureD	FeatureE	FeatureF	DuesFrequency	PaperlessBilling	PaymentMethod	MonthlyDues	TotalDues
569	10569	Female	0	No	No	11	No	MO	DSL	No	Yes	Yes	Yes	No	No	One year	Yes	Credit card (automatic)	40.40	422.6
834	10834	Female	1	No	No	4	Yes	MO	Fiber optic	No	No	No	No	Yes	No	Month-to-month	Yes	Electronic check	78.85	292.8
# Review sample statistics

union_train_1.describe()
Management	MonthsInUnion	MonthlyDues
count	2999.000000	2999.000000	2999.000000
mean	0.156052	34.500834	63.754652
std	0.362965	48.716480	30.274755
min	0.000000	0.000000	18.700000
25%	0.000000	9.000000	30.500000
50%	0.000000	29.000000	69.700000
75%	0.000000	55.000000	89.475000
max	1.000000	917.000000	118.750000
union_train_1['Married'].value_counts()
No     2106
Yes     893
Name: Married, dtype: int64
fig, ax = plt.subplots(1, 2, figsize=(10,5))
x=union_train_1['Married'].value_counts().index
y=union_train_1['Married'].value_counts().values.tolist()
data = union_train_1.groupby("Married").size()
sns.set(style="dark", color_codes=True)
pal = sns.color_palette("YlGnBu", len(data))
rank = data.argsort().argsort() 
sns.barplot(x=x,y=y,palette=np.array(pal[::-1])[rank],ax = ax[0])
for p in ax[0].patches:
        ax[0].annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')
ax[0].set_xlabel('Married Yes/No', weight='semibold', fontname = 'monospace')
        
ax[1].pie(y, labels = x, colors = pal, autopct='%1.1f%%',
        explode=[0.03 for i in union_train_1['Married'].value_counts().index])
plt.legend(bbox_to_anchor=(1, 1))
plt.suptitle ('Married Yes/No',weight = 'bold')
plt.show()

union_train_1['Connectivity'].value_counts()
Fiber optic    1277
DSL            1042
No              680
Name: Connectivity, dtype: int64
fig, ax = plt.subplots(1, 2, figsize=(10,5))
x=union_train_1['Connectivity'].value_counts().index
y=union_train_1['Connectivity'].value_counts().values.tolist()
data = union_train_1.groupby("Connectivity").size()
sns.set(style="dark", color_codes=True)
pal = sns.color_palette("YlGnBu", len(data))
rank = data.argsort().argsort() 
sns.barplot(x=x,y=y,palette=np.array(pal[::-1])[rank],ax = ax[0])
for p in ax[0].patches:
        ax[0].annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')
ax[0].set_xlabel('Connectivity Yes/No', weight='semibold', fontname = 'monospace')
        
ax[1].pie(y, labels = x, colors = pal, autopct='%1.1f%%',
        explode=[0.03 for i in union_train_1['Connectivity'].value_counts().index])
plt.legend(bbox_to_anchor=(1, 1))
plt.suptitle ('Connectivity Yes/No',weight = 'bold')
plt.show()

union_train_1['PaperlessBilling'].value_counts()
Yes    1761
No     1238
Name: PaperlessBilling, dtype: int64
fig, ax = plt.subplots(1, 2, figsize=(10,5))
x=union_train_1['PaperlessBilling'].value_counts().index
y=union_train_1['PaperlessBilling'].value_counts().values.tolist()
data = union_train_1.groupby("PaperlessBilling").size()
sns.set(style="dark", color_codes=True)
pal = sns.color_palette("YlGnBu", len(data))
rank = data.argsort().argsort() 
sns.barplot(x=x,y=y,palette=np.array(pal[::-1])[rank],ax = ax[0])
for p in ax[0].patches:
        ax[0].annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')
ax[0].set_xlabel('PaperlessBilling Yes/No', weight='semibold', fontname = 'monospace')
        
ax[1].pie(y, labels = x, colors = pal, autopct='%1.1f%%',
        explode=[0.03 for i in union_train_1['PaperlessBilling'].value_counts().index])
plt.legend(bbox_to_anchor=(1, 1))
plt.suptitle ('PaperlessBilling Yes/No',weight = 'bold')
plt.show()

union_train_1['PaymentMethod'].value_counts()
Electronic check             1001
Mailed check                  702
Bank transfer (automatic)     667
Credit card (automatic)       629
Name: PaymentMethod, dtype: int64
fig, ax = plt.subplots(1, 2, figsize=(20,10))
x=union_train_1['PaymentMethod'].value_counts().index
y=union_train_1['PaymentMethod'].value_counts().values.tolist()
data = union_train_1.groupby("PaymentMethod").size()
sns.set(style="dark", color_codes=True)
pal = sns.color_palette("YlGnBu", len(data))
rank = data.argsort().argsort() 
sns.barplot(x=x,y=y,palette=np.array(pal[::-1])[rank],ax = ax[0])
for p in ax[0].patches:
        ax[0].annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')
ax[0].set_xlabel('PaymentMethod Yes/No', weight='semibold', fontname = 'monospace')
        
ax[1].pie(y, labels = x, colors = pal, autopct='%1.1f%%',
        explode=[0.03 for i in union_train_1['PaymentMethod'].value_counts().index])
plt.legend(bbox_to_anchor=(1, 1))
plt.suptitle ('PaymentMethod Yes/No',weight = 'bold')
plt.show()

union_train_1['LeftUnion'].value_counts()
No     2198
Yes     801
Name: LeftUnion, dtype: int64
Original_data_percentage_left = round(801/2198,3)
Original_data_percentage_left
0.364
fig, ax = plt.subplots(1, 2, figsize=(10,5))
x=union_train_1['LeftUnion'].value_counts().index
y=union_train_1['LeftUnion'].value_counts().values.tolist()
data = union_train_1.groupby("LeftUnion").size()
sns.set(style="dark", color_codes=True)
pal = sns.color_palette("YlGnBu", len(data))
rank = data.argsort().argsort() 
sns.barplot(x=x,y=y,palette=np.array(pal[::-1])[rank],ax = ax[0])
for p in ax[0].patches:
        ax[0].annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')
ax[0].set_xlabel('Left Union Yes/No', weight='semibold', fontname = 'monospace')
        
ax[1].pie(y, labels = x, colors = pal, autopct='%1.1f%%',
        explode=[0.03 for i in union_train_1['LeftUnion'].value_counts().index])
plt.legend(bbox_to_anchor=(1, 1))
plt.suptitle ('Left Union Yes/No',weight = 'bold')
plt.show()

sns.histplot(x = union_train_1['MonthsInUnion'],kde = True)
plt.show()

There appears to be some outliers. I'll explore further with a violin plot. I suspect outliers beyond 400-500 months of union membership. This will need further investigation. USA citizen does not appear relevant in the decision to leave the union.

ax = sns.catplot(x="MonthsInUnion", y="USAcitizen", hue="LeftUnion", kind="violin",
                 split=True, palette="pastel", data=union_train_1, height=4.2, aspect=1.4)

sns.histplot(x = union_train_1['MonthlyDues'],kde = True)
plt.show()

g = sns.FacetGrid(union_train_1, row='Management', col="gender", hue="LeftUnion", height=3.5)
g.map(plt.scatter, "MonthsInUnion", "MonthlyDues", alpha=0.6)
g.add_legend();

Gender doesnt affect Left Union rate. The amount of monthly Dues and Months in Union do appear to play a role.

ax = sns.catplot(x="Connectivity", y="MonthlyDues", hue="LeftUnion", kind="violin",
                 split=True, palette="pastel", data=union_train_1, height=4.2, aspect=1.4)

fig, ax=plt.subplots(figsize=(10,5))
sns.countplot(data = union_train_1, x='StateOfResidence', order=union_train_1['StateOfResidence'].value_counts().index, palette='viridis', hue='LeftUnion')
plt.xticks(rotation=90)
plt.xlabel('State', fontsize=10, fontweight='bold')
plt.ylabel('Members', fontsize=10, fontweight='bold')
plt.title('State wise Members', fontsize=12, fontweight='bold')
plt.show()

Step 2.0 Data cleansing
union_train_1.shape
(2999, 21)
union_train_1.columns
Index(['ID', 'gender', 'Management', 'USAcitizen', 'Married', 'MonthsInUnion',
       'ContinuingEd', 'StateOfResidence', 'Connectivity', 'FeatureA',
       'FeatureB', 'FeatureC', 'FeatureD', 'FeatureE', 'FeatureF',
       'DuesFrequency', 'PaperlessBilling', 'PaymentMethod', 'MonthlyDues',
       'TotalDues', 'LeftUnion'],
      dtype='object')
union_train_1['MonthsInUnion'].unique()  ## watch for outliers
array([  1,   2,   3,   4,   5,   6,   8,   7,   9,  11,  10,  12,  15,
        13,  16,  17,  14,  18,  20,  19,  21,  23,  22,  24,  27,  26,
        25,  28,  29,  30,  33,  32,  31,  35,  34,  37,  36,  38,  40,
        42,  43,  44,  39,  41,  48,  46,  45,  49,  52,  50,  47,  51,
        55,  53,  54,  58,  57,  56,  62,  60,  61,  59,  64,  63,  69,
        67,  65,  68,  70,  71,  66,  72, 744, 614, 602, 827, 917, 658,
       782, 731, 899, 788,   0], dtype=int64)
union_train_1['PaymentMethod'].value_counts()
Electronic check             1001
Mailed check                  702
Bank transfer (automatic)     667
Credit card (automatic)       629
Name: PaymentMethod, dtype: int64
# check for nulls

union_train_1.isnull().sum()
ID                  0
gender              0
Management          0
USAcitizen          0
Married             0
MonthsInUnion       0
ContinuingEd        0
StateOfResidence    0
Connectivity        0
FeatureA            0
FeatureB            0
FeatureC            0
FeatureD            0
FeatureE            0
FeatureF            0
DuesFrequency       0
PaperlessBilling    0
PaymentMethod       0
MonthlyDues         0
TotalDues           0
LeftUnion           0
dtype: int64
union_train_1.describe
<bound method NDFrame.describe of                      ID  gender  Management USAcitizen Married  MonthsInUnion  \
0      Jeffrey866-18344    Male           0        Yes     Yes              1   
1     Jeffrey1948-13509    Male           0         No      No              1   
2     Jeffrey1080-19415  Female           0        Yes     Yes              1   
3     Jeffrey1957-19631  Female           0         No      No              1   
4     Jeffrey1170-16685    Male           0         No      No              1   
...                 ...     ...         ...        ...     ...            ...   
2994   Jeffrey939-15618  Female           0        Yes     Yes              0   
2995  Jeffrey1280-19245    Male           0         No     Yes              0   
2996  Jeffrey1345-13328  Female           0        Yes     Yes              0   
2997  Jeffrey1405-13244    Male           0         No     Yes              0   
2998  Jeffrey1600-19062  Female           0        Yes     Yes              0   

     ContinuingEd StateOfResidence Connectivity   FeatureA  ...   FeatureC  \
0             Yes               MO           No  Maryville  ...  Maryville   
1             Yes               MO           No  Maryville  ...  Maryville   
2             Yes               MO           No  Maryville  ...  Maryville   
3             Yes               MO           No  Maryville  ...  Maryville   
4             Yes               MO           No  Maryville  ...  Maryville   
...           ...              ...          ...        ...  ...        ...   
2994          Yes               MO           No  Maryville  ...  Maryville   
2995          Yes               MO           No  Maryville  ...  Maryville   
2996          Yes               IL          DSL         No  ...        Yes   
2997          Yes               IL          DSL        Yes  ...         No   
2998          Yes               MO          DSL        Yes  ...        Yes   

       FeatureD   FeatureE   FeatureF   DuesFrequency PaperlessBilling  \
0     Maryville  Maryville  Maryville        One year               No   
1     Maryville  Maryville  Maryville  Month-to-month              Yes   
2     Maryville  Maryville  Maryville  Month-to-month              Yes   
3     Maryville  Maryville  Maryville  Month-to-month               No   
4     Maryville  Maryville  Maryville  Month-to-month               No   
...         ...        ...        ...             ...              ...   
2994  Maryville  Maryville  Maryville        Two year               No   
2995  Maryville  Maryville  Maryville        Two year               No   
2996        Yes        Yes         No        Two year               No   
2997        Yes         No         No        Two year              Yes   
2998         No        Yes        Yes        Two year               No   

                  PaymentMethod MonthlyDues  TotalDues LeftUnion  
0                  Mailed check       18.80       18.8        No  
1                  Mailed check       18.85      18.85       Yes  
2                  Mailed check       19.00         19        No  
3                  Mailed check       19.15      19.15        No  
4              Electronic check       19.20       19.2        No  
...                         ...         ...        ...       ...  
2994               Mailed check       20.00                   No  
2995               Mailed check       20.25                   No  
2996               Mailed check       73.35                   No  
2997  Bank transfer (automatic)       61.90                   No  
2998               Mailed check       80.85                   No  

[2999 rows x 21 columns]>
# One hot encoding for gender

dummies = pd.get_dummies(union_train_1.gender)
dummies_pred = pd.get_dummies(pred.gender)
dummies.head(3)
Female	Male
0	0	1
1	0	1
2	1	0
pred.sample()
DS_ID	gender	Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	StateOfResidence	Connectivity	FeatureA	FeatureB	FeatureC	FeatureD	FeatureE	FeatureF	DuesFrequency	PaperlessBilling	PaymentMethod	MonthlyDues	TotalDues
53	10053	Female	1	Yes	No	8	Yes	IL	Fiber optic	No	Yes	No	No	No	No	Month-to-month	Yes	Credit card (automatic)	80.65	633.3
union_train_3 = pd.concat([union_train_1, dummies], axis='columns')
union_train_3 = union_train_3.drop('gender',axis='columns')

pred_1 = pd.concat([pred, dummies_pred], axis = 'columns')
pred_1 = pred_1.drop('gender',axis='columns')
pred_1.head(3)
DS_ID	Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	StateOfResidence	Connectivity	FeatureA	FeatureB	...	FeatureD	FeatureE	FeatureF	DuesFrequency	PaperlessBilling	PaymentMethod	MonthlyDues	TotalDues	Female	Male
0	10000	0	Yes	No	1	No	MO	DSL	No	Yes	...	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	1	0
1	10001	0	No	No	34	Yes	MO	DSL	Yes	No	...	No	No	No	One year	No	Mailed check	56.95	1889.50	0	1
2	10002	0	No	No	2	Yes	MO	DSL	Yes	Yes	...	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	0	1
3 rows × 21 columns

# One hot encoding StateOfResidence

dummies = pd.get_dummies(union_train_3.StateOfResidence)
pred_dummies = pd.get_dummies(pred_1.StateOfResidence)
union_train_4 = pd.concat([union_train_3, dummies], axis='columns')
union_train_4 = union_train_4.drop('StateOfResidence',axis='columns')

pred_2 = pd.concat([pred_1, pred_dummies], axis='columns')
pred_2 = pred_2.drop('StateOfResidence',axis='columns')
pred_2.head()
DS_ID	Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	Connectivity	FeatureA	FeatureB	FeatureC	...	FeatureF	DuesFrequency	PaperlessBilling	PaymentMethod	MonthlyDues	TotalDues	Female	Male	IL	MO
0	10000	0	Yes	No	1	No	DSL	No	Yes	No	...	No	Month-to-month	Yes	Electronic check	29.85	29.85	1	0	0	1
1	10001	0	No	No	34	Yes	DSL	Yes	No	Yes	...	No	One year	No	Mailed check	56.95	1889.50	0	1	0	1
2	10002	0	No	No	2	Yes	DSL	Yes	Yes	No	...	No	Month-to-month	Yes	Mailed check	53.85	108.15	0	1	0	1
3	10003	0	No	No	45	No	DSL	Yes	No	Yes	...	No	One year	No	Bank transfer (automatic)	42.30	1840.75	0	1	0	1
4	10004	0	No	No	2	Yes	Fiber optic	No	No	No	...	No	Month-to-month	Yes	Electronic check	70.70	151.65	1	0	0	1
5 rows × 22 columns

# One hot encoding DuesFrequency

dummies = pd.get_dummies(union_train_4.DuesFrequency)

pred_dummies = pd.get_dummies(pred_2.DuesFrequency)
union_train_5 = pd.concat([union_train_4, dummies], axis='columns')
union_train_5 = union_train_5.drop('DuesFrequency',axis='columns')

pred_3 = pd.concat([pred_2, pred_dummies], axis='columns')
pred_3 = pred_3.drop('DuesFrequency',axis='columns')
# One hot encoding PaymentMethod

dummies = pd.get_dummies(union_train_5.PaymentMethod)

pred_dummies = pd.get_dummies(pred_3.PaymentMethod)
union_train_6 = pd.concat([union_train_5, dummies], axis='columns')
union_train_6 = union_train_6.drop('PaymentMethod',axis='columns')

pred_4 = pd.concat([pred_3, pred_dummies], axis='columns')
pred_4 = pred_4.drop('PaymentMethod',axis='columns')
# One hot encoding features a-e

features = ['FeatureA', 'FeatureB', 'FeatureC', 'FeatureD', 'FeatureE', 'FeatureF']

for f in features:
    print(f'{f:>25} has {union_train_6[f].nunique()} unique values')
                 FeatureA has 3 unique values
                 FeatureB has 3 unique values
                 FeatureC has 3 unique values
                 FeatureD has 3 unique values
                 FeatureE has 3 unique values
                 FeatureF has 3 unique values
union_train_6 = pd.get_dummies(union_train_6, columns=features, prefix_sep='_', drop_first=True)
union_train_6
ID	Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	Connectivity	PaperlessBilling	MonthlyDues	TotalDues	...	FeatureB_No	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes
0	Jeffrey866-18344	0	Yes	Yes	1	Yes	No	No	18.80	18.8	...	0	0	0	0	0	0	0	0	0	0
1	Jeffrey1948-13509	0	No	No	1	Yes	No	Yes	18.85	18.85	...	0	0	0	0	0	0	0	0	0	0
2	Jeffrey1080-19415	0	Yes	Yes	1	Yes	No	Yes	19.00	19	...	0	0	0	0	0	0	0	0	0	0
3	Jeffrey1957-19631	0	No	No	1	Yes	No	No	19.15	19.15	...	0	0	0	0	0	0	0	0	0	0
4	Jeffrey1170-16685	0	No	No	1	Yes	No	No	19.20	19.2	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2994	Jeffrey939-15618	0	Yes	Yes	0	Yes	No	No	20.00		...	0	0	0	0	0	0	0	0	0	0
2995	Jeffrey1280-19245	0	No	Yes	0	Yes	No	No	20.25		...	0	0	0	0	0	0	0	0	0	0
2996	Jeffrey1345-13328	0	Yes	Yes	0	Yes	DSL	No	73.35		...	0	1	0	1	0	1	0	1	1	0
2997	Jeffrey1405-13244	0	No	Yes	0	Yes	DSL	Yes	61.90		...	0	1	1	0	0	1	1	0	1	0
2998	Jeffrey1600-19062	0	Yes	Yes	0	Yes	DSL	No	80.85		...	0	1	0	1	1	0	0	1	0	1
2999 rows × 34 columns

pred_4 = pd.get_dummies(pred_4, columns=features, prefix_sep='_', drop_first=True)
pred_4
DS_ID	Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	Connectivity	PaperlessBilling	MonthlyDues	TotalDues	...	FeatureB_No	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes
0	10000	0	Yes	No	1	No	DSL	Yes	29.85	29.85	...	0	1	1	0	1	0	1	0	1	0
1	10001	0	No	No	34	Yes	DSL	No	56.95	1889.50	...	1	0	0	1	1	0	1	0	1	0
2	10002	0	No	No	2	Yes	DSL	Yes	53.85	108.15	...	0	1	1	0	1	0	1	0	1	0
3	10003	0	No	No	45	No	DSL	No	42.30	1840.75	...	1	0	0	1	0	1	1	0	1	0
4	10004	0	No	No	2	Yes	Fiber optic	Yes	70.70	151.65	...	1	0	1	0	1	0	1	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
995	10995	0	No	No	35	Yes	Fiber optic	No	85.30	2917.50	...	0	1	1	0	0	1	1	0	1	0
996	10996	0	No	No	34	Yes	Fiber optic	Yes	70.00	2416.10	...	1	0	1	0	1	0	1	0	1	0
997	10997	1	No	No	4	Yes	Fiber optic	Yes	94.30	424.45	...	1	0	1	0	1	0	0	1	0	1
998	10998	0	Yes	No	72	Yes	No	No	20.70	1492.10	...	0	0	0	0	0	0	0	0	0	0
999	10999	0	No	No	2	Yes	DSL	No	70.30	132.40	...	1	0	1	0	0	1	0	1	0	1
1000 rows × 33 columns

# drop ID [& features a - f]

union_train_7 = union_train_6.drop(['ID'],axis='columns')   #  , 'FeatureA', 'FeatureB', 'FeatureC', 'FeatureD', 'FeatureE', 'FeatureF'],axis='columns')

pred_5 = pred_4.drop(['DS_ID'],axis='columns') #, 'FeatureA', 'FeatureB', 'FeatureC', 'FeatureD', 'FeatureE', 'FeatureF'],axis='columns')
I chose to drop the ID columns, and Feature A - Feature F. These do not appear to be relevent features in the model.

# convert USAcitizen, Married, ContinuingEd, PaperLessBilling, LeftUnion to 1,0 using sklearn preprocessing

lb = LabelEncoder()

union_train_7['USAcitizen'] = lb.fit_transform(union_train_7['USAcitizen'])
union_train_7['Married'] = lb.fit_transform(union_train_7['Married'])
union_train_7['ContinuingEd'] = lb.fit_transform(union_train_7['ContinuingEd'])
union_train_7['PaperlessBilling'] = lb.fit_transform(union_train_7['PaperlessBilling'])
union_train_7['LeftUnion'] = lb.fit_transform(union_train_7['LeftUnion'])

union_train_7.head()
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	Connectivity	PaperlessBilling	MonthlyDues	TotalDues	LeftUnion	...	FeatureB_No	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes
0	0	1	1	1	1	No	0	18.80	18.8	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	1	1	No	1	18.85	18.85	1	...	0	0	0	0	0	0	0	0	0	0
2	0	1	1	1	1	No	1	19.00	19	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	1	1	No	0	19.15	19.15	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	1	1	No	0	19.20	19.2	0	...	0	0	0	0	0	0	0	0	0	0
5 rows × 33 columns

lb_pred = LabelEncoder()

pred_5['USAcitizen'] = lb_pred.fit_transform(pred_5['USAcitizen'])
pred_5['Married'] = lb_pred.fit_transform(pred_5['Married'])
pred_5['ContinuingEd'] = lb_pred.fit_transform(pred_5['ContinuingEd'])
pred_5['PaperlessBilling'] = lb_pred.fit_transform(pred_5['PaperlessBilling'])

pred_5.head()
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	Connectivity	PaperlessBilling	MonthlyDues	TotalDues	Female	...	FeatureB_No	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes
0	0	1	0	1	0	DSL	1	29.85	29.85	1	...	0	1	1	0	1	0	1	0	1	0
1	0	0	0	34	1	DSL	0	56.95	1889.50	0	...	1	0	0	1	1	0	1	0	1	0
2	0	0	0	2	1	DSL	1	53.85	108.15	0	...	0	1	1	0	1	0	1	0	1	0
3	0	0	0	45	0	DSL	0	42.30	1840.75	0	...	1	0	0	1	0	1	1	0	1	0
4	0	0	0	2	1	Fiber optic	1	70.70	151.65	1	...	1	0	1	0	1	0	1	0	1	0
5 rows × 32 columns

# Check column order train data

union_train_7.columns
Index(['Management', 'USAcitizen', 'Married', 'MonthsInUnion', 'ContinuingEd',
       'Connectivity', 'PaperlessBilling', 'MonthlyDues', 'TotalDues',
       'LeftUnion', 'Female', 'Male', 'IL', 'MO', 'Month-to-month', 'One year',
       'Two year', 'Bank transfer (automatic)', 'Credit card (automatic)',
       'Electronic check', 'Mailed check', 'FeatureA_No', 'FeatureA_Yes',
       'FeatureB_No', 'FeatureB_Yes', 'FeatureC_No', 'FeatureC_Yes',
       'FeatureD_No', 'FeatureD_Yes', 'FeatureE_No', 'FeatureE_Yes',
       'FeatureF_No', 'FeatureF_Yes'],
      dtype='object')
# check column order pred data

pred_5.columns
Index(['Management', 'USAcitizen', 'Married', 'MonthsInUnion', 'ContinuingEd',
       'Connectivity', 'PaperlessBilling', 'MonthlyDues', 'TotalDues',
       'Female', 'Male', 'IL', 'MO', 'Month-to-month', 'One year', 'Two year',
       'Bank transfer (automatic)', 'Credit card (automatic)',
       'Electronic check', 'Mailed check', 'FeatureA_No', 'FeatureA_Yes',
       'FeatureB_No', 'FeatureB_Yes', 'FeatureC_No', 'FeatureC_Yes',
       'FeatureD_No', 'FeatureD_Yes', 'FeatureE_No', 'FeatureE_Yes',
       'FeatureF_No', 'FeatureF_Yes'],
      dtype='object')
# reorganize columns train data, move predictor to end

clist = ['Management', 'USAcitizen', 'Married', 'MonthsInUnion', 'ContinuingEd',
       'Connectivity', 'PaperlessBilling', 'MonthlyDues', 'TotalDues',
       'LeftUnion', 'Female', 'Male', 'IL', 'MO', 'Month-to-month', 'One year',
       'Two year', 'Bank transfer (automatic)', 'Credit card (automatic)',
       'Electronic check', 'Mailed check', 'FeatureA_No', 'FeatureA_Yes',
       'FeatureB_No', 'FeatureB_Yes', 'FeatureC_No', 'FeatureC_Yes',
       'FeatureD_No', 'FeatureD_Yes', 'FeatureE_No', 'FeatureE_Yes',
       'FeatureF_No', 'FeatureF_Yes']

union_train_8 = union_train_7[clist]
union_train_8.head()      
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	Connectivity	PaperlessBilling	MonthlyDues	TotalDues	LeftUnion	...	FeatureB_No	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes
0	0	1	1	1	1	No	0	18.80	18.8	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	1	1	No	1	18.85	18.85	1	...	0	0	0	0	0	0	0	0	0	0
2	0	1	1	1	1	No	1	19.00	19	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	1	1	No	0	19.15	19.15	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	1	1	No	0	19.20	19.2	0	...	0	0	0	0	0	0	0	0	0	0
5 rows × 33 columns

# categorize connectivity column

union_train_8.loc[(union_train_8['Connectivity'] == 'Fiber optic') | (union_train_8['Connectivity'] == 'DSL'), 'connectivity'] = 1
union_train_8.loc[(union_train_8['Connectivity'] != 'Fiber optic') & (union_train_8['Connectivity'] != 'DSL'), 'connectivity'] = 0
union_train_9 = union_train_8.drop(['Connectivity'],axis='columns')

union_train_9.head()
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	LeftUnion	Female	...	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity
0	0	1	1	1	1	0	18.80	18.8	0	0	...	0	0	0	0	0	0	0	0	0	0.0
1	0	0	0	1	1	1	18.85	18.85	1	0	...	0	0	0	0	0	0	0	0	0	0.0
2	0	1	1	1	1	1	19.00	19	0	1	...	0	0	0	0	0	0	0	0	0	0.0
3	0	0	0	1	1	0	19.15	19.15	0	1	...	0	0	0	0	0	0	0	0	0	0.0
4	0	0	0	1	1	0	19.20	19.2	0	0	...	0	0	0	0	0	0	0	0	0	0.0
5 rows × 33 columns

# categorized pred data Connectivity 1,0

pred_5.loc[(pred_5['Connectivity'] == 'Fiber optic') | (pred_5['Connectivity'] == 'DSL'), 'connectivity'] = 1
pred_5.loc[(pred_5['Connectivity'] != 'Fiber optic') & (pred_5['Connectivity'] != 'DSL'), 'connectivity'] = 0
pred_6 = pred_5.drop(['Connectivity'],axis='columns')

pred_6.head()
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	Female	Male	...	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity
0	0	1	0	1	0	1	29.85	29.85	1	0	...	1	1	0	1	0	1	0	1	0	1.0
1	0	0	0	34	1	0	56.95	1889.50	0	1	...	0	0	1	1	0	1	0	1	0	1.0
2	0	0	0	2	1	1	53.85	108.15	0	1	...	1	1	0	1	0	1	0	1	0	1.0
3	0	0	0	45	0	0	42.30	1840.75	0	1	...	0	0	1	0	1	1	0	1	0	1.0
4	0	0	0	2	1	1	70.70	151.65	1	0	...	0	1	0	1	0	1	0	1	0	1.0
5 rows × 32 columns

union_train_9['TotalDues'] = pd.to_numeric(union_train_9['TotalDues'], downcast="float", errors='coerce')

# check for nan

union_train_9.isnull().sum()
Management                   0
USAcitizen                   0
Married                      0
MonthsInUnion                0
ContinuingEd                 0
PaperlessBilling             0
MonthlyDues                  0
TotalDues                    7
LeftUnion                    0
Female                       0
Male                         0
IL                           0
MO                           0
Month-to-month               0
One year                     0
Two year                     0
Bank transfer (automatic)    0
Credit card (automatic)      0
Electronic check             0
Mailed check                 0
FeatureA_No                  0
FeatureA_Yes                 0
FeatureB_No                  0
FeatureB_Yes                 0
FeatureC_No                  0
FeatureC_Yes                 0
FeatureD_No                  0
FeatureD_Yes                 0
FeatureE_No                  0
FeatureE_Yes                 0
FeatureF_No                  0
FeatureF_Yes                 0
connectivity                 0
dtype: int64
# callable function to find nans(nulls)

def nans(df): return df[df.isnull().any(axis=1)]
nans(union_train_9)
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	LeftUnion	Female	...	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity
2992	0	1	1	0	0	1	52.55	NaN	0	1	...	0	0	1	0	1	0	1	1	0	1.0
2993	0	1	1	0	1	0	25.35	NaN	0	0	...	0	0	0	0	0	0	0	0	0	0.0
2994	0	1	1	0	1	0	20.00	NaN	0	1	...	0	0	0	0	0	0	0	0	0	0.0
2995	0	0	1	0	1	0	20.25	NaN	0	0	...	0	0	0	0	0	0	0	0	0	0.0
2996	0	1	1	0	1	0	73.35	NaN	0	1	...	1	0	1	0	1	0	1	1	0	1.0
2997	0	0	1	0	1	1	61.90	NaN	0	0	...	1	1	0	0	1	1	0	1	0	1.0
2998	0	1	1	0	1	0	80.85	NaN	0	1	...	1	0	1	1	0	0	1	0	1	1.0
7 rows × 33 columns

# drop 7 rows with null values

union_train_9 = union_train_9.drop(labels=[2992,2993,2994,2995,2996,2997,2998], axis=0)
I chose to drop 7 rows of data that has null values for total dues. Due to the size of the data, this will have minimal effect on the outcome.

# REMOVE call nans function to check prediction file for nulls

nans(pred_6)
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	Female	Male	...	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity
0 rows × 32 columns

# REMOVE

pred_6.drop(pred_6.index[1000:2999], inplace=True)
# REMOVE

nans(pred_6)
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	Female	Male	...	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity
0 rows × 32 columns

pred_6['TotalDues'] = pd.to_numeric(pred_6['TotalDues'], downcast= "integer", errors='coerce')

pred_6['connectivity'] = pd.to_numeric(pred_6['connectivity'], downcast= "integer", errors='coerce')
pred_6['Management'] = pd.to_numeric(pred_6['Management'], downcast= "integer", errors='coerce')
pred_6['Female'] = pd.to_numeric(pred_6['Female'], downcast= "integer", errors='coerce')
pred_6['Male'] = pd.to_numeric(pred_6['Male'], downcast= "integer", errors='coerce')


pred_6.dtypes
Management                      int8
USAcitizen                     int32
Married                        int32
MonthsInUnion                  int64
ContinuingEd                   int32
PaperlessBilling               int32
MonthlyDues                  float64
TotalDues                    float64
Female                          int8
Male                            int8
IL                             uint8
MO                             uint8
Month-to-month                 uint8
One year                       uint8
Two year                       uint8
Bank transfer (automatic)      uint8
Credit card (automatic)        uint8
Electronic check               uint8
Mailed check                   uint8
FeatureA_No                    uint8
FeatureA_Yes                   uint8
FeatureB_No                    uint8
FeatureB_Yes                   uint8
FeatureC_No                    uint8
FeatureC_Yes                   uint8
FeatureD_No                    uint8
FeatureD_Yes                   uint8
FeatureE_No                    uint8
FeatureE_Yes                   uint8
FeatureF_No                    uint8
FeatureF_Yes                   uint8
connectivity                    int8
dtype: object
pred_6.head()
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	Female	Male	...	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity
0	0	1	0	1	0	1	29.85	29.85	1	0	...	1	1	0	1	0	1	0	1	0	1
1	0	0	0	34	1	0	56.95	1889.50	0	1	...	0	0	1	1	0	1	0	1	0	1
2	0	0	0	2	1	1	53.85	108.15	0	1	...	1	1	0	1	0	1	0	1	0	1
3	0	0	0	45	0	0	42.30	1840.75	0	1	...	0	0	1	0	1	1	0	1	0	1
4	0	0	0	2	1	1	70.70	151.65	1	0	...	0	1	0	1	0	1	0	1	0	1
5 rows × 32 columns

union_train_9.dtypes

# union_train_9 = map(float, union_train_9)
# union_train_9
Management                     int64
USAcitizen                     int32
Married                        int32
MonthsInUnion                  int64
ContinuingEd                   int32
PaperlessBilling               int32
MonthlyDues                  float64
TotalDues                    float32
LeftUnion                      int32
Female                         uint8
Male                           uint8
IL                             uint8
MO                             uint8
Month-to-month                 uint8
One year                       uint8
Two year                       uint8
Bank transfer (automatic)      uint8
Credit card (automatic)        uint8
Electronic check               uint8
Mailed check                   uint8
FeatureA_No                    uint8
FeatureA_Yes                   uint8
FeatureB_No                    uint8
FeatureB_Yes                   uint8
FeatureC_No                    uint8
FeatureC_Yes                   uint8
FeatureD_No                    uint8
FeatureD_Yes                   uint8
FeatureE_No                    uint8
FeatureE_Yes                   uint8
FeatureF_No                    uint8
FeatureF_Yes                   uint8
connectivity                 float64
dtype: object
# convert values to int

union_train_9['Management'] = pd.to_numeric(union_train_9['Management'], downcast= "integer", errors='coerce')
union_train_9['connectivity'] = pd.to_numeric(union_train_9['connectivity'], downcast= "integer", errors='coerce')
union_train_9['USAcitizen'] = pd.to_numeric(union_train_9['USAcitizen'], downcast= "integer", errors='coerce')
union_train_9['Married'] = pd.to_numeric(union_train_9['Married'], downcast= "integer", errors='coerce')
union_train_9['ContinuingEd'] = pd.to_numeric(union_train_9['ContinuingEd'], downcast= "integer", errors='coerce')
union_train_9['PaperlessBilling'] = pd.to_numeric(union_train_9['PaperlessBilling'], downcast= "integer", errors='coerce')

union_train_9.dtypes
Management                      int8
USAcitizen                      int8
Married                         int8
MonthsInUnion                  int64
ContinuingEd                    int8
PaperlessBilling                int8
MonthlyDues                  float64
TotalDues                    float32
LeftUnion                      int32
Female                         uint8
Male                           uint8
IL                             uint8
MO                             uint8
Month-to-month                 uint8
One year                       uint8
Two year                       uint8
Bank transfer (automatic)      uint8
Credit card (automatic)        uint8
Electronic check               uint8
Mailed check                   uint8
FeatureA_No                    uint8
FeatureA_Yes                   uint8
FeatureB_No                    uint8
FeatureB_Yes                   uint8
FeatureC_No                    uint8
FeatureC_Yes                   uint8
FeatureD_No                    uint8
FeatureD_Yes                   uint8
FeatureE_No                    uint8
FeatureE_Yes                   uint8
FeatureF_No                    uint8
FeatureF_Yes                   uint8
connectivity                    int8
dtype: object
# check pred file columns

pred_6.columns
Index(['Management', 'USAcitizen', 'Married', 'MonthsInUnion', 'ContinuingEd',
       'PaperlessBilling', 'MonthlyDues', 'TotalDues', 'Female', 'Male', 'IL',
       'MO', 'Month-to-month', 'One year', 'Two year',
       'Bank transfer (automatic)', 'Credit card (automatic)',
       'Electronic check', 'Mailed check', 'FeatureA_No', 'FeatureA_Yes',
       'FeatureB_No', 'FeatureB_Yes', 'FeatureC_No', 'FeatureC_Yes',
       'FeatureD_No', 'FeatureD_Yes', 'FeatureE_No', 'FeatureE_Yes',
       'FeatureF_No', 'FeatureF_Yes', 'connectivity'],
      dtype='object')
union_train_9.columns
Index(['Management', 'USAcitizen', 'Married', 'MonthsInUnion', 'ContinuingEd',
       'PaperlessBilling', 'MonthlyDues', 'TotalDues', 'LeftUnion', 'Female',
       'Male', 'IL', 'MO', 'Month-to-month', 'One year', 'Two year',
       'Bank transfer (automatic)', 'Credit card (automatic)',
       'Electronic check', 'Mailed check', 'FeatureA_No', 'FeatureA_Yes',
       'FeatureB_No', 'FeatureB_Yes', 'FeatureC_No', 'FeatureC_Yes',
       'FeatureD_No', 'FeatureD_Yes', 'FeatureE_No', 'FeatureE_Yes',
       'FeatureF_No', 'FeatureF_Yes', 'connectivity'],
      dtype='object')
# reorganize pred columns

pred_clist = ['Management', 'USAcitizen', 'Married', 'MonthsInUnion', 'ContinuingEd',
       'PaperlessBilling', 'MonthlyDues', 'TotalDues', 'Female', 'Male', 'IL',
       'MO', 'Month-to-month', 'One year', 'Two year',
       'Bank transfer (automatic)', 'Credit card (automatic)',
       'Electronic check', 'Mailed check', 'FeatureA_No', 'FeatureA_Yes',
       'FeatureB_No', 'FeatureB_Yes', 'FeatureC_No', 'FeatureC_Yes',
       'FeatureD_No', 'FeatureD_Yes', 'FeatureE_No', 'FeatureE_Yes',
       'FeatureF_No', 'FeatureF_Yes', 'connectivity']

pred_7 = pred_6[pred_clist]
pred_7.sample()
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	Female	Male	...	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity
615	0	0	0	15	1	0	48.85	631.4	1	0	...	0	1	0	1	0	1	0	1	0	1
1 rows × 32 columns

# organize column position

clist = ['Management', 'USAcitizen', 'Married', 'MonthsInUnion', 'ContinuingEd',
       'PaperlessBilling', 'MonthlyDues', 'TotalDues', 'Female', 'Male', 'IL',
       'MO', 'Month-to-month', 'One year', 'Two year',
       'Bank transfer (automatic)', 'Credit card (automatic)',
       'Electronic check', 'Mailed check', 'FeatureA_No', 'FeatureA_Yes',
       'FeatureB_No', 'FeatureB_Yes', 'FeatureC_No', 'FeatureC_Yes',
       'FeatureD_No', 'FeatureD_Yes', 'FeatureE_No', 'FeatureE_Yes',
       'FeatureF_No', 'FeatureF_Yes', 'connectivity', 'LeftUnion']

union_train_9 = union_train_9[clist]
union_train_9.head()
      
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	Female	Male	...	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity	LeftUnion
0	0	1	1	1	1	0	18.80	18.799999	0	1	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	1	1	1	18.85	18.850000	0	1	...	0	0	0	0	0	0	0	0	0	1
2	0	1	1	1	1	1	19.00	19.000000	1	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	1	1	0	19.15	19.150000	1	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	1	1	0	19.20	19.200001	0	1	...	0	0	0	0	0	0	0	0	0	0
5 rows × 33 columns

Step 3.0 Outlier Analysis Quantitative
# Adapted from https://www.kaggle.com/gaganmaahi224/telco-customer-churn-prediction-with-11-ml-algos

x = ['MonthsInUnion','MonthlyDues', 'TotalDues']
def count_outliers(data,col):
        q1 = data[col].quantile(0.25,interpolation='nearest')
        q2 = data[col].quantile(0.5,interpolation='nearest')
        q3 = data[col].quantile(0.75,interpolation='nearest')
        q4 = data[col].quantile(1,interpolation='nearest')
        IQR = q3 -q1
        global LLP
        global ULP
        LLP = q1 - 1.5*IQR
        ULP = q3 + 1.5*IQR
        if data[col].min() > LLP and data[col].max() < ULP:
            print("No outliers in",i)
        else:
            print("There are outliers in",i)
            x = data[data[col]<LLP][col].size
            y = data[data[col]>ULP][col].size
            a.append(i)
            print('Count of outliers are:',x+y)
global a
a = []
for i in x:
    count_outliers(union_train_9,i)
There are outliers in MonthsInUnion
Count of outliers are: 10
No outliers in MonthlyDues
There are outliers in TotalDues
Count of outliers are: 11
3.1 Outlier Analysis Visualization
The code above indicates outliers in MonthsInUnion and TotalDues. I will explore furthur with visualization.

def Box_plots(df):
    plt.figure(figsize=(10, 4))
    plt.title("Box Plot")
    sns.boxplot(df)
    plt.show()
Box_plots(union_train_9['MonthsInUnion'])

def hist_plots(df):
    plt.figure(figsize=(10, 4))
    plt.hist(df)
    plt.title("Histogram Plot")
    plt.show()
hist_plots(union_train_9['MonthsInUnion'])

def scatter_plots(df1,df2):
    fig, ax = plt.subplots(figsize=(10,4))
    ax.scatter(df1,df2)
    ax.set_xlabel('MonthsInUnion')
    ax.set_ylabel('MonthlyDues')
    plt.title("MonthlyDues")
    plt.show()
scatter_plots(union_train_9['MonthlyDues'],union_train_9['MonthsInUnion'])

def dist_plots(df):
    plt.figure(figsize=(10, 4))
    sns.distplot(df)
    plt.title("Distribution plot")
    sns.despine()
    plt.show()
dist_plots(union_train_9['MonthsInUnion'])

def qq_plots(df):
    plt.figure(figsize=(10, 4))
    qqplot(df,line='s')
    plt.title("Normal QQPlot")
    plt.show()
qq_plots(union_train_9['MonthsInUnion'])
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

<Figure size 720x288 with 0 Axes>

3.2 Manual outlier adjustments
Make adjustments to Monthly Dues, and Total Dues. Based upon the facts Monthly dues average for the data set = $64, and I am limiting months in union to approx 40 years or approx. 500 months. Monthly Dues and Total dues do not appear to make sense mathematically.

Average retirment age in the U.S. is 61. Considering most data science members would have 4 plus years of higher education, I will limit the months in union to those members who have less than 500 months in the union.

## ADJUST OR REMOVE

union_train_9.query('MonthsInUnion  > 500')
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	Female	Male	...	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity	LeftUnion
2982	0	1	1	744	1	1	19.20	14284.799805	0	1	...	0	0	0	0	0	0	0	0	0	0
2983	0	0	0	614	0	0	35.75	21950.500000	0	1	...	0	1	0	1	1	0	1	0	1	0
2984	1	1	0	602	1	0	60.00	36120.000000	0	1	...	1	0	1	0	1	0	1	0	1	0
2985	0	0	0	827	1	1	59.80	49454.601562	0	1	...	1	0	0	1	0	1	1	0	1	1
2986	1	1	0	917	1	1	74.45	68270.648438	1	0	...	1	0	1	0	1	0	1	0	1	1
2987	0	0	0	658	1	1	104.95	69057.101562	0	1	...	0	1	1	0	0	1	0	1	1	0
2988	0	0	0	782	1	1	93.40	73038.796875	1	0	...	0	1	1	0	0	1	1	0	1	0
2989	0	1	1	731	1	1	103.20	75439.203125	1	0	...	0	1	1	0	0	1	0	1	1	0
2990	0	1	0	899	1	1	84.95	76370.046875	1	0	...	1	0	0	1	0	1	0	1	1	0
2991	0	0	0	788	1	1	104.50	82346.000000	0	1	...	0	1	1	0	0	1	0	1	1	1
10 rows × 33 columns

## ADJUST OR REMOVE

union_train_9 = union_train_9.query('MonthsInUnion < 500')
sns.histplot(x = union_train_9['MonthsInUnion'],kde = True)
plt.show()

Check monthly and yearly dues for outliers

sns.histplot(x = union_train_9['MonthlyDues'],kde = True)
plt.show()

sns.boxplot(union_train_9['MonthlyDues'])
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
<AxesSubplot:xlabel='MonthlyDues'>

## ADJUST OR REMOVE

union_train_9.query('MonthlyDues > 110')
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	Female	Male	...	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity	LeftUnion
1515	0	0	0	12	1	1	112.95	1384.750000	0	1	...	0	1	0	1	0	1	0	1	1	1
1901	0	1	0	21	1	1	111.20	2317.100098	0	1	...	0	1	0	1	0	1	0	1	1	1
2045	1	0	0	27	1	1	110.50	2857.600098	1	0	...	0	1	1	0	0	1	0	1	1	0
2280	0	0	0	34	1	0	116.25	3899.050049	0	1	...	0	1	0	1	0	1	0	1	1	0
2292	0	0	1	34	1	1	116.15	3946.899902	1	0	...	0	1	0	1	0	1	0	1	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2977	0	1	1	72	1	1	115.80	8476.500000	1	0	...	0	1	0	1	0	1	0	1	1	0
2978	0	1	0	72	1	1	117.15	8529.500000	1	0	...	0	1	0	1	0	1	0	1	1	0
2979	0	0	0	72	1	1	118.20	8547.150391	0	1	...	0	1	0	1	0	1	0	1	1	0
2980	0	1	0	71	1	1	116.25	8564.750000	0	1	...	0	1	0	1	0	1	0	1	1	0
2981	0	1	1	72	1	1	118.75	8672.450195	1	0	...	0	1	0	1	0	1	0	1	1	0
87 rows × 33 columns

## ADJUST OR REMOVE

union_train_9 = union_train_9.query('MonthlyDues < 110')
sns.histplot(x = union_train_9['TotalDues'],kde = True)
plt.show()

sns.boxplot(union_train_9['TotalDues'])
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
<AxesSubplot:xlabel='TotalDues'>

union_train_9.describe()
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	Female	Male	...	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity	LeftUnion
count	2893.000000	2893.000000	2893.000000	2893.000000	2893.000000	2893.000000	2893.000000	2893.000000	2893.000000	2893.000000	...	2893.000000	2893.000000	2893.000000	2893.000000	2893.000000	2893.000000	2893.000000	2893.000000	2893.000000	2893.000000
mean	0.155894	0.470446	0.296578	31.238161	0.900449	0.580712	62.230954	2078.132568	0.509160	0.490840	...	0.453163	0.313170	0.495334	0.270999	0.415831	0.350501	0.408918	0.357414	0.766333	0.268925
std	0.362817	0.499212	0.456828	24.080381	0.299452	0.493528	29.439395	2078.714111	0.500003	0.500003	...	0.497888	0.463863	0.500065	0.444552	0.492950	0.477209	0.491719	0.479321	0.423236	0.443478
min	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	18.700000	18.799999	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000	0.000000	9.000000	1.000000	0.000000	29.100000	372.450012	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000
50%	0.000000	0.000000	0.000000	27.000000	1.000000	1.000000	69.000000	1284.199951	1.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000
75%	0.000000	1.000000	1.000000	53.000000	1.000000	1.000000	87.250000	3355.649902	1.000000	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000
max	1.000000	1.000000	1.000000	72.000000	1.000000	1.000000	109.950000	8129.299805	1.000000	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000
8 rows × 33 columns

union_train_9.query('TotalDues > 4000')
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	Female	Male	...	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity	LeftUnion
2307	0	1	1	43	1	1	91.25	4013.800049	0	1	...	0	1	0	1	1	0	1	0	1	0
2308	0	1	0	42	1	1	94.40	4014.600098	0	1	...	0	1	1	0	1	0	0	1	1	0
2309	0	1	1	52	1	0	79.20	4016.300049	0	1	...	0	1	1	0	0	1	0	1	1	0
2310	0	1	1	42	1	0	97.10	4016.750000	0	1	...	1	0	1	0	0	1	0	1	1	0
2311	0	0	0	45	1	1	89.30	4016.850098	1	0	...	1	0	1	0	0	1	1	0	1	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2941	1	1	0	71	1	1	109.70	7904.250000	1	0	...	0	1	0	1	0	1	0	1	1	0
2942	1	1	0	72	1	1	109.55	7920.700195	0	1	...	0	1	1	0	0	1	0	1	1	0
2943	0	1	1	70	1	1	108.15	7930.549805	1	0	...	0	1	1	0	0	1	0	1	1	0
2950	0	1	0	72	1	1	108.50	8003.799805	1	0	...	0	1	0	1	0	1	0	1	1	0
2960	0	1	1	72	1	0	109.70	8129.299805	1	0	...	0	1	1	0	0	1	0	1	1	0
591 rows × 33 columns

##ADJUST OR REMOVE

union_train_9 = union_train_9.query('TotalDues < 4000')
sns.boxplot(union_train_9['TotalDues'])
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
<AxesSubplot:xlabel='TotalDues'>

3.3 Scale non-binary columns
cols_to_scale = ['MonthlyDues', 'TotalDues', 'MonthsInUnion']

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

union_train_9[cols_to_scale] = scaler.fit_transform(union_train_9[cols_to_scale])
pred_7[cols_to_scale] = scaler.fit_transform(pred_7[cols_to_scale])
3.4 Clean pred data
a = (pred_7.isnull().sum())
print(a[a>0])
Series([], dtype: int64)
pred_7.fillna(0,inplace=True)
4.0 Correlation, Create Test and Train, Feature Engineering
4.1 Correlation Heat map
# correlation of all features

corr = union_train_9.corr()
fig4, ax = plt.subplots(figsize=(15,7))
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
            annot=True,cmap="YlGnBu",annot_kws={'size': 12},fmt=".2f")
<AxesSubplot:>

X = union_train_9.drop(['LeftUnion'], axis = 1)  #Independent variables(features)select all but last column
y = union_train_9['LeftUnion']                   #Dependent variable
4.2 Feature Engineering (feature_Selection)
from sklearn.feature_selection import SelectKBest, chi2

X.sample(5)
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	Female	Male	...	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity
433	0	0	0	0.042254	0	0	0.259868	0.032125	0	1	...	1	1	0	0	1	1	0	1	0	1
1805	0	1	0	0.450704	0	0	0.447917	0.502902	1	0	...	0	0	1	0	1	0	1	0	1	1
1122	0	1	1	0.577465	1	0	0.014254	0.205126	1	0	...	0	0	0	0	0	0	0	0	0	0
1654	0	1	1	0.957746	1	1	0.075658	0.416571	1	0	...	0	0	0	0	0	0	0	0	0	0
2205	0	1	0	0.802817	1	1	0.464364	0.871096	1	0	...	0	1	0	0	1	1	0	1	0	1
5 rows × 32 columns

#*****************CHANGE k to add or delete features (32 features total)********************************

sel_Feat = SelectKBest(chi2, k=32).fit(X,y)          ##CHANGE k to add or delete features (1-32)

#*******************************************************************************************************
sel_Bool = sel_Feat.get_support()
sel_Bool
array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])
X_sel = X[X.columns[sel_Bool]]
X_sel.head()     ##This var affects split var X to X_sel!!
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	Female	Male	...	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity
0	0	1	1	0.0	1	0	0.001096	0.000000	0	1	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0.0	1	1	0.001645	0.000013	0	1	...	0	0	0	0	0	0	0	0	0	0
2	0	1	1	0.0	1	1	0.003289	0.000050	1	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0.0	1	0	0.004934	0.000088	1	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0.0	1	0	0.005482	0.000101	0	1	...	0	0	0	0	0	0	0	0	0	0
5 rows × 32 columns

X_sel.columns
Index(['Management', 'USAcitizen', 'Married', 'MonthsInUnion', 'ContinuingEd',
       'PaperlessBilling', 'MonthlyDues', 'TotalDues', 'Female', 'Male', 'IL',
       'MO', 'Month-to-month', 'One year', 'Two year',
       'Bank transfer (automatic)', 'Credit card (automatic)',
       'Electronic check', 'Mailed check', 'FeatureA_No', 'FeatureA_Yes',
       'FeatureB_No', 'FeatureB_Yes', 'FeatureC_No', 'FeatureC_Yes',
       'FeatureD_No', 'FeatureD_Yes', 'FeatureE_No', 'FeatureE_Yes',
       'FeatureF_No', 'FeatureF_Yes', 'connectivity'],
      dtype='object')
# correlation matrix with choice features

X_sel.corr()
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	Female	Male	...	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity
Management	1.000000	-0.019322	-0.209645	-0.043771	-0.005347	0.131617	0.238072	0.091305	-0.019151	0.019151	...	0.035437	0.164400	0.015734	0.232265	-0.068903	0.093705	0.096505	0.075627	0.116115	0.194613
USAcitizen	-0.019322	1.000000	0.495498	0.321132	-0.018498	-0.063017	-0.053103	0.218917	0.033487	-0.033487	...	-0.009115	-0.127777	0.055765	-0.128748	0.058436	-0.088269	0.006617	-0.095886	0.014984	-0.090148
Married	-0.209645	0.495498	1.000000	0.215732	-0.018319	-0.126253	-0.207811	0.054116	-0.001660	0.001660	...	-0.001947	-0.168167	-0.021180	-0.201638	0.020208	-0.130840	-0.063477	-0.105097	-0.092163	-0.203642
MonthsInUnion	-0.043771	0.321132	0.215732	1.000000	-0.095751	-0.131525	-0.193535	0.701683	0.023101	-0.023101	...	0.073545	-0.282492	0.076878	-0.313171	0.117328	-0.210181	-0.011829	-0.219556	-0.001520	-0.241011
ContinuingEd	-0.005347	-0.018498	-0.018319	-0.095751	1.000000	-0.025267	0.195308	-0.036141	0.009129	-0.009129	...	-0.131208	-0.089686	-0.152515	-0.073823	-0.176393	-0.147130	-0.078216	-0.129037	-0.098137	-0.235528
PaperlessBilling	0.131617	-0.063017	-0.126253	-0.131525	-0.025267	1.000000	0.358149	0.077001	-0.002360	0.002360	...	0.093256	0.255918	0.069508	0.295541	0.021679	0.143347	0.195457	0.137549	0.200257	0.343406
MonthlyDues	0.238072	-0.053103	-0.207811	-0.193535	0.195308	0.358149	1.000000	0.411395	0.012733	-0.012733	...	0.318246	0.404586	0.374818	0.516986	0.243773	0.230694	0.553569	0.225356	0.554297	0.781007
TotalDues	0.091305	0.218917	0.054116	0.701683	-0.036141	0.077001	0.411395	1.000000	0.025817	-0.025817	...	0.307398	-0.000639	0.332806	0.017429	0.318872	-0.014014	0.328373	-0.027995	0.340733	0.298411
Female	-0.019151	0.033487	-0.001660	0.023101	0.009129	-0.002360	0.012733	0.025817	1.000000	-1.000000	...	0.033373	0.016889	-0.008545	0.016937	-0.008828	0.029794	-0.022715	-0.021279	0.035297	0.010860
Male	0.019151	-0.033487	0.001660	-0.023101	-0.009129	0.002360	-0.012733	-0.025817	-1.000000	1.000000	...	-0.033373	-0.016889	0.008545	-0.016937	0.008828	-0.029794	0.022715	0.021279	-0.035297	-0.010860
IL	0.134264	0.046840	-0.068141	0.124069	0.246928	0.149888	0.358032	0.290429	0.016204	-0.016204	...	0.043550	0.057501	0.041896	0.109285	-0.021908	-0.039150	0.150265	-0.020452	0.127496	0.100776
MO	-0.134264	-0.046840	0.068141	-0.124069	-0.246928	-0.149888	-0.358032	-0.290429	-0.016204	0.016204	...	-0.043550	-0.057501	-0.041896	-0.109285	0.021908	0.039150	-0.150265	0.020452	-0.127496	-0.100776
Month-to-month	0.165383	-0.261202	-0.278647	-0.609798	0.032344	0.279590	0.384432	-0.261093	0.008951	-0.008951	...	0.045811	0.406261	-0.026287	0.498372	-0.142403	0.284234	0.116920	0.313108	0.083627	0.422348
One year	-0.070847	0.089851	0.096466	0.229536	-0.022380	-0.119852	-0.107001	0.202247	-0.028090	0.028090	...	0.044361	-0.143445	0.077463	-0.209187	0.161879	-0.071794	-0.009820	-0.096387	0.017934	-0.087847
Two year	-0.137717	0.239882	0.255281	0.539958	-0.018282	-0.232738	-0.378675	0.125658	0.017167	-0.017167	...	-0.103095	-0.369351	-0.045308	-0.419377	0.016188	-0.287405	-0.138234	-0.299042	-0.124205	-0.446177
Bank transfer (automatic)	-0.024474	0.120325	0.054677	0.259815	0.003361	-0.059881	-0.035046	0.194275	0.027341	-0.027341	...	0.002565	-0.077104	0.042996	-0.084238	0.053059	-0.040193	-0.002168	-0.041215	-0.001069	-0.045999
Credit card (automatic)	-0.039642	0.046420	0.045068	0.178839	0.003409	-0.032520	-0.056326	0.115631	-0.011212	0.011212	...	0.029142	-0.106786	0.048131	-0.106707	0.049288	-0.057389	-0.011769	-0.056503	-0.012760	-0.073966
Electronic check	0.218393	-0.083119	-0.153578	-0.242816	-0.014997	0.267271	0.375621	-0.035178	-0.000751	0.000751	...	0.044537	0.287153	0.036961	0.372278	-0.068747	0.148061	0.195337	0.149363	0.192123	0.348443
Mailed check	-0.175054	-0.056133	0.076033	-0.121473	0.009982	-0.201960	-0.317279	-0.229584	-0.013315	0.013315	...	-0.073954	-0.144720	-0.117141	-0.228265	-0.015505	-0.072258	-0.193822	-0.073500	-0.190544	-0.264149
FeatureA_No	0.206353	-0.145061	-0.209071	-0.318440	-0.064785	0.322368	0.549611	0.031621	-0.006609	0.006609	...	0.150548	0.475596	0.149727	0.543747	0.068253	0.350014	0.286918	0.374527	0.256914	0.656656
FeatureA_Yes	-0.036506	0.079068	0.029585	0.124222	-0.188190	-0.011956	0.203417	0.301834	0.020734	-0.020734	...	0.199608	0.130929	0.195669	0.066495	0.281554	0.231391	0.069811	0.189489	0.116889	0.319588
FeatureB_No	0.148079	-0.074614	-0.183930	-0.280249	-0.106356	0.235948	0.449053	0.018313	-0.017634	0.017634	...	-0.515434	0.429916	0.176894	0.462712	0.140439	0.409087	0.192412	0.391066	0.211572	0.630926
FeatureB_Yes	0.035437	-0.009115	-0.001947	0.073545	-0.131208	0.093256	0.318246	0.307398	0.033373	-0.033373	...	1.000000	0.181762	0.155894	0.162450	0.184187	0.151702	0.181960	0.162833	0.167785	0.339641
FeatureC_No	0.164400	-0.127777	-0.168167	-0.282492	-0.089686	0.255918	0.404586	-0.000639	0.016889	-0.016889	...	0.181762	1.000000	-0.513285	0.525074	0.068567	0.497132	0.097458	0.496896	0.097678	0.636433
FeatureC_Yes	0.015734	0.055765	-0.021180	0.076878	-0.152515	0.069508	0.374818	0.332806	-0.008545	0.008545	...	0.155894	-0.513285	1.000000	0.087392	0.273302	0.045204	0.299242	0.034697	0.308093	0.335299
FeatureD_No	0.232265	-0.128748	-0.201638	-0.313171	-0.073823	0.295541	0.516986	0.017429	0.016937	-0.016937	...	0.162450	0.525074	0.087392	1.000000	-0.505536	0.427433	0.196566	0.425805	0.197194	0.654946
FeatureD_Yes	-0.068903	0.058436	0.020208	0.117328	-0.176393	0.021679	0.243773	0.318872	-0.008828	0.008828	...	0.184187	0.068567	0.273302	-0.505536	1.000000	0.133767	0.182863	0.124730	0.191435	0.320902
FeatureE_No	0.093705	-0.088269	-0.130840	-0.210181	-0.147130	0.143347	0.230694	-0.014014	0.029794	-0.029794	...	0.151702	0.497132	0.045204	0.427433	0.133767	1.000000	-0.530205	0.588273	-0.057458	0.586356
FeatureE_Yes	0.096505	0.006617	-0.063477	-0.011829	-0.078216	0.195457	0.553569	0.328373	-0.022715	0.022715	...	0.181960	0.097458	0.299242	0.196566	0.182863	-0.530205	1.000000	-0.069371	0.467788	0.375931
FeatureF_No	0.075627	-0.095886	-0.105097	-0.219556	-0.129037	0.137549	0.225356	-0.027995	-0.021279	0.021279	...	0.162833	0.496896	0.034697	0.425805	0.124730	0.588273	-0.069371	1.000000	-0.532800	0.576654
FeatureF_Yes	0.116115	0.014984	-0.092163	-0.001520	-0.098137	0.200257	0.554297	0.340733	0.035297	-0.035297	...	0.167785	0.097678	0.308093	0.197194	0.191435	-0.057458	0.467788	-0.532800	1.000000	0.384127
connectivity	0.194613	-0.090148	-0.203642	-0.241011	-0.235528	0.343406	0.781007	0.298411	0.010860	-0.010860	...	0.339641	0.636433	0.335299	0.654946	0.320902	0.586356	0.375931	0.576654	0.384127	1.000000
32 rows × 32 columns

# change prediction file columns to match selected columns

pred_9 = pred_7[X_sel.columns]
pred_9.sample()
Management	USAcitizen	Married	MonthsInUnion	ContinuingEd	PaperlessBilling	MonthlyDues	TotalDues	Female	Male	...	FeatureB_Yes	FeatureC_No	FeatureC_Yes	FeatureD_No	FeatureD_Yes	FeatureE_No	FeatureE_Yes	FeatureF_No	FeatureF_Yes	connectivity
125	0	0	0	0.458333	1	1	0.719424	0.355738	0	1	...	0	1	0	0	1	0	1	1	0	1
1 rows × 32 columns

4.4 Create Train Test split
# Split the data into a training set and a testing set.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size = 0.30, random_state=50)
# Create Permutation feature importance post-feature selection

rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
rf.fit(X_train, y_train)

############################### Permutation feature importance #####################################

imp = rfpimp.importances(rf, X_test, y_test)

############################################## Plot ################################################

fig, ax = plt.subplots(figsize=(12,6))

ax.barh(imp.index, imp['Importance'], height=0.8, facecolor='grey', alpha=0.8, edgecolor='k')
ax.set_xlabel('Importance score')
ax.set_title('Permutation select features importance')
ax.text(0.8, 0.15, 'aegis4048.github.io', fontsize=12, ha='center', va='center',
        transform=ax.transAxes, color='grey', alpha=0.5)
plt.gca().invert_yaxis()

fig.tight_layout()

model = Ridge()
visualizer = ResidualsPlot(model)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show()                 # Finalize and render the figure

<AxesSubplot:title={'center':'Residuals for Ridge Model'}, xlabel='Predicted Value', ylabel='Residuals'>
THE BEAST
import time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score, classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Create the classifier.
clf = AdaBoostClassifier()

# Create the pipeline.
pipeline = Pipeline([('reduce_dim', PCA()),
                     ('clf', clf)])

# Create the parameters.
n_feature_options = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,15, 16, 17, 18, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
n_estimators = [50]
parameters = [{'reduce_dim': [PCA(iterated_power=7)],
               'reduce_dim__n_components': n_feature_options,
               'clf__n_estimators': n_estimators},
              {'reduce_dim': [SelectKBest()],
               'reduce_dim__k': n_feature_options,
               'clf__n_estimators': n_estimators}]

reducer_labels = ['PCA', 'KBest()']

# Create a function to get the best estimator and print the reports.
def compare_estimators():
#    t0 = time.time()

    # Create the KFold cross-validator.
    kf = KFold(n_splits=50, shuffle=True, random_state=23)

    # Create accuracy score to compare each combination.
    scoring = {'Accuracy': make_scorer(accuracy_score)}

    # Create the grid search.
    grid = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring,
                        cv=kf, refit='Accuracy')

    # Fit grid search combinations.
    grid.fit(X_train, y_train)

    # Make predictions.
    predictions = grid.predict(X_test)

    # Evaluate using sklearn.classification_report().
    report = classification_report(y_test, predictions)

    # Get the best parameters and scores.
    best_parameters = grid.best_params_
    best_score = grid.best_score_
    
    mean_scores = np.array(grid.cv_results_['mean_test_Accuracy'])
    # scores are in the order of param_grid iteration, which is alphabetical
    mean_scores = mean_scores.reshape(len(n_estimators), -1, len(n_feature_options))
    # select score for best C
    mean_scores = mean_scores.max(axis=0)
    bar_offsets = (np.arange(len(n_feature_options)) *
                   (len(reducer_labels) + 1) + .5)

    plt.figure(figsize=(10, 5))
    for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
        plt.bar(bar_offsets + i, reducer_scores, label=label)

    plt.title("Comparing feature reduction techniques")
    plt.xlabel('Reduced number of features')
    plt.xticks(bar_offsets + len(reducer_labels) / 2, n_feature_options)
    plt.ylabel('Accuracy')
    plt.ylim((0, 1))
    plt.legend(loc='upper left')

    # Print the results.
    print("\nAccuracy score: ", accuracy_score(y_test, predictions))
    print("\nReport:\n")
    print(report)
    print("\nBest Mean Accuracy score: ", best_score)
    print("\nBest parameters:\n")
    print(best_parameters)
    print(confusion_matrix(y_test, predictions))
#    print("Time passed: ", round(time() - t0, 3), "s")
    
    return grid.best_estimator_

compare_estimators()
Accuracy score:  0.7771345875542692

Report:

              precision    recall  f1-score   support

           0       0.82      0.87      0.84       469
           1       0.68      0.59      0.63       222

    accuracy                           0.78       691
   macro avg       0.75      0.73      0.73       691
weighted avg       0.77      0.78      0.77       691


Best Mean Accuracy score:  0.7905303030303031

Best parameters:

{'clf__n_estimators': 50, 'reduce_dim': SelectKBest(k=27), 'reduce_dim__k': 27}
[[407  62]
 [ 92 130]]
Pipeline(steps=[('reduce_dim', SelectKBest(k=27)),
                ('clf', AdaBoostClassifier())])

from sklearn.model_selection import StratifiedShuffleSplit

# Create the classifier.
clf = AdaBoostClassifier()

# Create the parameters.
parameters = {'n_estimators': [10, 25, 50, 75],
              'algorithm': ['SAMME', 'SAMME.R'],
              'random_state': [3]}

# Find the best estimator and print the reports.
#t0 = time()

# Create the Stratified ShuffleSplit cross-validator.
sss = StratifiedShuffleSplit(n_splits=50, test_size=0.2, random_state=3)

# Create multiple evaluation metrics to compare each combination.
scoring = {'AUC': 'roc_auc',
           'Accuracy': make_scorer(accuracy_score),
           'Precision': 'precision',
           'Recall': 'recall',
           'f1': 'f1'}

# Create the grid search.
grid = GridSearchCV(estimator=clf,
                    param_grid=parameters,
                    scoring=scoring,
                    cv=sss, refit='Accuracy')

# Fit grid search combinations.
grid.fit(X_train, y_train)

# Make predictions.
predictions = grid.predict(X_test)

# Evaluate using sklearn.classification_report().
report = classification_report(y_test, predictions)

# Get the best parameters and scores.
best_parameters = grid.best_params_
best_score = grid.best_score_

# Print the results.
print("\nAccuracy score: ", accuracy_score(y_test, predictions))
print("\nReport:\n")
print(report)
print("\nBest Accuracy score: ", best_score)
print("\nBest parameters:\n")
print(best_parameters)
print(confusion_matrix(y_test, predictions))
#print("Time passed: ", round(time() - t0, 3), "s")

best_clf = grid.best_estimator_
Accuracy score:  0.748191027496382

Report:

              precision    recall  f1-score   support

           0       0.79      0.85      0.82       469
           1       0.63      0.53      0.57       222

    accuracy                           0.75       691
   macro avg       0.71      0.69      0.70       691
weighted avg       0.74      0.75      0.74       691


Best Accuracy score:  0.7817956656346748

Best parameters:

{'algorithm': 'SAMME', 'n_estimators': 50, 'random_state': 3}
[[400  69]
 [105 117]]
5.0 Algorithms
5.1 Linear Regression
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_score = round(lr_clf.score(X_test,y_test),3)
lr_score
0.297
results_6 = lr_clf.predict(pred_9) #Linear Redression
# another method

ols = linear_model.LinearRegression()
model = ols.fit(X_train, y_train)
response = model.predict(pred_9)
r2 = round(model.score(X_train, y_train), 3)
5.2 ExtraTreeClassifier
This provided best score for competition. I left this in the final model to compare to the Ensemble.

from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import BaggingClassifier

extra_tree = ExtraTreeClassifier(random_state=0)
cls = BaggingClassifier(extra_tree, random_state=0).fit(X_train, y_train)
extra_tree_score = round(cls.score(X_test, y_test),3)
extra_tree_score
0.734
results_3 = cls.predict(pred_9) #Extra tree classifier
5.3 XGboost
From week 5 VOS lecture. I plan to use this for future research.

import xgboost as xgb

train = xgb.DMatrix(X_train, label=y_train)
test = xgb.DMatrix(X_test, label=y_test)
param = {
    'max_depth': 4,
    'eta': 0.3,
    'objective': 'multi:softmax',
    'num_class': 3} 
epochs = 10 
model = xgb.train(param, train, epochs)
[18:03:46] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.0/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softmax' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
predictions = model.predict(test)
#print(predictions)
xg_score = round(accuracy_score(y_test, predictions),3)
xg_score
0.781
xg_pred = xgb.DMatrix(pred_9)

xg_pred_1 = model.predict(xg_pred)
# xg_pred_1
5.4 CatBoost
Another model for future research.

categorical_features_indices = np.where(X_sel.dtypes != np.float)[0]

catboost_5 = CatBoostClassifier(verbose=False, random_state=0, scale_pos_weight=5)

catboost_5.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_test, y_test))
y_pred = catboost_5.predict(X_test)

model_names = ['Catboost_adjusted_weight_5']
fig, ax = plt.subplots(figsize=(10, 6))
plot_confusion_matrix(catboost_5, X_test, y_test, cmap="YlGn", ax=ax);

results_8 = catboost_5.predict(pred_9)
catboost_5.fit(X_test, y_test)
<catboost.core.CatBoostClassifier at 0x23f8f2d3700>
print(catboost_5.get_best_score())
{'learn': {'Logloss': 0.2098173936044513}}
catboost_score = round(catboost_5.score(X_test, y_test),3)
5.5 Ensemble
clf1 = LogisticRegression(solver='lbfgs', max_iter=10000)
clf2 = RandomForestClassifier(n_estimators=200, random_state=1)  #200 hyperparam
clf3 = GaussianNB()
clf4 = SVC(kernel='linear', probability=True)
clf5 = DecisionTreeClassifier(max_depth=5)      #5 tuned
clf6 = KNeighborsClassifier(n_neighbors=26)

eclf = VotingClassifier(
    estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('svc', clf4), ('dt', clf5), ('knn', clf6)],
    voting='soft', weights=[4, 4, 0, 3, 2, 2])

for clf, label in zip([clf1, clf2, clf3, clf4, clf5, clf6, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'SVC', 'Decision Tree', 'knn', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    
eparams = {'C': [0.1, 1, 10, 100],'gamma': [1, 0.1, 0.01, 0.001]}

grid = GridSearchCV(SVC(),param_grid=eparams, refit=True, verbose=2)
grid.fit(X_train,y_train)
Accuracy: 0.79 (+/- 0.01) [Logistic Regression]
Accuracy: 0.76 (+/- 0.01) [Random Forest]
Accuracy: 0.71 (+/- 0.01) [naive Bayes]
Accuracy: 0.79 (+/- 0.01) [SVC]
Accuracy: 0.75 (+/- 0.01) [Decision Tree]
Accuracy: 0.75 (+/- 0.01) [knn]
Accuracy: 0.78 (+/- 0.01) [Ensemble]
Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END .....................................C=0.1, gamma=1; total time=   0.3s
[CV] END .....................................C=0.1, gamma=1; total time=   0.3s
[CV] END .....................................C=0.1, gamma=1; total time=   0.3s
[CV] END .....................................C=0.1, gamma=1; total time=   0.3s
[CV] END .....................................C=0.1, gamma=1; total time=   0.3s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.2s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.2s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.2s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.2s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.2s
[CV] END ..................................C=0.1, gamma=0.01; total time=   0.1s
[CV] END ..................................C=0.1, gamma=0.01; total time=   0.2s
[CV] END ..................................C=0.1, gamma=0.01; total time=   0.1s
[CV] END ..................................C=0.1, gamma=0.01; total time=   0.2s
[CV] END ..................................C=0.1, gamma=0.01; total time=   0.1s
[CV] END .................................C=0.1, gamma=0.001; total time=   0.2s
[CV] END .................................C=0.1, gamma=0.001; total time=   0.2s
[CV] END .................................C=0.1, gamma=0.001; total time=   0.2s
[CV] END .................................C=0.1, gamma=0.001; total time=   0.1s
[CV] END .................................C=0.1, gamma=0.001; total time=   0.1s
[CV] END .......................................C=1, gamma=1; total time=   0.3s
[CV] END .......................................C=1, gamma=1; total time=   0.3s
[CV] END .......................................C=1, gamma=1; total time=   0.4s
[CV] END .......................................C=1, gamma=1; total time=   0.3s
[CV] END .......................................C=1, gamma=1; total time=   0.4s
[CV] END .....................................C=1, gamma=0.1; total time=   0.2s
[CV] END .....................................C=1, gamma=0.1; total time=   0.2s
[CV] END .....................................C=1, gamma=0.1; total time=   0.2s
[CV] END .....................................C=1, gamma=0.1; total time=   0.2s
[CV] END .....................................C=1, gamma=0.1; total time=   0.2s
[CV] END ....................................C=1, gamma=0.01; total time=   0.2s
[CV] END ....................................C=1, gamma=0.01; total time=   0.2s
[CV] END ....................................C=1, gamma=0.01; total time=   0.2s
[CV] END ....................................C=1, gamma=0.01; total time=   0.1s
[CV] END ....................................C=1, gamma=0.01; total time=   0.2s
[CV] END ...................................C=1, gamma=0.001; total time=   0.2s
[CV] END ...................................C=1, gamma=0.001; total time=   0.2s
[CV] END ...................................C=1, gamma=0.001; total time=   0.1s
[CV] END ...................................C=1, gamma=0.001; total time=   0.2s
[CV] END ...................................C=1, gamma=0.001; total time=   0.1s
[CV] END ......................................C=10, gamma=1; total time=   0.3s
[CV] END ......................................C=10, gamma=1; total time=   0.3s
[CV] END ......................................C=10, gamma=1; total time=   0.4s
[CV] END ......................................C=10, gamma=1; total time=   0.4s
[CV] END ......................................C=10, gamma=1; total time=   0.3s
[CV] END ....................................C=10, gamma=0.1; total time=   0.2s
[CV] END ....................................C=10, gamma=0.1; total time=   0.2s
[CV] END ....................................C=10, gamma=0.1; total time=   0.2s
[CV] END ....................................C=10, gamma=0.1; total time=   0.2s
[CV] END ....................................C=10, gamma=0.1; total time=   0.2s
[CV] END ...................................C=10, gamma=0.01; total time=   0.1s
[CV] END ...................................C=10, gamma=0.01; total time=   0.2s
[CV] END ...................................C=10, gamma=0.01; total time=   0.2s
[CV] END ...................................C=10, gamma=0.01; total time=   0.2s
[CV] END ...................................C=10, gamma=0.01; total time=   0.2s
[CV] END ..................................C=10, gamma=0.001; total time=   0.2s
[CV] END ..................................C=10, gamma=0.001; total time=   0.2s
[CV] END ..................................C=10, gamma=0.001; total time=   0.2s
[CV] END ..................................C=10, gamma=0.001; total time=   0.1s
[CV] END ..................................C=10, gamma=0.001; total time=   0.1s
[CV] END .....................................C=100, gamma=1; total time=   0.4s
[CV] END .....................................C=100, gamma=1; total time=   0.4s
[CV] END .....................................C=100, gamma=1; total time=   0.4s
[CV] END .....................................C=100, gamma=1; total time=   0.4s
[CV] END .....................................C=100, gamma=1; total time=   0.4s
[CV] END ...................................C=100, gamma=0.1; total time=   0.4s
[CV] END ...................................C=100, gamma=0.1; total time=   0.4s
[CV] END ...................................C=100, gamma=0.1; total time=   0.5s
[CV] END ...................................C=100, gamma=0.1; total time=   0.4s
[CV] END ...................................C=100, gamma=0.1; total time=   0.4s
[CV] END ..................................C=100, gamma=0.01; total time=   0.2s
[CV] END ..................................C=100, gamma=0.01; total time=   0.2s
[CV] END ..................................C=100, gamma=0.01; total time=   0.3s
[CV] END ..................................C=100, gamma=0.01; total time=   0.2s
[CV] END ..................................C=100, gamma=0.01; total time=   0.2s
[CV] END .................................C=100, gamma=0.001; total time=   0.1s
[CV] END .................................C=100, gamma=0.001; total time=   0.1s
[CV] END .................................C=100, gamma=0.001; total time=   0.1s
[CV] END .................................C=100, gamma=0.001; total time=   0.1s
[CV] END .................................C=100, gamma=0.001; total time=   0.2s
GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100],
                         'gamma': [1, 0.1, 0.01, 0.001]},
             verbose=2)
clf2.fit(X_train, y_train)
RandomForestClassifier(n_estimators=200, random_state=1)
clf2.score(X_train, y_train)
1.0
Step 6.0 Train accuracy & Predict
clf1.fit(X_train, y_train)                 # Logistic Regression
pred_clf1 = clf1.predict(pred_9)

clf2.fit(X_train, y_train)                 # Random Forest
pred_clf2 = clf2.predict(pred_9)

clf3.fit(X_train, y_train)                 # naive Bayes
pred_clf3 = clf3.predict(pred_9)

clf4.fit(X_train, y_train)                 # svc
pred_clf4 = clf4.predict(pred_9)

clf5.fit(X_train, y_train)                 # Decision Tree
pred_clf5 = clf5.predict(pred_9)

clf6.fit(X_train, y_train)                 # KNN
pred_clf6 = clf6.predict(pred_9)

ensemble = grid.predict(pred_9)
# Calculate train scores

clf_1_score = round(clf1.score(X_train, y_train),3)  # Logistic Regression
clf_2_score = round(clf2.score(X_train, y_train),3)  # Random Forest
clf_3_score = round(clf3.score(X_train, y_train),3)  # naive Bayes
clf_4_score = round(clf4.score(X_train, y_train),3)  # SVC score
clf_5_score = round(clf5.score(X_train, y_train),3)  # Decision Tree
clf_6_score = round(clf6.score(X_train, y_train),3)  # KNN

ensemble_train_acc = round(grid.score(X_train,y_train),3)
6.1 Load and send submission file
predictions = best_clf.predict(pred_9)

submission2 = pd.DataFrame({
        "DS_id": pred_2["DS_ID"],
        "BEAST MODE": predictions
        })

submission2.to_csv('BestScore.csv', index=False)
submission = pd.DataFrame({
        "DS_id": pred_2["DS_ID"],
        "Linear Regression": results_6,
        "Linear Regression ver2": response,
        "Extra Tree": results_3,
        "XG Boost": xg_pred_1,
        "CatBoost": results_8,
        "Ensemble Logistic Regression": pred_clf1,
        "Ensemble Random Forest": pred_clf2,
        "Ensemble naive Bayes": pred_clf3,     
        "Ensemble SVC": pred_clf4,
        "Ensemble Decision Tree": pred_clf5,
        "Ensemble KNN": pred_clf6,
        "Ensemble": ensemble
        })

submission.to_csv('DSCI508_FinalPred_Fossi_Jeff.csv', index=False)
Step 7. Summary
print('                Left Union Model results')
print('=====================================================')
print('Original Data Percentage who left              ', Original_data_percentage_left)
print('Original Data Percentage who stayed            ', 1.0-Original_data_percentage_left)
print('Linear Regression                              ', lr_score)
print('Linear Regression ver2 R-square                ', r2)
print('Extra Tree Classifier                          ', extra_tree_score)
print('XG Boost                                       ', xg_score)
print('Cat Boost                                      ', catboost_score)
print('----------------------------------------------------')
print('Ensemble Logistic Regression TEST & TRAIN Score', clf_1_score) 
print('Ensemble Random Forest TEST & TRAIN Score      ', clf_2_score) 
print('Ensemble naive Bayes TEST & TRAIN Score        ', clf_3_score) 
print('Ensemble SVC TEST & TRAIN Score                ', clf_4_score) 
print('Ensemble Decision Tree TEST & TRAIN Score      ', clf_5_score) 
print('Ensemble KNN TEST & TRAIN Score                ', clf_6_score) 
print('=====================================================')
print('Ensemble combined score                        ', ensemble_train_acc)
                Left Union Model results
=====================================================
Original Data Percentage who left               0.364
Original Data Percentage who stayed             0.636
Linear Regression                               0.297
Linear Regression ver2 R-square                 0.309
Extra Tree Classifier                           0.734
XG Boost                                        0.781
Cat Boost                                       0.848
----------------------------------------------------
Ensemble Logistic Regression TEST & TRAIN Score 0.796
Ensemble Random Forest TEST & TRAIN Score       1.0
Ensemble naive Bayes TEST & TRAIN Score         0.711
Ensemble SVC TEST & TRAIN Score                 0.801
Ensemble Decision Tree TEST & TRAIN Score       0.8
Ensemble KNN TEST & TRAIN Score                 0.791
=====================================================
Ensemble combined score                         0.796
Step 8.0 References
https://scikit-learn.org/stable/modules/ensemble.html

https://www.kaggle.com/gaganmaahi224/telco-customer-churn-prediction-with-11-ml-algos

https://aegis4048.github.io/mutiple_linear_regression_and_visualization_in_python

https://www.scikit-yb.org/en/latest/api/regressor/residuals.html