-
Notifications
You must be signed in to change notification settings - Fork 0
/
Union Python
1623 lines (1398 loc) · 81.4 KB
/
Union Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import rfpimp
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from yellowbrick.regressor import ResidualsPlot
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import decomposition
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder #allows transform to 1, 0
from mpl_toolkits.mplot3d import Axes3D
from statsmodels.graphics.gofplots import qqplot
from sklearn.metrics import classification_report, recall_score, confusion_matrix, roc_auc_score, precision_score, f1_score, roc_curve, auc, plot_confusion_matrix,plot_roc_curve
1.0 Exploratory data analysis
# load custom data set & predict data.
union_train_1 = pd.read_csv(r'JEFFREY-train.csv')
pred = pd.read_csv(r'comp_pred.csv')
union_train_1.sample(5)
ID gender Management USAcitizen Married MonthsInUnion ContinuingEd StateOfResidence Connectivity FeatureA ... FeatureC FeatureD FeatureE FeatureF DuesFrequency PaperlessBilling PaymentMethod MonthlyDues TotalDues LeftUnion
1770 Jeffrey508-19480 Female 1 Yes No 27 Yes MO Fiber optic Yes ... No No No No Month-to-month Yes Bank transfer (automatic) 75.20 1929.35 Yes
2437 Jeffrey2226-19369 Male 1 Yes No 49 Yes IL Fiber optic No ... No No No Yes Month-to-month No Credit card (automatic) 90.05 4547.25 Yes
978 Jeffrey1429-18544 Male 0 Yes Yes 29 Yes MO No Maryville ... Maryville Maryville Maryville Maryville Month-to-month Yes Mailed check 20.65 654.85 No
174 Jeffrey1066-13859 Male 0 No Yes 3 Yes MO No Maryville ... Maryville Maryville Maryville Maryville Month-to-month No Bank transfer (automatic) 20.20 50.6 No
2199 Jeffrey1962-19351 Male 0 Yes Yes 50 Yes IL DSL No ... Yes Yes Yes No One year Yes Electronic check 69.65 3442.15 No
5 rows × 21 columns
pred.sample(2)
DS_ID gender Management USAcitizen Married MonthsInUnion ContinuingEd StateOfResidence Connectivity FeatureA FeatureB FeatureC FeatureD FeatureE FeatureF DuesFrequency PaperlessBilling PaymentMethod MonthlyDues TotalDues
569 10569 Female 0 No No 11 No MO DSL No Yes Yes Yes No No One year Yes Credit card (automatic) 40.40 422.6
834 10834 Female 1 No No 4 Yes MO Fiber optic No No No No Yes No Month-to-month Yes Electronic check 78.85 292.8
# Review sample statistics
union_train_1.describe()
Management MonthsInUnion MonthlyDues
count 2999.000000 2999.000000 2999.000000
mean 0.156052 34.500834 63.754652
std 0.362965 48.716480 30.274755
min 0.000000 0.000000 18.700000
25% 0.000000 9.000000 30.500000
50% 0.000000 29.000000 69.700000
75% 0.000000 55.000000 89.475000
max 1.000000 917.000000 118.750000
union_train_1['Married'].value_counts()
No 2106
Yes 893
Name: Married, dtype: int64
fig, ax = plt.subplots(1, 2, figsize=(10,5))
x=union_train_1['Married'].value_counts().index
y=union_train_1['Married'].value_counts().values.tolist()
data = union_train_1.groupby("Married").size()
sns.set(style="dark", color_codes=True)
pal = sns.color_palette("YlGnBu", len(data))
rank = data.argsort().argsort()
sns.barplot(x=x,y=y,palette=np.array(pal[::-1])[rank],ax = ax[0])
for p in ax[0].patches:
ax[0].annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
ha='center', va='bottom',
color= 'black')
ax[0].set_xlabel('Married Yes/No', weight='semibold', fontname = 'monospace')
ax[1].pie(y, labels = x, colors = pal, autopct='%1.1f%%',
explode=[0.03 for i in union_train_1['Married'].value_counts().index])
plt.legend(bbox_to_anchor=(1, 1))
plt.suptitle ('Married Yes/No',weight = 'bold')
plt.show()
union_train_1['Connectivity'].value_counts()
Fiber optic 1277
DSL 1042
No 680
Name: Connectivity, dtype: int64
fig, ax = plt.subplots(1, 2, figsize=(10,5))
x=union_train_1['Connectivity'].value_counts().index
y=union_train_1['Connectivity'].value_counts().values.tolist()
data = union_train_1.groupby("Connectivity").size()
sns.set(style="dark", color_codes=True)
pal = sns.color_palette("YlGnBu", len(data))
rank = data.argsort().argsort()
sns.barplot(x=x,y=y,palette=np.array(pal[::-1])[rank],ax = ax[0])
for p in ax[0].patches:
ax[0].annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
ha='center', va='bottom',
color= 'black')
ax[0].set_xlabel('Connectivity Yes/No', weight='semibold', fontname = 'monospace')
ax[1].pie(y, labels = x, colors = pal, autopct='%1.1f%%',
explode=[0.03 for i in union_train_1['Connectivity'].value_counts().index])
plt.legend(bbox_to_anchor=(1, 1))
plt.suptitle ('Connectivity Yes/No',weight = 'bold')
plt.show()
union_train_1['PaperlessBilling'].value_counts()
Yes 1761
No 1238
Name: PaperlessBilling, dtype: int64
fig, ax = plt.subplots(1, 2, figsize=(10,5))
x=union_train_1['PaperlessBilling'].value_counts().index
y=union_train_1['PaperlessBilling'].value_counts().values.tolist()
data = union_train_1.groupby("PaperlessBilling").size()
sns.set(style="dark", color_codes=True)
pal = sns.color_palette("YlGnBu", len(data))
rank = data.argsort().argsort()
sns.barplot(x=x,y=y,palette=np.array(pal[::-1])[rank],ax = ax[0])
for p in ax[0].patches:
ax[0].annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
ha='center', va='bottom',
color= 'black')
ax[0].set_xlabel('PaperlessBilling Yes/No', weight='semibold', fontname = 'monospace')
ax[1].pie(y, labels = x, colors = pal, autopct='%1.1f%%',
explode=[0.03 for i in union_train_1['PaperlessBilling'].value_counts().index])
plt.legend(bbox_to_anchor=(1, 1))
plt.suptitle ('PaperlessBilling Yes/No',weight = 'bold')
plt.show()
union_train_1['PaymentMethod'].value_counts()
Electronic check 1001
Mailed check 702
Bank transfer (automatic) 667
Credit card (automatic) 629
Name: PaymentMethod, dtype: int64
fig, ax = plt.subplots(1, 2, figsize=(20,10))
x=union_train_1['PaymentMethod'].value_counts().index
y=union_train_1['PaymentMethod'].value_counts().values.tolist()
data = union_train_1.groupby("PaymentMethod").size()
sns.set(style="dark", color_codes=True)
pal = sns.color_palette("YlGnBu", len(data))
rank = data.argsort().argsort()
sns.barplot(x=x,y=y,palette=np.array(pal[::-1])[rank],ax = ax[0])
for p in ax[0].patches:
ax[0].annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
ha='center', va='bottom',
color= 'black')
ax[0].set_xlabel('PaymentMethod Yes/No', weight='semibold', fontname = 'monospace')
ax[1].pie(y, labels = x, colors = pal, autopct='%1.1f%%',
explode=[0.03 for i in union_train_1['PaymentMethod'].value_counts().index])
plt.legend(bbox_to_anchor=(1, 1))
plt.suptitle ('PaymentMethod Yes/No',weight = 'bold')
plt.show()
union_train_1['LeftUnion'].value_counts()
No 2198
Yes 801
Name: LeftUnion, dtype: int64
Original_data_percentage_left = round(801/2198,3)
Original_data_percentage_left
0.364
fig, ax = plt.subplots(1, 2, figsize=(10,5))
x=union_train_1['LeftUnion'].value_counts().index
y=union_train_1['LeftUnion'].value_counts().values.tolist()
data = union_train_1.groupby("LeftUnion").size()
sns.set(style="dark", color_codes=True)
pal = sns.color_palette("YlGnBu", len(data))
rank = data.argsort().argsort()
sns.barplot(x=x,y=y,palette=np.array(pal[::-1])[rank],ax = ax[0])
for p in ax[0].patches:
ax[0].annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
ha='center', va='bottom',
color= 'black')
ax[0].set_xlabel('Left Union Yes/No', weight='semibold', fontname = 'monospace')
ax[1].pie(y, labels = x, colors = pal, autopct='%1.1f%%',
explode=[0.03 for i in union_train_1['LeftUnion'].value_counts().index])
plt.legend(bbox_to_anchor=(1, 1))
plt.suptitle ('Left Union Yes/No',weight = 'bold')
plt.show()
sns.histplot(x = union_train_1['MonthsInUnion'],kde = True)
plt.show()
There appears to be some outliers. I'll explore further with a violin plot. I suspect outliers beyond 400-500 months of union membership. This will need further investigation. USA citizen does not appear relevant in the decision to leave the union.
ax = sns.catplot(x="MonthsInUnion", y="USAcitizen", hue="LeftUnion", kind="violin",
split=True, palette="pastel", data=union_train_1, height=4.2, aspect=1.4)
sns.histplot(x = union_train_1['MonthlyDues'],kde = True)
plt.show()
g = sns.FacetGrid(union_train_1, row='Management', col="gender", hue="LeftUnion", height=3.5)
g.map(plt.scatter, "MonthsInUnion", "MonthlyDues", alpha=0.6)
g.add_legend();
Gender doesnt affect Left Union rate. The amount of monthly Dues and Months in Union do appear to play a role.
ax = sns.catplot(x="Connectivity", y="MonthlyDues", hue="LeftUnion", kind="violin",
split=True, palette="pastel", data=union_train_1, height=4.2, aspect=1.4)
fig, ax=plt.subplots(figsize=(10,5))
sns.countplot(data = union_train_1, x='StateOfResidence', order=union_train_1['StateOfResidence'].value_counts().index, palette='viridis', hue='LeftUnion')
plt.xticks(rotation=90)
plt.xlabel('State', fontsize=10, fontweight='bold')
plt.ylabel('Members', fontsize=10, fontweight='bold')
plt.title('State wise Members', fontsize=12, fontweight='bold')
plt.show()
Step 2.0 Data cleansing
union_train_1.shape
(2999, 21)
union_train_1.columns
Index(['ID', 'gender', 'Management', 'USAcitizen', 'Married', 'MonthsInUnion',
'ContinuingEd', 'StateOfResidence', 'Connectivity', 'FeatureA',
'FeatureB', 'FeatureC', 'FeatureD', 'FeatureE', 'FeatureF',
'DuesFrequency', 'PaperlessBilling', 'PaymentMethod', 'MonthlyDues',
'TotalDues', 'LeftUnion'],
dtype='object')
union_train_1['MonthsInUnion'].unique() ## watch for outliers
array([ 1, 2, 3, 4, 5, 6, 8, 7, 9, 11, 10, 12, 15,
13, 16, 17, 14, 18, 20, 19, 21, 23, 22, 24, 27, 26,
25, 28, 29, 30, 33, 32, 31, 35, 34, 37, 36, 38, 40,
42, 43, 44, 39, 41, 48, 46, 45, 49, 52, 50, 47, 51,
55, 53, 54, 58, 57, 56, 62, 60, 61, 59, 64, 63, 69,
67, 65, 68, 70, 71, 66, 72, 744, 614, 602, 827, 917, 658,
782, 731, 899, 788, 0], dtype=int64)
union_train_1['PaymentMethod'].value_counts()
Electronic check 1001
Mailed check 702
Bank transfer (automatic) 667
Credit card (automatic) 629
Name: PaymentMethod, dtype: int64
# check for nulls
union_train_1.isnull().sum()
ID 0
gender 0
Management 0
USAcitizen 0
Married 0
MonthsInUnion 0
ContinuingEd 0
StateOfResidence 0
Connectivity 0
FeatureA 0
FeatureB 0
FeatureC 0
FeatureD 0
FeatureE 0
FeatureF 0
DuesFrequency 0
PaperlessBilling 0
PaymentMethod 0
MonthlyDues 0
TotalDues 0
LeftUnion 0
dtype: int64
union_train_1.describe
<bound method NDFrame.describe of ID gender Management USAcitizen Married MonthsInUnion \
0 Jeffrey866-18344 Male 0 Yes Yes 1
1 Jeffrey1948-13509 Male 0 No No 1
2 Jeffrey1080-19415 Female 0 Yes Yes 1
3 Jeffrey1957-19631 Female 0 No No 1
4 Jeffrey1170-16685 Male 0 No No 1
... ... ... ... ... ... ...
2994 Jeffrey939-15618 Female 0 Yes Yes 0
2995 Jeffrey1280-19245 Male 0 No Yes 0
2996 Jeffrey1345-13328 Female 0 Yes Yes 0
2997 Jeffrey1405-13244 Male 0 No Yes 0
2998 Jeffrey1600-19062 Female 0 Yes Yes 0
ContinuingEd StateOfResidence Connectivity FeatureA ... FeatureC \
0 Yes MO No Maryville ... Maryville
1 Yes MO No Maryville ... Maryville
2 Yes MO No Maryville ... Maryville
3 Yes MO No Maryville ... Maryville
4 Yes MO No Maryville ... Maryville
... ... ... ... ... ... ...
2994 Yes MO No Maryville ... Maryville
2995 Yes MO No Maryville ... Maryville
2996 Yes IL DSL No ... Yes
2997 Yes IL DSL Yes ... No
2998 Yes MO DSL Yes ... Yes
FeatureD FeatureE FeatureF DuesFrequency PaperlessBilling \
0 Maryville Maryville Maryville One year No
1 Maryville Maryville Maryville Month-to-month Yes
2 Maryville Maryville Maryville Month-to-month Yes
3 Maryville Maryville Maryville Month-to-month No
4 Maryville Maryville Maryville Month-to-month No
... ... ... ... ... ...
2994 Maryville Maryville Maryville Two year No
2995 Maryville Maryville Maryville Two year No
2996 Yes Yes No Two year No
2997 Yes No No Two year Yes
2998 No Yes Yes Two year No
PaymentMethod MonthlyDues TotalDues LeftUnion
0 Mailed check 18.80 18.8 No
1 Mailed check 18.85 18.85 Yes
2 Mailed check 19.00 19 No
3 Mailed check 19.15 19.15 No
4 Electronic check 19.20 19.2 No
... ... ... ... ...
2994 Mailed check 20.00 No
2995 Mailed check 20.25 No
2996 Mailed check 73.35 No
2997 Bank transfer (automatic) 61.90 No
2998 Mailed check 80.85 No
[2999 rows x 21 columns]>
# One hot encoding for gender
dummies = pd.get_dummies(union_train_1.gender)
dummies_pred = pd.get_dummies(pred.gender)
dummies.head(3)
Female Male
0 0 1
1 0 1
2 1 0
pred.sample()
DS_ID gender Management USAcitizen Married MonthsInUnion ContinuingEd StateOfResidence Connectivity FeatureA FeatureB FeatureC FeatureD FeatureE FeatureF DuesFrequency PaperlessBilling PaymentMethod MonthlyDues TotalDues
53 10053 Female 1 Yes No 8 Yes IL Fiber optic No Yes No No No No Month-to-month Yes Credit card (automatic) 80.65 633.3
union_train_3 = pd.concat([union_train_1, dummies], axis='columns')
union_train_3 = union_train_3.drop('gender',axis='columns')
pred_1 = pd.concat([pred, dummies_pred], axis = 'columns')
pred_1 = pred_1.drop('gender',axis='columns')
pred_1.head(3)
DS_ID Management USAcitizen Married MonthsInUnion ContinuingEd StateOfResidence Connectivity FeatureA FeatureB ... FeatureD FeatureE FeatureF DuesFrequency PaperlessBilling PaymentMethod MonthlyDues TotalDues Female Male
0 10000 0 Yes No 1 No MO DSL No Yes ... No No No Month-to-month Yes Electronic check 29.85 29.85 1 0
1 10001 0 No No 34 Yes MO DSL Yes No ... No No No One year No Mailed check 56.95 1889.50 0 1
2 10002 0 No No 2 Yes MO DSL Yes Yes ... No No No Month-to-month Yes Mailed check 53.85 108.15 0 1
3 rows × 21 columns
# One hot encoding StateOfResidence
dummies = pd.get_dummies(union_train_3.StateOfResidence)
pred_dummies = pd.get_dummies(pred_1.StateOfResidence)
union_train_4 = pd.concat([union_train_3, dummies], axis='columns')
union_train_4 = union_train_4.drop('StateOfResidence',axis='columns')
pred_2 = pd.concat([pred_1, pred_dummies], axis='columns')
pred_2 = pred_2.drop('StateOfResidence',axis='columns')
pred_2.head()
DS_ID Management USAcitizen Married MonthsInUnion ContinuingEd Connectivity FeatureA FeatureB FeatureC ... FeatureF DuesFrequency PaperlessBilling PaymentMethod MonthlyDues TotalDues Female Male IL MO
0 10000 0 Yes No 1 No DSL No Yes No ... No Month-to-month Yes Electronic check 29.85 29.85 1 0 0 1
1 10001 0 No No 34 Yes DSL Yes No Yes ... No One year No Mailed check 56.95 1889.50 0 1 0 1
2 10002 0 No No 2 Yes DSL Yes Yes No ... No Month-to-month Yes Mailed check 53.85 108.15 0 1 0 1
3 10003 0 No No 45 No DSL Yes No Yes ... No One year No Bank transfer (automatic) 42.30 1840.75 0 1 0 1
4 10004 0 No No 2 Yes Fiber optic No No No ... No Month-to-month Yes Electronic check 70.70 151.65 1 0 0 1
5 rows × 22 columns
# One hot encoding DuesFrequency
dummies = pd.get_dummies(union_train_4.DuesFrequency)
pred_dummies = pd.get_dummies(pred_2.DuesFrequency)
union_train_5 = pd.concat([union_train_4, dummies], axis='columns')
union_train_5 = union_train_5.drop('DuesFrequency',axis='columns')
pred_3 = pd.concat([pred_2, pred_dummies], axis='columns')
pred_3 = pred_3.drop('DuesFrequency',axis='columns')
# One hot encoding PaymentMethod
dummies = pd.get_dummies(union_train_5.PaymentMethod)
pred_dummies = pd.get_dummies(pred_3.PaymentMethod)
union_train_6 = pd.concat([union_train_5, dummies], axis='columns')
union_train_6 = union_train_6.drop('PaymentMethod',axis='columns')
pred_4 = pd.concat([pred_3, pred_dummies], axis='columns')
pred_4 = pred_4.drop('PaymentMethod',axis='columns')
# One hot encoding features a-e
features = ['FeatureA', 'FeatureB', 'FeatureC', 'FeatureD', 'FeatureE', 'FeatureF']
for f in features:
print(f'{f:>25} has {union_train_6[f].nunique()} unique values')
FeatureA has 3 unique values
FeatureB has 3 unique values
FeatureC has 3 unique values
FeatureD has 3 unique values
FeatureE has 3 unique values
FeatureF has 3 unique values
union_train_6 = pd.get_dummies(union_train_6, columns=features, prefix_sep='_', drop_first=True)
union_train_6
ID Management USAcitizen Married MonthsInUnion ContinuingEd Connectivity PaperlessBilling MonthlyDues TotalDues ... FeatureB_No FeatureB_Yes FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes
0 Jeffrey866-18344 0 Yes Yes 1 Yes No No 18.80 18.8 ... 0 0 0 0 0 0 0 0 0 0
1 Jeffrey1948-13509 0 No No 1 Yes No Yes 18.85 18.85 ... 0 0 0 0 0 0 0 0 0 0
2 Jeffrey1080-19415 0 Yes Yes 1 Yes No Yes 19.00 19 ... 0 0 0 0 0 0 0 0 0 0
3 Jeffrey1957-19631 0 No No 1 Yes No No 19.15 19.15 ... 0 0 0 0 0 0 0 0 0 0
4 Jeffrey1170-16685 0 No No 1 Yes No No 19.20 19.2 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2994 Jeffrey939-15618 0 Yes Yes 0 Yes No No 20.00 ... 0 0 0 0 0 0 0 0 0 0
2995 Jeffrey1280-19245 0 No Yes 0 Yes No No 20.25 ... 0 0 0 0 0 0 0 0 0 0
2996 Jeffrey1345-13328 0 Yes Yes 0 Yes DSL No 73.35 ... 0 1 0 1 0 1 0 1 1 0
2997 Jeffrey1405-13244 0 No Yes 0 Yes DSL Yes 61.90 ... 0 1 1 0 0 1 1 0 1 0
2998 Jeffrey1600-19062 0 Yes Yes 0 Yes DSL No 80.85 ... 0 1 0 1 1 0 0 1 0 1
2999 rows × 34 columns
pred_4 = pd.get_dummies(pred_4, columns=features, prefix_sep='_', drop_first=True)
pred_4
DS_ID Management USAcitizen Married MonthsInUnion ContinuingEd Connectivity PaperlessBilling MonthlyDues TotalDues ... FeatureB_No FeatureB_Yes FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes
0 10000 0 Yes No 1 No DSL Yes 29.85 29.85 ... 0 1 1 0 1 0 1 0 1 0
1 10001 0 No No 34 Yes DSL No 56.95 1889.50 ... 1 0 0 1 1 0 1 0 1 0
2 10002 0 No No 2 Yes DSL Yes 53.85 108.15 ... 0 1 1 0 1 0 1 0 1 0
3 10003 0 No No 45 No DSL No 42.30 1840.75 ... 1 0 0 1 0 1 1 0 1 0
4 10004 0 No No 2 Yes Fiber optic Yes 70.70 151.65 ... 1 0 1 0 1 0 1 0 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 10995 0 No No 35 Yes Fiber optic No 85.30 2917.50 ... 0 1 1 0 0 1 1 0 1 0
996 10996 0 No No 34 Yes Fiber optic Yes 70.00 2416.10 ... 1 0 1 0 1 0 1 0 1 0
997 10997 1 No No 4 Yes Fiber optic Yes 94.30 424.45 ... 1 0 1 0 1 0 0 1 0 1
998 10998 0 Yes No 72 Yes No No 20.70 1492.10 ... 0 0 0 0 0 0 0 0 0 0
999 10999 0 No No 2 Yes DSL No 70.30 132.40 ... 1 0 1 0 0 1 0 1 0 1
1000 rows × 33 columns
# drop ID [& features a - f]
union_train_7 = union_train_6.drop(['ID'],axis='columns') # , 'FeatureA', 'FeatureB', 'FeatureC', 'FeatureD', 'FeatureE', 'FeatureF'],axis='columns')
pred_5 = pred_4.drop(['DS_ID'],axis='columns') #, 'FeatureA', 'FeatureB', 'FeatureC', 'FeatureD', 'FeatureE', 'FeatureF'],axis='columns')
I chose to drop the ID columns, and Feature A - Feature F. These do not appear to be relevent features in the model.
# convert USAcitizen, Married, ContinuingEd, PaperLessBilling, LeftUnion to 1,0 using sklearn preprocessing
lb = LabelEncoder()
union_train_7['USAcitizen'] = lb.fit_transform(union_train_7['USAcitizen'])
union_train_7['Married'] = lb.fit_transform(union_train_7['Married'])
union_train_7['ContinuingEd'] = lb.fit_transform(union_train_7['ContinuingEd'])
union_train_7['PaperlessBilling'] = lb.fit_transform(union_train_7['PaperlessBilling'])
union_train_7['LeftUnion'] = lb.fit_transform(union_train_7['LeftUnion'])
union_train_7.head()
Management USAcitizen Married MonthsInUnion ContinuingEd Connectivity PaperlessBilling MonthlyDues TotalDues LeftUnion ... FeatureB_No FeatureB_Yes FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes
0 0 1 1 1 1 No 0 18.80 18.8 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 1 1 No 1 18.85 18.85 1 ... 0 0 0 0 0 0 0 0 0 0
2 0 1 1 1 1 No 1 19.00 19 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 1 1 No 0 19.15 19.15 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 1 1 No 0 19.20 19.2 0 ... 0 0 0 0 0 0 0 0 0 0
5 rows × 33 columns
lb_pred = LabelEncoder()
pred_5['USAcitizen'] = lb_pred.fit_transform(pred_5['USAcitizen'])
pred_5['Married'] = lb_pred.fit_transform(pred_5['Married'])
pred_5['ContinuingEd'] = lb_pred.fit_transform(pred_5['ContinuingEd'])
pred_5['PaperlessBilling'] = lb_pred.fit_transform(pred_5['PaperlessBilling'])
pred_5.head()
Management USAcitizen Married MonthsInUnion ContinuingEd Connectivity PaperlessBilling MonthlyDues TotalDues Female ... FeatureB_No FeatureB_Yes FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes
0 0 1 0 1 0 DSL 1 29.85 29.85 1 ... 0 1 1 0 1 0 1 0 1 0
1 0 0 0 34 1 DSL 0 56.95 1889.50 0 ... 1 0 0 1 1 0 1 0 1 0
2 0 0 0 2 1 DSL 1 53.85 108.15 0 ... 0 1 1 0 1 0 1 0 1 0
3 0 0 0 45 0 DSL 0 42.30 1840.75 0 ... 1 0 0 1 0 1 1 0 1 0
4 0 0 0 2 1 Fiber optic 1 70.70 151.65 1 ... 1 0 1 0 1 0 1 0 1 0
5 rows × 32 columns
# Check column order train data
union_train_7.columns
Index(['Management', 'USAcitizen', 'Married', 'MonthsInUnion', 'ContinuingEd',
'Connectivity', 'PaperlessBilling', 'MonthlyDues', 'TotalDues',
'LeftUnion', 'Female', 'Male', 'IL', 'MO', 'Month-to-month', 'One year',
'Two year', 'Bank transfer (automatic)', 'Credit card (automatic)',
'Electronic check', 'Mailed check', 'FeatureA_No', 'FeatureA_Yes',
'FeatureB_No', 'FeatureB_Yes', 'FeatureC_No', 'FeatureC_Yes',
'FeatureD_No', 'FeatureD_Yes', 'FeatureE_No', 'FeatureE_Yes',
'FeatureF_No', 'FeatureF_Yes'],
dtype='object')
# check column order pred data
pred_5.columns
Index(['Management', 'USAcitizen', 'Married', 'MonthsInUnion', 'ContinuingEd',
'Connectivity', 'PaperlessBilling', 'MonthlyDues', 'TotalDues',
'Female', 'Male', 'IL', 'MO', 'Month-to-month', 'One year', 'Two year',
'Bank transfer (automatic)', 'Credit card (automatic)',
'Electronic check', 'Mailed check', 'FeatureA_No', 'FeatureA_Yes',
'FeatureB_No', 'FeatureB_Yes', 'FeatureC_No', 'FeatureC_Yes',
'FeatureD_No', 'FeatureD_Yes', 'FeatureE_No', 'FeatureE_Yes',
'FeatureF_No', 'FeatureF_Yes'],
dtype='object')
# reorganize columns train data, move predictor to end
clist = ['Management', 'USAcitizen', 'Married', 'MonthsInUnion', 'ContinuingEd',
'Connectivity', 'PaperlessBilling', 'MonthlyDues', 'TotalDues',
'LeftUnion', 'Female', 'Male', 'IL', 'MO', 'Month-to-month', 'One year',
'Two year', 'Bank transfer (automatic)', 'Credit card (automatic)',
'Electronic check', 'Mailed check', 'FeatureA_No', 'FeatureA_Yes',
'FeatureB_No', 'FeatureB_Yes', 'FeatureC_No', 'FeatureC_Yes',
'FeatureD_No', 'FeatureD_Yes', 'FeatureE_No', 'FeatureE_Yes',
'FeatureF_No', 'FeatureF_Yes']
union_train_8 = union_train_7[clist]
union_train_8.head()
Management USAcitizen Married MonthsInUnion ContinuingEd Connectivity PaperlessBilling MonthlyDues TotalDues LeftUnion ... FeatureB_No FeatureB_Yes FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes
0 0 1 1 1 1 No 0 18.80 18.8 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 1 1 No 1 18.85 18.85 1 ... 0 0 0 0 0 0 0 0 0 0
2 0 1 1 1 1 No 1 19.00 19 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 1 1 No 0 19.15 19.15 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 1 1 No 0 19.20 19.2 0 ... 0 0 0 0 0 0 0 0 0 0
5 rows × 33 columns
# categorize connectivity column
union_train_8.loc[(union_train_8['Connectivity'] == 'Fiber optic') | (union_train_8['Connectivity'] == 'DSL'), 'connectivity'] = 1
union_train_8.loc[(union_train_8['Connectivity'] != 'Fiber optic') & (union_train_8['Connectivity'] != 'DSL'), 'connectivity'] = 0
union_train_9 = union_train_8.drop(['Connectivity'],axis='columns')
union_train_9.head()
Management USAcitizen Married MonthsInUnion ContinuingEd PaperlessBilling MonthlyDues TotalDues LeftUnion Female ... FeatureB_Yes FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes connectivity
0 0 1 1 1 1 0 18.80 18.8 0 0 ... 0 0 0 0 0 0 0 0 0 0.0
1 0 0 0 1 1 1 18.85 18.85 1 0 ... 0 0 0 0 0 0 0 0 0 0.0
2 0 1 1 1 1 1 19.00 19 0 1 ... 0 0 0 0 0 0 0 0 0 0.0
3 0 0 0 1 1 0 19.15 19.15 0 1 ... 0 0 0 0 0 0 0 0 0 0.0
4 0 0 0 1 1 0 19.20 19.2 0 0 ... 0 0 0 0 0 0 0 0 0 0.0
5 rows × 33 columns
# categorized pred data Connectivity 1,0
pred_5.loc[(pred_5['Connectivity'] == 'Fiber optic') | (pred_5['Connectivity'] == 'DSL'), 'connectivity'] = 1
pred_5.loc[(pred_5['Connectivity'] != 'Fiber optic') & (pred_5['Connectivity'] != 'DSL'), 'connectivity'] = 0
pred_6 = pred_5.drop(['Connectivity'],axis='columns')
pred_6.head()
Management USAcitizen Married MonthsInUnion ContinuingEd PaperlessBilling MonthlyDues TotalDues Female Male ... FeatureB_Yes FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes connectivity
0 0 1 0 1 0 1 29.85 29.85 1 0 ... 1 1 0 1 0 1 0 1 0 1.0
1 0 0 0 34 1 0 56.95 1889.50 0 1 ... 0 0 1 1 0 1 0 1 0 1.0
2 0 0 0 2 1 1 53.85 108.15 0 1 ... 1 1 0 1 0 1 0 1 0 1.0
3 0 0 0 45 0 0 42.30 1840.75 0 1 ... 0 0 1 0 1 1 0 1 0 1.0
4 0 0 0 2 1 1 70.70 151.65 1 0 ... 0 1 0 1 0 1 0 1 0 1.0
5 rows × 32 columns
union_train_9['TotalDues'] = pd.to_numeric(union_train_9['TotalDues'], downcast="float", errors='coerce')
# check for nan
union_train_9.isnull().sum()
Management 0
USAcitizen 0
Married 0
MonthsInUnion 0
ContinuingEd 0
PaperlessBilling 0
MonthlyDues 0
TotalDues 7
LeftUnion 0
Female 0
Male 0
IL 0
MO 0
Month-to-month 0
One year 0
Two year 0
Bank transfer (automatic) 0
Credit card (automatic) 0
Electronic check 0
Mailed check 0
FeatureA_No 0
FeatureA_Yes 0
FeatureB_No 0
FeatureB_Yes 0
FeatureC_No 0
FeatureC_Yes 0
FeatureD_No 0
FeatureD_Yes 0
FeatureE_No 0
FeatureE_Yes 0
FeatureF_No 0
FeatureF_Yes 0
connectivity 0
dtype: int64
# callable function to find nans(nulls)
def nans(df): return df[df.isnull().any(axis=1)]
nans(union_train_9)
Management USAcitizen Married MonthsInUnion ContinuingEd PaperlessBilling MonthlyDues TotalDues LeftUnion Female ... FeatureB_Yes FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes connectivity
2992 0 1 1 0 0 1 52.55 NaN 0 1 ... 0 0 1 0 1 0 1 1 0 1.0
2993 0 1 1 0 1 0 25.35 NaN 0 0 ... 0 0 0 0 0 0 0 0 0 0.0
2994 0 1 1 0 1 0 20.00 NaN 0 1 ... 0 0 0 0 0 0 0 0 0 0.0
2995 0 0 1 0 1 0 20.25 NaN 0 0 ... 0 0 0 0 0 0 0 0 0 0.0
2996 0 1 1 0 1 0 73.35 NaN 0 1 ... 1 0 1 0 1 0 1 1 0 1.0
2997 0 0 1 0 1 1 61.90 NaN 0 0 ... 1 1 0 0 1 1 0 1 0 1.0
2998 0 1 1 0 1 0 80.85 NaN 0 1 ... 1 0 1 1 0 0 1 0 1 1.0
7 rows × 33 columns
# drop 7 rows with null values
union_train_9 = union_train_9.drop(labels=[2992,2993,2994,2995,2996,2997,2998], axis=0)
I chose to drop 7 rows of data that has null values for total dues. Due to the size of the data, this will have minimal effect on the outcome.
# REMOVE call nans function to check prediction file for nulls
nans(pred_6)
Management USAcitizen Married MonthsInUnion ContinuingEd PaperlessBilling MonthlyDues TotalDues Female Male ... FeatureB_Yes FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes connectivity
0 rows × 32 columns
# REMOVE
pred_6.drop(pred_6.index[1000:2999], inplace=True)
# REMOVE
nans(pred_6)
Management USAcitizen Married MonthsInUnion ContinuingEd PaperlessBilling MonthlyDues TotalDues Female Male ... FeatureB_Yes FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes connectivity
0 rows × 32 columns
pred_6['TotalDues'] = pd.to_numeric(pred_6['TotalDues'], downcast= "integer", errors='coerce')
pred_6['connectivity'] = pd.to_numeric(pred_6['connectivity'], downcast= "integer", errors='coerce')
pred_6['Management'] = pd.to_numeric(pred_6['Management'], downcast= "integer", errors='coerce')
pred_6['Female'] = pd.to_numeric(pred_6['Female'], downcast= "integer", errors='coerce')
pred_6['Male'] = pd.to_numeric(pred_6['Male'], downcast= "integer", errors='coerce')
pred_6.dtypes
Management int8
USAcitizen int32
Married int32
MonthsInUnion int64
ContinuingEd int32
PaperlessBilling int32
MonthlyDues float64
TotalDues float64
Female int8
Male int8
IL uint8
MO uint8
Month-to-month uint8
One year uint8
Two year uint8
Bank transfer (automatic) uint8
Credit card (automatic) uint8
Electronic check uint8
Mailed check uint8
FeatureA_No uint8
FeatureA_Yes uint8
FeatureB_No uint8
FeatureB_Yes uint8
FeatureC_No uint8
FeatureC_Yes uint8
FeatureD_No uint8
FeatureD_Yes uint8
FeatureE_No uint8
FeatureE_Yes uint8
FeatureF_No uint8
FeatureF_Yes uint8
connectivity int8
dtype: object
pred_6.head()
Management USAcitizen Married MonthsInUnion ContinuingEd PaperlessBilling MonthlyDues TotalDues Female Male ... FeatureB_Yes FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes connectivity
0 0 1 0 1 0 1 29.85 29.85 1 0 ... 1 1 0 1 0 1 0 1 0 1
1 0 0 0 34 1 0 56.95 1889.50 0 1 ... 0 0 1 1 0 1 0 1 0 1
2 0 0 0 2 1 1 53.85 108.15 0 1 ... 1 1 0 1 0 1 0 1 0 1
3 0 0 0 45 0 0 42.30 1840.75 0 1 ... 0 0 1 0 1 1 0 1 0 1
4 0 0 0 2 1 1 70.70 151.65 1 0 ... 0 1 0 1 0 1 0 1 0 1
5 rows × 32 columns
union_train_9.dtypes
# union_train_9 = map(float, union_train_9)
# union_train_9
Management int64
USAcitizen int32
Married int32
MonthsInUnion int64
ContinuingEd int32
PaperlessBilling int32
MonthlyDues float64
TotalDues float32
LeftUnion int32
Female uint8
Male uint8
IL uint8
MO uint8
Month-to-month uint8
One year uint8
Two year uint8
Bank transfer (automatic) uint8
Credit card (automatic) uint8
Electronic check uint8
Mailed check uint8
FeatureA_No uint8
FeatureA_Yes uint8
FeatureB_No uint8
FeatureB_Yes uint8
FeatureC_No uint8
FeatureC_Yes uint8
FeatureD_No uint8
FeatureD_Yes uint8
FeatureE_No uint8
FeatureE_Yes uint8
FeatureF_No uint8
FeatureF_Yes uint8
connectivity float64
dtype: object
# convert values to int
union_train_9['Management'] = pd.to_numeric(union_train_9['Management'], downcast= "integer", errors='coerce')
union_train_9['connectivity'] = pd.to_numeric(union_train_9['connectivity'], downcast= "integer", errors='coerce')
union_train_9['USAcitizen'] = pd.to_numeric(union_train_9['USAcitizen'], downcast= "integer", errors='coerce')
union_train_9['Married'] = pd.to_numeric(union_train_9['Married'], downcast= "integer", errors='coerce')
union_train_9['ContinuingEd'] = pd.to_numeric(union_train_9['ContinuingEd'], downcast= "integer", errors='coerce')
union_train_9['PaperlessBilling'] = pd.to_numeric(union_train_9['PaperlessBilling'], downcast= "integer", errors='coerce')
union_train_9.dtypes
Management int8
USAcitizen int8
Married int8
MonthsInUnion int64
ContinuingEd int8
PaperlessBilling int8
MonthlyDues float64
TotalDues float32
LeftUnion int32
Female uint8
Male uint8
IL uint8
MO uint8
Month-to-month uint8
One year uint8
Two year uint8
Bank transfer (automatic) uint8
Credit card (automatic) uint8
Electronic check uint8
Mailed check uint8
FeatureA_No uint8
FeatureA_Yes uint8
FeatureB_No uint8
FeatureB_Yes uint8
FeatureC_No uint8
FeatureC_Yes uint8
FeatureD_No uint8
FeatureD_Yes uint8
FeatureE_No uint8
FeatureE_Yes uint8
FeatureF_No uint8
FeatureF_Yes uint8
connectivity int8
dtype: object
# check pred file columns
pred_6.columns
Index(['Management', 'USAcitizen', 'Married', 'MonthsInUnion', 'ContinuingEd',
'PaperlessBilling', 'MonthlyDues', 'TotalDues', 'Female', 'Male', 'IL',
'MO', 'Month-to-month', 'One year', 'Two year',
'Bank transfer (automatic)', 'Credit card (automatic)',
'Electronic check', 'Mailed check', 'FeatureA_No', 'FeatureA_Yes',
'FeatureB_No', 'FeatureB_Yes', 'FeatureC_No', 'FeatureC_Yes',
'FeatureD_No', 'FeatureD_Yes', 'FeatureE_No', 'FeatureE_Yes',
'FeatureF_No', 'FeatureF_Yes', 'connectivity'],
dtype='object')
union_train_9.columns
Index(['Management', 'USAcitizen', 'Married', 'MonthsInUnion', 'ContinuingEd',
'PaperlessBilling', 'MonthlyDues', 'TotalDues', 'LeftUnion', 'Female',
'Male', 'IL', 'MO', 'Month-to-month', 'One year', 'Two year',
'Bank transfer (automatic)', 'Credit card (automatic)',
'Electronic check', 'Mailed check', 'FeatureA_No', 'FeatureA_Yes',
'FeatureB_No', 'FeatureB_Yes', 'FeatureC_No', 'FeatureC_Yes',
'FeatureD_No', 'FeatureD_Yes', 'FeatureE_No', 'FeatureE_Yes',
'FeatureF_No', 'FeatureF_Yes', 'connectivity'],
dtype='object')
# reorganize pred columns
pred_clist = ['Management', 'USAcitizen', 'Married', 'MonthsInUnion', 'ContinuingEd',
'PaperlessBilling', 'MonthlyDues', 'TotalDues', 'Female', 'Male', 'IL',
'MO', 'Month-to-month', 'One year', 'Two year',
'Bank transfer (automatic)', 'Credit card (automatic)',
'Electronic check', 'Mailed check', 'FeatureA_No', 'FeatureA_Yes',
'FeatureB_No', 'FeatureB_Yes', 'FeatureC_No', 'FeatureC_Yes',
'FeatureD_No', 'FeatureD_Yes', 'FeatureE_No', 'FeatureE_Yes',
'FeatureF_No', 'FeatureF_Yes', 'connectivity']
pred_7 = pred_6[pred_clist]
pred_7.sample()
Management USAcitizen Married MonthsInUnion ContinuingEd PaperlessBilling MonthlyDues TotalDues Female Male ... FeatureB_Yes FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes connectivity
615 0 0 0 15 1 0 48.85 631.4 1 0 ... 0 1 0 1 0 1 0 1 0 1
1 rows × 32 columns
# organize column position
clist = ['Management', 'USAcitizen', 'Married', 'MonthsInUnion', 'ContinuingEd',
'PaperlessBilling', 'MonthlyDues', 'TotalDues', 'Female', 'Male', 'IL',
'MO', 'Month-to-month', 'One year', 'Two year',
'Bank transfer (automatic)', 'Credit card (automatic)',
'Electronic check', 'Mailed check', 'FeatureA_No', 'FeatureA_Yes',
'FeatureB_No', 'FeatureB_Yes', 'FeatureC_No', 'FeatureC_Yes',
'FeatureD_No', 'FeatureD_Yes', 'FeatureE_No', 'FeatureE_Yes',
'FeatureF_No', 'FeatureF_Yes', 'connectivity', 'LeftUnion']
union_train_9 = union_train_9[clist]
union_train_9.head()
Management USAcitizen Married MonthsInUnion ContinuingEd PaperlessBilling MonthlyDues TotalDues Female Male ... FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes connectivity LeftUnion
0 0 1 1 1 1 0 18.80 18.799999 0 1 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 1 1 1 18.85 18.850000 0 1 ... 0 0 0 0 0 0 0 0 0 1
2 0 1 1 1 1 1 19.00 19.000000 1 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 1 1 0 19.15 19.150000 1 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 1 1 0 19.20 19.200001 0 1 ... 0 0 0 0 0 0 0 0 0 0
5 rows × 33 columns
Step 3.0 Outlier Analysis Quantitative
# Adapted from https://www.kaggle.com/gaganmaahi224/telco-customer-churn-prediction-with-11-ml-algos
x = ['MonthsInUnion','MonthlyDues', 'TotalDues']
def count_outliers(data,col):
q1 = data[col].quantile(0.25,interpolation='nearest')
q2 = data[col].quantile(0.5,interpolation='nearest')
q3 = data[col].quantile(0.75,interpolation='nearest')
q4 = data[col].quantile(1,interpolation='nearest')
IQR = q3 -q1
global LLP
global ULP
LLP = q1 - 1.5*IQR
ULP = q3 + 1.5*IQR
if data[col].min() > LLP and data[col].max() < ULP:
print("No outliers in",i)
else:
print("There are outliers in",i)
x = data[data[col]<LLP][col].size
y = data[data[col]>ULP][col].size
a.append(i)
print('Count of outliers are:',x+y)
global a
a = []
for i in x:
count_outliers(union_train_9,i)
There are outliers in MonthsInUnion
Count of outliers are: 10
No outliers in MonthlyDues
There are outliers in TotalDues
Count of outliers are: 11
3.1 Outlier Analysis Visualization
The code above indicates outliers in MonthsInUnion and TotalDues. I will explore furthur with visualization.
def Box_plots(df):
plt.figure(figsize=(10, 4))
plt.title("Box Plot")
sns.boxplot(df)
plt.show()
Box_plots(union_train_9['MonthsInUnion'])
def hist_plots(df):
plt.figure(figsize=(10, 4))
plt.hist(df)
plt.title("Histogram Plot")
plt.show()
hist_plots(union_train_9['MonthsInUnion'])
def scatter_plots(df1,df2):
fig, ax = plt.subplots(figsize=(10,4))
ax.scatter(df1,df2)
ax.set_xlabel('MonthsInUnion')
ax.set_ylabel('MonthlyDues')
plt.title("MonthlyDues")
plt.show()
scatter_plots(union_train_9['MonthlyDues'],union_train_9['MonthsInUnion'])
def dist_plots(df):
plt.figure(figsize=(10, 4))
sns.distplot(df)
plt.title("Distribution plot")
sns.despine()
plt.show()
dist_plots(union_train_9['MonthsInUnion'])
def qq_plots(df):
plt.figure(figsize=(10, 4))
qqplot(df,line='s')
plt.title("Normal QQPlot")
plt.show()
qq_plots(union_train_9['MonthsInUnion'])
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
<Figure size 720x288 with 0 Axes>
3.2 Manual outlier adjustments
Make adjustments to Monthly Dues, and Total Dues. Based upon the facts Monthly dues average for the data set = $64, and I am limiting months in union to approx 40 years or approx. 500 months. Monthly Dues and Total dues do not appear to make sense mathematically.
Average retirment age in the U.S. is 61. Considering most data science members would have 4 plus years of higher education, I will limit the months in union to those members who have less than 500 months in the union.
## ADJUST OR REMOVE
union_train_9.query('MonthsInUnion > 500')
Management USAcitizen Married MonthsInUnion ContinuingEd PaperlessBilling MonthlyDues TotalDues Female Male ... FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes connectivity LeftUnion
2982 0 1 1 744 1 1 19.20 14284.799805 0 1 ... 0 0 0 0 0 0 0 0 0 0
2983 0 0 0 614 0 0 35.75 21950.500000 0 1 ... 0 1 0 1 1 0 1 0 1 0
2984 1 1 0 602 1 0 60.00 36120.000000 0 1 ... 1 0 1 0 1 0 1 0 1 0
2985 0 0 0 827 1 1 59.80 49454.601562 0 1 ... 1 0 0 1 0 1 1 0 1 1
2986 1 1 0 917 1 1 74.45 68270.648438 1 0 ... 1 0 1 0 1 0 1 0 1 1
2987 0 0 0 658 1 1 104.95 69057.101562 0 1 ... 0 1 1 0 0 1 0 1 1 0
2988 0 0 0 782 1 1 93.40 73038.796875 1 0 ... 0 1 1 0 0 1 1 0 1 0
2989 0 1 1 731 1 1 103.20 75439.203125 1 0 ... 0 1 1 0 0 1 0 1 1 0
2990 0 1 0 899 1 1 84.95 76370.046875 1 0 ... 1 0 0 1 0 1 0 1 1 0
2991 0 0 0 788 1 1 104.50 82346.000000 0 1 ... 0 1 1 0 0 1 0 1 1 1
10 rows × 33 columns
## ADJUST OR REMOVE
union_train_9 = union_train_9.query('MonthsInUnion < 500')
sns.histplot(x = union_train_9['MonthsInUnion'],kde = True)
plt.show()
Check monthly and yearly dues for outliers
sns.histplot(x = union_train_9['MonthlyDues'],kde = True)
plt.show()
sns.boxplot(union_train_9['MonthlyDues'])
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
<AxesSubplot:xlabel='MonthlyDues'>
## ADJUST OR REMOVE
union_train_9.query('MonthlyDues > 110')
Management USAcitizen Married MonthsInUnion ContinuingEd PaperlessBilling MonthlyDues TotalDues Female Male ... FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes connectivity LeftUnion
1515 0 0 0 12 1 1 112.95 1384.750000 0 1 ... 0 1 0 1 0 1 0 1 1 1
1901 0 1 0 21 1 1 111.20 2317.100098 0 1 ... 0 1 0 1 0 1 0 1 1 1
2045 1 0 0 27 1 1 110.50 2857.600098 1 0 ... 0 1 1 0 0 1 0 1 1 0
2280 0 0 0 34 1 0 116.25 3899.050049 0 1 ... 0 1 0 1 0 1 0 1 1 0
2292 0 0 1 34 1 1 116.15 3946.899902 1 0 ... 0 1 0 1 0 1 0 1 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2977 0 1 1 72 1 1 115.80 8476.500000 1 0 ... 0 1 0 1 0 1 0 1 1 0
2978 0 1 0 72 1 1 117.15 8529.500000 1 0 ... 0 1 0 1 0 1 0 1 1 0
2979 0 0 0 72 1 1 118.20 8547.150391 0 1 ... 0 1 0 1 0 1 0 1 1 0
2980 0 1 0 71 1 1 116.25 8564.750000 0 1 ... 0 1 0 1 0 1 0 1 1 0
2981 0 1 1 72 1 1 118.75 8672.450195 1 0 ... 0 1 0 1 0 1 0 1 1 0
87 rows × 33 columns
## ADJUST OR REMOVE
union_train_9 = union_train_9.query('MonthlyDues < 110')
sns.histplot(x = union_train_9['TotalDues'],kde = True)
plt.show()
sns.boxplot(union_train_9['TotalDues'])
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
<AxesSubplot:xlabel='TotalDues'>
union_train_9.describe()
Management USAcitizen Married MonthsInUnion ContinuingEd PaperlessBilling MonthlyDues TotalDues Female Male ... FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes connectivity LeftUnion
count 2893.000000 2893.000000 2893.000000 2893.000000 2893.000000 2893.000000 2893.000000 2893.000000 2893.000000 2893.000000 ... 2893.000000 2893.000000 2893.000000 2893.000000 2893.000000 2893.000000 2893.000000 2893.000000 2893.000000 2893.000000
mean 0.155894 0.470446 0.296578 31.238161 0.900449 0.580712 62.230954 2078.132568 0.509160 0.490840 ... 0.453163 0.313170 0.495334 0.270999 0.415831 0.350501 0.408918 0.357414 0.766333 0.268925
std 0.362817 0.499212 0.456828 24.080381 0.299452 0.493528 29.439395 2078.714111 0.500003 0.500003 ... 0.497888 0.463863 0.500065 0.444552 0.492950 0.477209 0.491719 0.479321 0.423236 0.443478
min 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 18.700000 18.799999 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 9.000000 1.000000 0.000000 29.100000 372.450012 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
50% 0.000000 0.000000 0.000000 27.000000 1.000000 1.000000 69.000000 1284.199951 1.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
75% 0.000000 1.000000 1.000000 53.000000 1.000000 1.000000 87.250000 3355.649902 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
max 1.000000 1.000000 1.000000 72.000000 1.000000 1.000000 109.950000 8129.299805 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
8 rows × 33 columns
union_train_9.query('TotalDues > 4000')
Management USAcitizen Married MonthsInUnion ContinuingEd PaperlessBilling MonthlyDues TotalDues Female Male ... FeatureC_No FeatureC_Yes FeatureD_No FeatureD_Yes FeatureE_No FeatureE_Yes FeatureF_No FeatureF_Yes connectivity LeftUnion
2307 0 1 1 43 1 1 91.25 4013.800049 0 1 ... 0 1 0 1 1 0 1 0 1 0
2308 0 1 0 42 1 1 94.40 4014.600098 0 1 ... 0 1 1 0 1 0 0 1 1 0
2309 0 1 1 52 1 0 79.20 4016.300049 0 1 ... 0 1 1 0 0 1 0 1 1 0
2310 0 1 1 42 1 0 97.10 4016.750000 0 1 ... 1 0 1 0 0 1 0 1 1 0
2311 0 0 0 45 1 1 89.30 4016.850098 1 0 ... 1 0 1 0 0 1 1 0 1 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2941 1 1 0 71 1 1 109.70 7904.250000 1 0 ... 0 1 0 1 0 1 0 1 1 0
2942 1 1 0 72 1 1 109.55 7920.700195 0 1 ... 0 1 1 0 0 1 0 1 1 0
2943 0 1 1 70 1 1 108.15 7930.549805 1 0 ... 0 1 1 0 0 1 0 1 1 0
2950 0 1 0 72 1 1 108.50 8003.799805 1 0 ... 0 1 0 1 0 1 0 1 1 0
2960 0 1 1 72 1 0 109.70 8129.299805 1 0 ... 0 1 1 0 0 1 0 1 1 0
591 rows × 33 columns
##ADJUST OR REMOVE
union_train_9 = union_train_9.query('TotalDues < 4000')
sns.boxplot(union_train_9['TotalDues'])
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
<AxesSubplot:xlabel='TotalDues'>
3.3 Scale non-binary columns
cols_to_scale = ['MonthlyDues', 'TotalDues', 'MonthsInUnion']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
union_train_9[cols_to_scale] = scaler.fit_transform(union_train_9[cols_to_scale])