-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
118 lines (80 loc) · 5.11 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
numerical_columns = ["Age", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts", "Num_Credit_Card", "Interest_Rate", "Num_of_Loan", "Delay_from_due_date", "Num_of_Delayed_Payment", "Changed_Credit_Limit", "Num_Credit_Inquiries", "Outstanding_Debt", "Credit_Utilization_Ratio", "Credit_History_Age", "Total_EMI_per_month", "Amount_invested_monthly", "Monthly_Balance"]
def show_stats(dataset):
print(dataset.shape)
# Check how many entries are 0
print(dataset.isna().sum())
for col in dataset.columns:
print(f"Column '{col}' has data type: {dataset[col].dtype}")
def formatting(dataset):
# Remove underscores from numerical columns
for col in numerical_columns:
dataset[col] = dataset[col].str.replace('_', '')
# Specify desired data types for each column
dtypes = {'ID': object, 'Customer_ID': object, 'Month': object, 'Name': object, 'Age': int, 'SSN': object, 'Occupation': object, 'Annual_Income': float, 'Monthly_Inhand_Salary': float, 'Num_Bank_Accounts': int, 'Num_Credit_Card': int, 'Interest_Rate': int, 'Num_of_Loan': int, 'Type_of_Loan': object, 'Delay_from_due_date': int, 'Num_of_Delayed_Payment': int, 'Changed_Credit_Limit': float, 'Num_Credit_Inquiries': int, 'Credit_Mix': object, 'Outstanding_Debt': float, 'Credit_Utilization_Ratio': float, 'Credit_History_Age': int, 'Payment_of_Min_Amount': bool, 'Total_EMI_per_month': float, 'Amount_invested_monthly': float, 'Payment_Behaviour': object, 'Monthly_Balance': float, 'Credit_Score': object}
for col, dtype in dtypes.items():
dataset[col] = pd.to_numeric(dataset[col], errors='coerce').astype(dtype)
# Remove negative values
for col in numerical_columns:
dataset = dataset[dataset[col] >= 0]
dataset = dataset.drop_duplicates()
# dataset = dataset.dropna(inplace=True)
print(dataset.describe())
columns_with_nan = dataset.columns[dataset.isna().any()].tolist()
print(columns_with_nan)
print(dataset.head())
def interpolate_missing_values(dataset):
# INTERPOLATION
rows_to_interpolate = ['Num_of_Delayed_Payment', 'Num_Credit_Inquiries', 'Amount_invested_monthly', 'Monthly_Balance']
def convert_objects_to_labels(dataset):
# Specify the names of the categorical columns and converting them to labels
categorical_columns = ["Month", "Occupation", "Type_of_Loan", "Credit_Mix", "Payment_Behaviour", "Payment_of_Min_Amount", "Credit_History_Age"]
le = preprocessing.LabelEncoder()
for col_name in categorical_columns:
col_index = dataset.columns.get_loc(col_name) # Find the index of the column by column name
dataset[col_name] = le.fit_transform(dataset[col_name])
def remove_underscores(dataset):
# Fixing numerical columns that have _ in them
numerical_columns = ["Age", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts", "Num_Credit_Card", "Interest_Rate", "Num_of_Loan", "Delay_from_due_date", "Num_of_Delayed_Payment", "Changed_Credit_Limit", "Num_Credit_Inquiries", "Outstanding_Debt", "Credit_Utilization_Ratio", "Credit_History_Age", "Total_EMI_per_month", "Amount_invested_monthly", "Monthly_Balance"]
for col in numerical_columns:
if dataset[col].dtype == 'object': # Check if the column contains string values
dataset[col] = pd.to_numeric(dataset[col], errors='coerce')
def define_values(dataset):
# We are predicting the credit score
y = dataset['Credit_Score'].values
# Remove features that don't help us predict credit score
X = dataset.drop(columns=['ID', 'Customer_ID', 'Name', 'SSN', 'Credit_Score', 'Month', 'Type_of_Loan', 'Credit_History_Age']).values
# X_train, y_train = dataset['x'], dataset['y']
# X_test, y_test = test_df['x'], test_df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
return X_train, X_test, y_train, y_test
def scale(X_train, X_test):
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
return X_train, X_test
def random_forest(X_train, X_test, y_train, y_test):
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=0)
# Train the model on the training data
rf_classifier.fit(X_train, y_train)
# Predict on the testing data
y_pred = rf_classifier.predict(X_test)
# Evaluate the model
report = classification_report(y_test, y_pred)
return report
train_df = pd.read_csv('data/train.csv', dtype = 'object')
train_df = show_stats(train_df)
train_df = formatting(train_df)
train_df = interpolate_missing_values(train_df)
train_df = convert_objects_to_labels(train_df)
train_df = remove_underscores(train_df)
X_train, X_test, y_train, y_test = define_values(train_df)
X_train, X_test = scale(X_train, X_test)
print(random_forest(X_train, X_test, y_train, y_test))