-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexp_analysis.py
182 lines (145 loc) · 5.78 KB
/
exp_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
from utils import *
import pandas as pd
import pdb
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from classification import train_RF
import time
from sklearn.decomposition import PCA
import seaborn as sns
data_file_name_europe = "data/creditcard.csv"
data_file_name_paysim = "data/paysim_data_600.csv"
NUM_OF_TREES = 250
DEBUG = False
def hist_of_utils(data):
"""
This function takes the entire dataframe and attaches a new pandas column to it containing the utility
which is defined as the amount of the transaction * the probability of a transaction being fraud.
The function also print a histogram of the utility of all transactions.
param data: Pandas frame containing all the features and the label of the data
return void:
"""
print('Training RF')
clf = train_RF(data, NUM_OF_TREES)
print('Attaching proba to dataset')
probs_test = clf.predict_proba(data.drop(columns = ['Class'], axis = 1))[:,1]
data['Utility'] = probs_test * data['Amount']
bins = [0.0, 0.01, 1.0, 10.0, 100.0, 1000.0, 10000.0]
#hist = np.histogram(data['Utility'], bins = bins)
print(data['Utility'].value_counts(bins = bins))
sum = data['Utility'].value_counts(bins = bins).sum()
print('Capturing ', sum, ' / ', len(data))
return
def hist_of_amounts(data):
'''
This function takes the features of the data and creates a histogram of it
:param data: Pandas frame containing all the features of the data
'''
amount_s = data['Amount']
amounts_of_trans = data['Amount'].to_numpy()
#hist = np.histogram(amounts_of_trans, bins = 10)
import pdb; pdb.set_trace()
print('Length of dataframe', len(data))
plt.hist(amounts_of_trans, bins = 100)
plt.title("Histogram of amounts of transactions within the range of 0.00 - 100,000")
plt.xlabel('Amount of transactions in Euros')
plt.ylabel('Frequency')
plt.savefig('results/amounts_of_transactions_paysim.png')
return
def obtain_stats_of_features(features, labels):
'''
This function prints standard statistics about the data set such as the mean amount + SD, maximum and minimum amount and the start and end time.
:param features: Pandas dataframe containing the features of the data
:param labels: Pandas Series containing the labels of the data features
'''
std = features['Amount'].std()
min = features['Amount'].min()
mean = features['Amount'].mean()
max = features['Amount'].max()
min_time = features['Time'].min()
max_time = features['Time'].max()
print('standard deviation = ', std)
print('lowest transaction = ', min)
print('mean = ', mean)
print('highest transaction = ', max)
print('start time = ', min_time)
print('end time = ', max_time)
def visualize_pca(features, labels):
'''
This function applies PCA and plots the data along the principle components
:param features: Pandas dataframe containing the features of the data
:param labels: Pandas Series containing the labels of the data
'''
pca = PCA()
Xt = pca.fit_transform(features)
plot = plt.scatter(Xt[:,0], Xt[:,1], c=labels)
plt.legend(handles=plot.legend_elements()[0], labels = list(labels))
plt.savefig('results/pca_plot_paysim.png')
return
def plot_explained_variance(df):
'''
This function shows the explained variance per pca principle component
:param df: Dataframe containing the features of the data
'''
features_train = StandardScaler().fit_transform(df)
pca = PCA()
principalComponents = pca.fit_transform(features_train)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.savefig('results/pca_explained_variance_paysim.png')
return
def delete_outliers(df):
'''
This function delete the outliers based on the amount of the transactions
:param df: Pandas Dataframe containing the features of the data
'''
df = df[df['Amount'] < 10000]
return df
def hist_fraud_time(df):
'''
This function plots the number of fraud transactions over time
:param df: Pandas dataframe containing the features of the data. This df should also contain the class labels.
'''
#Filter data on frauds
fraud_df = df[df['Class'] == 1]
max_time = df['Time'].max()
range_length = 10
time_ticks = np.arange(0, max_time, 20)
spacing = int(max_time / range_length)
bins = np.linspace(0,max_time, num = spacing)
cum_fraud = fraud_df['Time'].value_counts(bins = bins, sort = False)
cum_fraud.plot(xticks=[])
#ax = cum_fraud.plot.hist(bins=len(cum_fraud))
plt.xlabel('Time')
plt.xticks(time_ticks)
plt.ylabel('Frequency of fraud transactions')
plt.title('Number of fraud transactions over time')
plt.savefig('results/fraud_over_time_paysim.png')
def main():
df = read_csv_as_pd(data_file_name_paysim)
df = preprocess_paysim(df)
if DEBUG:
df = df.sample( n = 100)
features = df.drop(columns = ['Class'], axis = 1)
labels = df['Class']
#Either one of the lines below should be uncommented to visualize part of the exploratory analysis.
#visualize_pca(features, labels)
#plot_explained_variance(features)
#df_small_amount = df[df['Amount'] < 100000]
#df_small_amount = df[df['Amount'] < 500]
#hist_of_amounts(df_small_amount)
#hist_fraud_time(df)
#obtain_stats_of_features(features, labels)
#new_df = delete_outliers(df)
#hist_of_utils(df)
if __name__ == "__main__":
st = time.time()
main()
run_time = time.time() - st
print('execution time = ', run_time, 'seconds')