-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare.py
55 lines (48 loc) · 2.42 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import wrangle as w
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
import warnings
warnings.filterwarnings("ignore")
def visualize_scaler(scaler, df, columns_to_scale, bins=10):
fig, axs = plt.subplots(len(columns_to_scale), 2, figsize=(16,9))
df_scaled = df.copy()
df_scaled[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
for (ax1, ax2), col in zip(axs, columns_to_scale):
ax1.hist(df[col], bins=bins)
ax1.set(title=f'{col} before scaling', xlabel=col, ylabel='count')
ax2.hist(df_scaled[col], bins=bins)
ax2.set(title=f'{col} after scaling with {scaler.__class__.__name__}', xlabel=col, ylabel='count')
plt.tight_layout()
# return fig, axs
def scale_data(train,
validate,
test,
columns_to_scale=['bedrooms', 'bathrooms', 'tax_amount', 'finished_area'],
return_scaler=False
):
'''
Scales the 3 data splits.
Takes in train, validate, and test data splits and returns their scaled counterparts.
If return_scalar is True, the scaler object will be returned as well
'''
# make copies of our original data so we dont gronk up anything
train_scaled = train.copy()
validate_scaled = validate.copy()
test_scaled = test.copy()
# make the thing
scaler = QuantileTransformer()
# fit the thing
scaler.fit(train[columns_to_scale])
# applying the scaler:
train_scaled[columns_to_scale] = pd.DataFrame(scaler.transform(train[columns_to_scale]),
columns=train[columns_to_scale].columns.values).set_index([train.index.values])
validate_scaled[columns_to_scale] = pd.DataFrame(scaler.transform(validate[columns_to_scale]),
columns=validate[columns_to_scale].columns.values).set_index([validate.index.values])
test_scaled[columns_to_scale] = pd.DataFrame(scaler.transform(test[columns_to_scale]),
columns=test[columns_to_scale].columns.values).set_index([test.index.values])
if return_scaler:
return scaler, train_scaled, validate_scaled, test_scaled
else:
return train_scaled, validate_scaled, test_scaled