-
Notifications
You must be signed in to change notification settings - Fork 0
/
Pump_Classifier.py
97 lines (92 loc) · 3.25 KB
/
Pump_Classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pandas as pd
from joblib import load
from json import dumps
from numpy import log10
def predict_pump_status(json_input):
# load encoder and model
encoder = load('pump_classifier_transformer.joblib')
wp_classifier = load('pump_classifier_model.joblib')
# convert input to DataFrame (for cleaning function)
input_df = pd.read_json(json_input, orient='index', convert_dates=False)
# clean data (for model format)
cleaned_input_df = clean_pump_data(input_df)
# encode data
encoded_input = encoder.transform(cleaned_input_df)
# generate prediction
input_prediction = wp_classifier.predict(encoded_input)[0]
# prepare prediction for JSON conversion
wp_predict = {"output": input_prediction}
# convert prediction to JSON
well_output = dumps(wp_predict, indent=2)
# return prediction
return well_output
def custom_pump_impute(dataframe):
# define values to impute in nulls
impute_pairs = {
'amount_tsh': 1.0,
'gps_height': 0,
'basin': 'Lake Victoria',
'population': 0,
'public_meeting': 'unknown',
'extraction_type_class': 'gravity',
'management_group': 'user-group',
'payment_type': 'never pay',
'water_quality': 'soft',
'quantity_group': 'enough',
'source_type': 'spring',
'waterpoint_type': 'communal standpipe',
'date_recorded': '1850-01-01',
'construction_year': 0
}
# impute null values
for k, v in impute_pairs.items():
dataframe[k] = dataframe[k].fillna(v)
# include 'None' in imputed values for scheme_management
dataframe.scheme_management = dataframe.scheme_management.apply(
lambda x: 'unknown' if (pd.isna(x) or x == 'None') else x
)
return dataframe
def clean_pump_data(pump_dataframe):
# drop columns
pump_dataframe = pump_dataframe.drop(columns=[
'num_private',
'subvillage',
'lga',
'ward',
'region',
'region_code',
'district_code',
'scheme_name',
'extraction_type',
'extraction_type_group',
'management',
'payment',
'quality_group',
'quantity',
'source',
'source_class',
'waterpoint_type_group',
'funder',
'installer',
'wpt_name',
'recorded_by',
'permit'
])
# apply custom impute function
pump_dataframe = custom_pump_impute(pump_dataframe)
# extract year from 'date_recorded'
year_recorded = pump_dataframe.date_recorded.apply(lambda x: int(x[:4]))
# calculate pump age
pump_age = year_recorded - pump_dataframe.construction_year
# impute inaccurate values
pump_age = pump_age.apply(lambda x: -100 if (x < 0 or x > 100) else x)
# assign pump_age to feature matrix
pump_dataframe['pump_age'] = pump_age
# drop date_recorded and construction_year features
pump_dataframe = pump_dataframe.drop(columns=['date_recorded', 'construction_year'])
# apply log transformations
pump_dataframe.amount_tsh = pump_dataframe.amount_tsh.apply(lambda x: log10(x+1))
pump_dataframe.population = pump_dataframe.population.apply(lambda x: log10(x+1))
# convert public_meeting boolean values to string objects
pump_dataframe.public_meeting = pump_dataframe.public_meeting.apply(lambda x: str(x))
return pump_dataframe