-
Notifications
You must be signed in to change notification settings - Fork 14
/
Utils.py
143 lines (125 loc) · 5.2 KB
/
Utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# -*- coding: utf-8 -*-
"""
Created on Wed May 17 23:07:54 2017
@author: vrtjso
"""
import numpy as np
import pandas as pd
import random
from operator import le, eq
from sklearn.decomposition import PCA
from sklearn import model_selection, preprocessing
def RMSLE(y, yfit):
n = len(y)
s = 0
for i in range(0,n):
s += (np.log(yfit[i] + 1) - np.log(y[i] + 1)) ** 2
RMSLE = np.sqrt(s/n)
return RMSLE
#Objective function used for xgb
def objective(yfit, dtrain):
y = dtrain.get_label()
g = 2 * (np.log(yfit + 1) - np.log(y + 1)) / (yfit + 1)
h = (2 - 2 * np.log(yfit + 1) + 2 * np.log(y + 1)) / ((yfit + 1) ** 2)
#n = dtrain.num_row()
#g = []
#h = []
#for i in range(0,n):
# g.append(2 * (np.log(yfit[i] + 1) - np.log(y[i] + 1)) / (yfit[i] + 1))
# h.append((2 - 2 * np.log(yfit[i] + 1) + 2 * np.log(y[i] + 1)) / ((yfit[i] + 1) ** 2))
return g, h
#Metric used for xgb cv
def eval_metric(yfit, dtrain):
y = dtrain.get_label()
return 'error', RMSLE(y,yfit)
def CreateOutput(prediction):
output = pd.read_csv('test.csv')
output = output[['id']]
output['price_doc'] = prediction
output.to_csv('Submission.csv',index=False)
#load data
def loadTraindata(takeLog=True):
#filename = 'train.csv'
filename = 'train_featured.csv'
rawDf = pd.read_csv(filename)
Ytrain = rawDf['price_doc'].values
Xtrain = rawDf.drop(['price_doc','w'], 1).values
return Ytrain, Xtrain
def loadTestdata():
#filename = 'test.csv'
filename = 'test_featured.csv'
rawDf = pd.read_csv(filename)
Xtest = rawDf.values
return Xtest
#load random small part of data for fast model testing
def loadSample(n=300):
#filename = 'train.csv'
#filename = 'train_cleaned.csv'
filename = 'train_featured.csv'
size = pd.read_csv(filename).shape[0]
skip = sorted(random.sample(range(1,size+1),size-n))
rawDf = pd.read_csv(filename, skiprows = skip)
Ytrain = rawDf['log_price'].values
Xtrain = rawDf.drop(['log_price'], 1).values
return Ytrain, Xtrain
#Used to undersample strange values
def sample_vals(df, price_value, ratio, condition):
indices = condition(df.price_doc, price_value) & (df.product_type == 0)
df_resampled = df.loc[indices].sample(frac=ratio)
df_remaining = df.loc[~indices]
df_new = pd.concat([df_resampled, df_remaining], axis=0)
return df_new
#Encoding dummy variables
def Encoding(TestEncoding = True):
filename = 'test.csv' if TestEncoding else 'train.csv'
rawDf = pd.read_csv(filename)
#Drop variable with no use and small variance, and sub area
rawDf = rawDf.drop(["id","ID_metro","ID_railroad_station_walk","ID_railroad_station_avto",
"ID_big_road1", "ID_big_road2", "ID_railroad_terminal", "ID_bus_terminal"],1)
rawDf = rawDf.drop(["culture_objects_top_25_raion","oil_chemistry_raion","railroad_terminal_raion",
"nuclear_reactor_raion", "build_count_foam", "big_road1_1line","railroad_1line",
"office_sqm_500", "trc_sqm_500", "cafe_count_500_price_4000", "cafe_count_500_price_high",
"mosque_count_500", "leisure_count_500", "office_sqm_1000", "trc_sqm_1000",
"cafe_count_1000_price_high", "mosque_count_1000", "cafe_count_1500_price_high",
"mosque_count_1500", "cafe_count_2000_price_high"],1)
#rawDf = rawDf.drop('sub_area',1)
result = rawDf
for i in range(1,rawDf.shape[1]): #Do not encode timestamp
if rawDf.ix[:,i].dtype == np.object:
varName = rawDf.columns[i]
if varName == 'sub_area':
dummy_ranks = pd.get_dummies(rawDf[varName], prefix = varName)
else:
dummy_ranks = pd.get_dummies(rawDf[varName], prefix = varName, drop_first=True)
result = pd.concat([result, dummy_ranks], axis=1)
result = result.drop(varName, 1)
varName = 'material' #special case
dummy_ranks = pd.get_dummies(rawDf[varName], prefix = varName)
result = pd.concat([result, dummy_ranks], axis=1)
result = result.drop(varName, 1)
outputFile = 'test_encoded.csv' if TestEncoding else 'train_encoded.csv'
result.to_csv(outputFile,index=False)
#return result
#用PCA合并同一个系列高度相关的feature
def FeatureCombination(Df,s='',num_feature=2):
feature_set = []
for c in Df.columns:
if c.startswith(s): feature_set.append(c)
print('combining', len(feature_set), 'features')
data = Df[feature_set].values
for c in Df.columns:
if Df[c].dtype == 'object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(Df[c].values))
Df[c] = lbl.transform(list(Df[c].values))
imp = preprocessing.Imputer()
data = imp.fit_transform(data)
data = preprocessing.scale(data)
pca = PCA(num_feature)
pca.fit(data)
print('explained_variance_ratio_:', pca.explained_variance_ratio_)
trans = pca.transform(data)
for i in range(0,num_feature):
Df[s+'_%d'%(i+1)] = trans[:,i]
Df.drop(feature_set,1,inplace=True)
return Df