-
Notifications
You must be signed in to change notification settings - Fork 6
/
preprocess_data.py
62 lines (48 loc) · 1.88 KB
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
Adapted from https://www.kaggle.com/myonin/music-recommendation-random-forest-xgboost
"""
#from sklearn import cross_validation, grid_search, metrics, ensemble
import xgboost as xgb
import numpy as np
import pandas as pd
PATH = "data/"
PATH_OUT = "clean_data/"
# Load data
df = pd.read_csv(f'{PATH}/train.csv')
# 1% sample of items
# df = df.sample(frac=0.01)
# Load and join songs data
songs = pd.read_csv(f'{PATH}/songs.csv')
df = pd.merge(df, songs, on='song_id', how='left')
del songs
# Load and join songs data
members = pd.read_csv(f'{PATH}/members.csv')
df = pd.merge(df, members, on='msno', how='left')
del members
# Replace NA
for i in df.select_dtypes(include=['object']).columns:
df[i][df[i].isnull()] = 'unknown'
df = df.fillna(value=0)
# Create Dates
# registration_init_time
df.registration_init_time = pd.to_datetime(df.registration_init_time, format='%Y%m%d', errors='ignore')
df['registration_init_time_year'] = df['registration_init_time'].dt.year
df['registration_init_time_month'] = df['registration_init_time'].dt.month
df['registration_init_time_day'] = df['registration_init_time'].dt.day
# expiration_date
df.expiration_date = pd.to_datetime(df.expiration_date, format='%Y%m%d', errors='ignore')
df['expiration_date_year'] = df['expiration_date'].dt.year
df['expiration_date_month'] = df['expiration_date'].dt.month
df['expiration_date_day'] = df['expiration_date'].dt.day
# Dates to categoty
df['registration_init_time'] = df['registration_init_time'].astype('category')
df['expiration_date'] = df['expiration_date'].astype('category')
# Object data to category
for col in df.select_dtypes(include=['object']).columns:
df[col] = df[col].astype('category')
# Encoding categorical features
for col in df.select_dtypes(include=['category']).columns:
df[col] = df[col].cat.codes
# Drop columns
df = df.drop(['expiration_date', 'lyricist'], 1)
df.to_csv(f"{PATH_OUT}/output.csv")