-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_preparation.py
60 lines (54 loc) · 1.79 KB
/
data_preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
IMAGES_PATH = 'data/images/'
TABULAR_PATH = 'data/styles.csv'
SAVE_PATH = 'data/prepared_data.csv'
df = pd.read_csv(TABULAR_PATH, nrows=None, error_bad_lines=False) # error_bad_lines=False drops instances with too many columns
print(len(df))
df['image'] = df.apply(lambda row: str(row['id']) + ".jpg", axis=1)
df = df.loc[df['image'].isin(os.listdir(IMAGES_PATH))] # keep rows that have an image in the IMAGES_PATH
df = df.drop('year', axis=1)
plt.style.use('bmh')
## Exploratory Analysis
# Season
df.season.value_counts().sort_values().plot(kind='barh')
plt.savefig('season.png')
# Master Category
df.masterCategory.value_counts().sort_values().plot(kind='barh')
plt.show()
# Sub Category
df.subCategory.value_counts().sort_values().plot(kind='barh')
plt.tight_layout()
plt.show()
# Articly Type
plt.figure(figsize=(10,50))
df.articleType.value_counts().sort_values().plot(kind='barh')
plt.tight_layout()
plt.show()
# Base colour
df.baseColour.value_counts().sort_values().plot(kind='barh')
plt.tight_layout()
plt.show()
# Usage
df.usage.value_counts().sort_values().plot(kind='barh')
plt.show()
## Data Preparation
# Balacing label samples
num_of_samples = len(df[df.season == 'Spring'])
spring = df[df.season == 'Spring']
summer = df[df.season == 'Summer']
print(len(spring))
winter = df[df.season == 'Winter']
fall = df[df.season == 'Fall']
summer_sample = summer.sample(n=num_of_samples)
winter_sample = winter.sample(n=num_of_samples)
fall_sample = fall.sample(n=num_of_samples)
frames = [spring, summer_sample, winter_sample, fall_sample]
final_df = pd.concat(frames)
final_df.season.value_counts().sort_values().plot(kind='barh')
print(final_df.info(verbose=True))
plt.show()
df.to_csv(SAVE_PATH, index=False)