Skip to content

Commit

Permalink
Issue #1 - General refactorization + update as suggested by pandas cr…
Browse files Browse the repository at this point in the history
…eators
  • Loading branch information
jukiewiczm committed Jan 19, 2020
1 parent e35186c commit 6f3f526
Show file tree
Hide file tree
Showing 10 changed files with 44 additions and 33 deletions.
6 changes: 3 additions & 3 deletions eda/basic_datasets_overview.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2031,7 +2031,7 @@
}
],
"source": [
"item_category = data.nlargest(1, 'count')['item_category_id'].values[0]\n",
"item_category = data.nlargest(1, 'count')['item_category_id'].to_numpy()[0]\n",
"item_categories[item_categories['item_category_id'] == int(item_category)]"
]
},
Expand Down Expand Up @@ -2283,7 +2283,7 @@
}
],
"source": [
"item_id = train_postproc[train_postproc['item_price'] == int(data['item_price'].nlargest(1).values[0])]['item_id'].values[0]\n",
"item_id = train_postproc[train_postproc['item_price'] == int(data['item_price'].nlargest(1).to_numpy()[0])]['item_id'].to_numpy()[0]\n",
"items[items['item_id'] == item_id]\n",
"train_postproc[train_postproc['item_id'] == item_id]"
]
Expand Down Expand Up @@ -2572,7 +2572,7 @@
}
],
"source": [
"item_id = int(data.nlargest(1, 'count')['item_id'].values[0])\n",
"item_id = int(data.nlargest(1, 'count')['item_id'].to_numpy()[0])\n",
"items[items['item_id'] == item_id]"
]
},
Expand Down
6 changes: 3 additions & 3 deletions eda/basic_datasets_overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def my_figure(**options):
# In[30]:


item_category = data.nlargest(1, 'count')['item_category_id'].values[0]
item_category = data.nlargest(1, 'count')['item_category_id'].to_numpy()[0]
item_categories[item_categories['item_category_id'] == int(item_category)]


Expand Down Expand Up @@ -329,7 +329,7 @@ def my_figure(**options):
# In[33]:


item_id = train_postproc[train_postproc['item_price'] == int(data['item_price'].nlargest(1).values[0])]['item_id'].values[0]
item_id = train_postproc[train_postproc['item_price'] == int(data['item_price'].nlargest(1).to_numpy()[0])]['item_id'].to_numpy()[0]
items[items['item_id'] == item_id]
train_postproc[train_postproc['item_id'] == item_id]

Expand Down Expand Up @@ -373,7 +373,7 @@ def my_figure(**options):
# In[37]:


item_id = int(data.nlargest(1, 'count')['item_id'].values[0])
item_id = int(data.nlargest(1, 'count')['item_id'].to_numpy()[0])
items[items['item_id'] == item_id]


Expand Down
2 changes: 1 addition & 1 deletion modeling/feature_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def score(X, y):
score_func = get_score_function(model, cols, constant_data, evaluate_embeddings)

base_score, score_decreases = get_score_importances(
score_func, score_set.values, score_y.values, random_state=234234, n_iter=1
score_func, score_set.to_numpy(), score_y.to_numpy(), random_state=234234, n_iter=1
)

feature_importances = np.mean(score_decreases, axis=0)
Expand Down
35 changes: 23 additions & 12 deletions modeling/model_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,20 @@ def __init__(self, target_col, model, metric=root_mean_squared_error):
self.model = model
self.metric = metric

def run(self, train, test, copy=False):
def run(self, train, test, inner_train_validation=True, copy=False):
"""
:param inner_train_validation if this flag is set, you will see validation result during training, but it will
take longer to train the model
:param copy: if input sets should stay untouched (not modified in place) after the process, use this flag.
Warning: the process will take more memory with this flag on.
"""
y_train, train = self.data_target_split(train, copy)
y_test, test = self.data_target_split(test, copy)

self.model.fit(train, y_train, (test, y_test))
if inner_train_validation:
self.model.fit(train, y_train, (test, y_test))
else:
self.model.fit(train, y_train)

predictions = self.model.transform(test)

Expand All @@ -38,19 +47,20 @@ def __init__(self, target_col, model, time_column_name, metric=root_mean_squared
self.time_column_name = time_column_name
self.one_set_validation = OneSetValidation(target_col, model, metric)

def run(self, train_set, train_period, test_period, num_splits, copy=False):
def run(self, train_set, train_period, test_period, num_splits, inner_train_validation=True):
time_values = train_set[self.time_column_name].drop_duplicates().sort_values(ascending=False)

scores = []

for i in range(num_splits):
test_max, test_min = time_values.iloc[i], time_values.iloc[i+test_period-1]
train_max, train_min = time_values.iloc[i+test_period], time_values.iloc[i+test_period+train_period-1]

train_subset = train_set[train_set[self.time_column_name].between(train_min, train_max)]
test_subset = train_set[train_set[self.time_column_name].between(test_min, test_max)]
# Explicit copy to avoid the "assignment on copy" warning
train_subset = train_set[train_set[self.time_column_name].between(train_min, train_max)].copy()
test_subset = train_set[train_set[self.time_column_name].between(test_min, test_max)].copy()

scores.append(self.one_set_validation.run(train_subset, test_subset, copy))
scores.append(self.one_set_validation.run(train_subset, test_subset, inner_train_validation))
del train_subset, test_subset

result = np.array(scores)

Expand Down Expand Up @@ -86,8 +96,9 @@ def run(self, train_set, train_period, test_period, num_splits, copy=False):

print("Time taken: {}s".format(str(end - start)))

# For now, as the RNN model is memory extensive, give up on the multi period validation.
# full_train_set = valid_train_set.append(valid_test_set, ignore_index=True)
# multi_period_validation = TimeBasedValidation(target_col, model, 'date_block_num')
# multi_eval_result = multi_period_validation.run(full_train_set, 29, 1, 5, True)
# print("Multi evaluation result:\n", multi_eval_result)
full_train_set = valid_train_set.append(valid_test_set, ignore_index=True)
del valid_train_set, valid_test_set

multi_period_validation = TimeBasedValidation(target_col, model, 'date_block_num')
multi_eval_result = multi_period_validation.run(full_train_set, 30, 1, 4, False)
print("Multi evaluation result:\n", multi_eval_result)
4 changes: 2 additions & 2 deletions modeling/models/rnn/dense_sets_holder.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def preproc_dataset(dataset, train_ids):
fit_dataset = dataset.merge(ids.reset_index(), on=id_attr_name).\
sort_values(id_attr_name).drop([id_attr_name, "index"], axis=1)
dataset = dataset.sort_values(id_attr_name).drop(id_attr_name, axis=1)
scaler = DenseSetsHolder.standardize(fit_dataset.values)
dataset = torch.Tensor(scaler.transform(dataset.values))
scaler = DenseSetsHolder.standardize(fit_dataset.to_numpy())
dataset = torch.Tensor(scaler.transform(dataset.to_numpy()))

return dataset
6 changes: 3 additions & 3 deletions modeling/models/rnn/rnn_data_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ def preprocess_common(self, dataset, fit_standardizer=True):
dataset[col] = np.log(dataset[col] + 1e-3)

if fit_standardizer:
self.standardizer.fit(dataset[self.data_colnames].values)
self.standardizer.fit(dataset[self.data_colnames].to_numpy())

# Use the standardizer
dataset[self.data_colnames] = self.standardizer.transform(dataset[self.data_colnames].values)
dataset[self.data_colnames] = self.standardizer.transform(dataset[self.data_colnames].to_numpy())

# Fill NA's introduced in dataset preparation procedure in Spark
dataset.fillna(0, inplace=True)
Expand All @@ -51,7 +51,7 @@ def preprocess_train_dataset(self, dataset, y_dataset):
# When preprocessing training, we care about the time order (as we'll be creating time series), hence the sort.
# In addition, the copy here is made so source dataset will not be changed.
dataset = dataset.sort_values(['date_block_num'])
y_dataset = y_dataset.values[dataset.index].squeeze().tolist()
y_dataset = y_dataset.loc[dataset.index].to_numpy().squeeze().tolist()

dataset = self.preprocess_common(dataset)

Expand Down
2 changes: 1 addition & 1 deletion modeling/models/rnn/rnn_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __getitem__(self, idx):

mixed = (items_lookup + items_categories_lookup + shops_lookup) / 3.0

train = self.train[idx]
train = torch.cat([train, items_lookup, items_categories_lookup, shops_lookup], dim=1)

train = torch.cat([train, mixed], dim=1)

Expand Down
2 changes: 1 addition & 1 deletion modeling/models/rnn/torch_rnn_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def fit(self, train, y_train, valid_set_tuple=None):
results = self.transform(test_loader, False, False)

test_loss = torch.sqrt(
F.mse_loss(results.clamp(0, 20), torch.Tensor(y_test.values).squeeze())
F.mse_loss(results.clamp(0, 20), torch.Tensor(y_test.to_numpy()).squeeze())
).item()

test_losses.append(test_loss)
Expand Down
10 changes: 5 additions & 5 deletions modeling/models/torch_embedding_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ def forward(self, input):
return input_concat

def preprocess_data(self, dataset):
index_cols = dataset[self.cols_in_order].values
normalized_cols = self.standardizer.transform(dataset[self.cols_rest].values)
index_cols = dataset[self.cols_in_order].to_numpy()
normalized_cols = self.standardizer.transform(dataset[self.cols_rest].to_numpy())
normalized = np.concatenate([index_cols, normalized_cols], axis=1)
normalized_wo_nan = np.nan_to_num(normalized)
return torch.from_numpy(normalized_wo_nan).float().to(self.device)
Expand All @@ -73,18 +73,18 @@ def fit(self, train, y_train, valid_set_tuple=None):
batch_report_interval = batches_num // 4

self.cols_rest = sorted(list(set(train.columns.tolist()) - set(self.cols_in_order)))
self.standardizer.fit(train[self.cols_rest].values)
self.standardizer.fit(train[self.cols_rest].to_numpy())

train_processed = self.preprocess_data(train)

train_dataset = TensorDataset(train_processed, torch.from_numpy(y_train.values).to(self.device).float())
train_dataset = TensorDataset(train_processed, torch.from_numpy(y_train.to_numpy()).to(self.device).float())

loader_train = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=0)

valid_set_tuple_postproc = None
if valid_set_tuple:
test, y_test = valid_set_tuple
valid_set_tuple_postproc = self.preprocess_data(test), torch.Tensor(y_test.values).squeeze()
valid_set_tuple_postproc = self.preprocess_data(test), torch.Tensor(y_test.to_numpy()).squeeze()

self.zero_grad()
self.train()
Expand Down
4 changes: 2 additions & 2 deletions modeling/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def prepare_aggregate_submission(dir_path, out_path, func=lambda x: x.mean(axis=
(takes an average of them by default).
"""
files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if f.endswith(".csv.gz")]
id_col = pd.read_csv(files[0])['ID'].values
predictions = func(np.array([pd.read_csv(f)['item_cnt_month'].values for f in files]))
id_col = pd.read_csv(files[0])['ID'].to_numpy()
predictions = func(np.array([pd.read_csv(f)['item_cnt_month'].to_numpy() for f in files]))
result = pd.DataFrame({'ID': id_col, 'item_cnt_month': predictions})
result.to_csv(out_path, index=False, header=True, compression="gzip")

0 comments on commit 6f3f526

Please sign in to comment.