From 6f3f52677020d115178786c97a2ecf1aac8a2cd1 Mon Sep 17 00:00:00 2001 From: Mateusz Jukiewicz Date: Fri, 17 Jan 2020 01:12:29 +0100 Subject: [PATCH] Issue #1 - General refactorization + update as suggested by pandas creators --- eda/basic_datasets_overview.ipynb | 6 ++-- eda/basic_datasets_overview.py | 6 ++-- modeling/feature_importance.py | 2 +- modeling/model_validation.py | 35 +++++++++++++------- modeling/models/rnn/dense_sets_holder.py | 4 +-- modeling/models/rnn/rnn_data_preprocessor.py | 6 ++-- modeling/models/rnn/rnn_dataset.py | 2 +- modeling/models/rnn/torch_rnn_net.py | 2 +- modeling/models/torch_embedding_net.py | 10 +++--- modeling/utils.py | 4 +-- 10 files changed, 44 insertions(+), 33 deletions(-) diff --git a/eda/basic_datasets_overview.ipynb b/eda/basic_datasets_overview.ipynb index 3614c6f..ceb011e 100644 --- a/eda/basic_datasets_overview.ipynb +++ b/eda/basic_datasets_overview.ipynb @@ -2031,7 +2031,7 @@ } ], "source": [ - "item_category = data.nlargest(1, 'count')['item_category_id'].values[0]\n", + "item_category = data.nlargest(1, 'count')['item_category_id'].to_numpy()[0]\n", "item_categories[item_categories['item_category_id'] == int(item_category)]" ] }, @@ -2283,7 +2283,7 @@ } ], "source": [ - "item_id = train_postproc[train_postproc['item_price'] == int(data['item_price'].nlargest(1).values[0])]['item_id'].values[0]\n", + "item_id = train_postproc[train_postproc['item_price'] == int(data['item_price'].nlargest(1).to_numpy()[0])]['item_id'].to_numpy()[0]\n", "items[items['item_id'] == item_id]\n", "train_postproc[train_postproc['item_id'] == item_id]" ] @@ -2572,7 +2572,7 @@ } ], "source": [ - "item_id = int(data.nlargest(1, 'count')['item_id'].values[0])\n", + "item_id = int(data.nlargest(1, 'count')['item_id'].to_numpy()[0])\n", "items[items['item_id'] == item_id]" ] }, diff --git a/eda/basic_datasets_overview.py b/eda/basic_datasets_overview.py index 2a9fa79..c718a8d 100644 --- a/eda/basic_datasets_overview.py +++ b/eda/basic_datasets_overview.py @@ -291,7 +291,7 @@ def my_figure(**options): # In[30]: -item_category = data.nlargest(1, 'count')['item_category_id'].values[0] +item_category = data.nlargest(1, 'count')['item_category_id'].to_numpy()[0] item_categories[item_categories['item_category_id'] == int(item_category)] @@ -329,7 +329,7 @@ def my_figure(**options): # In[33]: -item_id = train_postproc[train_postproc['item_price'] == int(data['item_price'].nlargest(1).values[0])]['item_id'].values[0] +item_id = train_postproc[train_postproc['item_price'] == int(data['item_price'].nlargest(1).to_numpy()[0])]['item_id'].to_numpy()[0] items[items['item_id'] == item_id] train_postproc[train_postproc['item_id'] == item_id] @@ -373,7 +373,7 @@ def my_figure(**options): # In[37]: -item_id = int(data.nlargest(1, 'count')['item_id'].values[0]) +item_id = int(data.nlargest(1, 'count')['item_id'].to_numpy()[0]) items[items['item_id'] == item_id] diff --git a/modeling/feature_importance.py b/modeling/feature_importance.py index db4d20b..883a46f 100644 --- a/modeling/feature_importance.py +++ b/modeling/feature_importance.py @@ -88,7 +88,7 @@ def score(X, y): score_func = get_score_function(model, cols, constant_data, evaluate_embeddings) base_score, score_decreases = get_score_importances( - score_func, score_set.values, score_y.values, random_state=234234, n_iter=1 + score_func, score_set.to_numpy(), score_y.to_numpy(), random_state=234234, n_iter=1 ) feature_importances = np.mean(score_decreases, axis=0) diff --git a/modeling/model_validation.py b/modeling/model_validation.py index ad1984d..b41821e 100644 --- a/modeling/model_validation.py +++ b/modeling/model_validation.py @@ -11,11 +11,20 @@ def __init__(self, target_col, model, metric=root_mean_squared_error): self.model = model self.metric = metric - def run(self, train, test, copy=False): + def run(self, train, test, inner_train_validation=True, copy=False): + """ + :param inner_train_validation if this flag is set, you will see validation result during training, but it will + take longer to train the model + :param copy: if input sets should stay untouched (not modified in place) after the process, use this flag. + Warning: the process will take more memory with this flag on. + """ y_train, train = self.data_target_split(train, copy) y_test, test = self.data_target_split(test, copy) - self.model.fit(train, y_train, (test, y_test)) + if inner_train_validation: + self.model.fit(train, y_train, (test, y_test)) + else: + self.model.fit(train, y_train) predictions = self.model.transform(test) @@ -38,19 +47,20 @@ def __init__(self, target_col, model, time_column_name, metric=root_mean_squared self.time_column_name = time_column_name self.one_set_validation = OneSetValidation(target_col, model, metric) - def run(self, train_set, train_period, test_period, num_splits, copy=False): + def run(self, train_set, train_period, test_period, num_splits, inner_train_validation=True): time_values = train_set[self.time_column_name].drop_duplicates().sort_values(ascending=False) scores = [] - for i in range(num_splits): test_max, test_min = time_values.iloc[i], time_values.iloc[i+test_period-1] train_max, train_min = time_values.iloc[i+test_period], time_values.iloc[i+test_period+train_period-1] - train_subset = train_set[train_set[self.time_column_name].between(train_min, train_max)] - test_subset = train_set[train_set[self.time_column_name].between(test_min, test_max)] + # Explicit copy to avoid the "assignment on copy" warning + train_subset = train_set[train_set[self.time_column_name].between(train_min, train_max)].copy() + test_subset = train_set[train_set[self.time_column_name].between(test_min, test_max)].copy() - scores.append(self.one_set_validation.run(train_subset, test_subset, copy)) + scores.append(self.one_set_validation.run(train_subset, test_subset, inner_train_validation)) + del train_subset, test_subset result = np.array(scores) @@ -86,8 +96,9 @@ def run(self, train_set, train_period, test_period, num_splits, copy=False): print("Time taken: {}s".format(str(end - start))) - # For now, as the RNN model is memory extensive, give up on the multi period validation. - # full_train_set = valid_train_set.append(valid_test_set, ignore_index=True) - # multi_period_validation = TimeBasedValidation(target_col, model, 'date_block_num') - # multi_eval_result = multi_period_validation.run(full_train_set, 29, 1, 5, True) - # print("Multi evaluation result:\n", multi_eval_result) + full_train_set = valid_train_set.append(valid_test_set, ignore_index=True) + del valid_train_set, valid_test_set + + multi_period_validation = TimeBasedValidation(target_col, model, 'date_block_num') + multi_eval_result = multi_period_validation.run(full_train_set, 30, 1, 4, False) + print("Multi evaluation result:\n", multi_eval_result) diff --git a/modeling/models/rnn/dense_sets_holder.py b/modeling/models/rnn/dense_sets_holder.py index 85d2fea..dfdd962 100644 --- a/modeling/models/rnn/dense_sets_holder.py +++ b/modeling/models/rnn/dense_sets_holder.py @@ -28,7 +28,7 @@ def preproc_dataset(dataset, train_ids): fit_dataset = dataset.merge(ids.reset_index(), on=id_attr_name).\ sort_values(id_attr_name).drop([id_attr_name, "index"], axis=1) dataset = dataset.sort_values(id_attr_name).drop(id_attr_name, axis=1) - scaler = DenseSetsHolder.standardize(fit_dataset.values) - dataset = torch.Tensor(scaler.transform(dataset.values)) + scaler = DenseSetsHolder.standardize(fit_dataset.to_numpy()) + dataset = torch.Tensor(scaler.transform(dataset.to_numpy())) return dataset diff --git a/modeling/models/rnn/rnn_data_preprocessor.py b/modeling/models/rnn/rnn_data_preprocessor.py index 88187a6..56b8665 100644 --- a/modeling/models/rnn/rnn_data_preprocessor.py +++ b/modeling/models/rnn/rnn_data_preprocessor.py @@ -28,10 +28,10 @@ def preprocess_common(self, dataset, fit_standardizer=True): dataset[col] = np.log(dataset[col] + 1e-3) if fit_standardizer: - self.standardizer.fit(dataset[self.data_colnames].values) + self.standardizer.fit(dataset[self.data_colnames].to_numpy()) # Use the standardizer - dataset[self.data_colnames] = self.standardizer.transform(dataset[self.data_colnames].values) + dataset[self.data_colnames] = self.standardizer.transform(dataset[self.data_colnames].to_numpy()) # Fill NA's introduced in dataset preparation procedure in Spark dataset.fillna(0, inplace=True) @@ -51,7 +51,7 @@ def preprocess_train_dataset(self, dataset, y_dataset): # When preprocessing training, we care about the time order (as we'll be creating time series), hence the sort. # In addition, the copy here is made so source dataset will not be changed. dataset = dataset.sort_values(['date_block_num']) - y_dataset = y_dataset.values[dataset.index].squeeze().tolist() + y_dataset = y_dataset.loc[dataset.index].to_numpy().squeeze().tolist() dataset = self.preprocess_common(dataset) diff --git a/modeling/models/rnn/rnn_dataset.py b/modeling/models/rnn/rnn_dataset.py index f7b5780..c51730c 100644 --- a/modeling/models/rnn/rnn_dataset.py +++ b/modeling/models/rnn/rnn_dataset.py @@ -23,7 +23,7 @@ def __getitem__(self, idx): mixed = (items_lookup + items_categories_lookup + shops_lookup) / 3.0 - train = self.train[idx] + train = torch.cat([train, items_lookup, items_categories_lookup, shops_lookup], dim=1) train = torch.cat([train, mixed], dim=1) diff --git a/modeling/models/rnn/torch_rnn_net.py b/modeling/models/rnn/torch_rnn_net.py index 83fef0b..8271ce3 100644 --- a/modeling/models/rnn/torch_rnn_net.py +++ b/modeling/models/rnn/torch_rnn_net.py @@ -209,7 +209,7 @@ def fit(self, train, y_train, valid_set_tuple=None): results = self.transform(test_loader, False, False) test_loss = torch.sqrt( - F.mse_loss(results.clamp(0, 20), torch.Tensor(y_test.values).squeeze()) + F.mse_loss(results.clamp(0, 20), torch.Tensor(y_test.to_numpy()).squeeze()) ).item() test_losses.append(test_loss) diff --git a/modeling/models/torch_embedding_net.py b/modeling/models/torch_embedding_net.py index f771181..3455e19 100644 --- a/modeling/models/torch_embedding_net.py +++ b/modeling/models/torch_embedding_net.py @@ -59,8 +59,8 @@ def forward(self, input): return input_concat def preprocess_data(self, dataset): - index_cols = dataset[self.cols_in_order].values - normalized_cols = self.standardizer.transform(dataset[self.cols_rest].values) + index_cols = dataset[self.cols_in_order].to_numpy() + normalized_cols = self.standardizer.transform(dataset[self.cols_rest].to_numpy()) normalized = np.concatenate([index_cols, normalized_cols], axis=1) normalized_wo_nan = np.nan_to_num(normalized) return torch.from_numpy(normalized_wo_nan).float().to(self.device) @@ -73,18 +73,18 @@ def fit(self, train, y_train, valid_set_tuple=None): batch_report_interval = batches_num // 4 self.cols_rest = sorted(list(set(train.columns.tolist()) - set(self.cols_in_order))) - self.standardizer.fit(train[self.cols_rest].values) + self.standardizer.fit(train[self.cols_rest].to_numpy()) train_processed = self.preprocess_data(train) - train_dataset = TensorDataset(train_processed, torch.from_numpy(y_train.values).to(self.device).float()) + train_dataset = TensorDataset(train_processed, torch.from_numpy(y_train.to_numpy()).to(self.device).float()) loader_train = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=0) valid_set_tuple_postproc = None if valid_set_tuple: test, y_test = valid_set_tuple - valid_set_tuple_postproc = self.preprocess_data(test), torch.Tensor(y_test.values).squeeze() + valid_set_tuple_postproc = self.preprocess_data(test), torch.Tensor(y_test.to_numpy()).squeeze() self.zero_grad() self.train() diff --git a/modeling/utils.py b/modeling/utils.py index 92268f3..d2ecd56 100644 --- a/modeling/utils.py +++ b/modeling/utils.py @@ -24,7 +24,7 @@ def prepare_aggregate_submission(dir_path, out_path, func=lambda x: x.mean(axis= (takes an average of them by default). """ files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if f.endswith(".csv.gz")] - id_col = pd.read_csv(files[0])['ID'].values - predictions = func(np.array([pd.read_csv(f)['item_cnt_month'].values for f in files])) + id_col = pd.read_csv(files[0])['ID'].to_numpy() + predictions = func(np.array([pd.read_csv(f)['item_cnt_month'].to_numpy() for f in files])) result = pd.DataFrame({'ID': id_col, 'item_cnt_month': predictions}) result.to_csv(out_path, index=False, header=True, compression="gzip")