From e333239dd140627961192ec59b9dbf068cc5a9ba Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Tue, 7 May 2019 06:19:12 -0700 Subject: [PATCH 01/51] Added rfRegressor class to random forest. - Added rfRegressor class to random forest, along with relevant stateless API functions. Implementation currently commented out as DecisionTreeRegressor is not yet implemented. - Updated RF_metrics struct to include regression metrics too. Metrics supported are mean absolute error, mean squared error and median absolute error as per SKL-rf. --- cuML/src/randomforest/randomforest.cu | 278 +++++++++++++++++++++++++- cuML/src/randomforest/randomforest.h | 31 ++- 2 files changed, 301 insertions(+), 8 deletions(-) diff --git a/cuML/src/randomforest/randomforest.cu b/cuML/src/randomforest/randomforest.cu index ce5291a095..f2a4b8a1eb 100644 --- a/cuML/src/randomforest/randomforest.cu +++ b/cuML/src/randomforest/randomforest.cu @@ -23,13 +23,30 @@ namespace ML { * @brief Construct RF_metrics. * @param[in] cfg_accuracy: accuracy. */ -RF_metrics::RF_metrics(float cfg_accuracy) : accuracy(cfg_accuracy) {}; +RF_metrics::RF_metrics(float cfg_accuracy) : rf_type(RF_type::CLASSIFICATION), accuracy(cfg_accuracy) {}; /** - * @brief Print accuracy metric. + * @brief Construct RF_metrics. + * @param[in] cfg_mean_abs_error: mean absolute error. + * @param[in] cfg_mean_squared_error: mean squared error. + * @param[in] cfg_median_abs_error: median absolute error. + */ +RF_metrics::RF_metrics(double cfg_mean_abs_error, double cfg_mean_squared_error, double cfg_median_abs_error) : + rf_type(RF_type::REGRESSION), mean_abs_error(cfg_mean_abs_error), mean_squared_error(cfg_mean_squared_error), + median_abs_error(cfg_median_abs_error) {}; + +/** + * @brief Print either accuracy metric for classification, or mean absolute error, mean squared error, + and median absolute error metrics for regression. */ void RF_metrics::print() { - std::cout << "Accuracy: " << accuracy << std::endl; + if (rf_type == RF_type::CLASSIFICATION) { + std::cout << "Accuracy: " << accuracy << std::endl; + } else if (rf_type == RF_type::REGRESSION) { + std::cout << "Mean Absolute Error: " << mean_abs_error << std::endl; + std::cout << "Mean Squared Error: " << mean_squared_error << std::endl; + std::cout << "Median Absolute Error: " << median_abs_error << std::endl; + } } /** @@ -340,14 +357,181 @@ RF_metrics rfClassifier::cross_validate(const cumlHandle& user_handle, const return stats; } + +/** + * @brief Construct rfRegressor object. + * @tparam T: data type for input data (float or double). + * @param[in] cfg_rf_params: Random forest hyper-parameter struct. + */ +template +rfRegressor::rfRegressor(RF_params cfg_rf_params): rf::rf(cfg_rf_params, RF_type::REGRESSION) {}; + +/** + * @brief Build (i.e., fit, train) random forest regressor for input data. + * @tparam T: data type for input data (float or double). + * @param[in] user_handle: cumlHandle + * @param[in] input: train data (n_rows samples, n_cols features) in column major format, excluding labels. Device pointer. + * @param[in] n_rows: number of training data samples. + * @param[in] n_cols: number of features (i.e., columns) excluding target feature. + * @param[in] labels: 1D array of target features (float or double), with one label per training sample. Device pointer. + */ +template +void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, int n_cols, T * labels) { + + ASSERT(!this->trees, "Cannot fit an existing forest."); + ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); + ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); + +#if 0 // commented out as DecisionTreeRegressor not implemented + rfRegressor::trees = new DecisionTree::DecisionTreeRegressor[this->rf_params.n_trees]; + int n_sampled_rows = this->rf_params.rows_sample * n_rows; + + const cumlHandle_impl& handle = user_handle.getImpl(); + cudaStream_t stream = user_handle.getStream(); + + for (int i = 0; i < this->rf_params.n_trees; i++) { + // Select n_sampled_rows (with replacement) numbers from [0, n_rows) per tree. + // selected_rows: randomly generated IDs for bootstrapped samples (w/ replacement); a device ptr. + MLCommon::device_buffer selected_rows(handle.getDeviceAllocator(), stream, n_sampled_rows); + + if (this->rf_params.bootstrap) { + MLCommon::Random::Rng r(i * 1000); // Ensure the seed for each tree is different and meaningful. + r.uniformInt(selected_rows.data(), n_sampled_rows, (unsigned int) 0, (unsigned int) n_rows, stream); + } else { + std::vector h_selected_rows(n_rows); + std::iota(h_selected_rows.begin(), h_selected_rows.end(), 0); + std::random_shuffle(h_selected_rows.begin(), h_selected_rows.end()); + h_selected_rows.resize(n_sampled_rows); + MLCommon::updateDevice(selected_rows.data(), h_selected_rows.data(), n_sampled_rows, stream); + } + + /* Build individual tree in the forest. + - input is a pointer to orig data that have n_cols features and n_rows rows. + - n_sampled_rows: # rows sampled for tree's bootstrap sample. + - selected_rows: points to a list of row #s (w/ n_sampled_rows elements) used to build the bootstrapped sample. + Expectation: Each tree node will contain (a) # n_sampled_rows and (b) a pointer to a list of row numbers w.r.t original data. + */ + this->trees[i].fit(user_handle, input, n_cols, n_rows, labels, selected_rows.data(), n_sampled_rows, /*n_unique_labels,*/ this->rf_params.tree_params); + + //Cleanup + selected_rows.release(stream); + } +#endif + +} + +/** + * @brief Predict target feature for input data; regression for single feature supported. + * @tparam T: data type for input data (float or double). + * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] n_rows: number of data samples. + * @param[in] n_cols: number of features (excluding target feature). + * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in] verbose: flag for debugging purposes. + */ +template +void rfRegressor::predict(const cumlHandle& user_handle, const T * input, int n_rows, int n_cols, T * predictions, bool verbose) const { + + ASSERT(this->trees, "Cannot predict! No trees in the forest."); + ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); + ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); + ASSERT(predictions != nullptr, "Error! User has not allocated memory for predictions."); + +#if 0 // commented out as DecisionTreeRegressor not implemented + int row_size = n_cols; + + for (int row_id = 0; row_id < n_rows; row_id++) { + + if (verbose) { + std::cout << "\n\n"; + std::cout << "Predict for sample: "; + for (int i = 0; i < n_cols; i++) std::cout << input[row_id*row_size + i] << ", "; + std::cout << std::endl; + } + + T sum_predictions = 0; + + for (int i = 0; i < this->rf_params.n_trees; i++) { + //Return prediction for one sample. + if (verbose) { + std::cout << "Printing tree " << i << std::endl; + this->trees[i].print(); + } + T prediction; + this->trees[i].predict(user_handle, &input[row_id * row_size], 1, n_cols, &prediction, verbose); + sum_predictions += prediction; + } + // Random forest's prediction is the arithmetic mean of all its decision tree predictions. + predictions[row_id] = sum_predictions / this->rf_params.n_trees; + } +#endif +} + +/** + * @brief Predict target feature for input data and validate against ref_labels. + * @tparam T: data type for input data (float or double). + * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] ref_labels: label values for cross validation (n_rows elements); CPU pointer. + * @param[in] n_rows: number of data samples. + * @param[in] n_cols: number of features (excluding target feature). + * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in] verbose: flag for debugging purposes. + */ +template +RF_metrics rfRegressor::cross_validate(const cumlHandle& user_handle, const T * input, const T * ref_labels, int n_rows, int n_cols, T * predictions, bool verbose) const { + + predict(user_handle, input, n_rows, n_cols, predictions, verbose); + +#if 0 // commented out as DecisionTreeRegressor not implemented + double abs_difference_sum = 0; + double mse_sum = 0; + std::vector abs_diffs; + + for (int i = 0; i < n_rows; i++) { + double abs_diff = abs(predictions[i] - ref_labels[i]); + abs_difference_sum += abs_diff; + mse_sum += pow(predictions[i] - ref_labels[i], 2); + abs_diffs.push_back(abs_diff); + } + + double mean_abs_error = abs_difference_sum / n_rows; + double mean_squared_error = mse_sum / n_rows; + + std::sort(abs_diffs.begin(), abs_diffs.end()); + double median_abs_error = 0; + int middle = n_rows / 2; + if (n_rows % 2 == 1) { + median_abs_error = abs_diffs[middle]; + } else { + median_abs_error = (abs_diffs[middle] + abs_diffs[middle - 1]) / 2; + } + + RF_metrics stats(mean_abs_error, mean_squared_error, median_abs_error); + if (verbose) stats.print(); + + return stats; +#endif + RF_metrics placeholder(0); // temp. code so code compiles. TODO remove once code above is uncommented. + return placeholder; +} + +//=================== regression end + template class rf; template class rf; template class rfClassifier; template class rfClassifier; +template class rfRegressor; +template class rfRegressor; + // Stateless API functions: fit, predict and cross_validate +// ----------------------------- Classification ----------------------------------- // + /** * @brief Build (i.e., fit, train) random forest classifier for input data of type float. * @param[in] user_handle: cumlHandle @@ -440,5 +624,93 @@ RF_metrics cross_validate(const cumlHandle& user_handle, const rfClassifiercross_validate(user_handle, input, ref_labels, n_rows, n_cols, predictions, verbose); } +// ----------------------------- Regression ----------------------------------- // + +/** + * @brief Build (i.e., fit, train) random forest regressor for input data of type float. + * @param[in] user_handle: cumlHandle + * @param[in,out] rf_regreesor: pointer to the rfRegressor object, previously constructed by the user. + * @param[in] input: train data (n_rows samples, n_cols features) in column major format, excluding labels. Device pointer. + * @param[in] n_rows: number of training data samples. + * @param[in] n_cols: number of features (i.e., columns) excluding target feature. + * @param[in] labels: 1D array of target features (float), with one label per training sample. Device pointer. + */ +void fit(const cumlHandle& user_handle, rfRegressor * rf_regressor, float * input, int n_rows, int n_cols, float * labels) { + rf_regressor->fit(user_handle, input, n_rows, n_cols, labels); +} + +/** + * @brief Build (i.e., fit, train) random forest regressor for input data of type double. + * @param[in] user_handle: cumlHandle + * @param[in,out] rf_regressor: pointer to the rfRegressor object, previously constructed by the user. + * @param[in] input: train data (n_rows samples, n_cols features) in column major format, excluding labels. Device pointer. + * @param[in] n_rows: number of training data samples. + * @param[in] n_cols: number of features (i.e., columns) excluding target feature. + * @param[in] labels: 1D array of target features (double), with one label per training sample. Device pointer. + */ +void fit(const cumlHandle& user_handle, rfRegressor * rf_regressor, double * input, int n_rows, int n_cols, double * labels) { + rf_regressor->fit(user_handle, input, n_rows, n_cols, labels); +} + +/** + * @brief Predict target feature for input data of type float; regression for single feature supported. + * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] rf_regressor: pointer to the rfRegressor object. The user should have previously called fit to build the random forest. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] n_rows: number of data samples. + * @param[in] n_cols: number of features (excluding target feature). + * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in] verbose: flag for debugging purposes. + */ +void predict(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const float * input, int n_rows, int n_cols, float * predictions, bool verbose) { + rf_regressor->predict(user_handle, input, n_rows, n_cols, predictions, verbose); +} + +/** + * @brief Predict target feature for input data of type double; regression for single feature supported. + * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] rf_regressor: pointer to the rfRegressor object. The user should have previously called fit to build the random forest. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] n_rows: number of data samples. + * @param[in] n_cols: number of features (excluding target feature). + * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in] verbose: flag for debugging purposes. + */ +void predict(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const double * input, int n_rows, int n_cols, double * predictions, bool verbose) { + rf_regressor->predict(user_handle, input, n_rows, n_cols, predictions, verbose); +} + +/** + * @brief Predict target feature for input data of type float and validate against ref_labels. + * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] rf_regressor: pointer to the rfRegressor object. The user should have previously called fit to build the random forest. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] ref_labels: label values for cross validation (n_rows elements); CPU pointer. + * @param[in] n_rows: number of data samples. + * @param[in] n_cols: number of features (excluding target feature). + * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in] verbose: flag for debugging purposes. + */ +RF_metrics cross_validate(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const float * input, const float * ref_labels, + int n_rows, int n_cols, float * predictions, bool verbose) { + return rf_regressor->cross_validate(user_handle, input, ref_labels, n_rows, n_cols, predictions, verbose); +} + +/** + * @brief Predict target feature for input data of type double and validate against ref_labels. + * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] rf_regressor: pointer to the rfRegressor object. The user should have previously called fit to build the random forest. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] ref_labels: label values for cross validation (n_rows elements); CPU pointer. + * @param[in] n_rows: number of data samples. + * @param[in] n_cols: number of features (excluding target feature). + * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in] verbose: flag for debugging purposes. + */ +RF_metrics cross_validate(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const double * input, const double * ref_labels, + int n_rows, int n_cols, double * predictions, bool verbose) { + return rf_regressor->cross_validate(user_handle, input, ref_labels, n_rows, n_cols, predictions, verbose); +} + }; // end namespace ML diff --git a/cuML/src/randomforest/randomforest.h b/cuML/src/randomforest/randomforest.h index f9d039e210..c612863dc7 100644 --- a/cuML/src/randomforest/randomforest.h +++ b/cuML/src/randomforest/randomforest.h @@ -25,17 +25,26 @@ namespace ML { +enum RF_type { + CLASSIFICATION, REGRESSION, +}; + struct RF_metrics { - float accuracy; + RF_type rf_type; + + // Classification metrics + float accuracy = -1.0f; + + // Regression metrics - TODO FIXME change the type? + double mean_abs_error = -1.0; + double mean_squared_error = -1.0; + double median_abs_error = -1.0; RF_metrics(float cfg_accuracy); + RF_metrics(double cfg_mean_abs_error, double cfg_mean_squared_error, double cfg_median_abs_error); void print(); }; -enum RF_type { - CLASSIFICATION, REGRESSION, -}; - struct RF_params { /** * Control bootstrapping. If set, each tree in the forest is built on a bootstrapped sample with replacement. @@ -102,6 +111,18 @@ class rfClassifier : public rf { }; +template +class rfRegressor : public rf { + public: + + rfRegressor(RF_params cfg_rf_params); + + void fit(const cumlHandle& user_handle, T * input, int n_rows, int n_cols, T * labels); + void predict(const cumlHandle& user_handle, const T * input, int n_rows, int n_cols, T * predictions, bool verbose=false) const; + RF_metrics cross_validate(const cumlHandle& user_handle, const T * input, const T * ref_labels, int n_rows, int n_cols, T * predictions, bool verbose=false) const; + +}; + // Stateless API functions: fit, predict and cross_validate. void fit(const cumlHandle& user_handle, rfClassifier * rf_classifier, float * input, int n_rows, int n_cols, int * labels, int n_unique_labels); void fit(const cumlHandle& user_handle, rfClassifier * rf_classifier, double * input, int n_rows, int n_cols, int * labels, int n_unique_labels); From af365707e5e1f613f1e38764240fd3f02e6aca10 Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Tue, 7 May 2019 09:32:31 -0700 Subject: [PATCH 02/51] Added base dt class and DecisionTreeRegressor - Added a base dt class, similat to what we had in rf. - Added a DecisionTreeRegressor class. API only, no implementation for now. --- cuML/src/decisiontree/decisiontree.cu | 365 +++++++++++++++++++------- cuML/src/decisiontree/decisiontree.h | 110 +++++--- cuML/src/randomforest/randomforest.cu | 10 - cuML/src/randomforest/randomforest.h | 2 +- 4 files changed, 350 insertions(+), 137 deletions(-) diff --git a/cuML/src/decisiontree/decisiontree.cu b/cuML/src/decisiontree/decisiontree.cu index 911e1bdf65..1ad633a6a2 100644 --- a/cuML/src/decisiontree/decisiontree.cu +++ b/cuML/src/decisiontree/decisiontree.cu @@ -86,6 +86,47 @@ void DecisionTreeParams::print() const { std::cout << "min_rows_per_node: " << min_rows_per_node << std::endl; } + +/** + * @brief Print high-level tree information. + * @tparam T: data type for input data (float or double). + */ +template +void dt::print_tree_summary() const { + std::cout << " Decision Tree depth --> " << depth_counter << " and n_leaves --> " << leaf_counter << std::endl; + std::cout << " Total temporary memory usage--> "<< ((double)total_temp_mem / (1024*1024)) << " MB" << std::endl; + std::cout << " Tree growing time --> " << construct_time << " seconds" << std::endl; + std::cout << " Shared memory used --> " << shmem_used << " bytes " << std::endl; +} + +/** + * @brief Print detailed tree information. + * @tparam T: data type for input data (float or double). + */ +template +void dt::print() const { + print_tree_summary(); + print_node("", this->root, false); +} + +template +void dt::print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const { + + if (node != nullptr) { + std::cout << prefix; + + std::cout << (isLeft ? "├" : "└" ); + + // print the value of the node + std::cout << node << std::endl; + + // enter the next tree level - left and right branch + print_node( prefix + (isLeft ? "│ " : " "), node->left, true); + print_node( prefix + (isLeft ? "│ " : " "), node->right, false); + } +} + + /** * @brief Build (i.e., fit, train) Decision Tree classifier for input data. * @tparam T: data type for input data (float or double). @@ -105,6 +146,7 @@ void DecisionTreeParams::print() const { template void DecisionTreeClassifier::fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, DecisionTreeParams tree_params) { + tree_params.validity_check(); if (tree_params.n_bins > n_sampled_rows) { std::cout << "Warning! Calling with number of bins > number of rows! "; @@ -115,6 +157,13 @@ void DecisionTreeClassifier::fit(const ML::cumlHandle& handle, T *data, const tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features); } +template +void DecisionTreeClassifier::fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, T *labels, + unsigned int *rowids, const int n_sampled_rows, DecisionTreeParams tree_params) { + ASSERT(false, "Unsupported fit method for DecisionTreeClassifier"); +} + + /** * @brief Predict target feature for input data; n-ary classification for single feature supported. Inference of trees is CPU only for now. * @tparam T: data type for input data (float or double). @@ -127,91 +176,74 @@ void DecisionTreeClassifier::fit(const ML::cumlHandle& handle, T *data, const */ template void DecisionTreeClassifier::predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, int* predictions, bool verbose) const { - ASSERT(root, "Cannot predict w/ empty tree!"); + ASSERT(this->root, "Cannot predict w/ empty tree!"); ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); classify_all(rows, n_rows, n_cols, predictions, verbose); } -/** - * @brief Print high-level tree information. - * @tparam T: data type for input data (float or double). - */ -template -void DecisionTreeClassifier::print_tree_summary() const { - std::cout << " Decision Tree depth --> " << depth_counter << " and n_leaves --> " << leaf_counter << std::endl; - std::cout << " Total temporary memory usage--> "<< ((double)total_temp_mem / (1024*1024)) << " MB" << std::endl; - std::cout << " Tree growing time --> " << construct_time << " seconds" << std::endl; - std::cout << " Shared memory used --> " << shmem_used << " bytes " << std::endl; -} - -/** - * @brief Print detailed tree information. - * @tparam T: data type for input data (float or double). - */ template -void DecisionTreeClassifier::print() const { - print_tree_summary(); - print_node("", root, false); +void DecisionTreeClassifier::predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, T* predictions, bool verbose) const { + ASSERT(false, "Unsupported predict method for DecisionTreeClassifier"); } template void DecisionTreeClassifier::plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, int maxdepth, int max_leaf_nodes, const float colper, int n_bins, int split_algo_flag, int cfg_min_rows_per_node, bool cfg_bootstrap_features) { - split_algo = split_algo_flag; - dinfo.NLocalrows = nrows; - dinfo.NGlobalrows = nrows; - dinfo.Ncols = ncols; - nbins = n_bins; - treedepth = maxdepth; - maxleaves = max_leaf_nodes; - tempmem.resize(MAXSTREAMS); - n_unique_labels = unique_labels; - min_rows_per_node = cfg_min_rows_per_node; - bootstrap_features = cfg_bootstrap_features; + this->split_algo = split_algo_flag; + this->dinfo.NLocalrows = nrows; + this->dinfo.NGlobalrows = nrows; + this->dinfo.Ncols = ncols; + this->nbins = n_bins; + this->treedepth = maxdepth; + this->maxleaves = max_leaf_nodes; + this->tempmem.resize(this->MAXSTREAMS); + this->n_unique_labels = unique_labels; + this->min_rows_per_node = cfg_min_rows_per_node; + this->bootstrap_features = cfg_bootstrap_features; //Bootstrap features - feature_selector.resize(dinfo.Ncols); - if (bootstrap_features) { - srand( n_bins ); - for(int i=0; i < dinfo.Ncols; i++) { - feature_selector.push_back( rand() % dinfo.Ncols ); + this->feature_selector.resize(this->dinfo.Ncols); + if (this->bootstrap_features) { + srand(n_bins); + for(int i=0; i < this->dinfo.Ncols; i++) { + this->feature_selector.push_back( rand() % this->dinfo.Ncols ); } } else { - std::iota(feature_selector.begin(), feature_selector.end(), 0); + std::iota(this->feature_selector.begin(), this->feature_selector.end(), 0); } - - std::random_shuffle(feature_selector.begin(),feature_selector.end()); - feature_selector.resize((int) (colper * dinfo.Ncols)); - + + std::random_shuffle(this->feature_selector.begin(), this->feature_selector.end()); + this->feature_selector.resize((int) (colper * this->dinfo.Ncols)); + cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, 0)); - max_shared_mem = prop.sharedMemPerBlock; + this->max_shared_mem = prop.sharedMemPerBlock; - if (split_algo == SPLIT_ALGO::HIST) { - shmem_used += 2 * sizeof(T) * ncols; - shmem_used += nbins * n_unique_labels * sizeof(int) * ncols; + if (this->split_algo == SPLIT_ALGO::HIST) { + this->shmem_used += 2 * sizeof(T) * ncols; + this->shmem_used += this->nbins * this->n_unique_labels * sizeof(int) * ncols; } else { - shmem_used += nbins * n_unique_labels * sizeof(int) * ncols; + this->shmem_used += this->nbins * this->n_unique_labels * sizeof(int) * ncols; } - ASSERT(shmem_used <= max_shared_mem, "Shared memory per block limit %zd , requested %zd \n", max_shared_mem, shmem_used); + ASSERT(this->shmem_used <= this->max_shared_mem, "Shared memory per block limit %zd , requested %zd \n", this->max_shared_mem, this->shmem_used); - for (int i = 0; i < MAXSTREAMS; i++) { - tempmem[i] = std::make_shared>(handle, n_sampled_rows, ncols, MAXSTREAMS, unique_labels, n_bins, split_algo); - if (split_algo == SPLIT_ALGO::GLOBAL_QUANTILE) { - preprocess_quantile(data, rowids, n_sampled_rows, ncols, dinfo.NLocalrows, n_bins, tempmem[i]); + for (int i = 0; i < this->MAXSTREAMS; i++) { + this->tempmem[i] = std::make_shared>(handle, n_sampled_rows, ncols, this->MAXSTREAMS, unique_labels, n_bins, this->split_algo); + if (this->split_algo == SPLIT_ALGO::GLOBAL_QUANTILE) { + preprocess_quantile(data, rowids, n_sampled_rows, ncols, this->dinfo.NLocalrows, n_bins, this->tempmem[i]); } } - total_temp_mem = tempmem[0]->totalmem; - total_temp_mem *= MAXSTREAMS; + this->total_temp_mem = this->tempmem[0]->totalmem; + this->total_temp_mem *= this->MAXSTREAMS; GiniInfo split_info; MLCommon::TimerCPU timer; - root = grow_tree(data, colper, labels, 0, rowids, n_sampled_rows, split_info); - construct_time = timer.getElapsedSeconds(); + this->root = grow_tree(data, colper, labels, 0, rowids, n_sampled_rows, split_info); + this->construct_time = timer.getElapsedSeconds(); - for (int i = 0; i < MAXSTREAMS; i++) { - tempmem[i].reset(); + for (int i = 0; i < this->MAXSTREAMS; i++) { + this->tempmem[i].reset(); } return; @@ -229,26 +261,26 @@ TreeNode* DecisionTreeClassifier::grow_tree(T *data, const float colper, i split_info[0] = prev_split_info; bool condition = ((depth != 0) && (prev_split_info.best_gini == 0.0f)); // This node is a leaf, no need to search for best split - condition = condition || (n_sampled_rows < min_rows_per_node); // Do not split a node with less than min_rows_per_node samples + condition = condition || (n_sampled_rows < this->min_rows_per_node); // Do not split a node with less than min_rows_per_node samples if (!condition) { find_best_fruit_all(data, labels, colper, ques, gain, rowids, n_sampled_rows, &split_info[0], depth); //ques and gain are output here condition = condition || (gain == 0.0f); } - if (treedepth != -1) - condition = (condition || (depth == treedepth)); + if (this->treedepth != -1) + condition = (condition || (depth == this->treedepth)); - if (maxleaves != -1) - condition = (condition || (leaf_counter >= maxleaves)); // FIXME not fully respecting maxleaves, but >= constraints it more than == + if (this->maxleaves != -1) + condition = (condition || (this->leaf_counter >= this->maxleaves)); // FIXME not fully respecting maxleaves, but >= constraints it more than == if (condition) { node->class_predict = get_class_hist(split_info[0].hist); node->gini_val = split_info[0].best_gini; - leaf_counter++; - if (depth > depth_counter) - depth_counter = depth; + this->leaf_counter++; + if (depth > this->depth_counter) + this->depth_counter = depth; } else { int nrowsleft, nrowsright; split_branch(data, ques, n_sampled_rows, nrowsleft, nrowsright, rowids); // populates ques.value @@ -265,40 +297,40 @@ TreeNode* DecisionTreeClassifier::grow_tree(T *data, const float colper, i template void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const float colper, GiniQuestion & ques, float& gain, unsigned int* rowids, const int n_sampled_rows, GiniInfo split_info[3], int depth) { - std::vector& colselector = feature_selector; - + std::vector& colselector = this->feature_selector; + // Optimize ginibefore; no need to compute except for root. if (depth == 0) { CUDA_CHECK(cudaHostRegister(colselector.data(), sizeof(int) * colselector.size(), cudaHostRegisterDefault)); // Copy sampled column IDs to device memory - CUDA_CHECK(cudaMemcpyAsync(tempmem[0]->d_colids->data(), colselector.data(), sizeof(int) * colselector.size(), cudaMemcpyHostToDevice, tempmem[0]->stream)); - CUDA_CHECK(cudaStreamSynchronize(tempmem[0]->stream)); - - int *labelptr = tempmem[0]->sampledlabels->data(); - get_sampled_labels(labels, labelptr, rowids, n_sampled_rows, tempmem[0]->stream); - gini(labelptr, n_sampled_rows, tempmem[0], split_info[0], n_unique_labels); + CUDA_CHECK(cudaMemcpyAsync(this->tempmem[0]->d_colids->data(), colselector.data(), sizeof(int) * colselector.size(), cudaMemcpyHostToDevice, this->tempmem[0]->stream)); + CUDA_CHECK(cudaStreamSynchronize(this->tempmem[0]->stream)); + + int *labelptr = this->tempmem[0]->sampledlabels->data(); + get_sampled_labels(labels, labelptr, rowids, n_sampled_rows, this->tempmem[0]->stream); + gini(labelptr, n_sampled_rows, this->tempmem[0], split_info[0], this->n_unique_labels); //Unregister CUDA_CHECK(cudaHostUnregister(colselector.data())); } - - int current_nbins = (n_sampled_rows < nbins) ? n_sampled_rows : nbins; - best_split_all_cols(data, rowids, labels, current_nbins, n_sampled_rows, n_unique_labels, dinfo.NLocalrows, colselector, - tempmem[0], &split_info[0], ques, gain, split_algo); + + int current_nbins = (n_sampled_rows < this->nbins) ? n_sampled_rows : this->nbins; + best_split_all_cols(data, rowids, labels, current_nbins, n_sampled_rows, this->n_unique_labels, this->dinfo.NLocalrows, colselector, + this->tempmem[0], &split_info[0], ques, gain, this->split_algo); } template void DecisionTreeClassifier::split_branch(T *data, GiniQuestion & ques, const int n_sampled_rows, int& nrowsleft, int& nrowsright, unsigned int* rowids) { - T *temp_data = tempmem[0]->temp_data->data(); + T *temp_data = this->tempmem[0]->temp_data->data(); T *sampledcolumn = &temp_data[n_sampled_rows * ques.bootstrapped_column]; - make_split(sampledcolumn, ques, n_sampled_rows, nrowsleft, nrowsright, rowids, split_algo, tempmem[0]); + make_split(sampledcolumn, ques, n_sampled_rows, nrowsleft, nrowsright, rowids, this->split_algo, this->tempmem[0]); } template void DecisionTreeClassifier::classify_all(const T * rows, const int n_rows, const int n_cols, int* preds, bool verbose) const { for (int row_id = 0; row_id < n_rows; row_id++) { - preds[row_id] = classify(&rows[row_id * n_cols], root, verbose); + preds[row_id] = classify(&rows[row_id * n_cols], this->root, verbose); } return; } @@ -322,29 +354,108 @@ int DecisionTreeClassifier::classify(const T * row, const TreeNode* const } } + +/** + * @brief Build (i.e., fit, train) Decision Tree regressor for input data. + * @tparam T: data type for input data (float or double). + * @param[in] handle: cumlHandle + * @param[in] data: train data (nrows samples, ncols features) in column major format, excluding labels. Device pointer. + * @param[in] ncols: number of features (i.e., columns) excluding target feature. + * @param[in] nrows: number of training data samples of the whole unsampled dataset. + * @param[in] labels: 1D array of target features (float or double). One label per training sample. Device pointer. + * @param[in,out] rowids: array of n_sampled_rows integers in [0, nrows) range. Device pointer. + The same array is then rearranged when splits are made, allowing us to construct trees without rearranging the actual dataset. + * @param[in] n_sampled_rows: number of training samples, after sampling. If using decision tree directly over the whole dataset: n_sampled_rows = nrows + * @param[in] tree_params: Decision Tree training hyper parameter struct. + */ template -void DecisionTreeClassifier::print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const { +void DecisionTreeRegressor::fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, T *labels, + unsigned int *rowids, const int n_sampled_rows, DecisionTreeParams tree_params) { + tree_params.validity_check(); + if (tree_params.n_bins > n_sampled_rows) { + std::cout << "Warning! Calling with number of bins > number of rows! "; + std::cout << "Resetting n_bins to " << n_sampled_rows << "." << std::endl; + tree_params.n_bins = n_sampled_rows; + } + // TODO FIXME placeholder + //plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree_params.max_depth, + // tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features); +} - if (node != nullptr) { - std::cout << prefix; +template +void DecisionTreeRegressor::fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, int *labels, + unsigned int *rowids, const int n_sampled_rows, int unique_labels, DecisionTreeParams tree_params) { - std::cout << (isLeft ? "├" : "└" ); + ASSERT(false, "Unsupported fit method for DecisionTreeRegressor"); +} - // print the value of the node - std::cout << node << std::endl; +/** + * @brief Predict target feature for input data; regression for single feature supported. Inference of trees is CPU only for now. + * @tparam T: data type for input data (float or double). + * @param[in] handle: cumlHandle (currently unused; API placeholder) + * @param[in] rows: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] n_rows: number of data samples. + * @param[in] n_cols: number of features (excluding target feature). + * @param[in,out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in] verbose: flag for debugging purposes. + */ +template +void DecisionTreeRegressor::predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, T* predictions, bool verbose) const { + ASSERT(this->root, "Cannot predict w/ empty tree!"); + ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); + ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); + predict_all(rows, n_rows, n_cols, predictions, verbose); +} - // enter the next tree level - left and right branch - print_node( prefix + (isLeft ? "│ " : " "), node->left, true); - print_node( prefix + (isLeft ? "│ " : " "), node->right, false); +template +void DecisionTreeRegressor::predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, int* predictions, bool verbose) const { + ASSERT(false, "Unsupported predict method for DecisionTreeRegressor"); +} + +template +void DecisionTreeRegressor::predict_all(const T * rows, const int n_rows, const int n_cols, T* preds, bool verbose) const { + for (int row_id = 0; row_id < n_rows; row_id++) { + preds[row_id] = predict(&rows[row_id * n_cols], this->root, verbose); + } +} + +template +T DecisionTreeRegressor::predict(const T * row, const TreeNode* const node, bool verbose) const { + + Question q = node->question; + if (node->left && (row[q.column] <= q.value)) { + if (verbose) + std::cout << "Classifying Left @ node w/ column " << q.column << " and value " << q.value << std::endl; + return predict(row, node->left, verbose); + } else if (node->right && (row[q.column] > q.value)) { + if (verbose) + std::cout << "Classifying Right @ node w/ column " << q.column << " and value " << q.value << std::endl; + return predict(row, node->right, verbose); + } else { // TODO FIXME class_predict should be of type T + if (verbose) + std::cout << "Leaf node. Predicting " << node->class_predict << std::endl; + return node->class_predict; } } +// ---------------- Regression end + + //Class specializations +template class dt; +template class dt; + template class DecisionTreeClassifier; template class DecisionTreeClassifier; +template class DecisionTreeRegressor; +template class DecisionTreeRegressor; + } //End namespace DecisionTree +// Stateless API functions + +// ----------------------------- Classification ----------------------------------- // /** * @brief Build (i.e., fit, train) Decision Tree classifier for input data. @@ -361,7 +472,7 @@ template class DecisionTreeClassifier; * @param[in] n_unique_labels: #unique label values. Number of categories of classification. * @param[in] tree_params: Decision Tree training hyper parameter struct */ -void fit(const ML::cumlHandle& handle, DecisionTree::DecisionTreeClassifier * dt_classifier, float *data, const int ncols, const int nrows, int *labels, +void fit(const ML::cumlHandle& handle, DecisionTree::DecisionTreeClassifier * dt_classifier, float *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, DecisionTree::DecisionTreeParams tree_params) { dt_classifier->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree_params); } @@ -381,8 +492,8 @@ void fit(const ML::cumlHandle& handle, DecisionTree::DecisionTreeClassifier * dt_classifier, double *data, const int ncols, const int nrows, int *labels, + */ +void fit(const ML::cumlHandle& handle, DecisionTree::DecisionTreeClassifier * dt_classifier, double *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, DecisionTree::DecisionTreeParams tree_params) { dt_classifier->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree_params); } @@ -390,7 +501,7 @@ void fit(const ML::cumlHandle& handle, DecisionTree::DecisionTreeClassifier * dt_classifier, const float * rows, const int n_rows, const int n_cols, int* predictions, bool verbose) { return dt_classifier->predict(handle, rows, n_rows, n_cols, predictions, verbose); } - + /** * @brief Predict target feature for input data; n-ary classification for single feature supported. Inference of trees is CPU only for now. * @param[in] handle: cumlHandle (currently unused; API placeholder) - * @param[in] dt_classifier: Pointer to decision tree object, which holds the trained tree. + * @param[in] dt_classifier: Pointer to decision tree object, which holds the trained tree. * @param[in] rows: test data type double (n_rows samples, n_cols features) in row major format. CPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). @@ -415,5 +526,69 @@ void predict(const ML::cumlHandle& handle, const DecisionTree::DecisionTreeClass return dt_classifier->predict(handle, rows, n_rows, n_cols, predictions, verbose); } +// ----------------------------- Regression ----------------------------------- // + +/** + * @brief Build (i.e., fit, train) Decision Tree regressor for input data. + * @param[in] handle: cumlHandle + * @param[in,out] dt_regressor: Pointer to Decision Tree Regressor object. The object holds the trained tree. + * @param[in] data: train data in float (nrows samples, ncols features) in column major format, excluding labels. Device pointer. + * @param[in] ncols: number of features (i.e., columns) excluding target feature. + * @param[in] nrows: number of training data samples of the whole unsampled dataset. + * @param[in] labels: 1D array of target features (type float). One label per training sample. Device pointer. + * @param[in,out] rowids: This array consists of integers from (0 - n_sampled_rows), the same array is then rearranged when splits are made. This allows, us to contruct trees without rearranging the actual dataset. Device pointer. + * @param[in] n_sampled_rows: number of training samples, after sampling. If using decsion tree directly over the whole dataset (n_sampled_rows = nrows) + * @param[in] tree_params: Decision Tree training hyper parameter struct + */ +void fit(const ML::cumlHandle& handle, DecisionTree::DecisionTreeRegressor * dt_regressor, float *data, const int ncols, const int nrows, float *labels, + unsigned int *rowids, const int n_sampled_rows, DecisionTree::DecisionTreeParams tree_params) { + dt_regressor->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, tree_params); +} + +/** + * @brief Build (i.e., fit, train) Decision Tree regressor for input data. + * @param[in] handle: cumlHandle + * @param[in,out] dt_regressor: Pointer to Decision Tree Regressor object. The object holds the trained tree. + * @param[in] data: train data in double (nrows samples, ncols features) in column major format, excluding labels. Device pointer. + * @param[in] ncols: number of features (i.e., columns) excluding target feature. + * @param[in] nrows: number of training data samples of the whole unsampled dataset. + * @param[in] labels: 1D array of target features (type float). One label per training sample. Device pointer. + * @param[in,out] rowids: array of n_sampled_rows integers in [0, nrows) range. Device pointer. + The same array is then rearranged when splits are made, allowing us to construct trees without rearranging the actual dataset. + * @param[in] n_sampled_rows: number of training samples, after sampling. If using decsion tree directly over the whole dataset (n_sampled_rows = nrows) + * @param[in] tree_params: Decision Tree training hyper parameter struct. + */ +void fit(const ML::cumlHandle& handle, DecisionTree::DecisionTreeRegressor * dt_regressor, double *data, const int ncols, const int nrows, double *labels, + unsigned int *rowids, const int n_sampled_rows, DecisionTree::DecisionTreeParams tree_params) { + dt_regressor->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, tree_params); +} + +/** + * @brief Predict target feature for input data; regression for single feature supported. Inference of trees is CPU only for now. + * @param[in] handle: cumlHandle (currently unused; API placeholder) + * @param[in] dt_regressor: Pointer to decision tree object, which holds the trained tree. + * @param[in] rows: test data type float (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] n_rows: number of data samples. + * @param[in] n_cols: number of features (excluding target feature). + * @param[in,out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in] verbose: flag for debugging purposes. + */ +void predict(const ML::cumlHandle& handle, const DecisionTree::DecisionTreeRegressor * dt_regressor, const float * rows, const int n_rows, const int n_cols, float * predictions, bool verbose) { + return dt_regressor->predict(handle, rows, n_rows, n_cols, predictions, verbose); +} + +/** + * @brief Predict target feature for input data; regression for single feature supported. Inference of trees is CPU only for now. + * @param[in] handle: cumlHandle (currently unused; API placeholder) + * @param[in] dt_regressor: Pointer to decision tree object, which holds the trained tree. + * @param[in] rows: test data type double (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] n_rows: number of data samples. + * @param[in] n_cols: number of features (excluding target feature). + * @param[in,out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in] verbose: flag for debugging purposes. + */ +void predict(const ML::cumlHandle& handle, const DecisionTree::DecisionTreeRegressor * dt_regressor, const double * rows, const int n_rows, const int n_cols, double * predictions, bool verbose) { + return dt_regressor->predict(handle, rows, n_rows, n_cols, predictions, verbose); +} } //End namespace ML diff --git a/cuML/src/decisiontree/decisiontree.h b/cuML/src/decisiontree/decisiontree.h index 2f8e12e008..cf2d805989 100644 --- a/cuML/src/decisiontree/decisiontree.h +++ b/cuML/src/decisiontree/decisiontree.h @@ -84,7 +84,7 @@ struct DecisionTreeParams { * Wheather to bootstarp columns with or without replacement */ bool bootstrap_features = false; - + DecisionTreeParams(); DecisionTreeParams(int cfg_max_depth, int cfg_max_leaves, float cfg_max_features, int cfg_n_bins, int cfg_split_aglo, int cfg_min_rows_per_node, bool cfg_bootstrap_features); void validity_check() const; @@ -92,28 +92,46 @@ struct DecisionTreeParams { }; template -class DecisionTreeClassifier { +class dt { + protected: + int split_algo; + TreeNode *root = nullptr; + int nbins; + DataInfo dinfo; + int treedepth; + int depth_counter = 0; + int maxleaves; + int leaf_counter = 0; + std::vector>> tempmem; + size_t total_temp_mem; + const int MAXSTREAMS = 1; + size_t max_shared_mem; + size_t shmem_used = 0; + int n_unique_labels = -1; // number of unique labels in dataset + double construct_time; + int min_rows_per_node; + bool bootstrap_features; + std::vector feature_selector; + + void print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const; + public: + // Printing utility for high level tree info. + void print_tree_summary() const; + + // Printing utility for debug and looking at nodes and leaves. + void print() const; + + virtual void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, + const int n_sampled_rows, const int unique_labels, DecisionTreeParams tree_params) = 0; + virtual void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, + const int n_sampled_rows, DecisionTreeParams tree_params) = 0; + + virtual void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, int* predictions, bool verbose=false) const = 0; + virtual void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, T* predictions, bool verbose=false) const = 0; +}; // End dt Class -private: - int split_algo; - TreeNode *root = nullptr; - int nbins; - DataInfo dinfo; - int treedepth; - int depth_counter = 0; - int maxleaves; - int leaf_counter = 0; - std::vector>> tempmem; - size_t total_temp_mem; - const int MAXSTREAMS = 1; - size_t max_shared_mem; - size_t shmem_used = 0; - int n_unique_labels = -1; // number of unique labels in dataset - double construct_time; - int min_rows_per_node; - bool bootstrap_features; - std::vector feature_selector; - +template +class DecisionTreeClassifier : public dt { public: // Expects column major T dataset, integer labels // data, labels are both device ptr. @@ -121,14 +139,10 @@ class DecisionTreeClassifier { void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, const int unique_labels, DecisionTreeParams tree_params); - /* Predict labels for n_rows rows, with n_cols features each, for a given tree. rows in row-major format. */ + void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, + const int n_sampled_rows, DecisionTreeParams tree_params); void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, int* predictions, bool verbose=false) const; - - // Printing utility for high level tree info. - void print_tree_summary() const; - - // Printing utility for debug and looking at nodes and leaves. - void print() const; + void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, T* predictions, bool verbose=false) const; private: // Same as above fit, but planting is better for a tree then fitting. @@ -143,13 +157,34 @@ class DecisionTreeClassifier { void split_branch(T *data, GiniQuestion & ques, const int n_sampled_rows, int& nrowsleft, int& nrowsright, unsigned int* rowids); void classify_all(const T * rows, const int n_rows, const int n_cols, int* preds, bool verbose=false) const; int classify(const T * row, const TreeNode * const node, bool verbose=false) const; - void print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const; -}; // End DecisionTree Class +}; // End DecisionTreeClassifier Class + +template +class DecisionTreeRegressor : public dt { +public: + void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, + const int n_sampled_rows, DecisionTreeParams tree_params); + + void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, + const int n_sampled_rows, const int unique_labels, DecisionTreeParams tree_params); + + /* Predict labels for n_rows rows, with n_cols features each, for a given tree. rows in row-major format. */ + void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, T* predictions, bool verbose=false) const; + void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, int* predictions, bool verbose=false) const; + +// TODO FIXME: add private methods from DecisionTreeClassifier as needed +private: + void predict_all(const T * rows, const int n_rows, const int n_cols, T * preds, bool verbose=false) const; + T predict(const T * row, const TreeNode * const node, bool verbose=false) const; +}; // End DecisionTreeRegressor Class } //End namespace DecisionTree // Stateless API functions + +// ----------------------------- Classification ----------------------------------- // + void fit(const ML::cumlHandle& handle, DecisionTree::DecisionTreeClassifier * dt_classifier, float *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, DecisionTree::DecisionTreeParams tree_params); @@ -161,4 +196,17 @@ void predict(const ML::cumlHandle& handle, const DecisionTree::DecisionTreeClass void predict(const ML::cumlHandle& handle, const DecisionTree::DecisionTreeClassifier * dt_classifier, const double * rows, const int n_rows, const int n_cols, int* predictions, bool verbose=false); +// ----------------------------- Regression ----------------------------------- // + +void fit(const ML::cumlHandle& handle, DecisionTree::DecisionTreeRegressor * dt_regressor, float *data, const int ncols, const int nrows, float *labels, + unsigned int *rowids, const int n_sampled_rows, DecisionTree::DecisionTreeParams tree_params); + +void fit(const ML::cumlHandle& handle, DecisionTree::DecisionTreeRegressor * dt_regressor, double *data, const int ncols, const int nrows, double *labels, + unsigned int *rowids, const int n_sampled_rows, DecisionTree::DecisionTreeParams tree_params); + +void predict(const ML::cumlHandle& handle, const DecisionTree::DecisionTreeRegressor * dt_regressor, const float * rows, + const int n_rows, const int n_cols, float * predictions, bool verbose=false); +void predict(const ML::cumlHandle& handle, const DecisionTree::DecisionTreeRegressor * dt_regressor, const double * rows, + const int n_rows, const int n_cols, double * predictions, bool verbose=false); + } //End namespace ML diff --git a/cuML/src/randomforest/randomforest.cu b/cuML/src/randomforest/randomforest.cu index f2a4b8a1eb..36c98a37c1 100644 --- a/cuML/src/randomforest/randomforest.cu +++ b/cuML/src/randomforest/randomforest.cu @@ -382,7 +382,6 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); -#if 0 // commented out as DecisionTreeRegressor not implemented rfRegressor::trees = new DecisionTree::DecisionTreeRegressor[this->rf_params.n_trees]; int n_sampled_rows = this->rf_params.rows_sample * n_rows; @@ -416,8 +415,6 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i //Cleanup selected_rows.release(stream); } -#endif - } /** @@ -438,7 +435,6 @@ void rfRegressor::predict(const cumlHandle& user_handle, const T * input, int ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); ASSERT(predictions != nullptr, "Error! User has not allocated memory for predictions."); -#if 0 // commented out as DecisionTreeRegressor not implemented int row_size = n_cols; for (int row_id = 0; row_id < n_rows; row_id++) { @@ -465,7 +461,6 @@ void rfRegressor::predict(const cumlHandle& user_handle, const T * input, int // Random forest's prediction is the arithmetic mean of all its decision tree predictions. predictions[row_id] = sum_predictions / this->rf_params.n_trees; } -#endif } /** @@ -484,7 +479,6 @@ RF_metrics rfRegressor::cross_validate(const cumlHandle& user_handle, const T predict(user_handle, input, n_rows, n_cols, predictions, verbose); -#if 0 // commented out as DecisionTreeRegressor not implemented double abs_difference_sum = 0; double mse_sum = 0; std::vector abs_diffs; @@ -512,12 +506,8 @@ RF_metrics rfRegressor::cross_validate(const cumlHandle& user_handle, const T if (verbose) stats.print(); return stats; -#endif - RF_metrics placeholder(0); // temp. code so code compiles. TODO remove once code above is uncommented. - return placeholder; } -//=================== regression end template class rf; template class rf; diff --git a/cuML/src/randomforest/randomforest.h b/cuML/src/randomforest/randomforest.h index c612863dc7..37c8b6dff7 100644 --- a/cuML/src/randomforest/randomforest.h +++ b/cuML/src/randomforest/randomforest.h @@ -88,7 +88,7 @@ class rf { protected: RF_params rf_params; int rf_type; - DecisionTree::DecisionTreeClassifier * trees; + DecisionTree::dt * trees; public: rf(RF_params cfg_rf_params, int cfg_rf_type=RF_type::CLASSIFICATION); From 545582d5dbf9668197119a3ae7ba61d5ca5a5288 Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Wed, 8 May 2019 05:56:26 -0700 Subject: [PATCH 03/51] More class updates. --- cuML/src/decisiontree/decisiontree.cu | 23 -------- cuML/src/decisiontree/decisiontree.h | 17 +----- cuML/src/randomforest/randomforest.cu | 80 ++++++++++++++++++--------- cuML/src/randomforest/randomforest.h | 29 ++++++++-- 4 files changed, 81 insertions(+), 68 deletions(-) diff --git a/cuML/src/decisiontree/decisiontree.cu b/cuML/src/decisiontree/decisiontree.cu index 1ad633a6a2..f66eacea83 100644 --- a/cuML/src/decisiontree/decisiontree.cu +++ b/cuML/src/decisiontree/decisiontree.cu @@ -157,13 +157,6 @@ void DecisionTreeClassifier::fit(const ML::cumlHandle& handle, T *data, const tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features); } -template -void DecisionTreeClassifier::fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, T *labels, - unsigned int *rowids, const int n_sampled_rows, DecisionTreeParams tree_params) { - ASSERT(false, "Unsupported fit method for DecisionTreeClassifier"); -} - - /** * @brief Predict target feature for input data; n-ary classification for single feature supported. Inference of trees is CPU only for now. * @tparam T: data type for input data (float or double). @@ -182,10 +175,6 @@ void DecisionTreeClassifier::predict(const ML::cumlHandle& handle, const T * classify_all(rows, n_rows, n_cols, predictions, verbose); } -template -void DecisionTreeClassifier::predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, T* predictions, bool verbose) const { - ASSERT(false, "Unsupported predict method for DecisionTreeClassifier"); -} template void DecisionTreeClassifier::plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, @@ -382,13 +371,6 @@ void DecisionTreeRegressor::fit(const ML::cumlHandle& handle, T *data, const // tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features); } -template -void DecisionTreeRegressor::fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, int *labels, - unsigned int *rowids, const int n_sampled_rows, int unique_labels, DecisionTreeParams tree_params) { - - ASSERT(false, "Unsupported fit method for DecisionTreeRegressor"); -} - /** * @brief Predict target feature for input data; regression for single feature supported. Inference of trees is CPU only for now. * @tparam T: data type for input data (float or double). @@ -407,11 +389,6 @@ void DecisionTreeRegressor::predict(const ML::cumlHandle& handle, const T * r predict_all(rows, n_rows, n_cols, predictions, verbose); } -template -void DecisionTreeRegressor::predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, int* predictions, bool verbose) const { - ASSERT(false, "Unsupported predict method for DecisionTreeRegressor"); -} - template void DecisionTreeRegressor::predict_all(const T * rows, const int n_rows, const int n_cols, T* preds, bool verbose) const { for (int row_id = 0; row_id < n_rows; row_id++) { diff --git a/cuML/src/decisiontree/decisiontree.h b/cuML/src/decisiontree/decisiontree.h index cf2d805989..5b5ce7bfe7 100644 --- a/cuML/src/decisiontree/decisiontree.h +++ b/cuML/src/decisiontree/decisiontree.h @@ -121,13 +121,6 @@ class dt { // Printing utility for debug and looking at nodes and leaves. void print() const; - virtual void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, - const int n_sampled_rows, const int unique_labels, DecisionTreeParams tree_params) = 0; - virtual void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, - const int n_sampled_rows, DecisionTreeParams tree_params) = 0; - - virtual void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, int* predictions, bool verbose=false) const = 0; - virtual void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, T* predictions, bool verbose=false) const = 0; }; // End dt Class template @@ -139,10 +132,8 @@ class DecisionTreeClassifier : public dt { void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, const int unique_labels, DecisionTreeParams tree_params); - void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, - const int n_sampled_rows, DecisionTreeParams tree_params); + /* Predict labels for n_rows rows, with n_cols features each, for a given tree. rows in row-major format. */ void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, int* predictions, bool verbose=false) const; - void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, T* predictions, bool verbose=false) const; private: // Same as above fit, but planting is better for a tree then fitting. @@ -165,17 +156,13 @@ class DecisionTreeRegressor : public dt { void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, const int n_sampled_rows, DecisionTreeParams tree_params); - void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, - const int n_sampled_rows, const int unique_labels, DecisionTreeParams tree_params); - /* Predict labels for n_rows rows, with n_cols features each, for a given tree. rows in row-major format. */ void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, T* predictions, bool verbose=false) const; - void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, int* predictions, bool verbose=false) const; // TODO FIXME: add private methods from DecisionTreeClassifier as needed private: void predict_all(const T * rows, const int n_rows, const int n_cols, T * preds, bool verbose=false) const; - T predict(const T * row, const TreeNode * const node, bool verbose=false) const; + T predict(const T * row, const TreeNode * const node, bool verbose=false) const; // TODO FIXME rename so it's not overloaded? Or pull to base class? }; // End DecisionTreeRegressor Class } //End namespace DecisionTree diff --git a/cuML/src/randomforest/randomforest.cu b/cuML/src/randomforest/randomforest.cu index 36c98a37c1..d12adba5ad 100644 --- a/cuML/src/randomforest/randomforest.cu +++ b/cuML/src/randomforest/randomforest.cu @@ -145,19 +145,10 @@ void RF_params::print() const { * @param[in] cfg_rf_type: Random forest type. Only CLASSIFICATION is currently supported. */ template -rf::rf(RF_params cfg_rf_params, int cfg_rf_type):rf_params(cfg_rf_params), rf_type(cfg_rf_type), trees(nullptr) { +rf::rf(RF_params cfg_rf_params, int cfg_rf_type):rf_params(cfg_rf_params), rf_type(cfg_rf_type) { rf_params.validity_check(); } -/** - * @brief Destructor for random forest object. - * @tparam T: data type for input data (float or double). - */ -template -rf::~rf() { - delete [] trees; -} - /** * @brief Return number of trees in the forest. * @tparam T: data type for input data (float or double). @@ -167,14 +158,13 @@ int rf::get_ntrees() { return rf_params.n_trees; } - /** * @brief Print summary for all trees in the random forest. * @tparam T: data type for input data (float or double). */ template void rf::print_rf_summary() { - + const DecisionTree::dt * trees = get_trees_ptr(); if (!trees) { std::cout << "Empty forest" << std::endl; } else { @@ -182,11 +172,12 @@ void rf::print_rf_summary() { std::cout << ", and max_leaves " << rf_params.tree_params.max_leaves << std::endl; for (int i = 0; i < rf_params.n_trees; i++) { std::cout << "Tree #" << i << std::endl; - trees[i].print_tree_summary(); + trees[i].print_tree_summary(); } } } + /** * @brief Print detailed view of all trees in the random forest. * @tparam T: data type for input data (float or double). @@ -194,6 +185,7 @@ void rf::print_rf_summary() { template void rf::print_rf_detailed() { + const DecisionTree::dt * trees = get_trees_ptr(); if (!trees) { std::cout << "Empty forest" << std::endl; } else { @@ -215,6 +207,25 @@ void rf::print_rf_detailed() { template rfClassifier::rfClassifier(RF_params cfg_rf_params): rf::rf(cfg_rf_params, RF_type::CLASSIFICATION) {}; +/** + * @brief Destructor for random forest classifier object. + * @tparam T: data type for input data (float or double). + */ +template +rfClassifier::~rfClassifier() { + delete [] trees; +} + + +/** + * @brief Return a const pointer to decision trees. + * @tparam T: data type for input data (float or double). + */ +template +const DecisionTree::DecisionTreeClassifier * rfClassifier::get_trees_ptr() const { + return trees; +} + /** * @brief Build (i.e., fit, train) random forest classifier for input data. * @tparam T: data type for input data (float or double). @@ -230,11 +241,11 @@ rfClassifier::rfClassifier(RF_params cfg_rf_params): rf::rf(cfg_rf_params, template void rfClassifier::fit(const cumlHandle& user_handle, T * input, int n_rows, int n_cols, int * labels, int n_unique_labels) { - ASSERT(!this->trees, "Cannot fit an existing forest."); + ASSERT(!trees, "Cannot fit an existing forest."); ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); - rfClassifier::trees = new DecisionTree::DecisionTreeClassifier[this->rf_params.n_trees]; + trees = new DecisionTree::DecisionTreeClassifier[this->rf_params.n_trees]; int n_sampled_rows = this->rf_params.rows_sample * n_rows; const cumlHandle_impl& handle = user_handle.getImpl(); @@ -262,7 +273,7 @@ void rfClassifier::fit(const cumlHandle& user_handle, T * input, int n_rows, - selected_rows: points to a list of row #s (w/ n_sampled_rows elements) used to build the bootstrapped sample. Expectation: Each tree node will contain (a) # n_sampled_rows and (b) a pointer to a list of row numbers w.r.t original data. */ - this->trees[i].fit(user_handle, input, n_cols, n_rows, labels, selected_rows.data(), n_sampled_rows, n_unique_labels, this->rf_params.tree_params); + trees[i].fit(user_handle, input, n_cols, n_rows, labels, selected_rows.data(), n_sampled_rows, n_unique_labels, this->rf_params.tree_params); //Cleanup selected_rows.release(stream); @@ -283,7 +294,7 @@ void rfClassifier::fit(const cumlHandle& user_handle, T * input, int n_rows, template void rfClassifier::predict(const cumlHandle& user_handle, const T * input, int n_rows, int n_cols, int * predictions, bool verbose) const { - ASSERT(this->trees, "Cannot predict! No trees in the forest."); + ASSERT(trees, "Cannot predict! No trees in the forest."); ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); ASSERT(predictions != nullptr, "Error! User has not allocated memory for predictions."); @@ -308,10 +319,10 @@ void rfClassifier::predict(const cumlHandle& user_handle, const T * input, in //Return prediction for one sample. if (verbose) { std::cout << "Printing tree " << i << std::endl; - this->trees[i].print(); + trees[i].print(); } int prediction; - this->trees[i].predict(user_handle, &input[row_id * row_size], 1, n_cols, &prediction, verbose); + trees[i].predict(user_handle, &input[row_id * row_size], 1, n_cols, &prediction, verbose); ret = prediction_to_cnt.insert(std::pair(prediction, 1)); if (!(ret.second)) { ret.first->second += 1; @@ -366,6 +377,24 @@ RF_metrics rfClassifier::cross_validate(const cumlHandle& user_handle, const template rfRegressor::rfRegressor(RF_params cfg_rf_params): rf::rf(cfg_rf_params, RF_type::REGRESSION) {}; +/** + * @brief Destructor for random forest regressor object. + * @tparam T: data type for input data (float or double). + */ +template +rfRegressor::~rfRegressor() { + delete [] trees; +} + +/** + * @brief Return a const pointer to decision trees. + * @tparam T: data type for input data (float or double). + */ +template +const DecisionTree::DecisionTreeRegressor * rfRegressor::get_trees_ptr() const { + return trees; +} + /** * @brief Build (i.e., fit, train) random forest regressor for input data. * @tparam T: data type for input data (float or double). @@ -378,11 +407,11 @@ rfRegressor::rfRegressor(RF_params cfg_rf_params): rf::rf(cfg_rf_params, R template void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, int n_cols, T * labels) { - ASSERT(!this->trees, "Cannot fit an existing forest."); + ASSERT(!trees, "Cannot fit an existing forest."); ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); - rfRegressor::trees = new DecisionTree::DecisionTreeRegressor[this->rf_params.n_trees]; + trees = new DecisionTree::DecisionTreeRegressor[this->rf_params.n_trees]; int n_sampled_rows = this->rf_params.rows_sample * n_rows; const cumlHandle_impl& handle = user_handle.getImpl(); @@ -410,7 +439,7 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i - selected_rows: points to a list of row #s (w/ n_sampled_rows elements) used to build the bootstrapped sample. Expectation: Each tree node will contain (a) # n_sampled_rows and (b) a pointer to a list of row numbers w.r.t original data. */ - this->trees[i].fit(user_handle, input, n_cols, n_rows, labels, selected_rows.data(), n_sampled_rows, /*n_unique_labels,*/ this->rf_params.tree_params); + trees[i].fit(user_handle, input, n_cols, n_rows, labels, selected_rows.data(), n_sampled_rows, /*n_unique_labels,*/ this->rf_params.tree_params); //Cleanup selected_rows.release(stream); @@ -430,7 +459,7 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i template void rfRegressor::predict(const cumlHandle& user_handle, const T * input, int n_rows, int n_cols, T * predictions, bool verbose) const { - ASSERT(this->trees, "Cannot predict! No trees in the forest."); + ASSERT(trees, "Cannot predict! No trees in the forest."); ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); ASSERT(predictions != nullptr, "Error! User has not allocated memory for predictions."); @@ -452,10 +481,10 @@ void rfRegressor::predict(const cumlHandle& user_handle, const T * input, int //Return prediction for one sample. if (verbose) { std::cout << "Printing tree " << i << std::endl; - this->trees[i].print(); + trees[i].print(); } T prediction; - this->trees[i].predict(user_handle, &input[row_id * row_size], 1, n_cols, &prediction, verbose); + trees[i].predict(user_handle, &input[row_id * row_size], 1, n_cols, &prediction, verbose); sum_predictions += prediction; } // Random forest's prediction is the arithmetic mean of all its decision tree predictions. @@ -508,7 +537,6 @@ RF_metrics rfRegressor::cross_validate(const cumlHandle& user_handle, const T return stats; } - template class rf; template class rf; diff --git a/cuML/src/randomforest/randomforest.h b/cuML/src/randomforest/randomforest.h index 37c8b6dff7..55d7fd9af1 100644 --- a/cuML/src/randomforest/randomforest.h +++ b/cuML/src/randomforest/randomforest.h @@ -88,11 +88,10 @@ class rf { protected: RF_params rf_params; int rf_type; - DecisionTree::dt * trees; + virtual const DecisionTree::dt * get_trees_ptr() const = 0; public: rf(RF_params cfg_rf_params, int cfg_rf_type=RF_type::CLASSIFICATION); - ~rf(); int get_ntrees(); void print_rf_summary(); @@ -101,29 +100,38 @@ class rf { template class rfClassifier : public rf { + private: + DecisionTree::DecisionTreeClassifier * trees = nullptr; + const DecisionTree::DecisionTreeClassifier * get_trees_ptr() const; public: rfClassifier(RF_params cfg_rf_params); + ~rfClassifier(); void fit(const cumlHandle& user_handle, T * input, int n_rows, int n_cols, int * labels, int n_unique_labels); void predict(const cumlHandle& user_handle, const T * input, int n_rows, int n_cols, int * predictions, bool verbose=false) const; RF_metrics cross_validate(const cumlHandle& user_handle, const T * input, const int * ref_labels, int n_rows, int n_cols, int * predictions, bool verbose=false) const; - }; template class rfRegressor : public rf { + private: + DecisionTree::DecisionTreeRegressor * trees = nullptr; + const DecisionTree::DecisionTreeRegressor * get_trees_ptr() const; public: rfRegressor(RF_params cfg_rf_params); + ~rfRegressor(); void fit(const cumlHandle& user_handle, T * input, int n_rows, int n_cols, T * labels); void predict(const cumlHandle& user_handle, const T * input, int n_rows, int n_cols, T * predictions, bool verbose=false) const; RF_metrics cross_validate(const cumlHandle& user_handle, const T * input, const T * ref_labels, int n_rows, int n_cols, T * predictions, bool verbose=false) const; - }; // Stateless API functions: fit, predict and cross_validate. + +// ----------------------------- Classification ----------------------------------- // + void fit(const cumlHandle& user_handle, rfClassifier * rf_classifier, float * input, int n_rows, int n_cols, int * labels, int n_unique_labels); void fit(const cumlHandle& user_handle, rfClassifier * rf_classifier, double * input, int n_rows, int n_cols, int * labels, int n_unique_labels); @@ -135,4 +143,17 @@ RF_metrics cross_validate(const cumlHandle& user_handle, const rfClassifier * rf_classifier, const double * input, const int * ref_labels, int n_rows, int n_cols, int * predictions, bool verbose=false); +// ----------------------------- Regression ----------------------------------- // + +void fit(const cumlHandle& user_handle, rfRegressor * rf_regressor, float * input, int n_rows, int n_cols, int * labels, int n_unique_labels); +void fit(const cumlHandle& user_handle, rfRegressor * rf_regressor, double * input, int n_rows, int n_cols, int * labels, int n_unique_labels); + +void predict(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const float * input, int n_rows, int n_cols, int * predictions, bool verbose=false); +void predict(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const double * input, int n_rows, int n_cols, int * predictions, bool verbose=false); + +RF_metrics cross_validate(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const float * input, const int * ref_labels, + int n_rows, int n_cols, int * predictions, bool verbose=false); +RF_metrics cross_validate(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const double * input, const int * ref_labels, + int n_rows, int n_cols, int * predictions, bool verbose=false); + }; From 8c5dd25e9d606fe6b7bdfd2173f3d9746ef16ef3 Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Wed, 8 May 2019 07:37:56 -0700 Subject: [PATCH 04/51] TreeNode updates & more dt class changes. -Further templated TreeNode to work for both regression and classification. -Moved all predict methods to the base dt class. --- cuML/src/decisiontree/decisiontree.cu | 189 +++++++++++--------------- cuML/src/decisiontree/decisiontree.h | 35 ++--- cuML/src/randomforest/randomforest.cu | 34 +++-- cuML/src/randomforest/randomforest.h | 8 +- 4 files changed, 115 insertions(+), 151 deletions(-) diff --git a/cuML/src/decisiontree/decisiontree.cu b/cuML/src/decisiontree/decisiontree.cu index f66eacea83..16ffde7946 100644 --- a/cuML/src/decisiontree/decisiontree.cu +++ b/cuML/src/decisiontree/decisiontree.cu @@ -31,19 +31,19 @@ void Question::update(const GiniQuestion & ques) { value = ques.value; } -template -void TreeNode::print(std::ostream& os) const { - - if (left == nullptr && right == nullptr) - os << "(leaf, " << class_predict << ", " << gini_val << ")" ; - else - os << "(" << question.column << ", " << question.value << ", " << gini_val << ")" ; +template +void TreeNode::print(std::ostream& os) const { + if (left == nullptr && right == nullptr) { + os << "(leaf, " << prediction << ", " << split_metric_val << ")" ; + } else { + os << "(" << question.column << ", " << question.value << ", " << split_metric_val << ")" ; + } return; } -template -std::ostream& operator<<(std::ostream& os, const TreeNode * const node) { +template +std::ostream& operator<<(std::ostream& os, const TreeNode * const node) { node->print(os); return os; } @@ -90,9 +90,10 @@ void DecisionTreeParams::print() const { /** * @brief Print high-level tree information. * @tparam T: data type for input data (float or double). + * @tparam L: data type for labels (int type for classification, T type for regression). */ -template -void dt::print_tree_summary() const { +template +void dt::print_tree_summary() const { std::cout << " Decision Tree depth --> " << depth_counter << " and n_leaves --> " << leaf_counter << std::endl; std::cout << " Total temporary memory usage--> "<< ((double)total_temp_mem / (1024*1024)) << " MB" << std::endl; std::cout << " Tree growing time --> " << construct_time << " seconds" << std::endl; @@ -102,15 +103,17 @@ void dt::print_tree_summary() const { /** * @brief Print detailed tree information. * @tparam T: data type for input data (float or double). + * @tparam L: data type for labels (int type for classification, T type for regression). */ -template -void dt::print() const { +template +void dt::print() const { print_tree_summary(); print_node("", this->root, false); } -template -void dt::print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const { + +template +void dt::print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const { if (node != nullptr) { std::cout << prefix; @@ -127,6 +130,55 @@ void dt::print_node(const std::string& prefix, const TreeNode* const node, } +/** + * @brief Predict target feature for input data; n-ary classification or regression for single feature supported. Inference of trees is CPU only for now. + * @tparam T: data type for input data (float or double). + * @tparam L: data type for labels (int type for classification, T type for regression). + * @param[in] handle: cumlHandle (currently unused; API placeholder) + * @param[in] rows: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] n_rows: number of data samples. + * @param[in] n_cols: number of features (excluding target feature). + * @param[in,out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in] verbose: flag for debugging purposes. + */ +template +void dt::predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, L* predictions, bool verbose) const { + ASSERT(root, "Cannot predict w/ empty tree!"); + ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); + ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); + predict_all(rows, n_rows, n_cols, predictions, verbose); +} + +template +void dt::predict_all(const T * rows, const int n_rows, const int n_cols, L * preds, bool verbose) const { + for (int row_id = 0; row_id < n_rows; row_id++) { + preds[row_id] = predict_one(&rows[row_id * n_cols], this->root, verbose); + } +} + +template +L dt::predict_one(const T * row, const TreeNode* const node, bool verbose) const { + + Question q = node->question; + if (node->left && (row[q.column] <= q.value)) { + if (verbose) { + std::cout << "Classifying Left @ node w/ column " << q.column << " and value " << q.value << std::endl; + } + return predict_one(row, node->left, verbose); + } else if (node->right && (row[q.column] > q.value)) { + if (verbose) { + std::cout << "Classifying Right @ node w/ column " << q.column << " and value " << q.value << std::endl; + } + return predict_one(row, node->right, verbose); + } else { + if (verbose) { + std::cout << "Leaf node. Predicting " << node->prediction << std::endl; + } + return node->prediction; + } +} + + /** * @brief Build (i.e., fit, train) Decision Tree classifier for input data. * @tparam T: data type for input data (float or double). @@ -157,25 +209,6 @@ void DecisionTreeClassifier::fit(const ML::cumlHandle& handle, T *data, const tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features); } -/** - * @brief Predict target feature for input data; n-ary classification for single feature supported. Inference of trees is CPU only for now. - * @tparam T: data type for input data (float or double). - * @param[in] handle: cumlHandle (currently unused; API placeholder) - * @param[in] rows: test data (n_rows samples, n_cols features) in row major format. CPU pointer. - * @param[in] n_rows: number of data samples. - * @param[in] n_cols: number of features (excluding target feature). - * @param[in,out] predictions: n_rows predicted labels. CPU pointer, user allocated. - * @param[in] verbose: flag for debugging purposes. - */ -template -void DecisionTreeClassifier::predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, int* predictions, bool verbose) const { - ASSERT(this->root, "Cannot predict w/ empty tree!"); - ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); - ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); - classify_all(rows, n_rows, n_cols, predictions, verbose); -} - - template void DecisionTreeClassifier::plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, int maxdepth, int max_leaf_nodes, const float colper, int n_bins, int split_algo_flag, int cfg_min_rows_per_node, bool cfg_bootstrap_features) { @@ -239,10 +272,10 @@ void DecisionTreeClassifier::plant(const cumlHandle_impl& handle, T *data, co } template -TreeNode* DecisionTreeClassifier::grow_tree(T *data, const float colper, int *labels, int depth, unsigned int* rowids, +TreeNode* DecisionTreeClassifier::grow_tree(T *data, const float colper, int *labels, int depth, unsigned int* rowids, const int n_sampled_rows, GiniInfo prev_split_info) { - TreeNode *node = new TreeNode(); + TreeNode *node = new TreeNode(); GiniQuestion ques; Question node_ques; float gain = 0.0; @@ -264,8 +297,8 @@ TreeNode* DecisionTreeClassifier::grow_tree(T *data, const float colper, i condition = (condition || (this->leaf_counter >= this->maxleaves)); // FIXME not fully respecting maxleaves, but >= constraints it more than == if (condition) { - node->class_predict = get_class_hist(split_info[0].hist); - node->gini_val = split_info[0].best_gini; + node->prediction = get_class_hist(split_info[0].hist); + node->split_metric_val = split_info[0].best_gini; this->leaf_counter++; if (depth > this->depth_counter) @@ -277,7 +310,7 @@ TreeNode* DecisionTreeClassifier::grow_tree(T *data, const float colper, i node->question = node_ques; node->left = grow_tree(data, colper, labels, depth+1, &rowids[0], nrowsleft, split_info[1]); node->right = grow_tree(data, colper, labels, depth+1, &rowids[nrowsleft], nrowsright, split_info[2]); - node->gini_val = split_info[0].best_gini; + node->split_metric_val = split_info[0].best_gini; } return node; } @@ -316,34 +349,6 @@ void DecisionTreeClassifier::split_branch(T *data, GiniQuestion & ques, co make_split(sampledcolumn, ques, n_sampled_rows, nrowsleft, nrowsright, rowids, this->split_algo, this->tempmem[0]); } -template -void DecisionTreeClassifier::classify_all(const T * rows, const int n_rows, const int n_cols, int* preds, bool verbose) const { - for (int row_id = 0; row_id < n_rows; row_id++) { - preds[row_id] = classify(&rows[row_id * n_cols], this->root, verbose); - } - return; -} - -template -int DecisionTreeClassifier::classify(const T * row, const TreeNode* const node, bool verbose) const { - - Question q = node->question; - if (node->left && (row[q.column] <= q.value)) { - if (verbose) - std::cout << "Classifying Left @ node w/ column " << q.column << " and value " << q.value << std::endl; - return classify(row, node->left, verbose); - } else if (node->right && (row[q.column] > q.value)) { - if (verbose) - std::cout << "Classifying Right @ node w/ column " << q.column << " and value " << q.value << std::endl; - return classify(row, node->right, verbose); - } else { - if (verbose) - std::cout << "Leaf node. Predicting " << node->class_predict << std::endl; - return node->class_predict; - } -} - - /** * @brief Build (i.e., fit, train) Decision Tree regressor for input data. * @tparam T: data type for input data (float or double). @@ -371,56 +376,14 @@ void DecisionTreeRegressor::fit(const ML::cumlHandle& handle, T *data, const // tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features); } -/** - * @brief Predict target feature for input data; regression for single feature supported. Inference of trees is CPU only for now. - * @tparam T: data type for input data (float or double). - * @param[in] handle: cumlHandle (currently unused; API placeholder) - * @param[in] rows: test data (n_rows samples, n_cols features) in row major format. CPU pointer. - * @param[in] n_rows: number of data samples. - * @param[in] n_cols: number of features (excluding target feature). - * @param[in,out] predictions: n_rows predicted labels. CPU pointer, user allocated. - * @param[in] verbose: flag for debugging purposes. - */ -template -void DecisionTreeRegressor::predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, T* predictions, bool verbose) const { - ASSERT(this->root, "Cannot predict w/ empty tree!"); - ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); - ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); - predict_all(rows, n_rows, n_cols, predictions, verbose); -} - -template -void DecisionTreeRegressor::predict_all(const T * rows, const int n_rows, const int n_cols, T* preds, bool verbose) const { - for (int row_id = 0; row_id < n_rows; row_id++) { - preds[row_id] = predict(&rows[row_id * n_cols], this->root, verbose); - } -} - -template -T DecisionTreeRegressor::predict(const T * row, const TreeNode* const node, bool verbose) const { - - Question q = node->question; - if (node->left && (row[q.column] <= q.value)) { - if (verbose) - std::cout << "Classifying Left @ node w/ column " << q.column << " and value " << q.value << std::endl; - return predict(row, node->left, verbose); - } else if (node->right && (row[q.column] > q.value)) { - if (verbose) - std::cout << "Classifying Right @ node w/ column " << q.column << " and value " << q.value << std::endl; - return predict(row, node->right, verbose); - } else { // TODO FIXME class_predict should be of type T - if (verbose) - std::cout << "Leaf node. Predicting " << node->class_predict << std::endl; - return node->class_predict; - } -} - // ---------------- Regression end //Class specializations -template class dt; -template class dt; +template class dt; +template class dt; +template class dt; +template class dt; template class DecisionTreeClassifier; template class DecisionTreeClassifier; diff --git a/cuML/src/decisiontree/decisiontree.h b/cuML/src/decisiontree/decisiontree.h index 5b5ce7bfe7..9c095cf3de 100644 --- a/cuML/src/decisiontree/decisiontree.h +++ b/cuML/src/decisiontree/decisiontree.h @@ -36,13 +36,13 @@ struct Question { void update(const GiniQuestion & ques); }; -template +template struct TreeNode { TreeNode *left = nullptr; TreeNode *right = nullptr; - int class_predict; + L prediction; Question question; - T gini_val; + T split_metric_val; void print(std::ostream& os) const; }; @@ -91,11 +91,11 @@ struct DecisionTreeParams { void print() const; }; -template +template class dt { protected: int split_algo; - TreeNode *root = nullptr; + TreeNode *root = nullptr; int nbins; DataInfo dinfo; int treedepth; @@ -113,7 +113,7 @@ class dt { bool bootstrap_features; std::vector feature_selector; - void print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const; + void print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const; public: // Printing utility for high level tree info. void print_tree_summary() const; @@ -121,10 +121,15 @@ class dt { // Printing utility for debug and looking at nodes and leaves. void print() const; + // Predict labels for n_rows rows, with n_cols features each, for a given tree. rows in row-major format. + void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, L * predictions, bool verbose=false) const; + void predict_all(const T * rows, const int n_rows, const int n_cols, L * preds, bool verbose=false) const; + L predict_one(const T * row, const TreeNode * const node, bool verbose=false) const; + }; // End dt Class template -class DecisionTreeClassifier : public dt { +class DecisionTreeClassifier : public dt { public: // Expects column major T dataset, integer labels // data, labels are both device ptr. @@ -132,37 +137,27 @@ class DecisionTreeClassifier : public dt { void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, const int unique_labels, DecisionTreeParams tree_params); - /* Predict labels for n_rows rows, with n_cols features each, for a given tree. rows in row-major format. */ - void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, int* predictions, bool verbose=false) const; - private: // Same as above fit, but planting is better for a tree then fitting. void plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, int maxdepth = -1, int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node=2, bool cfg_bootstrap_features=false); - TreeNode * grow_tree(T *data, const float colper, int *labels, int depth, unsigned int* rowids, const int n_sampled_rows, GiniInfo prev_split_info); + TreeNode * grow_tree(T *data, const float colper, int *labels, int depth, unsigned int* rowids, const int n_sampled_rows, GiniInfo prev_split_info); /* depth is used to distinguish between root and other tree nodes for computations */ void find_best_fruit_all(T *data, int *labels, const float colper, GiniQuestion & ques, float& gain, unsigned int* rowids, const int n_sampled_rows, GiniInfo split_info[3], int depth); void split_branch(T *data, GiniQuestion & ques, const int n_sampled_rows, int& nrowsleft, int& nrowsright, unsigned int* rowids); - void classify_all(const T * rows, const int n_rows, const int n_cols, int* preds, bool verbose=false) const; - int classify(const T * row, const TreeNode * const node, bool verbose=false) const; }; // End DecisionTreeClassifier Class template -class DecisionTreeRegressor : public dt { +class DecisionTreeRegressor : public dt { public: void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, const int n_sampled_rows, DecisionTreeParams tree_params); - /* Predict labels for n_rows rows, with n_cols features each, for a given tree. rows in row-major format. */ - void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, T* predictions, bool verbose=false) const; - // TODO FIXME: add private methods from DecisionTreeClassifier as needed -private: - void predict_all(const T * rows, const int n_rows, const int n_cols, T * preds, bool verbose=false) const; - T predict(const T * row, const TreeNode * const node, bool verbose=false) const; // TODO FIXME rename so it's not overloaded? Or pull to base class? +//private: }; // End DecisionTreeRegressor Class } //End namespace DecisionTree diff --git a/cuML/src/randomforest/randomforest.cu b/cuML/src/randomforest/randomforest.cu index d12adba5ad..40387f05b0 100644 --- a/cuML/src/randomforest/randomforest.cu +++ b/cuML/src/randomforest/randomforest.cu @@ -141,30 +141,33 @@ void RF_params::print() const { /** * @brief Construct rf (random forest) object. * @tparam T: data type for input data (float or double). + * @tparam L: data type for labels (int type for classification, T type for regression). * @param[in] cfg_rf_params: Random forest hyper-parameter struct. * @param[in] cfg_rf_type: Random forest type. Only CLASSIFICATION is currently supported. */ -template -rf::rf(RF_params cfg_rf_params, int cfg_rf_type):rf_params(cfg_rf_params), rf_type(cfg_rf_type) { +template +rf::rf(RF_params cfg_rf_params, int cfg_rf_type):rf_params(cfg_rf_params), rf_type(cfg_rf_type) { rf_params.validity_check(); } /** * @brief Return number of trees in the forest. * @tparam T: data type for input data (float or double). + * @tparam L: data type for labels (int type for classification, T type for regression). */ -template -int rf::get_ntrees() { +template +int rf::get_ntrees() { return rf_params.n_trees; } /** * @brief Print summary for all trees in the random forest. * @tparam T: data type for input data (float or double). + * @tparam L: data type for labels (int type for classification, T type for regression). */ -template -void rf::print_rf_summary() { - const DecisionTree::dt * trees = get_trees_ptr(); +template +void rf::print_rf_summary() { + const DecisionTree::dt * trees = get_trees_ptr(); if (!trees) { std::cout << "Empty forest" << std::endl; } else { @@ -181,11 +184,12 @@ void rf::print_rf_summary() { /** * @brief Print detailed view of all trees in the random forest. * @tparam T: data type for input data (float or double). + * @tparam L: data type for labels (int type for classification, T type for regression). */ -template -void rf::print_rf_detailed() { +template +void rf::print_rf_detailed() { - const DecisionTree::dt * trees = get_trees_ptr(); + const DecisionTree::dt * trees = get_trees_ptr(); if (!trees) { std::cout << "Empty forest" << std::endl; } else { @@ -205,7 +209,7 @@ void rf::print_rf_detailed() { * @param[in] cfg_rf_params: Random forest hyper-parameter struct. */ template -rfClassifier::rfClassifier(RF_params cfg_rf_params): rf::rf(cfg_rf_params, RF_type::CLASSIFICATION) {}; +rfClassifier::rfClassifier(RF_params cfg_rf_params): rf::rf(cfg_rf_params, RF_type::CLASSIFICATION) {}; /** * @brief Destructor for random forest classifier object. @@ -375,7 +379,7 @@ RF_metrics rfClassifier::cross_validate(const cumlHandle& user_handle, const * @param[in] cfg_rf_params: Random forest hyper-parameter struct. */ template -rfRegressor::rfRegressor(RF_params cfg_rf_params): rf::rf(cfg_rf_params, RF_type::REGRESSION) {}; +rfRegressor::rfRegressor(RF_params cfg_rf_params): rf::rf(cfg_rf_params, RF_type::REGRESSION) {}; /** * @brief Destructor for random forest regressor object. @@ -537,8 +541,10 @@ RF_metrics rfRegressor::cross_validate(const cumlHandle& user_handle, const T return stats; } -template class rf; -template class rf; +template class rf; +template class rf; +template class rf; +template class rf; template class rfClassifier; template class rfClassifier; diff --git a/cuML/src/randomforest/randomforest.h b/cuML/src/randomforest/randomforest.h index 55d7fd9af1..531a321156 100644 --- a/cuML/src/randomforest/randomforest.h +++ b/cuML/src/randomforest/randomforest.h @@ -83,12 +83,12 @@ void preprocess_labels(int n_rows, std::vector & labels, std::map /* Revert preprocessing effect, if needed. */ void postprocess_labels(int n_rows, std::vector & labels, std::map & labels_map, bool verbose=false); -template +template class rf { protected: RF_params rf_params; int rf_type; - virtual const DecisionTree::dt * get_trees_ptr() const = 0; + virtual const DecisionTree::dt * get_trees_ptr() const = 0; public: rf(RF_params cfg_rf_params, int cfg_rf_type=RF_type::CLASSIFICATION); @@ -99,7 +99,7 @@ class rf { }; template -class rfClassifier : public rf { +class rfClassifier : public rf { private: DecisionTree::DecisionTreeClassifier * trees = nullptr; const DecisionTree::DecisionTreeClassifier * get_trees_ptr() const; @@ -114,7 +114,7 @@ class rfClassifier : public rf { }; template -class rfRegressor : public rf { +class rfRegressor : public rf { private: DecisionTree::DecisionTreeRegressor * trees = nullptr; const DecisionTree::DecisionTreeRegressor * get_trees_ptr() const; From b9c5c6f675c5464b06e4c1a57506e37d9344edd4 Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Wed, 8 May 2019 17:00:46 +0200 Subject: [PATCH 05/51] added regression kernels, modified naming convention to metric question and metric info --- cuML/src/decisiontree/decisiontree.cu | 22 +- cuML/src/decisiontree/decisiontree.h | 10 +- cuML/src/decisiontree/kernels/evaluate.cuh | 10 +- .../kernels/evaluate_regression.cuh | 230 ++++++++++++++++++ cuML/src/decisiontree/kernels/gini.cuh | 6 +- cuML/src/decisiontree/kernels/gini_def.h | 6 +- .../src/decisiontree/kernels/split_labels.cuh | 2 +- cuML/src/decisiontree/memory.cuh | 11 +- 8 files changed, 266 insertions(+), 31 deletions(-) create mode 100644 cuML/src/decisiontree/kernels/evaluate_regression.cuh diff --git a/cuML/src/decisiontree/decisiontree.cu b/cuML/src/decisiontree/decisiontree.cu index 16ffde7946..0057df5836 100644 --- a/cuML/src/decisiontree/decisiontree.cu +++ b/cuML/src/decisiontree/decisiontree.cu @@ -26,7 +26,7 @@ namespace ML { namespace DecisionTree { template -void Question::update(const GiniQuestion & ques) { +void Question::update(const MetricQuestion & ques) { column = ques.original_column; value = ques.value; } @@ -259,7 +259,7 @@ void DecisionTreeClassifier::plant(const cumlHandle_impl& handle, T *data, co } this->total_temp_mem = this->tempmem[0]->totalmem; this->total_temp_mem *= this->MAXSTREAMS; - GiniInfo split_info; + MetricInfo split_info; MLCommon::TimerCPU timer; this->root = grow_tree(data, colper, labels, 0, rowids, n_sampled_rows, split_info); this->construct_time = timer.getElapsedSeconds(); @@ -273,16 +273,16 @@ void DecisionTreeClassifier::plant(const cumlHandle_impl& handle, T *data, co template TreeNode* DecisionTreeClassifier::grow_tree(T *data, const float colper, int *labels, int depth, unsigned int* rowids, - const int n_sampled_rows, GiniInfo prev_split_info) { + const int n_sampled_rows, MetricInfo prev_split_info) { TreeNode *node = new TreeNode(); - GiniQuestion ques; + MetricQuestion ques; Question node_ques; float gain = 0.0; - GiniInfo split_info[3]; // basis, left, right. Populate this + MetricInfo split_info[3]; // basis, left, right. Populate this split_info[0] = prev_split_info; - bool condition = ((depth != 0) && (prev_split_info.best_gini == 0.0f)); // This node is a leaf, no need to search for best split + bool condition = ((depth != 0) && (prev_split_info.best_metric == 0.0f)); // This node is a leaf, no need to search for best split condition = condition || (n_sampled_rows < this->min_rows_per_node); // Do not split a node with less than min_rows_per_node samples if (!condition) { @@ -298,7 +298,7 @@ TreeNode* DecisionTreeClassifier::grow_tree(T *data, const float colp if (condition) { node->prediction = get_class_hist(split_info[0].hist); - node->split_metric_val = split_info[0].best_gini; + node->split_metric_val = split_info[0].best_metric; this->leaf_counter++; if (depth > this->depth_counter) @@ -310,15 +310,15 @@ TreeNode* DecisionTreeClassifier::grow_tree(T *data, const float colp node->question = node_ques; node->left = grow_tree(data, colper, labels, depth+1, &rowids[0], nrowsleft, split_info[1]); node->right = grow_tree(data, colper, labels, depth+1, &rowids[nrowsleft], nrowsright, split_info[2]); - node->split_metric_val = split_info[0].best_gini; + node->split_metric_val = split_info[0].best_metric; } return node; } template -void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const float colper, GiniQuestion & ques, float& gain, - unsigned int* rowids, const int n_sampled_rows, GiniInfo split_info[3], int depth) { +void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const float colper, MetricQuestion & ques, float& gain, + unsigned int* rowids, const int n_sampled_rows, MetricInfo split_info[3], int depth) { std::vector& colselector = this->feature_selector; // Optimize ginibefore; no need to compute except for root. @@ -341,7 +341,7 @@ void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const } template -void DecisionTreeClassifier::split_branch(T *data, GiniQuestion & ques, const int n_sampled_rows, int& nrowsleft, +void DecisionTreeClassifier::split_branch(T *data, MetricQuestion & ques, const int n_sampled_rows, int& nrowsleft, int& nrowsright, unsigned int* rowids) { T *temp_data = this->tempmem[0]->temp_data->data(); diff --git a/cuML/src/decisiontree/decisiontree.h b/cuML/src/decisiontree/decisiontree.h index 9c095cf3de..1263c863f3 100644 --- a/cuML/src/decisiontree/decisiontree.h +++ b/cuML/src/decisiontree/decisiontree.h @@ -33,7 +33,7 @@ template struct Question { int column; T value; - void update(const GiniQuestion & ques); + void update(const MetricQuestion & ques); }; template @@ -142,12 +142,12 @@ class DecisionTreeClassifier : public dt { void plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, int maxdepth = -1, int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node=2, bool cfg_bootstrap_features=false); - TreeNode * grow_tree(T *data, const float colper, int *labels, int depth, unsigned int* rowids, const int n_sampled_rows, GiniInfo prev_split_info); + TreeNode * grow_tree(T *data, const float colper, int *labels, int depth, unsigned int* rowids, const int n_sampled_rows, MetricInfo prev_split_info); /* depth is used to distinguish between root and other tree nodes for computations */ - void find_best_fruit_all(T *data, int *labels, const float colper, GiniQuestion & ques, float& gain, unsigned int* rowids, - const int n_sampled_rows, GiniInfo split_info[3], int depth); - void split_branch(T *data, GiniQuestion & ques, const int n_sampled_rows, int& nrowsleft, int& nrowsright, unsigned int* rowids); + void find_best_fruit_all(T *data, int *labels, const float colper, MetricQuestion & ques, float& gain, unsigned int* rowids, + const int n_sampled_rows, MetricInfo split_info[3], int depth); + void split_branch(T *data, MetricQuestion & ques, const int n_sampled_rows, int& nrowsleft, int& nrowsright, unsigned int* rowids); }; // End DecisionTreeClassifier Class template diff --git a/cuML/src/decisiontree/kernels/evaluate.cuh b/cuML/src/decisiontree/kernels/evaluate.cuh index fb496c1282..527f35dba7 100644 --- a/cuML/src/decisiontree/kernels/evaluate.cuh +++ b/cuML/src/decisiontree/kernels/evaluate.cuh @@ -110,7 +110,7 @@ __global__ void all_cols_histograms_global_quantile_kernel(const T* __restrict__ } template -void find_best_split(const std::shared_ptr> tempmem, const int nbins, const int n_unique_labels, const std::vector& col_selector, GiniInfo split_info[3], const int nrows, GiniQuestion & ques, float & gain, const int split_algo) { +void find_best_split(const std::shared_ptr> tempmem, const int nbins, const int n_unique_labels, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { gain = 0.0f; int best_col_id = -1; @@ -154,7 +154,7 @@ void find_best_split(const std::shared_ptr> tempmem, const in ASSERT((tmp_gini_right >= 0.0f) && (tmp_gini_right <= 1.0f), "gini right value %f not in [0.0, 1.0]", tmp_gini_right); float impurity = (tmp_lnrows * 1.0f/nrows) * tmp_gini_left + (tmp_rnrows * 1.0f/nrows) * tmp_gini_right; - float info_gain = split_info[0].best_gini - impurity; + float info_gain = split_info[0].best_metric - impurity; // Compute best information col_gain so far @@ -162,8 +162,8 @@ void find_best_split(const std::shared_ptr> tempmem, const in gain = info_gain; best_bin_id = i; best_col_id = col_id; - split_info[1].best_gini = tmp_gini_left; - split_info[2].best_gini = tmp_gini_right; + split_info[1].best_metric = tmp_gini_left; + split_info[2].best_metric = tmp_gini_right; } } } @@ -193,7 +193,7 @@ void find_best_split(const std::shared_ptr> tempmem, const in template -void best_split_all_cols(const T *data, const unsigned int* rowids, const int *labels, const int nbins, const int nrows, const int n_unique_labels, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, GiniInfo split_info[3], GiniQuestion & ques, float & gain, const int split_algo) +void best_split_all_cols(const T *data, const unsigned int* rowids, const int *labels, const int nbins, const int nrows, const int n_unique_labels, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) { int* d_colids = tempmem->d_colids->data(); T* d_globalminmax = tempmem->d_globalminmax->data(); diff --git a/cuML/src/decisiontree/kernels/evaluate_regression.cuh b/cuML/src/decisiontree/kernels/evaluate_regression.cuh new file mode 100644 index 0000000000..cf6d961f45 --- /dev/null +++ b/cuML/src/decisiontree/kernels/evaluate_regression.cuh @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include "gini.cuh" +#include "../memory.cuh" +#include "col_condenser.cuh" +#include +#include "../algo_helper.h" + +/* + The output of the function is a histogram array, of size ncols * nbins * n_unique_lables + column order is as per colids (bootstrapped random cols) for each col there are nbins histograms + */ +template +__global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const T* __restrict__ globalminmax, T* mseout, int* countout) { + + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ char shmem[]; + T *minmaxshared = (T*)shmem; + T *shmemmse = (T*)(shmem + 2*ncols*sizeof(T)); + int *shmemcount = (int*)(shmem + 2*ncols*sizeof(T) + nbins*ncols*sizeof(T)); + + for (int i=threadIdx.x; i < 2*ncols; i += blockDim.x) { + minmaxshared[i] = globalminmax[i]; + } + + for (int i = threadIdx.x; i < nbins*ncols; i += blockDim.x) { + shmemmse[i] = 0; + shmemcount[i] = 0; + } + + __syncthreads(); + + for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { + int mycolid = (int)( i / nrows); + int coloffset = mycolid*nbins; + + // nbins is # batched bins. Use (batched bins + 1) for delta computation. + T delta = (minmaxshared[mycolid + ncols] - minmaxshared[mycolid]) / (nbins); + T base_quesval = minmaxshared[mycolid] + delta; + + T localdata = data[i]; + T label = labels[ rowids[ i % nrows ] ]; + for (int j=0; j < nbins; j++) { + T quesval = base_quesval + j * delta; + + if (localdata <= quesval) { + atomicAdd(&shmemcount[j + coloffset], 1); + atomicAdd(&shmemmse[j + coloffset], label); + } + } + + } + + __syncthreads(); + + for (int i = threadIdx.x; i < ncols*n_unique_labels*nbins; i += blockDim.x) { + atomicAdd(&mseout[i], shmemmse[i]); + atomicAdd(&countout[i], shmemcount[i]); + } +} + +template +__global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const int n_unique_labels, T* mseout, int* countout, const T* __restrict__ quantile) { + + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ char shmem[]; + T *shmemmse = (T*) (shmem); + int *shmemcount = (int*)(shmem + nbins*ncols*sizeof(T)); + + for (int i = threadIdx.x; i < n_unique_labels*nbins*ncols; i += blockDim.x) { + shmemmse[i] = 0; + shmemcount[i] = 0; + } + + __syncthreads(); + + for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { + int mycolid = (int)( i / nrows); + int coloffset = mycolid*nbins; + + // nbins is # batched bins. + T localdata = data[i]; + T label = labels[ rowids[ i % nrows ] ]; + for (int j=0; j < nbins; j++) { + int quantile_index = colids[mycolid] * nbins + j; + T quesval = quantile[quantile_index]; + if (localdata <= quesval) { + atomicAdd(&shmemcount[j + coloffset], 1); + atomicAdd(&shmemmse[j + coloffset], label); + } + } + + } + + __syncthreads(); + + for (int i = threadIdx.x; i < ncols*n_unique_labels*nbins; i += blockDim.x) { + atomicAdd(&mseout[i], shmemmse[i]); + atomicAdd(&countout[i], shmemcount[i]); + } +} + +template +void find_best_split(const std::shared_ptr> tempmem, const int nbins, const int n_unique_labels, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { + + gain = 0.0f; + int best_col_id = -1; + int best_bin_id = -1; + + int n_cols = col_selector.size(); + for (int col_id = 0; col_id < n_cols; col_id++) { + + int col_count_base_index = col_id * nbins; + // tempmem->h_histout holds n_cols histograms of nbins of n_unique_labels each. + for (int i = 0; i < nbins; i++) { + + int tmp_lnrows += tempmem->h_histout->data()[col_hist_base_index + i]; + int tmp_rnrows = nrows - tmp_lnrows; + + if (tmp_lnrows == 0 || tmp_rnrows == 0) + continue; + + // Compute MSE right and MSE left value for each bin. + float tmp_mse_left = tempmem->h_mseout->data()[col_hist_base_index + i]; + float tmp_mse_right = (nrows * split_info[0].best_metric) - tmp_mse_left; + tmp_mse_left /= tmp_lnrows; + tmp_mse_right /= tmp_rnrows; + + float impurity = (tmp_lnrows * 1.0f/nrows) * tmp_mse_left + (tmp_rnrows * 1.0f/nrows) * tmp_mse_right; + float info_gain = split_info[0].best_metric - impurity; + + // Compute best information col_gain so far + if (info_gain > gain) { + gain = info_gain; + best_bin_id = i; + best_col_id = col_id; + split_info[1].best_metric = tmp_mse_left; + split_info[2].best_metric = tmp_mse_right; + } + } + } + + if (best_col_id == -1 || best_bin_id == -1) + return; + + if (split_algo == ML::SPLIT_ALGO::HIST) { + ques.set_question_fields(best_col_id, col_selector[best_col_id], best_bin_id, nbins, n_cols, std::numeric_limits::max(), -std::numeric_limits::max(), (T) 0); + } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { + T ques_val; + T *d_quantile = tempmem->d_quantile->data(); + int q_index = col_selector[best_col_id] * nbins + best_bin_id; + CUDA_CHECK(cudaMemcpyAsync(&ques_val, &d_quantile[q_index], sizeof(T), cudaMemcpyDeviceToHost, tempmem->stream)); + CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); + ques.set_question_fields(best_col_id, col_selector[best_col_id], best_bin_id, nbins, n_cols, std::numeric_limits::max(), -std::numeric_limits::max(), ques_val); + } + return; +} + + +template +void best_split_all_cols(const T *data, const unsigned int* rowids, const T *labels, const int nbins, const int nrows, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) +{ + int* d_colids = tempmem->d_colids->data(); + T* d_globalminmax = tempmem->d_globalminmax->data(); + int *d_histout = tempmem->d_histout->data(); + int *h_histout = tempmem->h_histout->data(); + T* d_mseout = tempmem->d_mseout->data(); + T* h_mseout = tempmem->h_mseout->data(); + + int ncols = colselector.size(); + int col_minmax_bytes = sizeof(T) * 2 * ncols; + int n_mse_bytes = nbins * sizeof(T) * ncols; + int n_count_bytes = nbins * ncols * sizeof(int); + + CUDA_CHECK(cudaMemsetAsync((void*)d_mseout, 0, n_hist_bytes, tempmem->stream)); + + int threads = 512; + int blocks = MLCommon::ceildiv(nrows * ncols, threads); + if (blocks > 65536) + blocks = 65536; + + /* Kernel allcolsampler_*_kernel: + - populates tempmem->tempdata with the sampled column data, + - and computes min max histograms in tempmem->d_globalminmax *if minmax in name + across all columns. + */ + size_t shmemsize = col_minmax_bytes; + if (split_algo == ML::SPLIT_ALGO::HIST) { // Histograms (min, max) + allcolsampler_minmax_kernel<<stream>>>(data, rowids, d_colids, nrows, ncols, rowoffset, &d_globalminmax[0], &d_globalminmax[colselector.size()], tempmem->temp_data->data(), std::numeric_limits::max()); + } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { // Global quantiles; just col condenser + allcolsampler_kernel<<stream>>>(data, rowids, d_colids, nrows, ncols, rowoffset, tempmem->temp_data->data()); + } + CUDA_CHECK(cudaGetLastError()); + + shmemsize = n_mse_bytes + n_count_bytes; + + if (split_algo == ML::SPLIT_ALGO::HIST) { + shmemsize += col_minmax_bytes; + all_cols_histograms_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_globalminmax, d_mseout, d_histout); + } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { + all_cols_histograms_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_mseout, d_histout, tempmem->d_quantile->data()); + } + CUDA_CHECK(cudaGetLastError()); + + CUDA_CHECK(cudaMemcpyAsync(h_mseout, d_mseout, n_mse_bytes, cudaMemcpyDeviceToHost, tempmem->stream)); + CUDA_CHECK(cudaMemcpyAsync(h_histout, d_histout, n_count_bytes, cudaMemcpyDeviceToHost, tempmem->stream)); + + CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); + + find_best_split(tempmem, nbins, colselector, &split_info[0], nrows, ques, gain, split_algo); + return; +} + diff --git a/cuML/src/decisiontree/kernels/gini.cuh b/cuML/src/decisiontree/kernels/gini.cuh index 7495f8d404..e3f861ee09 100644 --- a/cuML/src/decisiontree/kernels/gini.cuh +++ b/cuML/src/decisiontree/kernels/gini.cuh @@ -23,7 +23,7 @@ #include "cuda_utils.h" template -void GiniQuestion::set_question_fields(int cfg_bootcolumn, int cfg_column, int cfg_batch_id, int cfg_nbins, int cfg_ncols, T cfg_min, T cfg_max, T cfg_value) { +void MetricQuestion::set_question_fields(int cfg_bootcolumn, int cfg_column, int cfg_batch_id, int cfg_nbins, int cfg_ncols, T cfg_min, T cfg_max, T cfg_value) { bootstrapped_column = cfg_bootcolumn; original_column = cfg_column; batch_id = cfg_batch_id; @@ -57,7 +57,7 @@ __global__ void gini_kernel(const int* __restrict__ labels, const int nrows, con } template -void gini(int *labels_in, const int nrows, const std::shared_ptr> tempmem, GiniInfo & split_info, int & unique_labels) +void gini(int *labels_in, const int nrows, const std::shared_ptr> tempmem, MetricInfo & split_info, int & unique_labels) { int *dhist = tempmem->d_hist->data(); int *hhist = tempmem->h_hist->data(); @@ -76,7 +76,7 @@ void gini(int *labels_in, const int nrows, const std::shared_ptr template -struct GiniQuestion { +struct MetricQuestion { int bootstrapped_column; int original_column; T value; @@ -45,7 +45,7 @@ struct GiniQuestion { void set_question_fields(int cfg_bootcolumn, int cfg_column, int cfg_batch_id, int cfg_nbins, int cfg_ncols, T cfg_min, T cfg_max, T cfg_value); }; -struct GiniInfo { - float best_gini = -1.0f; +struct MetricInfo { + float best_metric = -1.0f; std::vector hist; //Element hist[i] stores # labels with label i for a given node. }; diff --git a/cuML/src/decisiontree/kernels/split_labels.cuh b/cuML/src/decisiontree/kernels/split_labels.cuh index 6280f474a9..fbc2dc01a4 100644 --- a/cuML/src/decisiontree/kernels/split_labels.cuh +++ b/cuML/src/decisiontree/kernels/split_labels.cuh @@ -90,7 +90,7 @@ int get_class_hist(std::vector & node_hist) { } template -void make_split(T *column, GiniQuestion & ques, const int nrows, int& nrowsleft, int& nrowsright, unsigned int* rowids, int split_algo, const std::shared_ptr> tempmem) +void make_split(T *column, MetricQuestion & ques, const int nrows, int& nrowsleft, int& nrowsright, unsigned int* rowids, int split_algo, const std::shared_ptr> tempmem) { int *temprowids = tempmem->temprowids->data(); diff --git a/cuML/src/decisiontree/memory.cuh b/cuML/src/decisiontree/memory.cuh index 961b486a6c..8ef833a79b 100644 --- a/cuML/src/decisiontree/memory.cuh +++ b/cuML/src/decisiontree/memory.cuh @@ -31,12 +31,14 @@ struct TemporaryMemory // Used for gini histograms (root tree node) MLCommon::device_buffer *d_hist; MLCommon::host_buffer *h_hist; - + //Host/Device histograms and device minmaxs MLCommon::device_buffer *d_globalminmax; MLCommon::device_buffer *d_histout, *d_colids; MLCommon::host_buffer *h_histout; - + MLCommon::device_buffer *d_mseout; + MLCommon::host_buffer *h_mseout; + //Below pointers are shared for split functions MLCommon::device_buffer *d_flags_left, *d_flags_right; MLCommon::host_buffer *nrowsleftright; @@ -98,9 +100,12 @@ struct TemporaryMemory totalmem += split_temp_storage_bytes + (N + 1)*sizeof(int) + 2*N*sizeof(char) + sizeof(T); h_histout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, n_hist_elements * Ncols); - + h_mseout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, Ncols); + d_globalminmax = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols * 2); d_histout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_hist_elements * Ncols); + d_mseout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); + d_colids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); totalmem += (n_hist_elements * sizeof(int) + sizeof(int) + 2*sizeof(T))* Ncols; From 04da29ae1882c4e296dcd35b314a0d50eab296b3 Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Wed, 8 May 2019 09:28:27 -0700 Subject: [PATCH 06/51] More decision tree changes - Further templated TemporaryMemory (for regression labels) + other minor fixes. - Moved split_branch to base dt class. - Placeholder methods for DecisionTreeRegressor. --- cuML/src/decisiontree/decisiontree.cu | 143 ++++++++++++++++-- cuML/src/decisiontree/decisiontree.h | 17 ++- cuML/src/decisiontree/kernels/evaluate.cuh | 8 +- cuML/src/decisiontree/kernels/gini.cuh | 2 +- cuML/src/decisiontree/kernels/quantile.cuh | 4 +- .../src/decisiontree/kernels/split_labels.cuh | 4 +- cuML/src/decisiontree/memory.cuh | 18 ++- 7 files changed, 162 insertions(+), 34 deletions(-) diff --git a/cuML/src/decisiontree/decisiontree.cu b/cuML/src/decisiontree/decisiontree.cu index 0057df5836..4418ed7508 100644 --- a/cuML/src/decisiontree/decisiontree.cu +++ b/cuML/src/decisiontree/decisiontree.cu @@ -129,6 +129,15 @@ void dt::print_node(const std::string& prefix, const TreeNode* const } } +template +void dt::split_branch(T *data, MetricQuestion & ques, const int n_sampled_rows, int& nrowsleft, + int& nrowsright, unsigned int* rowids) { + + T *temp_data = this->tempmem[0]->temp_data->data(); + T *sampledcolumn = &temp_data[n_sampled_rows * ques.bootstrapped_column]; + make_split(sampledcolumn, ques, n_sampled_rows, nrowsleft, nrowsright, rowids, this->split_algo, this->tempmem[0]); +} + /** * @brief Predict target feature for input data; n-ary classification or regression for single feature supported. Inference of trees is CPU only for now. @@ -252,7 +261,7 @@ void DecisionTreeClassifier::plant(const cumlHandle_impl& handle, T *data, co ASSERT(this->shmem_used <= this->max_shared_mem, "Shared memory per block limit %zd , requested %zd \n", this->max_shared_mem, this->shmem_used); for (int i = 0; i < this->MAXSTREAMS; i++) { - this->tempmem[i] = std::make_shared>(handle, n_sampled_rows, ncols, this->MAXSTREAMS, unique_labels, n_bins, this->split_algo); + this->tempmem[i] = std::make_shared>(handle, n_sampled_rows, ncols, this->MAXSTREAMS, unique_labels, n_bins, this->split_algo); if (this->split_algo == SPLIT_ALGO::GLOBAL_QUANTILE) { preprocess_quantile(data, rowids, n_sampled_rows, ncols, this->dinfo.NLocalrows, n_bins, this->tempmem[i]); } @@ -305,7 +314,7 @@ TreeNode* DecisionTreeClassifier::grow_tree(T *data, const float colp this->depth_counter = depth; } else { int nrowsleft, nrowsright; - split_branch(data, ques, n_sampled_rows, nrowsleft, nrowsright, rowids); // populates ques.value + this->split_branch(data, ques, n_sampled_rows, nrowsleft, nrowsright, rowids); // populates ques.value node_ques.update(ques); node->question = node_ques; node->left = grow_tree(data, colper, labels, depth+1, &rowids[0], nrowsleft, split_info[1]); @@ -340,15 +349,6 @@ void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const this->tempmem[0], &split_info[0], ques, gain, this->split_algo); } -template -void DecisionTreeClassifier::split_branch(T *data, MetricQuestion & ques, const int n_sampled_rows, int& nrowsleft, - int& nrowsright, unsigned int* rowids) { - - T *temp_data = this->tempmem[0]->temp_data->data(); - T *sampledcolumn = &temp_data[n_sampled_rows * ques.bootstrapped_column]; - make_split(sampledcolumn, ques, n_sampled_rows, nrowsleft, nrowsright, rowids, this->split_algo, this->tempmem[0]); -} - /** * @brief Build (i.e., fit, train) Decision Tree regressor for input data. * @tparam T: data type for input data (float or double). @@ -371,9 +371,124 @@ void DecisionTreeRegressor::fit(const ML::cumlHandle& handle, T *data, const std::cout << "Resetting n_bins to " << n_sampled_rows << "." << std::endl; tree_params.n_bins = n_sampled_rows; } - // TODO FIXME placeholder - //plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree_params.max_depth, - // tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features); + plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, 1, tree_params.max_depth, + tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features); +} + +template +void DecisionTreeRegressor::plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, const int n_sampled_rows, + int unique_labels, int maxdepth, int max_leaf_nodes, const float colper, int n_bins, int split_algo_flag, int cfg_min_rows_per_node, bool cfg_bootstrap_features) { + + //TODO FIXME - method body copied from Classifier as temp placeholder. FIXME + + this->split_algo = split_algo_flag; + this->dinfo.NLocalrows = nrows; + this->dinfo.NGlobalrows = nrows; + this->dinfo.Ncols = ncols; + this->nbins = n_bins; + this->treedepth = maxdepth; + this->maxleaves = max_leaf_nodes; + this->tempmem.resize(this->MAXSTREAMS); + this->n_unique_labels = unique_labels; + this->min_rows_per_node = cfg_min_rows_per_node; + this->bootstrap_features = cfg_bootstrap_features; + + //Bootstrap features + this->feature_selector.resize(this->dinfo.Ncols); + if (this->bootstrap_features) { + srand(n_bins); + for(int i=0; i < this->dinfo.Ncols; i++) { + this->feature_selector.push_back( rand() % this->dinfo.Ncols ); + } + } else { + std::iota(this->feature_selector.begin(), this->feature_selector.end(), 0); + } + + std::random_shuffle(this->feature_selector.begin(), this->feature_selector.end()); + this->feature_selector.resize((int) (colper * this->dinfo.Ncols)); + + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, 0)); + this->max_shared_mem = prop.sharedMemPerBlock; + + if (this->split_algo == SPLIT_ALGO::HIST) { + this->shmem_used += 2 * sizeof(T) * ncols; + this->shmem_used += this->nbins * this->n_unique_labels * sizeof(int) * ncols; + } else { + this->shmem_used += this->nbins * this->n_unique_labels * sizeof(int) * ncols; + } + ASSERT(this->shmem_used <= this->max_shared_mem, "Shared memory per block limit %zd , requested %zd \n", this->max_shared_mem, this->shmem_used); + + for (int i = 0; i < this->MAXSTREAMS; i++) { + this->tempmem[i] = std::make_shared>(handle, n_sampled_rows, ncols, this->MAXSTREAMS, unique_labels, n_bins, this->split_algo); + if (this->split_algo == SPLIT_ALGO::GLOBAL_QUANTILE) { + preprocess_quantile(data, rowids, n_sampled_rows, ncols, this->dinfo.NLocalrows, n_bins, this->tempmem[i]); + } + } + this->total_temp_mem = this->tempmem[0]->totalmem; + this->total_temp_mem *= this->MAXSTREAMS; + MetricInfo split_info; + MLCommon::TimerCPU timer; + this->root = grow_tree(data, colper, labels, 0, rowids, n_sampled_rows, split_info); + this->construct_time = timer.getElapsedSeconds(); + + for (int i = 0; i < this->MAXSTREAMS; i++) { + this->tempmem[i].reset(); + } + + return; +} + +template +TreeNode* DecisionTreeRegressor::grow_tree(T *data, const float colper, T *labels, int depth, unsigned int* rowids, + const int n_sampled_rows, MetricInfo prev_split_info) { + + //TODO FIXME - method body mostly copied from Classifier. FIXME + TreeNode *node = new TreeNode(); + MetricQuestion ques; + Question node_ques; + float gain = 0.0; + MetricInfo split_info[3]; // basis, left, right. Populate this + split_info[0] = prev_split_info; + + bool condition = ((depth != 0) && (prev_split_info.best_metric == 0.0f)); // This node is a leaf, no need to search for best split + condition = condition || (n_sampled_rows < this->min_rows_per_node); // Do not split a node with less than min_rows_per_node samples + + if (!condition) { + find_best_fruit_all(data, labels, colper, ques, gain, rowids, n_sampled_rows, &split_info[0], depth); //ques and gain are output here + condition = condition || (gain == 0.0f); + } + + if (this->treedepth != -1) + condition = (condition || (depth == this->treedepth)); + + if (this->maxleaves != -1) + condition = (condition || (this->leaf_counter >= this->maxleaves)); // FIXME not fully respecting maxleaves, but >= constraints it more than == + + if (condition) { + node->prediction = get_class_hist(split_info[0].hist); + node->split_metric_val = split_info[0].best_metric; + + this->leaf_counter++; + if (depth > this->depth_counter) + this->depth_counter = depth; + } else { + int nrowsleft, nrowsright; + this->split_branch(data, ques, n_sampled_rows, nrowsleft, nrowsright, rowids); // populates ques.value + node_ques.update(ques); + node->question = node_ques; + node->left = grow_tree(data, colper, labels, depth+1, &rowids[0], nrowsleft, split_info[1]); + node->right = grow_tree(data, colper, labels, depth+1, &rowids[nrowsleft], nrowsright, split_info[2]); + node->split_metric_val = split_info[0].best_metric; + } + return node; +} + +template +void DecisionTreeRegressor::find_best_fruit_all(T *data, T *labels, const float colper, MetricQuestion & ques, float& gain, + unsigned int* rowids, const int n_sampled_rows, MetricInfo split_info[3], int depth) { + + //TODO FIXME - placeholder } // ---------------- Regression end diff --git a/cuML/src/decisiontree/decisiontree.h b/cuML/src/decisiontree/decisiontree.h index 1263c863f3..35ba197f99 100644 --- a/cuML/src/decisiontree/decisiontree.h +++ b/cuML/src/decisiontree/decisiontree.h @@ -102,7 +102,7 @@ class dt { int depth_counter = 0; int maxleaves; int leaf_counter = 0; - std::vector>> tempmem; + std::vector>> tempmem; size_t total_temp_mem; const int MAXSTREAMS = 1; size_t max_shared_mem; @@ -114,6 +114,8 @@ class dt { std::vector feature_selector; void print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const; + void split_branch(T *data, MetricQuestion & ques, const int n_sampled_rows, int& nrowsleft, int& nrowsright, unsigned int* rowids); + public: // Printing utility for high level tree info. void print_tree_summary() const; @@ -147,7 +149,6 @@ class DecisionTreeClassifier : public dt { /* depth is used to distinguish between root and other tree nodes for computations */ void find_best_fruit_all(T *data, int *labels, const float colper, MetricQuestion & ques, float& gain, unsigned int* rowids, const int n_sampled_rows, MetricInfo split_info[3], int depth); - void split_branch(T *data, MetricQuestion & ques, const int n_sampled_rows, int& nrowsleft, int& nrowsright, unsigned int* rowids); }; // End DecisionTreeClassifier Class template @@ -156,8 +157,16 @@ class DecisionTreeRegressor : public dt { void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, const int n_sampled_rows, DecisionTreeParams tree_params); -// TODO FIXME: add private methods from DecisionTreeClassifier as needed -//private: +private: + // Same as above fit, but planting is better for a tree then fitting. + void plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels = 1, + int maxdepth = -1, int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node=2, bool cfg_bootstrap_features=false); + + TreeNode * grow_tree(T *data, const float colper, T *labels, int depth, unsigned int* rowids, const int n_sampled_rows, MetricInfo prev_split_info); + + /* depth is used to distinguish between root and other tree nodes for computations */ + void find_best_fruit_all(T *data, T *labels, const float colper, MetricQuestion & ques, float& gain, unsigned int* rowids, + const int n_sampled_rows, MetricInfo split_info[3], int depth); }; // End DecisionTreeRegressor Class } //End namespace DecisionTree diff --git a/cuML/src/decisiontree/kernels/evaluate.cuh b/cuML/src/decisiontree/kernels/evaluate.cuh index 527f35dba7..ddaf8fb4f2 100644 --- a/cuML/src/decisiontree/kernels/evaluate.cuh +++ b/cuML/src/decisiontree/kernels/evaluate.cuh @@ -109,8 +109,8 @@ __global__ void all_cols_histograms_global_quantile_kernel(const T* __restrict__ } } -template -void find_best_split(const std::shared_ptr> tempmem, const int nbins, const int n_unique_labels, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { +template +void find_best_split(const std::shared_ptr> tempmem, const int nbins, const int n_unique_labels, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { gain = 0.0f; int best_col_id = -1; @@ -192,8 +192,8 @@ void find_best_split(const std::shared_ptr> tempmem, const in } -template -void best_split_all_cols(const T *data, const unsigned int* rowids, const int *labels, const int nbins, const int nrows, const int n_unique_labels, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) +template +void best_split_all_cols(const T *data, const unsigned int* rowids, const L *labels, const int nbins, const int nrows, const int n_unique_labels, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) { int* d_colids = tempmem->d_colids->data(); T* d_globalminmax = tempmem->d_globalminmax->data(); diff --git a/cuML/src/decisiontree/kernels/gini.cuh b/cuML/src/decisiontree/kernels/gini.cuh index e3f861ee09..4d90af617e 100644 --- a/cuML/src/decisiontree/kernels/gini.cuh +++ b/cuML/src/decisiontree/kernels/gini.cuh @@ -57,7 +57,7 @@ __global__ void gini_kernel(const int* __restrict__ labels, const int nrows, con } template -void gini(int *labels_in, const int nrows, const std::shared_ptr> tempmem, MetricInfo & split_info, int & unique_labels) +void gini(int *labels_in, const int nrows, const std::shared_ptr> tempmem, MetricInfo & split_info, int & unique_labels) { int *dhist = tempmem->d_hist->data(); int *hhist = tempmem->h_hist->data(); diff --git a/cuML/src/decisiontree/kernels/quantile.cuh b/cuML/src/decisiontree/kernels/quantile.cuh index 4d844b978f..bded206e4b 100644 --- a/cuML/src/decisiontree/kernels/quantile.cuh +++ b/cuML/src/decisiontree/kernels/quantile.cuh @@ -39,8 +39,8 @@ __global__ void get_all_quantiles(const T* __restrict__ data, T* quantile, const return; } -template -void preprocess_quantile(const T* data, const unsigned int* rowids, const int n_sampled_rows, const int ncols, const int rowoffset, const int nbins, std::shared_ptr> tempmem) { +template +void preprocess_quantile(const T* data, const unsigned int* rowids, const int n_sampled_rows, const int ncols, const int rowoffset, const int nbins, std::shared_ptr> tempmem) { int threads = 128; int num_items = n_sampled_rows * ncols; // number of items to sort across all segments (i.e., cols) diff --git a/cuML/src/decisiontree/kernels/split_labels.cuh b/cuML/src/decisiontree/kernels/split_labels.cuh index fbc2dc01a4..bb2349b698 100644 --- a/cuML/src/decisiontree/kernels/split_labels.cuh +++ b/cuML/src/decisiontree/kernels/split_labels.cuh @@ -89,8 +89,8 @@ int get_class_hist(std::vector & node_hist) { return classval; } -template -void make_split(T *column, MetricQuestion & ques, const int nrows, int& nrowsleft, int& nrowsright, unsigned int* rowids, int split_algo, const std::shared_ptr> tempmem) +template +void make_split(T *column, MetricQuestion & ques, const int nrows, int& nrowsleft, int& nrowsright, unsigned int* rowids, int split_algo, const std::shared_ptr> tempmem) { int *temprowids = tempmem->temprowids->data(); diff --git a/cuML/src/decisiontree/memory.cuh b/cuML/src/decisiontree/memory.cuh index 8ef833a79b..4c78fd1d25 100644 --- a/cuML/src/decisiontree/memory.cuh +++ b/cuML/src/decisiontree/memory.cuh @@ -22,11 +22,11 @@ #include #include -template +template struct TemporaryMemory { // Labels after boostrapping - MLCommon::device_buffer *sampledlabels; + MLCommon::device_buffer *sampledlabels; // Used for gini histograms (root tree node) MLCommon::device_buffer *d_hist; @@ -84,8 +84,8 @@ struct TemporaryMemory totalmem += (n_bins + N) * extra_elements * sizeof(T); } - sampledlabels = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); - totalmem += N*sizeof(int); + sampledlabels = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); + totalmem += N*sizeof(L); //Allocate Temporary for split functions d_num_selected_out = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, 1); @@ -100,14 +100,14 @@ struct TemporaryMemory totalmem += split_temp_storage_bytes + (N + 1)*sizeof(int) + 2*N*sizeof(char) + sizeof(T); h_histout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, n_hist_elements * Ncols); - h_mseout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, Ncols); + h_mseout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, Ncols); d_globalminmax = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols * 2); d_histout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_hist_elements * Ncols); - d_mseout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); + d_mseout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); d_colids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); - totalmem += (n_hist_elements * sizeof(int) + sizeof(int) + 2*sizeof(T))* Ncols; + totalmem += (n_hist_elements * sizeof(int) + sizeof(int) + 3*sizeof(T))* Ncols; } @@ -146,6 +146,7 @@ struct TemporaryMemory temprowids->release(stream); question_value->release(stream); h_histout->release(stream); + h_mseout->release(stream); delete sampledlabels; delete d_split_temp_storage; @@ -155,13 +156,16 @@ struct TemporaryMemory delete temprowids; delete question_value; delete h_histout; + delete h_mseout; d_globalminmax->release(stream); d_histout->release(stream); + d_mseout->release(stream); d_colids->release(stream); delete d_globalminmax; delete d_histout; + delete d_mseout; delete d_colids; } From 14fb0e01febb001d7588c91a5ac9487d55c8c4a5 Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Thu, 9 May 2019 14:35:15 +0200 Subject: [PATCH 07/51] added kernels for mean squared error --- .../kernels/evaluate_regression.cuh | 136 +++++++++++++++--- cuML/src/decisiontree/memory.cuh | 3 +- 2 files changed, 118 insertions(+), 21 deletions(-) diff --git a/cuML/src/decisiontree/kernels/evaluate_regression.cuh b/cuML/src/decisiontree/kernels/evaluate_regression.cuh index cf6d961f45..a7db1c6b1f 100644 --- a/cuML/src/decisiontree/kernels/evaluate_regression.cuh +++ b/cuML/src/decisiontree/kernels/evaluate_regression.cuh @@ -23,17 +23,67 @@ #include #include "../algo_helper.h" +template +__global__ void compute_mse_minmax_kernel(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout) { + + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ char shmem[]; + T *minmaxshared = (T*)shmem; + T *shmempred = (T*)(shmem + 2*ncols*sizeof(T)); + T *shmemmse = (T*)(shmem + 2*ncols*sizeof(T) + nbins*ncols*sizeof(T)); + int *shmemcount = (int*)(shmem + 2*ncols*sizeof(T) + 2*nbins*ncols*sizeof(T)); + + for (int i=threadIdx.x; i < 2*ncols; i += blockDim.x) { + minmaxshared[i] = globalminmax[i]; + } + + for (int i = threadIdx.x; i < nbins*ncols; i += blockDim.x) { + shmemcount[i] = countout[i]; + shmempred[i] = predout[i] / shmemcount[i]; + shmemmse[i] = 0.0; + } + + __syncthreads(); + + for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { + int mycolid = (int)( i / nrows); + int coloffset = mycolid*nbins; + + // nbins is # batched bins. Use (batched bins + 1) for delta computation. + T delta = (minmaxshared[mycolid + ncols] - minmaxshared[mycolid]) / (nbins); + T base_quesval = minmaxshared[mycolid] + delta; + + T localdata = data[i]; + T label = labels[ rowids[ i % nrows ] ]; + for (int j=0; j < nbins; j++) { + T quesval = base_quesval + j * delta; + + if (localdata <= quesval) { + T temp = label - shmempred[coloffset +j]; + atomicAdd(&shmemmse[j + coloffset], temp*temp); + } + } + + } + + __syncthreads(); + + for (int i = threadIdx.x; i < ncols*nbins; i += blockDim.x) { + atomicAdd(&mseout[i], shmemmse[i]); + } +} + /* The output of the function is a histogram array, of size ncols * nbins * n_unique_lables column order is as per colids (bootstrapped random cols) for each col there are nbins histograms */ template -__global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const T* __restrict__ globalminmax, T* mseout, int* countout) { +__global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const T* __restrict__ globalminmax, T* predout, int* countout) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; T *minmaxshared = (T*)shmem; - T *shmemmse = (T*)(shmem + 2*ncols*sizeof(T)); + T *shmempred = (T*)(shmem + 2*ncols*sizeof(T)); int *shmemcount = (int*)(shmem + 2*ncols*sizeof(T) + nbins*ncols*sizeof(T)); for (int i=threadIdx.x; i < 2*ncols; i += blockDim.x) { @@ -41,7 +91,7 @@ __global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data } for (int i = threadIdx.x; i < nbins*ncols; i += blockDim.x) { - shmemmse[i] = 0; + shmempred[i] = 0; shmemcount[i] = 0; } @@ -62,7 +112,7 @@ __global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data if (localdata <= quesval) { atomicAdd(&shmemcount[j + coloffset], 1); - atomicAdd(&shmemmse[j + coloffset], label); + atomicAdd(&shmempred[j + coloffset], label); } } @@ -70,22 +120,22 @@ __global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data __syncthreads(); - for (int i = threadIdx.x; i < ncols*n_unique_labels*nbins; i += blockDim.x) { - atomicAdd(&mseout[i], shmemmse[i]); + for (int i = threadIdx.x; i < ncols*nbins; i += blockDim.x) { + atomicAdd(&predout[i], shmempred[i]); atomicAdd(&countout[i], shmemcount[i]); } } template -__global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const int n_unique_labels, T* mseout, int* countout, const T* __restrict__ quantile) { +__global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T* __restrict__ quantile) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; - T *shmemmse = (T*) (shmem); + T *shmempred = (T*) (shmem); int *shmemcount = (int*)(shmem + nbins*ncols*sizeof(T)); - for (int i = threadIdx.x; i < n_unique_labels*nbins*ncols; i += blockDim.x) { - shmemmse[i] = 0; + for (int i = threadIdx.x; i < nbins*ncols; i += blockDim.x) { + shmempred[i] = 0; shmemcount[i] = 0; } @@ -103,7 +153,7 @@ __global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restri T quesval = quantile[quantile_index]; if (localdata <= quesval) { atomicAdd(&shmemcount[j + coloffset], 1); - atomicAdd(&shmemmse[j + coloffset], label); + atomicAdd(&shmempred[j + coloffset], label); } } @@ -111,14 +161,56 @@ __global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restri __syncthreads(); - for (int i = threadIdx.x; i < ncols*n_unique_labels*nbins; i += blockDim.x) { - atomicAdd(&mseout[i], shmemmse[i]); + for (int i = threadIdx.x; i < ncols*nbins; i += blockDim.x) { + atomicAdd(&predout[i], shmempred[i]); atomicAdd(&countout[i], shmemcount[i]); } } template -void find_best_split(const std::shared_ptr> tempmem, const int nbins, const int n_unique_labels, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { +__global__ void compute_mse_global_quantile_kernel(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, T* mseout, int* countout, const T* __restrict__ quantile) { + + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ char shmem[]; + T *shmempred = (T*) (shmem); + T *shmemmse = (T*)(shmem + nbins*ncols*sizeof(T)); + int *shmemcount = (int*)(shmem + 2*nbins*ncols*sizeof(T)); + + for (int i = threadIdx.x; i < nbins*ncols; i += blockDim.x) { + shmemcount[i] = countout[i]; + shmempred[i] = predout[i] / shmemcount[i]; + shmemmse[i] = 0.0; + } + + __syncthreads(); + + for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { + int mycolid = (int)( i / nrows); + int coloffset = mycolid*nbins; + + // nbins is # batched bins. + T localdata = data[i]; + T label = labels[ rowids[ i % nrows ] ]; + for (int j=0; j < nbins; j++) { + int quantile_index = colids[mycolid] * nbins + j; + T quesval = quantile[quantile_index]; + if (localdata <= quesval) { + T temp = label - shmempred[coloffset +j]; + atomicAdd(&shmemmse[j + coloffset], temp*temp); + } + } + + } + + __syncthreads(); + + for (int i = threadIdx.x; i < ncols*nbins; i += blockDim.x) { + atomicAdd(&mseout[i], shmemmse[i]); + } +} + +template +void find_best_split(const std::shared_ptr> tempmem, const int nbins, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { gain = 0.0f; int best_col_id = -1; @@ -183,12 +275,14 @@ void best_split_all_cols(const T *data, const unsigned int* rowids, const T *lab int *h_histout = tempmem->h_histout->data(); T* d_mseout = tempmem->d_mseout->data(); T* h_mseout = tempmem->h_mseout->data(); + T* d_predout = tempmem->d_predout->data(); int ncols = colselector.size(); int col_minmax_bytes = sizeof(T) * 2 * ncols; - int n_mse_bytes = nbins * sizeof(T) * ncols; + int n_pred_bytes = nbins * sizeof(T) * ncols; int n_count_bytes = nbins * ncols * sizeof(int); - + int n_mse_bytes = nbins * sizeof(T) * ncols; + CUDA_CHECK(cudaMemsetAsync((void*)d_mseout, 0, n_hist_bytes, tempmem->stream)); int threads = 512; @@ -208,14 +302,16 @@ void best_split_all_cols(const T *data, const unsigned int* rowids, const T *lab allcolsampler_kernel<<stream>>>(data, rowids, d_colids, nrows, ncols, rowoffset, tempmem->temp_data->data()); } CUDA_CHECK(cudaGetLastError()); - - shmemsize = n_mse_bytes + n_count_bytes; + + shmemsize = n_pred_bytes + n_count_bytes; if (split_algo == ML::SPLIT_ALGO::HIST) { shmemsize += col_minmax_bytes; - all_cols_histograms_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_globalminmax, d_mseout, d_histout); + all_cols_histograms_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_globalminmax, d_predout, d_histout); + compute_mse_minmax_kernel<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_globalminmax, d_mseout, d_predout, d_histout); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { - all_cols_histograms_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_mseout, d_histout, tempmem->d_quantile->data()); + all_cols_histograms_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_predout, d_histout, tempmem->d_quantile->data()); + compute_mse_global_quantile_kernel<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_mseout, d_predout, d_histout, tempmem->d_quantile->data()); } CUDA_CHECK(cudaGetLastError()); diff --git a/cuML/src/decisiontree/memory.cuh b/cuML/src/decisiontree/memory.cuh index 4c78fd1d25..915d5be0f5 100644 --- a/cuML/src/decisiontree/memory.cuh +++ b/cuML/src/decisiontree/memory.cuh @@ -36,7 +36,7 @@ struct TemporaryMemory MLCommon::device_buffer *d_globalminmax; MLCommon::device_buffer *d_histout, *d_colids; MLCommon::host_buffer *h_histout; - MLCommon::device_buffer *d_mseout; + MLCommon::device_buffer *d_mseout, *d_predout; MLCommon::host_buffer *h_mseout; //Below pointers are shared for split functions @@ -105,6 +105,7 @@ struct TemporaryMemory d_globalminmax = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols * 2); d_histout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_hist_elements * Ncols); d_mseout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); + d_predout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); d_colids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); totalmem += (n_hist_elements * sizeof(int) + sizeof(int) + 3*sizeof(T))* Ncols; From feaac6dd55eee29deb08818f49801838e8d66e92 Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Thu, 9 May 2019 17:18:54 +0200 Subject: [PATCH 08/51] added all regression code / kernels, now compiles, next step is testing --- cuML/src/decisiontree/decisiontree.cu | 52 ++++++++---- cuML/src/decisiontree/decisiontree.h | 8 +- .../decisiontree/kernels/col_condenser.cuh | 5 +- .../{evaluate.cuh => evaluate_classifier.cuh} | 14 ++-- ..._regression.cuh => evaluate_regressor.cuh} | 81 +++++++++++-------- cuML/src/decisiontree/kernels/gini.cuh | 75 ++++++++++++++++- cuML/src/decisiontree/kernels/gini_def.h | 4 +- cuML/src/decisiontree/memory.cuh | 13 ++- 8 files changed, 183 insertions(+), 69 deletions(-) rename cuML/src/decisiontree/kernels/{evaluate.cuh => evaluate_classifier.cuh} (80%) rename cuML/src/decisiontree/kernels/{evaluate_regression.cuh => evaluate_regressor.cuh} (76%) diff --git a/cuML/src/decisiontree/decisiontree.cu b/cuML/src/decisiontree/decisiontree.cu index 4418ed7508..f96f2b7b90 100644 --- a/cuML/src/decisiontree/decisiontree.cu +++ b/cuML/src/decisiontree/decisiontree.cu @@ -19,7 +19,8 @@ #include "kernels/gini.cuh" #include "kernels/split_labels.cuh" #include "kernels/col_condenser.cuh" -#include "kernels/evaluate.cuh" +#include "kernels/evaluate_classifier.cuh" +#include "kernels/evaluate_regressor.cuh" #include "kernels/quantile.cuh" namespace ML { @@ -268,7 +269,7 @@ void DecisionTreeClassifier::plant(const cumlHandle_impl& handle, T *data, co } this->total_temp_mem = this->tempmem[0]->totalmem; this->total_temp_mem *= this->MAXSTREAMS; - MetricInfo split_info; + MetricInfo split_info; MLCommon::TimerCPU timer; this->root = grow_tree(data, colper, labels, 0, rowids, n_sampled_rows, split_info); this->construct_time = timer.getElapsedSeconds(); @@ -282,13 +283,13 @@ void DecisionTreeClassifier::plant(const cumlHandle_impl& handle, T *data, co template TreeNode* DecisionTreeClassifier::grow_tree(T *data, const float colper, int *labels, int depth, unsigned int* rowids, - const int n_sampled_rows, MetricInfo prev_split_info) { + const int n_sampled_rows, MetricInfo prev_split_info) { TreeNode *node = new TreeNode(); MetricQuestion ques; Question node_ques; float gain = 0.0; - MetricInfo split_info[3]; // basis, left, right. Populate this + MetricInfo split_info[3]; // basis, left, right. Populate this split_info[0] = prev_split_info; bool condition = ((depth != 0) && (prev_split_info.best_metric == 0.0f)); // This node is a leaf, no need to search for best split @@ -327,7 +328,7 @@ TreeNode* DecisionTreeClassifier::grow_tree(T *data, const float colp template void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const float colper, MetricQuestion & ques, float& gain, - unsigned int* rowids, const int n_sampled_rows, MetricInfo split_info[3], int depth) { + unsigned int* rowids, const int n_sampled_rows, MetricInfo split_info[3], int depth) { std::vector& colselector = this->feature_selector; // Optimize ginibefore; no need to compute except for root. @@ -338,14 +339,14 @@ void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const CUDA_CHECK(cudaStreamSynchronize(this->tempmem[0]->stream)); int *labelptr = this->tempmem[0]->sampledlabels->data(); - get_sampled_labels(labels, labelptr, rowids, n_sampled_rows, this->tempmem[0]->stream); + get_sampled_labels(labels, labelptr, rowids, n_sampled_rows, this->tempmem[0]->stream); gini(labelptr, n_sampled_rows, this->tempmem[0], split_info[0], this->n_unique_labels); //Unregister CUDA_CHECK(cudaHostUnregister(colselector.data())); } int current_nbins = (n_sampled_rows < this->nbins) ? n_sampled_rows : this->nbins; - best_split_all_cols(data, rowids, labels, current_nbins, n_sampled_rows, this->n_unique_labels, this->dinfo.NLocalrows, colselector, + best_split_all_cols_classifier(data, rowids, labels, current_nbins, n_sampled_rows, this->n_unique_labels, this->dinfo.NLocalrows, colselector, this->tempmem[0], &split_info[0], ques, gain, this->split_algo); } @@ -413,9 +414,11 @@ void DecisionTreeRegressor::plant(const cumlHandle_impl& handle, T *data, con if (this->split_algo == SPLIT_ALGO::HIST) { this->shmem_used += 2 * sizeof(T) * ncols; - this->shmem_used += this->nbins * this->n_unique_labels * sizeof(int) * ncols; + this->shmem_used += this->nbins * sizeof(T) * ncols * 2; + this->shmem_used += this->nbins * sizeof(int) * ncols; } else { - this->shmem_used += this->nbins * this->n_unique_labels * sizeof(int) * ncols; + this->shmem_used += this->nbins * sizeof(T) * ncols * 2; + this->shmem_used += this->nbins * sizeof(int) * ncols; } ASSERT(this->shmem_used <= this->max_shared_mem, "Shared memory per block limit %zd , requested %zd \n", this->max_shared_mem, this->shmem_used); @@ -427,7 +430,7 @@ void DecisionTreeRegressor::plant(const cumlHandle_impl& handle, T *data, con } this->total_temp_mem = this->tempmem[0]->totalmem; this->total_temp_mem *= this->MAXSTREAMS; - MetricInfo split_info; + MetricInfo split_info; MLCommon::TimerCPU timer; this->root = grow_tree(data, colper, labels, 0, rowids, n_sampled_rows, split_info); this->construct_time = timer.getElapsedSeconds(); @@ -441,14 +444,14 @@ void DecisionTreeRegressor::plant(const cumlHandle_impl& handle, T *data, con template TreeNode* DecisionTreeRegressor::grow_tree(T *data, const float colper, T *labels, int depth, unsigned int* rowids, - const int n_sampled_rows, MetricInfo prev_split_info) { + const int n_sampled_rows, MetricInfo prev_split_info) { //TODO FIXME - method body mostly copied from Classifier. FIXME TreeNode *node = new TreeNode(); MetricQuestion ques; Question node_ques; float gain = 0.0; - MetricInfo split_info[3]; // basis, left, right. Populate this + MetricInfo split_info[3]; // basis, left, right. Populate this split_info[0] = prev_split_info; bool condition = ((depth != 0) && (prev_split_info.best_metric == 0.0f)); // This node is a leaf, no need to search for best split @@ -466,7 +469,7 @@ TreeNode* DecisionTreeRegressor::grow_tree(T *data, const float colper, condition = (condition || (this->leaf_counter >= this->maxleaves)); // FIXME not fully respecting maxleaves, but >= constraints it more than == if (condition) { - node->prediction = get_class_hist(split_info[0].hist); + node->prediction = split_info[0].predict; node->split_metric_val = split_info[0].best_metric; this->leaf_counter++; @@ -486,9 +489,28 @@ TreeNode* DecisionTreeRegressor::grow_tree(T *data, const float colper, template void DecisionTreeRegressor::find_best_fruit_all(T *data, T *labels, const float colper, MetricQuestion & ques, float& gain, - unsigned int* rowids, const int n_sampled_rows, MetricInfo split_info[3], int depth) { + unsigned int* rowids, const int n_sampled_rows, MetricInfo split_info[3], int depth) { + + std::vector& colselector = this->feature_selector; + + // Optimize ginibefore; no need to compute except for root. + if (depth == 0) { + CUDA_CHECK(cudaHostRegister(colselector.data(), sizeof(int) * colselector.size(), cudaHostRegisterDefault)); + // Copy sampled column IDs to device memory + CUDA_CHECK(cudaMemcpyAsync(this->tempmem[0]->d_colids->data(), colselector.data(), sizeof(int) * colselector.size(), cudaMemcpyHostToDevice, this->tempmem[0]->stream)); + CUDA_CHECK(cudaStreamSynchronize(this->tempmem[0]->stream)); + + T *labelptr = this->tempmem[0]->sampledlabels->data(); + get_sampled_labels(labels, labelptr, rowids, n_sampled_rows, this->tempmem[0]->stream); + mse(labelptr, n_sampled_rows, this->tempmem[0], split_info[0]); + //Unregister + CUDA_CHECK(cudaHostUnregister(colselector.data())); + } - //TODO FIXME - placeholder + int current_nbins = (n_sampled_rows < this->nbins) ? n_sampled_rows : this->nbins; + best_split_all_cols_regressor(data, rowids, labels, current_nbins, n_sampled_rows, this->dinfo.NLocalrows, colselector, + this->tempmem[0], &split_info[0], ques, gain, this->split_algo); + } // ---------------- Regression end diff --git a/cuML/src/decisiontree/decisiontree.h b/cuML/src/decisiontree/decisiontree.h index 35ba197f99..198559d605 100644 --- a/cuML/src/decisiontree/decisiontree.h +++ b/cuML/src/decisiontree/decisiontree.h @@ -144,11 +144,11 @@ class DecisionTreeClassifier : public dt { void plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, int maxdepth = -1, int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node=2, bool cfg_bootstrap_features=false); - TreeNode * grow_tree(T *data, const float colper, int *labels, int depth, unsigned int* rowids, const int n_sampled_rows, MetricInfo prev_split_info); + TreeNode * grow_tree(T *data, const float colper, int *labels, int depth, unsigned int* rowids, const int n_sampled_rows, MetricInfo prev_split_info); /* depth is used to distinguish between root and other tree nodes for computations */ void find_best_fruit_all(T *data, int *labels, const float colper, MetricQuestion & ques, float& gain, unsigned int* rowids, - const int n_sampled_rows, MetricInfo split_info[3], int depth); + const int n_sampled_rows, MetricInfo split_info[3], int depth); }; // End DecisionTreeClassifier Class template @@ -162,11 +162,11 @@ class DecisionTreeRegressor : public dt { void plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels = 1, int maxdepth = -1, int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node=2, bool cfg_bootstrap_features=false); - TreeNode * grow_tree(T *data, const float colper, T *labels, int depth, unsigned int* rowids, const int n_sampled_rows, MetricInfo prev_split_info); + TreeNode * grow_tree(T *data, const float colper, T *labels, int depth, unsigned int* rowids, const int n_sampled_rows, MetricInfo prev_split_info); /* depth is used to distinguish between root and other tree nodes for computations */ void find_best_fruit_all(T *data, T *labels, const float colper, MetricQuestion & ques, float& gain, unsigned int* rowids, - const int n_sampled_rows, MetricInfo split_info[3], int depth); + const int n_sampled_rows, MetricInfo split_info[3], int depth); }; // End DecisionTreeRegressor Class } //End namespace DecisionTree diff --git a/cuML/src/decisiontree/kernels/col_condenser.cuh b/cuML/src/decisiontree/kernels/col_condenser.cuh index b20bce3c88..462fc57673 100644 --- a/cuML/src/decisiontree/kernels/col_condenser.cuh +++ b/cuML/src/decisiontree/kernels/col_condenser.cuh @@ -28,9 +28,10 @@ __global__ void get_sampled_column_kernel(const T* __restrict__ column, T *outco return; } -void get_sampled_labels(const int *labels, int *outlabels, unsigned int* rowids, const int n_sampled_rows, const cudaStream_t stream) { +template +void get_sampled_labels(const T *labels, T *outlabels, unsigned int* rowids, const int n_sampled_rows, const cudaStream_t stream) { int threads = 128; - get_sampled_column_kernel<<>>(labels, outlabels, rowids, n_sampled_rows); + get_sampled_column_kernel<<>>(labels, outlabels, rowids, n_sampled_rows); CUDA_CHECK(cudaGetLastError()); return; } diff --git a/cuML/src/decisiontree/kernels/evaluate.cuh b/cuML/src/decisiontree/kernels/evaluate_classifier.cuh similarity index 80% rename from cuML/src/decisiontree/kernels/evaluate.cuh rename to cuML/src/decisiontree/kernels/evaluate_classifier.cuh index ddaf8fb4f2..f9b3d129fd 100644 --- a/cuML/src/decisiontree/kernels/evaluate.cuh +++ b/cuML/src/decisiontree/kernels/evaluate_classifier.cuh @@ -28,7 +28,7 @@ column order is as per colids (bootstrapped random cols) for each col there are nbins histograms */ template -__global__ void all_cols_histograms_kernel(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const int n_unique_labels, const T* __restrict__ globalminmax, int* histout) { +__global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const int n_unique_labels, const T* __restrict__ globalminmax, int* histout) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -73,7 +73,7 @@ __global__ void all_cols_histograms_kernel(const T* __restrict__ data, const int } template -__global__ void all_cols_histograms_global_quantile_kernel(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const int n_unique_labels, int* histout, const T* __restrict__ quantile) { +__global__ void all_cols_histograms_global_quantile_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const int n_unique_labels, int* histout, const T* __restrict__ quantile) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -110,7 +110,7 @@ __global__ void all_cols_histograms_global_quantile_kernel(const T* __restrict__ } template -void find_best_split(const std::shared_ptr> tempmem, const int nbins, const int n_unique_labels, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { +void find_best_split_classifier(const std::shared_ptr> tempmem, const int nbins, const int n_unique_labels, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { gain = 0.0f; int best_col_id = -1; @@ -193,7 +193,7 @@ void find_best_split(const std::shared_ptr> tempmem, const template -void best_split_all_cols(const T *data, const unsigned int* rowids, const L *labels, const int nbins, const int nrows, const int n_unique_labels, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) +void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, const L *labels, const int nbins, const int nrows, const int n_unique_labels, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) { int* d_colids = tempmem->d_colids->data(); T* d_globalminmax = tempmem->d_globalminmax->data(); @@ -228,16 +228,16 @@ void best_split_all_cols(const T *data, const unsigned int* rowids, const L *lab if (split_algo == ML::SPLIT_ALGO::HIST) { shmemsize += col_minmax_bytes; - all_cols_histograms_kernel<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, n_unique_labels, d_globalminmax, d_histout); + all_cols_histograms_kernel_class<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, n_unique_labels, d_globalminmax, d_histout); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { - all_cols_histograms_global_quantile_kernel<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, n_unique_labels, d_histout, tempmem->d_quantile->data()); + all_cols_histograms_global_quantile_kernel_class<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, n_unique_labels, d_histout, tempmem->d_quantile->data()); } CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaMemcpyAsync(h_histout, d_histout, n_hist_bytes, cudaMemcpyDeviceToHost, tempmem->stream)); CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); - find_best_split(tempmem, nbins, n_unique_labels, colselector, &split_info[0], nrows, ques, gain, split_algo); + find_best_split_classifier(tempmem, nbins, n_unique_labels, colselector, &split_info[0], nrows, ques, gain, split_algo); return; } diff --git a/cuML/src/decisiontree/kernels/evaluate_regression.cuh b/cuML/src/decisiontree/kernels/evaluate_regressor.cuh similarity index 76% rename from cuML/src/decisiontree/kernels/evaluate_regression.cuh rename to cuML/src/decisiontree/kernels/evaluate_regressor.cuh index a7db1c6b1f..a22dc9df99 100644 --- a/cuML/src/decisiontree/kernels/evaluate_regression.cuh +++ b/cuML/src/decisiontree/kernels/evaluate_regressor.cuh @@ -24,7 +24,7 @@ #include "../algo_helper.h" template -__global__ void compute_mse_minmax_kernel(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout) { +__global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -36,15 +36,15 @@ __global__ void compute_mse_minmax_kernel(const T* __restrict__ data, const T* _ for (int i=threadIdx.x; i < 2*ncols; i += blockDim.x) { minmaxshared[i] = globalminmax[i]; } - + for (int i = threadIdx.x; i < nbins*ncols; i += blockDim.x) { shmemcount[i] = countout[i]; shmempred[i] = predout[i] / shmemcount[i]; shmemmse[i] = 0.0; } - + __syncthreads(); - + for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { int mycolid = (int)( i / nrows); int coloffset = mycolid*nbins; @@ -127,7 +127,7 @@ __global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data } template -__global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T* __restrict__ quantile) { +__global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, T* predout, int* countout, const T* __restrict__ quantile) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -160,7 +160,7 @@ __global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restri } __syncthreads(); - + for (int i = threadIdx.x; i < ncols*nbins; i += blockDim.x) { atomicAdd(&predout[i], shmempred[i]); atomicAdd(&countout[i], shmemcount[i]); @@ -168,26 +168,26 @@ __global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restri } template -__global__ void compute_mse_global_quantile_kernel(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, T* mseout, int* countout, const T* __restrict__ quantile) { - +__global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T* __restrict__ quantile) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; T *shmempred = (T*) (shmem); T *shmemmse = (T*)(shmem + nbins*ncols*sizeof(T)); int *shmemcount = (int*)(shmem + 2*nbins*ncols*sizeof(T)); - + for (int i = threadIdx.x; i < nbins*ncols; i += blockDim.x) { shmemcount[i] = countout[i]; shmempred[i] = predout[i] / shmemcount[i]; shmemmse[i] = 0.0; } - + __syncthreads(); - + for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { int mycolid = (int)( i / nrows); int coloffset = mycolid*nbins; - + // nbins is # batched bins. T localdata = data[i]; T label = labels[ rowids[ i % nrows ] ]; @@ -199,45 +199,50 @@ __global__ void compute_mse_global_quantile_kernel(const T* __restrict__ data, c atomicAdd(&shmemmse[j + coloffset], temp*temp); } } - + } - + __syncthreads(); - + for (int i = threadIdx.x; i < ncols*nbins; i += blockDim.x) { atomicAdd(&mseout[i], shmemmse[i]); } } template -void find_best_split(const std::shared_ptr> tempmem, const int nbins, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { - +void find_best_split_regressor(const std::shared_ptr> tempmem, const int nbins, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { + gain = 0.0f; int best_col_id = -1; int best_bin_id = -1; - + int n_cols = col_selector.size(); for (int col_id = 0; col_id < n_cols; col_id++) { - + int col_count_base_index = col_id * nbins; // tempmem->h_histout holds n_cols histograms of nbins of n_unique_labels each. for (int i = 0; i < nbins; i++) { - - int tmp_lnrows += tempmem->h_histout->data()[col_hist_base_index + i]; + + int tmp_lnrows = tempmem->h_histout->data()[col_count_base_index + i]; int tmp_rnrows = nrows - tmp_lnrows; - + if (tmp_lnrows == 0 || tmp_rnrows == 0) continue; + float tmp_pred_left = tempmem->h_predout->data()[col_count_base_index + i]; + float tmp_pred_right = (nrows * split_info[0].predict) - tmp_pred_left; + tmp_pred_left /= tmp_lnrows; + tmp_pred_right /= tmp_rnrows; + // Compute MSE right and MSE left value for each bin. - float tmp_mse_left = tempmem->h_mseout->data()[col_hist_base_index + i]; + float tmp_mse_left = tempmem->h_mseout->data()[col_count_base_index + i]; float tmp_mse_right = (nrows * split_info[0].best_metric) - tmp_mse_left; tmp_mse_left /= tmp_lnrows; tmp_mse_right /= tmp_rnrows; float impurity = (tmp_lnrows * 1.0f/nrows) * tmp_mse_left + (tmp_rnrows * 1.0f/nrows) * tmp_mse_right; float info_gain = split_info[0].best_metric - impurity; - + // Compute best information col_gain so far if (info_gain > gain) { gain = info_gain; @@ -245,6 +250,8 @@ void find_best_split(const std::shared_ptr> tempmem, const in best_col_id = col_id; split_info[1].best_metric = tmp_mse_left; split_info[2].best_metric = tmp_mse_right; + split_info[1].predict = tmp_pred_left; + split_info[2].predict = tmp_pred_right; } } } @@ -267,7 +274,7 @@ void find_best_split(const std::shared_ptr> tempmem, const in template -void best_split_all_cols(const T *data, const unsigned int* rowids, const T *labels, const int nbins, const int nrows, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) +void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, const T *labels, const int nbins, const int nrows, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) { int* d_colids = tempmem->d_colids->data(); T* d_globalminmax = tempmem->d_globalminmax->data(); @@ -276,6 +283,7 @@ void best_split_all_cols(const T *data, const unsigned int* rowids, const T *lab T* d_mseout = tempmem->d_mseout->data(); T* h_mseout = tempmem->h_mseout->data(); T* d_predout = tempmem->d_predout->data(); + T* h_predout = tempmem->h_predout->data(); int ncols = colselector.size(); int col_minmax_bytes = sizeof(T) * 2 * ncols; @@ -283,16 +291,18 @@ void best_split_all_cols(const T *data, const unsigned int* rowids, const T *lab int n_count_bytes = nbins * ncols * sizeof(int); int n_mse_bytes = nbins * sizeof(T) * ncols; - CUDA_CHECK(cudaMemsetAsync((void*)d_mseout, 0, n_hist_bytes, tempmem->stream)); - + CUDA_CHECK(cudaMemsetAsync((void*)d_mseout, 0, n_mse_bytes, tempmem->stream)); + CUDA_CHECK(cudaMemsetAsync((void*)d_predout, 0, n_pred_bytes, tempmem->stream)); + CUDA_CHECK(cudaMemsetAsync((void*)d_histout, 0, n_count_bytes, tempmem->stream)); + int threads = 512; int blocks = MLCommon::ceildiv(nrows * ncols, threads); if (blocks > 65536) blocks = 65536; - + /* Kernel allcolsampler_*_kernel: - - populates tempmem->tempdata with the sampled column data, - - and computes min max histograms in tempmem->d_globalminmax *if minmax in name + - populates tempmem->tempdata with the sampled column data, + - and computes min max histograms in tempmem->d_globalminmax *if minmax in name across all columns. */ size_t shmemsize = col_minmax_bytes; @@ -304,23 +314,24 @@ void best_split_all_cols(const T *data, const unsigned int* rowids, const T *lab CUDA_CHECK(cudaGetLastError()); shmemsize = n_pred_bytes + n_count_bytes; - + if (split_algo == ML::SPLIT_ALGO::HIST) { shmemsize += col_minmax_bytes; all_cols_histograms_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_globalminmax, d_predout, d_histout); - compute_mse_minmax_kernel<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_globalminmax, d_mseout, d_predout, d_histout); + compute_mse_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_globalminmax, d_mseout, d_predout, d_histout); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { all_cols_histograms_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_predout, d_histout, tempmem->d_quantile->data()); - compute_mse_global_quantile_kernel<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_mseout, d_predout, d_histout, tempmem->d_quantile->data()); + compute_mse_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_mseout, d_predout, d_histout, tempmem->d_quantile->data()); } CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaMemcpyAsync(h_mseout, d_mseout, n_mse_bytes, cudaMemcpyDeviceToHost, tempmem->stream)); CUDA_CHECK(cudaMemcpyAsync(h_histout, d_histout, n_count_bytes, cudaMemcpyDeviceToHost, tempmem->stream)); + CUDA_CHECK(cudaMemcpyAsync(h_predout, d_predout, n_pred_bytes, cudaMemcpyDeviceToHost, tempmem->stream)); CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); - - find_best_split(tempmem, nbins, colselector, &split_info[0], nrows, ques, gain, split_algo); + + find_best_split_regressor(tempmem, nbins, colselector, &split_info[0], nrows, ques, gain, split_algo); return; } diff --git a/cuML/src/decisiontree/kernels/gini.cuh b/cuML/src/decisiontree/kernels/gini.cuh index 4d90af617e..4c23323376 100644 --- a/cuML/src/decisiontree/kernels/gini.cuh +++ b/cuML/src/decisiontree/kernels/gini.cuh @@ -57,7 +57,56 @@ __global__ void gini_kernel(const int* __restrict__ labels, const int nrows, con } template -void gini(int *labels_in, const int nrows, const std::shared_ptr> tempmem, MetricInfo & split_info, int & unique_labels) +__global__ void pred_kernel(const T* __restrict__ labels, const int nrows, T* predout) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + __shared__ T shmempred; + + if (threadIdx.x == 0) + shmempred = 0; + + __syncthreads(); + + if (tid < nrows) { + T label = labels[tid]; + atomicAdd(&shmempred, label); + } + + __syncthreads(); + + if (threadIdx.x == 0) + atomicAdd(predout, shmempred); + + return; +} + +template +__global__ void mse_kernel(const T* __restrict__ labels, const int nrows, const T* predout, T* mseout) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + __shared__ T shmemmse; + + if (threadIdx.x == 0) { + shmemmse = 0; + } + + __syncthreads(); + + if (tid < nrows) { + T label = labels[tid] - (predout[0]/nrows); + atomicAdd(&shmemmse, label*label); + } + + __syncthreads(); + + if (threadIdx.x == 0) + atomicAdd(mseout, shmemmse); + + return; +} + +template +void gini(int *labels_in, const int nrows, const std::shared_ptr> tempmem, MetricInfo & split_info, int & unique_labels) { int *dhist = tempmem->d_hist->data(); int *hhist = tempmem->h_hist->data(); @@ -81,3 +130,27 @@ void gini(int *labels_in, const int nrows, const std::shared_ptr +void mse(T *labels_in, const int nrows, const std::shared_ptr> tempmem, MetricInfo & split_info) +{ + T *dpred = tempmem->d_predout->data(); + T *dmse = tempmem->d_mseout->data(); + T *hmse = tempmem->h_mseout->data(); + T *hpred = tempmem->h_predout->data(); + + CUDA_CHECK(cudaMemsetAsync(dpred, 0, sizeof(T), tempmem->stream)); + CUDA_CHECK(cudaMemsetAsync(dmse, 0, sizeof(T), tempmem->stream)); + + pred_kernel<<< MLCommon::ceildiv(nrows, 128), 128, 0, tempmem->stream>>>(labels_in, nrows, dpred); + CUDA_CHECK(cudaGetLastError()); + mse_kernel<<< MLCommon::ceildiv(nrows, 128), 128, 0, tempmem->stream>>>(labels_in, nrows, dpred, dmse); + + CUDA_CHECK(cudaMemcpyAsync(hmse, dmse, sizeof(T), cudaMemcpyDeviceToHost, tempmem->stream)); + CUDA_CHECK(cudaMemcpyAsync(hpred, dpred, sizeof(T), cudaMemcpyDeviceToHost, tempmem->stream)); + CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); + + split_info.best_metric = (float)hmse[0] / (float)nrows; //Update gini val + split_info.predict = hpred[0] / (T)nrows; + return; +} + diff --git a/cuML/src/decisiontree/kernels/gini_def.h b/cuML/src/decisiontree/kernels/gini_def.h index c565853a28..5de23b77e0 100644 --- a/cuML/src/decisiontree/kernels/gini_def.h +++ b/cuML/src/decisiontree/kernels/gini_def.h @@ -45,7 +45,9 @@ struct MetricQuestion { void set_question_fields(int cfg_bootcolumn, int cfg_column, int cfg_batch_id, int cfg_nbins, int cfg_ncols, T cfg_min, T cfg_max, T cfg_value); }; +template struct MetricInfo { float best_metric = -1.0f; - std::vector hist; //Element hist[i] stores # labels with label i for a given node. + T predict = 0; + std::vector hist; //Element hist[i] stores # labels with label i for a given node. for classification }; diff --git a/cuML/src/decisiontree/memory.cuh b/cuML/src/decisiontree/memory.cuh index 915d5be0f5..31a4519b68 100644 --- a/cuML/src/decisiontree/memory.cuh +++ b/cuML/src/decisiontree/memory.cuh @@ -37,7 +37,7 @@ struct TemporaryMemory MLCommon::device_buffer *d_histout, *d_colids; MLCommon::host_buffer *h_histout; MLCommon::device_buffer *d_mseout, *d_predout; - MLCommon::host_buffer *h_mseout; + MLCommon::host_buffer *h_mseout, *h_predout; //Below pointers are shared for split functions MLCommon::device_buffer *d_flags_left, *d_flags_right; @@ -101,6 +101,7 @@ struct TemporaryMemory h_histout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, n_hist_elements * Ncols); h_mseout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, Ncols); + h_predout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, Ncols); d_globalminmax = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols * 2); d_histout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_hist_elements * Ncols); @@ -108,7 +109,7 @@ struct TemporaryMemory d_predout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); d_colids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); - totalmem += (n_hist_elements * sizeof(int) + sizeof(int) + 3*sizeof(T))* Ncols; + totalmem += (n_hist_elements * sizeof(int) + sizeof(int) + 4*sizeof(T))* Ncols; } @@ -148,7 +149,8 @@ struct TemporaryMemory question_value->release(stream); h_histout->release(stream); h_mseout->release(stream); - + h_predout->release(stream); + delete sampledlabels; delete d_split_temp_storage; delete d_num_selected_out; @@ -158,15 +160,18 @@ struct TemporaryMemory delete question_value; delete h_histout; delete h_mseout; - + delete h_predout; + d_globalminmax->release(stream); d_histout->release(stream); d_mseout->release(stream); + d_predout->release(stream); d_colids->release(stream); delete d_globalminmax; delete d_histout; delete d_mseout; + delete d_predout; delete d_colids; } From 09eb47951da82c2ecbe421121911d89e48136a33 Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Thu, 9 May 2019 09:46:03 -0700 Subject: [PATCH 09/51] Code in flux. Regression related changes. - RFRegressor flat API signature fixes. - Minor tempmem fixes. - Temp addition to rf_test for regression testing. Very simply test. - Debugging ongoing. --- cuML/src/decisiontree/memory.cuh | 11 +-- cuML/src/randomforest/randomforest.h | 17 ++-- cuML/test/rf_test.cu | 130 +++++++++++++++++++++++++-- 3 files changed, 138 insertions(+), 20 deletions(-) diff --git a/cuML/src/decisiontree/memory.cuh b/cuML/src/decisiontree/memory.cuh index 31a4519b68..9b8cce951a 100644 --- a/cuML/src/decisiontree/memory.cuh +++ b/cuML/src/decisiontree/memory.cuh @@ -100,16 +100,17 @@ struct TemporaryMemory totalmem += split_temp_storage_bytes + (N + 1)*sizeof(int) + 2*N*sizeof(char) + sizeof(T); h_histout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, n_hist_elements * Ncols); - h_mseout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, Ncols); - h_predout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, Ncols); + int mse_elements = Ncols * n_bins; + h_mseout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, mse_elements); + h_predout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, mse_elements); d_globalminmax = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols * 2); d_histout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_hist_elements * Ncols); - d_mseout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); - d_predout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); + d_mseout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, mse_elements); + d_predout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, mse_elements); d_colids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); - totalmem += (n_hist_elements * sizeof(int) + sizeof(int) + 4*sizeof(T))* Ncols; + totalmem += (n_hist_elements * sizeof(int) + sizeof(int) + 2*sizeof(T) + n_bins * sizeof(T))* Ncols; } diff --git a/cuML/src/randomforest/randomforest.h b/cuML/src/randomforest/randomforest.h index 531a321156..b2fcc6feb9 100644 --- a/cuML/src/randomforest/randomforest.h +++ b/cuML/src/randomforest/randomforest.h @@ -89,6 +89,7 @@ class rf { RF_params rf_params; int rf_type; virtual const DecisionTree::dt * get_trees_ptr() const = 0; + ~rf() = default; public: rf(RF_params cfg_rf_params, int cfg_rf_type=RF_type::CLASSIFICATION); @@ -145,15 +146,15 @@ RF_metrics cross_validate(const cumlHandle& user_handle, const rfClassifier * rf_regressor, float * input, int n_rows, int n_cols, int * labels, int n_unique_labels); -void fit(const cumlHandle& user_handle, rfRegressor * rf_regressor, double * input, int n_rows, int n_cols, int * labels, int n_unique_labels); +void fit(const cumlHandle& user_handle, rfRegressor * rf_regressor, float * input, int n_rows, int n_cols, float * labels); +void fit(const cumlHandle& user_handle, rfRegressor * rf_regressor, double * input, int n_rows, int n_cols, double * labels); -void predict(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const float * input, int n_rows, int n_cols, int * predictions, bool verbose=false); -void predict(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const double * input, int n_rows, int n_cols, int * predictions, bool verbose=false); +void predict(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const float * input, int n_rows, int n_cols, float * predictions, bool verbose=false); +void predict(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const double * input, int n_rows, int n_cols, double * predictions, bool verbose=false); -RF_metrics cross_validate(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const float * input, const int * ref_labels, - int n_rows, int n_cols, int * predictions, bool verbose=false); -RF_metrics cross_validate(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const double * input, const int * ref_labels, - int n_rows, int n_cols, int * predictions, bool verbose=false); +RF_metrics cross_validate(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const float * input, const float * ref_labels, + int n_rows, int n_cols, float * predictions, bool verbose=false); +RF_metrics cross_validate(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const double * input, const double * ref_labels, + int n_rows, int n_cols, double * predictions, bool verbose=false); }; diff --git a/cuML/test/rf_test.cu b/cuML/test/rf_test.cu index 7b681423d9..682322a026 100644 --- a/cuML/test/rf_test.cu +++ b/cuML/test/rf_test.cu @@ -48,7 +48,7 @@ template template -class RfTest: public ::testing::TestWithParam > { +class RfClassifierTest: public ::testing::TestWithParam > { protected: void basicTest() { @@ -135,6 +135,94 @@ protected: std::vector predicted_labels; }; +//------------------------------------------------------------------------------------------------------------------------------------- + +template +class RfRegressorTest: public ::testing::TestWithParam > { +protected: + void basicTest() { + + params = ::testing::TestWithParam>::GetParam(); + + DecisionTree::DecisionTreeParams tree_params(params.max_depth, params.max_leaves, params.max_features, params.n_bins, + params.split_algo, params.min_rows_per_node, params.bootstrap_features); + RF_params rf_params(params.bootstrap, params.bootstrap_features, params.n_trees, params.rows_sample, tree_params); + //rf_params.print(); + + //-------------------------------------------------------- + // Random Forest + //-------------------------------------------------------- + + int data_len = params.n_rows * params.n_cols; + allocate(data, data_len); + allocate(labels, params.n_rows); + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream) ); + + // Populate data (assume Col major) + std::vector data_h = {0.0, 0.0, 0.0, 0.0, 10.0, 20.0, 30.0, 40.0}; + data_h.resize(data_len); + updateDevice(data, data_h.data(), data_len, stream); + + // Populate labels + labels_h = {1.0, 2.0, 3.0, 4.0}; + labels_h.resize(params.n_rows); + //preprocess_labels(params.n_rows, labels_h, labels_map); + updateDevice(labels, labels_h.data(), params.n_rows, stream); + + rf_regressor = new typename rfRegressor::rfRegressor(rf_params); + + cumlHandle handle; + handle.setStream(stream); + + fit(handle, rf_regressor, data, params.n_rows, params.n_cols, labels); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaStreamDestroy(stream)); + + // Inference data: same as train, but row major + int inference_data_len = params.n_inference_rows * params.n_cols; + inference_data_h = {0.0, 10.0, 0.0, 20.0, 0.0, 30.0, 0.0, 40.0}; + inference_data_h.resize(inference_data_len); + + + // Predict and compare against known labels + predicted_labels.resize(params.n_inference_rows); + RF_metrics tmp = cross_validate(handle, rf_regressor, inference_data_h.data(), labels_h.data(), + params.n_inference_rows, params.n_cols, predicted_labels.data(), false); + mse = tmp.mean_squared_error; + std::cout << "MSE is " << mse << std::endl; + } + + void SetUp() override { + basicTest(); + } + + void TearDown() override { + mse = -1.0f; // reset mse + inference_data_h.clear(); + labels_h.clear(); + predicted_labels.clear(); + + CUDA_CHECK(cudaFree(labels)); + CUDA_CHECK(cudaFree(data)); + delete rf_regressor; + } + +protected: + + RfInputs params; + T * data; + T * labels; + std::vector inference_data_h; + std::vector labels_h; + + rfRegressor * rf_regressor; + float mse = -1.0f; // overriden in each test SetUp and TearDown + + std::vector predicted_labels; +}; +//------------------------------------------------------------------------------------------------------------------------------------- const std::vector > inputsf2 = { {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2}, // single tree forest, bootstrap false, unlimited depth, 4 bins @@ -153,8 +241,8 @@ const std::vector > inputsd2 = { // Same as inputsf2 }; -typedef RfTest RfTestF; -TEST_P(RfTestF, Fit) { +typedef RfClassifierTest RfClassifierTestF; +TEST_P(RfClassifierTestF, Fit) { //rf_classifier->print_rf_detailed(); // Prints all trees in the forest. Leaf nodes use the remapped values from labels_map. if (!params.bootstrap && (params.max_features == 1.0f)) { ASSERT_TRUE(accuracy == 1.0f); @@ -163,8 +251,8 @@ TEST_P(RfTestF, Fit) { } } -typedef RfTest RfTestD; -TEST_P(RfTestD, Fit) { +typedef RfClassifierTest RfClassifierTestD; +TEST_P(RfClassifierTestD, Fit) { if (!params.bootstrap && (params.max_features == 1.0f)) { ASSERT_TRUE(accuracy == 1.0f); } else { @@ -172,8 +260,36 @@ TEST_P(RfTestD, Fit) { } } -INSTANTIATE_TEST_CASE_P(RfTests, RfTestF, ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_CASE_P(RfClassifierTests, RfClassifierTestF, ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_CASE_P(RfClassifierTests, RfClassifierTestD, ::testing::ValuesIn(inputsd2)); + +typedef RfRegressorTest RfRegressorTestF; +TEST_P(RfRegressorTestF, Fit) { + rf_regressor->print_rf_detailed(); // Prints all trees in the forest. + if (!params.bootstrap && (params.max_features == 1.0f)) { + ASSERT_TRUE(mse == 0.0f); + } else { + ASSERT_TRUE(mse <= 0.1f); // Empirically derived mse range. TODO FIXME + } +} + +typedef RfRegressorTest RfRegressorTestD; +TEST_P(RfRegressorTestD, Fit) { + if (!params.bootstrap && (params.max_features == 1.0f)) { + ASSERT_TRUE(mse == 0.0f); + } else { + ASSERT_TRUE(mse <= 0.2f); + } +} + +const std::vector > inputsf2_temp = { + {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2}}; // single tree forest, bootstrap false, unlimited depth, 4 bins + +//INSTANTIATE_TEST_CASE_P(RfRegressorTests, RfRegressorTestF, ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_CASE_P(RfRegressorTests, RfRegressorTestF, ::testing::ValuesIn(inputsf2_temp)); + +//INSTANTIATE_TEST_CASE_P(RfRegressorTests, RfRegressorTestD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_CASE_P(RfTests, RfTestD, ::testing::ValuesIn(inputsd2)); } // end namespace ML From 1b2b8d3b9cc0b9256723cd769cf449170e72a953 Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Fri, 10 May 2019 14:51:22 +0200 Subject: [PATCH 10/51] fixed right mse, it needs to be computed in kernel --- cuML/src/decisiontree/decisiontree.cu | 12 +-- .../kernels/evaluate_regressor.cuh | 99 +++++++++++-------- cuML/src/decisiontree/memory.cuh | 4 +- 3 files changed, 65 insertions(+), 50 deletions(-) diff --git a/cuML/src/decisiontree/decisiontree.cu b/cuML/src/decisiontree/decisiontree.cu index f96f2b7b90..1a123d4143 100644 --- a/cuML/src/decisiontree/decisiontree.cu +++ b/cuML/src/decisiontree/decisiontree.cu @@ -405,7 +405,7 @@ void DecisionTreeRegressor::plant(const cumlHandle_impl& handle, T *data, con std::iota(this->feature_selector.begin(), this->feature_selector.end(), 0); } - std::random_shuffle(this->feature_selector.begin(), this->feature_selector.end()); + //std::random_shuffle(this->feature_selector.begin(), this->feature_selector.end()); this->feature_selector.resize((int) (colper * this->dinfo.Ncols)); cudaDeviceProp prop; @@ -414,10 +414,10 @@ void DecisionTreeRegressor::plant(const cumlHandle_impl& handle, T *data, con if (this->split_algo == SPLIT_ALGO::HIST) { this->shmem_used += 2 * sizeof(T) * ncols; - this->shmem_used += this->nbins * sizeof(T) * ncols * 2; + this->shmem_used += this->nbins * sizeof(T) * ncols * 3; this->shmem_used += this->nbins * sizeof(int) * ncols; } else { - this->shmem_used += this->nbins * sizeof(T) * ncols * 2; + this->shmem_used += this->nbins * sizeof(T) * ncols * 3; this->shmem_used += this->nbins * sizeof(int) * ncols; } ASSERT(this->shmem_used <= this->max_shared_mem, "Shared memory per block limit %zd , requested %zd \n", this->max_shared_mem, this->shmem_used); @@ -458,8 +458,8 @@ TreeNode* DecisionTreeRegressor::grow_tree(T *data, const float colper, condition = condition || (n_sampled_rows < this->min_rows_per_node); // Do not split a node with less than min_rows_per_node samples if (!condition) { - find_best_fruit_all(data, labels, colper, ques, gain, rowids, n_sampled_rows, &split_info[0], depth); //ques and gain are output here - condition = condition || (gain == 0.0f); + find_best_fruit_all(data, labels, colper, ques, gain, rowids, n_sampled_rows, split_info, depth); //ques and gain are output here + condition = condition || (gain == 0.0f); } if (this->treedepth != -1) @@ -509,7 +509,7 @@ void DecisionTreeRegressor::find_best_fruit_all(T *data, T *labels, const flo int current_nbins = (n_sampled_rows < this->nbins) ? n_sampled_rows : this->nbins; best_split_all_cols_regressor(data, rowids, labels, current_nbins, n_sampled_rows, this->dinfo.NLocalrows, colselector, - this->tempmem[0], &split_info[0], ques, gain, this->split_algo); + this->tempmem[0], split_info, ques, gain, this->split_algo); } diff --git a/cuML/src/decisiontree/kernels/evaluate_regressor.cuh b/cuML/src/decisiontree/kernels/evaluate_regressor.cuh index a22dc9df99..7d1ac05fe5 100644 --- a/cuML/src/decisiontree/kernels/evaluate_regressor.cuh +++ b/cuML/src/decisiontree/kernels/evaluate_regressor.cuh @@ -24,23 +24,24 @@ #include "../algo_helper.h" template -__global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout) { +__global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T pred_parent) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; T *minmaxshared = (T*)shmem; - T *shmempred = (T*)(shmem + 2*ncols*sizeof(T)); - T *shmemmse = (T*)(shmem + 2*ncols*sizeof(T) + nbins*ncols*sizeof(T)); - int *shmemcount = (int*)(shmem + 2*ncols*sizeof(T) + 2*nbins*ncols*sizeof(T)); + T *shmem_pred = (T*)(shmem + 2*ncols*sizeof(T)); + T *shmem_mse = (T*)(shmem + 2*ncols*sizeof(T) + nbins*ncols*sizeof(T)); + int *shmem_count = (int*)(shmem + 2*ncols*sizeof(T) + 3*nbins*ncols*sizeof(T)); for (int i=threadIdx.x; i < 2*ncols; i += blockDim.x) { minmaxshared[i] = globalminmax[i]; } for (int i = threadIdx.x; i < nbins*ncols; i += blockDim.x) { - shmemcount[i] = countout[i]; - shmempred[i] = predout[i] / shmemcount[i]; - shmemmse[i] = 0.0; + shmem_count[i] = countout[i]; + shmem_pred[i] = predout[i]; + shmem_mse[i] = 0.0; + shmem_mse[i + ncols*nbins] = 0.0; } __syncthreads(); @@ -59,17 +60,23 @@ __global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const T quesval = base_quesval + j * delta; if (localdata <= quesval) { - T temp = label - shmempred[coloffset +j]; - atomicAdd(&shmemmse[j + coloffset], temp*temp); + T temp = shmem_pred[coloffset +j] / shmem_count[coloffset + j] ; + temp = label - temp; + atomicAdd(&shmem_mse[j + coloffset], temp*temp); + } else { + T temp = ( pred_parent*nrows - shmem_pred[coloffset +j] ) / (nrows - shmem_count[coloffset + j] ); + temp = label - temp; + atomicAdd(&shmem_mse[j + coloffset + ncols*nbins], temp*temp); } + } } __syncthreads(); - for (int i = threadIdx.x; i < ncols*nbins; i += blockDim.x) { - atomicAdd(&mseout[i], shmemmse[i]); + for (int i = threadIdx.x; i < 2*ncols*nbins; i += blockDim.x) { + atomicAdd(&mseout[i], shmem_mse[i]); } } @@ -83,16 +90,16 @@ __global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; T *minmaxshared = (T*)shmem; - T *shmempred = (T*)(shmem + 2*ncols*sizeof(T)); - int *shmemcount = (int*)(shmem + 2*ncols*sizeof(T) + nbins*ncols*sizeof(T)); + T *shmem_pred = (T*)(shmem + 2*ncols*sizeof(T)); + int *shmem_count = (int*)(shmem + 2*ncols*sizeof(T) + nbins*ncols*sizeof(T)); for (int i=threadIdx.x; i < 2*ncols; i += blockDim.x) { minmaxshared[i] = globalminmax[i]; } for (int i = threadIdx.x; i < nbins*ncols; i += blockDim.x) { - shmempred[i] = 0; - shmemcount[i] = 0; + shmem_pred[i] = 0; + shmem_count[i] = 0; } __syncthreads(); @@ -111,8 +118,8 @@ __global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data T quesval = base_quesval + j * delta; if (localdata <= quesval) { - atomicAdd(&shmemcount[j + coloffset], 1); - atomicAdd(&shmempred[j + coloffset], label); + atomicAdd(&shmem_count[j + coloffset], 1); + atomicAdd(&shmem_pred[j + coloffset], label); } } @@ -121,8 +128,8 @@ __global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data __syncthreads(); for (int i = threadIdx.x; i < ncols*nbins; i += blockDim.x) { - atomicAdd(&predout[i], shmempred[i]); - atomicAdd(&countout[i], shmemcount[i]); + atomicAdd(&predout[i], shmem_pred[i]); + atomicAdd(&countout[i], shmem_count[i]); } } @@ -131,12 +138,12 @@ __global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restri int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; - T *shmempred = (T*) (shmem); - int *shmemcount = (int*)(shmem + nbins*ncols*sizeof(T)); + T *shmem_pred = (T*) (shmem); + int *shmem_count = (int*)(shmem + nbins*ncols*sizeof(T)); for (int i = threadIdx.x; i < nbins*ncols; i += blockDim.x) { - shmempred[i] = 0; - shmemcount[i] = 0; + shmem_pred[i] = 0; + shmem_count[i] = 0; } __syncthreads(); @@ -152,8 +159,8 @@ __global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restri int quantile_index = colids[mycolid] * nbins + j; T quesval = quantile[quantile_index]; if (localdata <= quesval) { - atomicAdd(&shmemcount[j + coloffset], 1); - atomicAdd(&shmempred[j + coloffset], label); + atomicAdd(&shmem_count[j + coloffset], 1); + atomicAdd(&shmem_pred[j + coloffset], label); } } @@ -162,24 +169,25 @@ __global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restri __syncthreads(); for (int i = threadIdx.x; i < ncols*nbins; i += blockDim.x) { - atomicAdd(&predout[i], shmempred[i]); - atomicAdd(&countout[i], shmemcount[i]); + atomicAdd(&predout[i], shmem_pred[i]); + atomicAdd(&countout[i], shmem_count[i]); } } template -__global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T* __restrict__ quantile) { +__global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T* __restrict__ quantile, const T pred_parent) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; - T *shmempred = (T*) (shmem); - T *shmemmse = (T*)(shmem + nbins*ncols*sizeof(T)); - int *shmemcount = (int*)(shmem + 2*nbins*ncols*sizeof(T)); + T *shmem_pred = (T*) (shmem); + T *shmem_mse = (T*)(shmem + nbins*ncols*sizeof(T)); + int *shmem_count = (int*)(shmem + 3*nbins*ncols*sizeof(T)); for (int i = threadIdx.x; i < nbins*ncols; i += blockDim.x) { - shmemcount[i] = countout[i]; - shmempred[i] = predout[i] / shmemcount[i]; - shmemmse[i] = 0.0; + shmem_count[i] = countout[i]; + shmem_pred[i] = predout[i]; + shmem_mse[i] = 0.0; + shmem_mse[i + nbins*ncols] = 0.0; } __syncthreads(); @@ -194,18 +202,25 @@ __global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ dat for (int j=0; j < nbins; j++) { int quantile_index = colids[mycolid] * nbins + j; T quesval = quantile[quantile_index]; + if (localdata <= quesval) { - T temp = label - shmempred[coloffset +j]; - atomicAdd(&shmemmse[j + coloffset], temp*temp); + T temp = shmem_pred[coloffset +j] / shmem_count[coloffset + j] ; + temp = label - temp; + atomicAdd(&shmem_mse[j + coloffset], temp*temp); + } else { + T temp = ( pred_parent*nrows - shmem_pred[coloffset +j] ) / (nrows - shmem_count[coloffset + j] ); + temp = label - temp; + atomicAdd(&shmem_mse[j + coloffset + ncols*nbins], temp*temp); } + } } __syncthreads(); - for (int i = threadIdx.x; i < ncols*nbins; i += blockDim.x) { - atomicAdd(&mseout[i], shmemmse[i]); + for (int i = threadIdx.x; i < 2*ncols*nbins; i += blockDim.x) { + atomicAdd(&mseout[i], shmem_mse[i]); } } @@ -236,7 +251,7 @@ void find_best_split_regressor(const std::shared_ptr> tempm // Compute MSE right and MSE left value for each bin. float tmp_mse_left = tempmem->h_mseout->data()[col_count_base_index + i]; - float tmp_mse_right = (nrows * split_info[0].best_metric) - tmp_mse_left; + float tmp_mse_right = tempmem->h_mseout->data()[col_count_base_index + i + n_cols*nbins]; tmp_mse_left /= tmp_lnrows; tmp_mse_right /= tmp_rnrows; @@ -289,7 +304,7 @@ void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, co int col_minmax_bytes = sizeof(T) * 2 * ncols; int n_pred_bytes = nbins * sizeof(T) * ncols; int n_count_bytes = nbins * ncols * sizeof(int); - int n_mse_bytes = nbins * sizeof(T) * ncols; + int n_mse_bytes = 2 * nbins * sizeof(T) * ncols; CUDA_CHECK(cudaMemsetAsync((void*)d_mseout, 0, n_mse_bytes, tempmem->stream)); CUDA_CHECK(cudaMemsetAsync((void*)d_predout, 0, n_pred_bytes, tempmem->stream)); @@ -318,10 +333,10 @@ void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, co if (split_algo == ML::SPLIT_ALGO::HIST) { shmemsize += col_minmax_bytes; all_cols_histograms_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_globalminmax, d_predout, d_histout); - compute_mse_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_globalminmax, d_mseout, d_predout, d_histout); + compute_mse_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_globalminmax, d_mseout, d_predout, d_histout, split_info[0].predict); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { all_cols_histograms_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_predout, d_histout, tempmem->d_quantile->data()); - compute_mse_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_mseout, d_predout, d_histout, tempmem->d_quantile->data()); + compute_mse_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_mseout, d_predout, d_histout, tempmem->d_quantile->data(), split_info[0].predict); } CUDA_CHECK(cudaGetLastError()); diff --git a/cuML/src/decisiontree/memory.cuh b/cuML/src/decisiontree/memory.cuh index 9b8cce951a..e7cc600734 100644 --- a/cuML/src/decisiontree/memory.cuh +++ b/cuML/src/decisiontree/memory.cuh @@ -101,12 +101,12 @@ struct TemporaryMemory h_histout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, n_hist_elements * Ncols); int mse_elements = Ncols * n_bins; - h_mseout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, mse_elements); + h_mseout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, 2*mse_elements); h_predout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, mse_elements); d_globalminmax = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols * 2); d_histout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_hist_elements * Ncols); - d_mseout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, mse_elements); + d_mseout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, 2*mse_elements); d_predout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, mse_elements); d_colids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); From a6457fcf2f046885cfd3af3b2eca412c90e9a05c Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Mon, 13 May 2019 01:19:11 -0700 Subject: [PATCH 11/51] Added support for MSE or MAE split criterion. - DecisionTreeRegressor: supports MSE (mean squared error) or MAE (mean absolute error) as split criterion. - DecisionTreeClassifier: default and only option is GINI. --- cuML/src/decisiontree/algo_helper.h | 5 +++ cuML/src/decisiontree/decisiontree.cu | 36 ++++++++++++------- cuML/src/decisiontree/decisiontree.h | 11 ++++-- .../kernels/evaluate_regressor.cuh | 18 +++++----- cuML/src/decisiontree/kernels/gini.cuh | 33 +++++++++++++---- cuML/src/decisiontree/memory.cuh | 3 +- cuML/test/rf_test.cu | 26 +++++++------- 7 files changed, 87 insertions(+), 45 deletions(-) diff --git a/cuML/src/decisiontree/algo_helper.h b/cuML/src/decisiontree/algo_helper.h index 0e5b6c1a1f..eea7cc5e52 100644 --- a/cuML/src/decisiontree/algo_helper.h +++ b/cuML/src/decisiontree/algo_helper.h @@ -21,4 +21,9 @@ namespace ML { enum SPLIT_ALGO { HIST, GLOBAL_QUANTILE, SPLIT_ALGO_END, }; + +enum CRITERION { + GINI, MSE, MAE, +}; + }; diff --git a/cuML/src/decisiontree/decisiontree.cu b/cuML/src/decisiontree/decisiontree.cu index 1a123d4143..44899ce553 100644 --- a/cuML/src/decisiontree/decisiontree.cu +++ b/cuML/src/decisiontree/decisiontree.cu @@ -58,9 +58,9 @@ DecisionTreeParams::DecisionTreeParams() {} * @brief Decision tree hyper-parameter object constructor to set all DecisionTreeParams members. */ DecisionTreeParams::DecisionTreeParams(int cfg_max_depth, int cfg_max_leaves, float cfg_max_features, int cfg_n_bins, int cfg_split_algo, - int cfg_min_rows_per_node, bool cfg_bootstrap_features):max_depth(cfg_max_depth), max_leaves(cfg_max_leaves), + int cfg_min_rows_per_node, bool cfg_bootstrap_features, CRITERION cfg_split_criterion):max_depth(cfg_max_depth), max_leaves(cfg_max_leaves), max_features(cfg_max_features), n_bins(cfg_n_bins), split_algo(cfg_split_algo), - min_rows_per_node(cfg_min_rows_per_node), bootstrap_features(cfg_bootstrap_features) {} + min_rows_per_node(cfg_min_rows_per_node), bootstrap_features(cfg_bootstrap_features), split_criterion(cfg_split_criterion) {} /** * @brief Check validity of all decision tree hyper-parameters. @@ -215,6 +215,9 @@ void DecisionTreeClassifier::fit(const ML::cumlHandle& handle, T *data, const std::cout << "Resetting n_bins to " << n_sampled_rows << "." << std::endl; tree_params.n_bins = n_sampled_rows; } + if (tree_params.split_criterion != CRITERION::GINI) { // Only GINI split criterion supported for classification. + tree_params.split_criterion = CRITERION::GINI; + } return plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree_params.max_depth, tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features); } @@ -234,12 +237,13 @@ void DecisionTreeClassifier::plant(const cumlHandle_impl& handle, T *data, co this->n_unique_labels = unique_labels; this->min_rows_per_node = cfg_min_rows_per_node; this->bootstrap_features = cfg_bootstrap_features; + this->split_criterion = CRITERION::GINI; //Bootstrap features this->feature_selector.resize(this->dinfo.Ncols); if (this->bootstrap_features) { srand(n_bins); - for(int i=0; i < this->dinfo.Ncols; i++) { + for (int i=0; i < this->dinfo.Ncols; i++) { this->feature_selector.push_back( rand() % this->dinfo.Ncols ); } } else { @@ -373,14 +377,12 @@ void DecisionTreeRegressor::fit(const ML::cumlHandle& handle, T *data, const tree_params.n_bins = n_sampled_rows; } plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, 1, tree_params.max_depth, - tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features); + tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features, tree_params.split_criterion); } template void DecisionTreeRegressor::plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, const int n_sampled_rows, - int unique_labels, int maxdepth, int max_leaf_nodes, const float colper, int n_bins, int split_algo_flag, int cfg_min_rows_per_node, bool cfg_bootstrap_features) { - - //TODO FIXME - method body copied from Classifier as temp placeholder. FIXME + int unique_labels, int maxdepth, int max_leaf_nodes, const float colper, int n_bins, int split_algo_flag, int cfg_min_rows_per_node, bool cfg_bootstrap_features, CRITERION cfg_split_criterion) { this->split_algo = split_algo_flag; this->dinfo.NLocalrows = nrows; @@ -393,19 +395,20 @@ void DecisionTreeRegressor::plant(const cumlHandle_impl& handle, T *data, con this->n_unique_labels = unique_labels; this->min_rows_per_node = cfg_min_rows_per_node; this->bootstrap_features = cfg_bootstrap_features; + this->split_criterion = cfg_split_criterion; //Bootstrap features this->feature_selector.resize(this->dinfo.Ncols); if (this->bootstrap_features) { srand(n_bins); - for(int i=0; i < this->dinfo.Ncols; i++) { + for (int i=0; i < this->dinfo.Ncols; i++) { this->feature_selector.push_back( rand() % this->dinfo.Ncols ); } } else { std::iota(this->feature_selector.begin(), this->feature_selector.end(), 0); } - //std::random_shuffle(this->feature_selector.begin(), this->feature_selector.end()); + std::random_shuffle(this->feature_selector.begin(), this->feature_selector.end()); this->feature_selector.resize((int) (colper * this->dinfo.Ncols)); cudaDeviceProp prop; @@ -446,7 +449,6 @@ template TreeNode* DecisionTreeRegressor::grow_tree(T *data, const float colper, T *labels, int depth, unsigned int* rowids, const int n_sampled_rows, MetricInfo prev_split_info) { - //TODO FIXME - method body mostly copied from Classifier. FIXME TreeNode *node = new TreeNode(); MetricQuestion ques; Question node_ques; @@ -502,15 +504,23 @@ void DecisionTreeRegressor::find_best_fruit_all(T *data, T *labels, const flo T *labelptr = this->tempmem[0]->sampledlabels->data(); get_sampled_labels(labels, labelptr, rowids, n_sampled_rows, this->tempmem[0]->stream); - mse(labelptr, n_sampled_rows, this->tempmem[0], split_info[0]); + if (this->split_criterion == CRITERION::MSE) { + mse(labelptr, n_sampled_rows, this->tempmem[0], split_info[0]); + } else { + mse(labelptr, n_sampled_rows, this->tempmem[0], split_info[0]); + } //Unregister CUDA_CHECK(cudaHostUnregister(colselector.data())); } int current_nbins = (n_sampled_rows < this->nbins) ? n_sampled_rows : this->nbins; - best_split_all_cols_regressor(data, rowids, labels, current_nbins, n_sampled_rows, this->dinfo.NLocalrows, colselector, + if (this->split_criterion == CRITERION::MSE) { + best_split_all_cols_regressor(data, rowids, labels, current_nbins, n_sampled_rows, this->dinfo.NLocalrows, colselector, this->tempmem[0], split_info, ques, gain, this->split_algo); - + } else { + best_split_all_cols_regressor(data, rowids, labels, current_nbins, n_sampled_rows, this->dinfo.NLocalrows, colselector, + this->tempmem[0], split_info, ques, gain, this->split_algo); + } } // ---------------- Regression end diff --git a/cuML/src/decisiontree/decisiontree.h b/cuML/src/decisiontree/decisiontree.h index 198559d605..576e45a18c 100644 --- a/cuML/src/decisiontree/decisiontree.h +++ b/cuML/src/decisiontree/decisiontree.h @@ -81,12 +81,16 @@ struct DecisionTreeParams { */ int min_rows_per_node = 2; /** - * Wheather to bootstarp columns with or without replacement + * Whether to bootstrap columns with or without replacement. */ bool bootstrap_features = false; + /** + * Node split criterion. GINI for classification, MSE or MAE for regression. + */ + CRITERION split_criterion = CRITERION::MSE; DecisionTreeParams(); - DecisionTreeParams(int cfg_max_depth, int cfg_max_leaves, float cfg_max_features, int cfg_n_bins, int cfg_split_aglo, int cfg_min_rows_per_node, bool cfg_bootstrap_features); + DecisionTreeParams(int cfg_max_depth, int cfg_max_leaves, float cfg_max_features, int cfg_n_bins, int cfg_split_aglo, int cfg_min_rows_per_node, bool cfg_bootstrap_features, CRITERION cfg_split_criterion); void validity_check() const; void print() const; }; @@ -111,6 +115,7 @@ class dt { double construct_time; int min_rows_per_node; bool bootstrap_features; + CRITERION split_criterion; std::vector feature_selector; void print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const; @@ -160,7 +165,7 @@ class DecisionTreeRegressor : public dt { private: // Same as above fit, but planting is better for a tree then fitting. void plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels = 1, - int maxdepth = -1, int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node=2, bool cfg_bootstrap_features=false); + int maxdepth = -1, int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node=2, bool cfg_bootstrap_features=false, CRITERION cfg_split_criterion=CRITERION::MSE); TreeNode * grow_tree(T *data, const float colper, T *labels, int depth, unsigned int* rowids, const int n_sampled_rows, MetricInfo prev_split_info); diff --git a/cuML/src/decisiontree/kernels/evaluate_regressor.cuh b/cuML/src/decisiontree/kernels/evaluate_regressor.cuh index 7d1ac05fe5..d796c68d59 100644 --- a/cuML/src/decisiontree/kernels/evaluate_regressor.cuh +++ b/cuML/src/decisiontree/kernels/evaluate_regressor.cuh @@ -23,7 +23,7 @@ #include #include "../algo_helper.h" -template +template __global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T pred_parent) { int tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -62,11 +62,11 @@ __global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const if (localdata <= quesval) { T temp = shmem_pred[coloffset +j] / shmem_count[coloffset + j] ; temp = label - temp; - atomicAdd(&shmem_mse[j + coloffset], temp*temp); + atomicAdd(&shmem_mse[j + coloffset], F::exec(temp)); } else { T temp = ( pred_parent*nrows - shmem_pred[coloffset +j] ) / (nrows - shmem_count[coloffset + j] ); temp = label - temp; - atomicAdd(&shmem_mse[j + coloffset + ncols*nbins], temp*temp); + atomicAdd(&shmem_mse[j + coloffset + ncols*nbins], F::exec(temp)); } } @@ -174,7 +174,7 @@ __global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restri } } -template +template __global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T* __restrict__ quantile, const T pred_parent) { int tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -206,11 +206,11 @@ __global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ dat if (localdata <= quesval) { T temp = shmem_pred[coloffset +j] / shmem_count[coloffset + j] ; temp = label - temp; - atomicAdd(&shmem_mse[j + coloffset], temp*temp); + atomicAdd(&shmem_mse[j + coloffset], F::exec(temp)); } else { T temp = ( pred_parent*nrows - shmem_pred[coloffset +j] ) / (nrows - shmem_count[coloffset + j] ); temp = label - temp; - atomicAdd(&shmem_mse[j + coloffset + ncols*nbins], temp*temp); + atomicAdd(&shmem_mse[j + coloffset + ncols*nbins], F::exec(temp)); } } @@ -288,7 +288,7 @@ void find_best_split_regressor(const std::shared_ptr> tempm } -template +template void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, const T *labels, const int nbins, const int nrows, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) { int* d_colids = tempmem->d_colids->data(); @@ -333,10 +333,10 @@ void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, co if (split_algo == ML::SPLIT_ALGO::HIST) { shmemsize += col_minmax_bytes; all_cols_histograms_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_globalminmax, d_predout, d_histout); - compute_mse_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_globalminmax, d_mseout, d_predout, d_histout, split_info[0].predict); + compute_mse_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_globalminmax, d_mseout, d_predout, d_histout, split_info[0].predict); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { all_cols_histograms_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_predout, d_histout, tempmem->d_quantile->data()); - compute_mse_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_mseout, d_predout, d_histout, tempmem->d_quantile->data(), split_info[0].predict); + compute_mse_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_mseout, d_predout, d_histout, tempmem->d_quantile->data(), split_info[0].predict); } CUDA_CHECK(cudaGetLastError()); diff --git a/cuML/src/decisiontree/kernels/gini.cuh b/cuML/src/decisiontree/kernels/gini.cuh index 4c23323376..435ac2e035 100644 --- a/cuML/src/decisiontree/kernels/gini.cuh +++ b/cuML/src/decisiontree/kernels/gini.cuh @@ -22,6 +22,23 @@ #include "gini_def.h" #include "cuda_utils.h" +struct SquareFunctor { + + template + static __device__ T exec(T x) { + return MLCommon::myPow(x, (T) 2); + } +}; + +struct AbsFunctor { + + template + static __device__ T exec(T x) { + return MLCommon::myAbs(x); + } +}; + + template void MetricQuestion::set_question_fields(int cfg_bootcolumn, int cfg_column, int cfg_batch_id, int cfg_nbins, int cfg_ncols, T cfg_min, T cfg_max, T cfg_value) { bootstrapped_column = cfg_bootcolumn; @@ -74,13 +91,14 @@ __global__ void pred_kernel(const T* __restrict__ labels, const int nrows, T* pr __syncthreads(); - if (threadIdx.x == 0) + if (threadIdx.x == 0) { atomicAdd(predout, shmempred); + } return; } -template +template __global__ void mse_kernel(const T* __restrict__ labels, const int nrows, const T* predout, T* mseout) { int tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -94,13 +112,14 @@ __global__ void mse_kernel(const T* __restrict__ labels, const int nrows, const if (tid < nrows) { T label = labels[tid] - (predout[0]/nrows); - atomicAdd(&shmemmse, label*label); + atomicAdd(&shmemmse, F::exec(label)); } __syncthreads(); - if (threadIdx.x == 0) + if (threadIdx.x == 0) { atomicAdd(mseout, shmemmse); + } return; } @@ -130,7 +149,7 @@ void gini(int *labels_in, const int nrows, const std::shared_ptr +template void mse(T *labels_in, const int nrows, const std::shared_ptr> tempmem, MetricInfo & split_info) { T *dpred = tempmem->d_predout->data(); @@ -143,13 +162,13 @@ void mse(T *labels_in, const int nrows, const std::shared_ptrstream>>>(labels_in, nrows, dpred); CUDA_CHECK(cudaGetLastError()); - mse_kernel<<< MLCommon::ceildiv(nrows, 128), 128, 0, tempmem->stream>>>(labels_in, nrows, dpred, dmse); + mse_kernel<<< MLCommon::ceildiv(nrows, 128), 128, 0, tempmem->stream>>>(labels_in, nrows, dpred, dmse); CUDA_CHECK(cudaMemcpyAsync(hmse, dmse, sizeof(T), cudaMemcpyDeviceToHost, tempmem->stream)); CUDA_CHECK(cudaMemcpyAsync(hpred, dpred, sizeof(T), cudaMemcpyDeviceToHost, tempmem->stream)); CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); - split_info.best_metric = (float)hmse[0] / (float)nrows; //Update gini val + split_info.best_metric = (float)hmse[0] / (float)nrows; //Update split metric value split_info.predict = hpred[0] / (T)nrows; return; } diff --git a/cuML/src/decisiontree/memory.cuh b/cuML/src/decisiontree/memory.cuh index e7cc600734..e39f637251 100644 --- a/cuML/src/decisiontree/memory.cuh +++ b/cuML/src/decisiontree/memory.cuh @@ -110,7 +110,8 @@ struct TemporaryMemory d_predout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, mse_elements); d_colids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); - totalmem += (n_hist_elements * sizeof(int) + sizeof(int) + 2*sizeof(T) + n_bins * sizeof(T))* Ncols; + // memory of d_histout + d_colids + d_globalminmax + (d_mseout + d_predout) + totalmem += (n_hist_elements * sizeof(int) + sizeof(int) + 2*sizeof(T) + 3 * n_bins * sizeof(T))* Ncols; } diff --git a/cuML/test/rf_test.cu b/cuML/test/rf_test.cu index 682322a026..09966c7296 100644 --- a/cuML/test/rf_test.cu +++ b/cuML/test/rf_test.cu @@ -39,6 +39,7 @@ struct RfInputs { int n_bins; int split_algo; int min_rows_per_node; + CRITERION split_criterion; }; template @@ -53,9 +54,10 @@ protected: void basicTest() { params = ::testing::TestWithParam>::GetParam(); + params.split_criterion = CRITERION::GINI; // override MSE config, one criterion supported for classification DecisionTree::DecisionTreeParams tree_params(params.max_depth, params.max_leaves, params.max_features, params.n_bins, - params.split_algo, params.min_rows_per_node, params.bootstrap_features); + params.split_algo, params.min_rows_per_node, params.bootstrap_features, params.split_criterion); RF_params rf_params(params.bootstrap, params.bootstrap_features, params.n_trees, params.rows_sample, tree_params); //rf_params.print(); @@ -145,7 +147,7 @@ protected: params = ::testing::TestWithParam>::GetParam(); DecisionTree::DecisionTreeParams tree_params(params.max_depth, params.max_leaves, params.max_features, params.n_bins, - params.split_algo, params.min_rows_per_node, params.bootstrap_features); + params.split_algo, params.min_rows_per_node, params.bootstrap_features, params.split_criterion); RF_params rf_params(params.bootstrap, params.bootstrap_features, params.n_trees, params.rows_sample, tree_params); //rf_params.print(); @@ -225,19 +227,19 @@ protected: //------------------------------------------------------------------------------------------------------------------------------------- const std::vector > inputsf2 = { - {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2}, // single tree forest, bootstrap false, unlimited depth, 4 bins - {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2}, // single tree forest, bootstrap false, depth of 8, 4 bins - {4, 2, 10, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2}, //forest with 10 trees, all trees should produce identical predictions (no bootstrapping or column subsampling) - {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::HIST, 2}, //forest with 10 trees, with bootstrap and column subsampling enabled, 3 bins - {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE, 2} //forest with 10 trees, with bootstrap and column subsampling enabled, 3 bins, different split algorithm + {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, // single tree forest, bootstrap false, unlimited depth, 4 bins + {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, // single tree forest, bootstrap false, depth of 8, 4 bins + {4, 2, 10, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, //forest with 10 trees, all trees should produce identical predictions (no bootstrapping or column subsampling) + {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, //forest with 10 trees, with bootstrap and column subsampling enabled, 3 bins + {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE, 2, CRITERION::MSE} //forest with 10 trees, with bootstrap and column subsampling enabled, 3 bins, different split algorithm }; const std::vector > inputsd2 = { // Same as inputsf2 - {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2}, - {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2}, - {4, 2, 10, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2}, - {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::HIST, 2}, - {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE, 2} + {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, + {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, + {4, 2, 10, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, + {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, + {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE, 2, CRITERION::MSE} }; From d8176b721667fc013e5307021c5350681c7d6446 Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Mon, 13 May 2019 01:52:51 -0700 Subject: [PATCH 12/51] Fixed split_criterion config in rf_test. --- cuML/src/decisiontree/decisiontree.cu | 2 ++ cuML/test/rf_test.cu | 5 ++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cuML/src/decisiontree/decisiontree.cu b/cuML/src/decisiontree/decisiontree.cu index 44899ce553..78bef43824 100644 --- a/cuML/src/decisiontree/decisiontree.cu +++ b/cuML/src/decisiontree/decisiontree.cu @@ -85,6 +85,7 @@ void DecisionTreeParams::print() const { std::cout << "n_bins: " << n_bins << std::endl; std::cout << "split_algo: " << split_algo << std::endl; std::cout << "min_rows_per_node: " << min_rows_per_node << std::endl; + std::cout << "split_criterion: " << split_criterion << std::endl; } @@ -376,6 +377,7 @@ void DecisionTreeRegressor::fit(const ML::cumlHandle& handle, T *data, const std::cout << "Resetting n_bins to " << n_sampled_rows << "." << std::endl; tree_params.n_bins = n_sampled_rows; } + ASSERT(tree_params.split_criterion != CRITERION::GINI, "GINI is invalid split criterion for regression"); plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, 1, tree_params.max_depth, tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features, tree_params.split_criterion); } diff --git a/cuML/test/rf_test.cu b/cuML/test/rf_test.cu index 09966c7296..726a96383e 100644 --- a/cuML/test/rf_test.cu +++ b/cuML/test/rf_test.cu @@ -54,10 +54,9 @@ protected: void basicTest() { params = ::testing::TestWithParam>::GetParam(); - params.split_criterion = CRITERION::GINI; // override MSE config, one criterion supported for classification DecisionTree::DecisionTreeParams tree_params(params.max_depth, params.max_leaves, params.max_features, params.n_bins, - params.split_algo, params.min_rows_per_node, params.bootstrap_features, params.split_criterion); + params.split_algo, params.min_rows_per_node, params.bootstrap_features, CRITERION::GINI); RF_params rf_params(params.bootstrap, params.bootstrap_features, params.n_trees, params.rows_sample, tree_params); //rf_params.print(); @@ -286,7 +285,7 @@ TEST_P(RfRegressorTestD, Fit) { } const std::vector > inputsf2_temp = { - {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2}}; // single tree forest, bootstrap false, unlimited depth, 4 bins + {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}}; // single tree forest, bootstrap false, unlimited depth, 4 bins //INSTANTIATE_TEST_CASE_P(RfRegressorTests, RfRegressorTestF, ::testing::ValuesIn(inputsf2)); INSTANTIATE_TEST_CASE_P(RfRegressorTests, RfRegressorTestF, ::testing::ValuesIn(inputsf2_temp)); From 229bd038b691a47e2eca9392065705d1729a942c Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Mon, 13 May 2019 14:17:27 +0200 Subject: [PATCH 13/51] relocating functors, adding inline to device functors, adding entropy for classification --- cuML/src/decisiontree/algo_helper.h | 2 +- cuML/src/decisiontree/decisiontree.cu | 30 +++++++++++--- cuML/src/decisiontree/decisiontree.h | 4 +- .../kernels/evaluate_classifier.cuh | 28 ++++++------- cuML/src/decisiontree/kernels/gini.cuh | 28 ++----------- cuML/src/decisiontree/kernels/gini_def.h | 40 +++++++++++++++++++ 6 files changed, 85 insertions(+), 47 deletions(-) diff --git a/cuML/src/decisiontree/algo_helper.h b/cuML/src/decisiontree/algo_helper.h index eea7cc5e52..88ce399f67 100644 --- a/cuML/src/decisiontree/algo_helper.h +++ b/cuML/src/decisiontree/algo_helper.h @@ -23,7 +23,7 @@ enum SPLIT_ALGO { }; enum CRITERION { - GINI, MSE, MAE, + GINI, ENTROPY, MSE, MAE, CRITERION_END, }; }; diff --git a/cuML/src/decisiontree/decisiontree.cu b/cuML/src/decisiontree/decisiontree.cu index 78bef43824..68396087cc 100644 --- a/cuML/src/decisiontree/decisiontree.cu +++ b/cuML/src/decisiontree/decisiontree.cu @@ -216,9 +216,12 @@ void DecisionTreeClassifier::fit(const ML::cumlHandle& handle, T *data, const std::cout << "Resetting n_bins to " << n_sampled_rows << "." << std::endl; tree_params.n_bins = n_sampled_rows; } - if (tree_params.split_criterion != CRITERION::GINI) { // Only GINI split criterion supported for classification. + + if (tree_params.split_criterion == CRITERION::CRITERION_END) { // Set default to GINI tree_params.split_criterion = CRITERION::GINI; } + ASSERT( (tree_params.split_criterion == CRITERION::GINI || tree_params.split_criterion == CRITERION::ENTROPY ) , " Decision Tree Classifer split creteria, should be Gini or Entropy\n"); + return plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree_params.max_depth, tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features); } @@ -342,17 +345,29 @@ void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const // Copy sampled column IDs to device memory CUDA_CHECK(cudaMemcpyAsync(this->tempmem[0]->d_colids->data(), colselector.data(), sizeof(int) * colselector.size(), cudaMemcpyHostToDevice, this->tempmem[0]->stream)); CUDA_CHECK(cudaStreamSynchronize(this->tempmem[0]->stream)); - + int *labelptr = this->tempmem[0]->sampledlabels->data(); get_sampled_labels(labels, labelptr, rowids, n_sampled_rows, this->tempmem[0]->stream); - gini(labelptr, n_sampled_rows, this->tempmem[0], split_info[0], this->n_unique_labels); + + if (this->split_criterion == CRITERION::GINI) { + gini(labelptr, n_sampled_rows, this->tempmem[0], split_info[0], this->n_unique_labels); + } else { + gini(labelptr, n_sampled_rows, this->tempmem[0], split_info[0], this->n_unique_labels); + } + //Unregister CUDA_CHECK(cudaHostUnregister(colselector.data())); } int current_nbins = (n_sampled_rows < this->nbins) ? n_sampled_rows : this->nbins; - best_split_all_cols_classifier(data, rowids, labels, current_nbins, n_sampled_rows, this->n_unique_labels, this->dinfo.NLocalrows, colselector, - this->tempmem[0], &split_info[0], ques, gain, this->split_algo); + + if (this->split_criterion == CRITERION::GINI) { + best_split_all_cols_classifier(data, rowids, labels, current_nbins, n_sampled_rows, this->n_unique_labels, this->dinfo.NLocalrows, colselector, + this->tempmem[0], &split_info[0], ques, gain, this->split_algo); + } else { + best_split_all_cols_classifier(data, rowids, labels, current_nbins, n_sampled_rows, this->n_unique_labels, this->dinfo.NLocalrows, colselector, + this->tempmem[0], &split_info[0], ques, gain, this->split_algo); + } } /** @@ -377,7 +392,10 @@ void DecisionTreeRegressor::fit(const ML::cumlHandle& handle, T *data, const std::cout << "Resetting n_bins to " << n_sampled_rows << "." << std::endl; tree_params.n_bins = n_sampled_rows; } - ASSERT(tree_params.split_criterion != CRITERION::GINI, "GINI is invalid split criterion for regression"); + if (tree_params.split_criterion == CRITERION::CRITERION_END) { // Set default to MSE + tree_params.split_criterion = CRITERION::MSE; + } + ASSERT( (tree_params.split_criterion == CRITERION::MSE || tree_params.split_criterion == CRITERION::MAE ) , " Decision Tree Regressor split creteria, should be MSE or MAE\n"); plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, 1, tree_params.max_depth, tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features, tree_params.split_criterion); } diff --git a/cuML/src/decisiontree/decisiontree.h b/cuML/src/decisiontree/decisiontree.h index 576e45a18c..3c09d58dfe 100644 --- a/cuML/src/decisiontree/decisiontree.h +++ b/cuML/src/decisiontree/decisiontree.h @@ -85,9 +85,9 @@ struct DecisionTreeParams { */ bool bootstrap_features = false; /** - * Node split criterion. GINI for classification, MSE or MAE for regression. + * Node split criterion. GINI and Entropy for classification, MSE or MAE for regression. */ - CRITERION split_criterion = CRITERION::MSE; + CRITERION split_criterion = CRITERION_END; DecisionTreeParams(); DecisionTreeParams(int cfg_max_depth, int cfg_max_leaves, float cfg_max_features, int cfg_n_bins, int cfg_split_aglo, int cfg_min_rows_per_node, bool cfg_bootstrap_features, CRITERION cfg_split_criterion); diff --git a/cuML/src/decisiontree/kernels/evaluate_classifier.cuh b/cuML/src/decisiontree/kernels/evaluate_classifier.cuh index f9b3d129fd..f8132bc8c2 100644 --- a/cuML/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cuML/src/decisiontree/kernels/evaluate_classifier.cuh @@ -109,7 +109,7 @@ __global__ void all_cols_histograms_global_quantile_kernel_class(const T* __rest } } -template +template void find_best_split_classifier(const std::shared_ptr> tempmem, const int nbins, const int n_unique_labels, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { gain = 0.0f; @@ -125,31 +125,31 @@ void find_best_split_classifier(const std::shared_ptr> tem // if tmp_lnrows or tmp_rnrows is 0, the corresponding gini will be 1 but that doesn't // matter as it won't count in the info_gain computation. - float tmp_gini_left = 1.0f; - float tmp_gini_right = 1.0f; int tmp_lnrows = 0; - + //separate loop for now to avoid overflow. for (int j = 0; j < n_unique_labels; j++) { int hist_index = i * n_unique_labels + j; tmp_lnrows += tempmem->h_histout->data()[col_hist_base_index + hist_index]; } int tmp_rnrows = nrows - tmp_lnrows; - + if (tmp_lnrows == 0 || tmp_rnrows == 0) continue; + std::vector tmp_histleft(n_unique_labels); + std::vector tmp_histright(n_unique_labels); + // Compute gini right and gini left value for each bin. for (int j = 0; j < n_unique_labels; j++) { int hist_index = i * n_unique_labels + j; - - float prob_left = (float) (tempmem->h_histout->data()[col_hist_base_index + hist_index]) / tmp_lnrows; - tmp_gini_left -= prob_left * prob_left; - - float prob_right = (float) (split_info[0].hist[j] - tempmem->h_histout->data()[col_hist_base_index + hist_index]) / tmp_rnrows; - tmp_gini_right -= prob_right * prob_right; + tmp_histleft[j] = tempmem->h_histout->data()[col_hist_base_index + hist_index]; + tmp_histright[j] = split_info[0].hist[j] - tmp_histleft[j]; } + float tmp_gini_left = F::exec(tmp_histleft, tmp_lnrows); + float tmp_gini_right = F::exec(tmp_histright, tmp_rnrows); + ASSERT((tmp_gini_left >= 0.0f) && (tmp_gini_left <= 1.0f), "gini left value %f not in [0.0, 1.0]", tmp_gini_left); ASSERT((tmp_gini_right >= 0.0f) && (tmp_gini_right <= 1.0f), "gini right value %f not in [0.0, 1.0]", tmp_gini_right); @@ -192,7 +192,7 @@ void find_best_split_classifier(const std::shared_ptr> tem } -template +template void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, const L *labels, const int nbins, const int nrows, const int n_unique_labels, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) { int* d_colids = tempmem->d_colids->data(); @@ -236,8 +236,8 @@ void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, c CUDA_CHECK(cudaMemcpyAsync(h_histout, d_histout, n_hist_bytes, cudaMemcpyDeviceToHost, tempmem->stream)); CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); - - find_best_split_classifier(tempmem, nbins, n_unique_labels, colselector, &split_info[0], nrows, ques, gain, split_algo); + + find_best_split_classifier(tempmem, nbins, n_unique_labels, colselector, &split_info[0], nrows, ques, gain, split_algo); return; } diff --git a/cuML/src/decisiontree/kernels/gini.cuh b/cuML/src/decisiontree/kernels/gini.cuh index 435ac2e035..75c7a24311 100644 --- a/cuML/src/decisiontree/kernels/gini.cuh +++ b/cuML/src/decisiontree/kernels/gini.cuh @@ -22,23 +22,6 @@ #include "gini_def.h" #include "cuda_utils.h" -struct SquareFunctor { - - template - static __device__ T exec(T x) { - return MLCommon::myPow(x, (T) 2); - } -}; - -struct AbsFunctor { - - template - static __device__ T exec(T x) { - return MLCommon::myAbs(x); - } -}; - - template void MetricQuestion::set_question_fields(int cfg_bootcolumn, int cfg_column, int cfg_batch_id, int cfg_nbins, int cfg_ncols, T cfg_min, T cfg_max, T cfg_value) { bootstrapped_column = cfg_bootcolumn; @@ -124,12 +107,11 @@ __global__ void mse_kernel(const T* __restrict__ labels, const int nrows, const return; } -template +template void gini(int *labels_in, const int nrows, const std::shared_ptr> tempmem, MetricInfo & split_info, int & unique_labels) { int *dhist = tempmem->d_hist->data(); int *hhist = tempmem->h_hist->data(); - float gval = 1.0; CUDA_CHECK(cudaMemsetAsync(dhist, 0, sizeof(int)*unique_labels, tempmem->stream)); gini_kernel<<< MLCommon::ceildiv(nrows, 128), 128, sizeof(int)*unique_labels, tempmem->stream>>>(labels_in, nrows, unique_labels, dhist); @@ -139,12 +121,10 @@ void gini(int *labels_in, const int nrows, const std::shared_ptr +#include "cuda_utils.h" +#include template struct MetricQuestion { @@ -51,3 +53,41 @@ struct MetricInfo { T predict = 0; std::vector hist; //Element hist[i] stores # labels with label i for a given node. for classification }; + +struct SquareFunctor { + + template + static __device__ __forceinline__ T exec(T x) { + return MLCommon::myPow(x, (T) 2); + } +}; + +struct AbsFunctor { + + template + static __device__ __forceinline__ T exec(T x) { + return MLCommon::myAbs(x); + } +}; + +struct GiniFunctor { + static float exec(std::vector& hist,int nrows) { + float gval = 1.0; + for (int i=0; i < hist.size(); i++) { + float prob = ((float)hist[i]) / nrows; + gval -= prob*prob; + } + return gval; + } +}; + +struct EntropyFunctor { + static float exec(std::vector& hist,int nrows) { + float eval = 0.0; + for (int i=0; i < hist.size(); i++) { + float prob = ((float)hist[i]) / nrows; + eval += prob * logf(prob); + } + return (-1*eval); + } +}; From f2a8336b731079e28d3a514fb23128f7a6ffcc82 Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Mon, 13 May 2019 09:09:53 -0700 Subject: [PATCH 14/51] Removed useless mem alloc+added tmp testing script - Removed useless memory allocation from memory.cuh - Added temporary testing folder in randomforest w/ testing script for both regression and classification. This directory will be removed prior any merge req. --- cuML/src/decisiontree/decisiontree.cu | 2 +- cuML/src/decisiontree/kernels/gini.cuh | 1 + cuML/src/decisiontree/memory.cuh | 9 +- .../tmp_testing_dir/launch_rf_testing | 42 +++ .../tmp_testing_dir/rf_testing.cu | 338 ++++++++++++++++++ 5 files changed, 384 insertions(+), 8 deletions(-) create mode 100755 cuML/src/randomforest/tmp_testing_dir/launch_rf_testing create mode 100644 cuML/src/randomforest/tmp_testing_dir/rf_testing.cu diff --git a/cuML/src/decisiontree/decisiontree.cu b/cuML/src/decisiontree/decisiontree.cu index 68396087cc..5b806212c1 100644 --- a/cuML/src/decisiontree/decisiontree.cu +++ b/cuML/src/decisiontree/decisiontree.cu @@ -395,7 +395,7 @@ void DecisionTreeRegressor::fit(const ML::cumlHandle& handle, T *data, const if (tree_params.split_criterion == CRITERION::CRITERION_END) { // Set default to MSE tree_params.split_criterion = CRITERION::MSE; } - ASSERT( (tree_params.split_criterion == CRITERION::MSE || tree_params.split_criterion == CRITERION::MAE ) , " Decision Tree Regressor split creteria, should be MSE or MAE\n"); + ASSERT( (tree_params.split_criterion == CRITERION::MSE || tree_params.split_criterion == CRITERION::MAE ) , "Decision Tree Regressor split creteria should be MSE or MAE\n"); plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, 1, tree_params.max_depth, tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features, tree_params.split_criterion); } diff --git a/cuML/src/decisiontree/kernels/gini.cuh b/cuML/src/decisiontree/kernels/gini.cuh index 75c7a24311..bac8dd1336 100644 --- a/cuML/src/decisiontree/kernels/gini.cuh +++ b/cuML/src/decisiontree/kernels/gini.cuh @@ -143,6 +143,7 @@ void mse(T *labels_in, const int nrows, const std::shared_ptrstream>>>(labels_in, nrows, dpred); CUDA_CHECK(cudaGetLastError()); mse_kernel<<< MLCommon::ceildiv(nrows, 128), 128, 0, tempmem->stream>>>(labels_in, nrows, dpred, dmse); + CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaMemcpyAsync(hmse, dmse, sizeof(T), cudaMemcpyDeviceToHost, tempmem->stream)); CUDA_CHECK(cudaMemcpyAsync(hpred, dpred, sizeof(T), cudaMemcpyDeviceToHost, tempmem->stream)); diff --git a/cuML/src/decisiontree/memory.cuh b/cuML/src/decisiontree/memory.cuh index e39f637251..e81d455ca9 100644 --- a/cuML/src/decisiontree/memory.cuh +++ b/cuML/src/decisiontree/memory.cuh @@ -56,7 +56,6 @@ struct TemporaryMemory //For quantiles MLCommon::device_buffer *d_quantile = nullptr; - MLCommon::device_buffer *d_temp_sampledcolumn = nullptr; const ML::cumlHandle_impl& ml_handle; @@ -80,8 +79,7 @@ struct TemporaryMemory if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { d_quantile = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_bins * quantile_elements); - d_temp_sampledcolumn = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N * extra_elements); - totalmem += (n_bins + N) * extra_elements * sizeof(T); + totalmem += n_bins * extra_elements * sizeof(T); } sampledlabels = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); @@ -113,6 +111,7 @@ struct TemporaryMemory // memory of d_histout + d_colids + d_globalminmax + (d_mseout + d_predout) totalmem += (n_hist_elements * sizeof(int) + sizeof(int) + 2*sizeof(T) + 3 * n_bins * sizeof(T))* Ncols; + this->print_info(); } void print_info() @@ -137,10 +136,6 @@ struct TemporaryMemory d_quantile->release(stream); delete d_quantile; } - if (d_temp_sampledcolumn != nullptr) { - d_temp_sampledcolumn->release(stream); - delete d_temp_sampledcolumn; - } sampledlabels->release(stream); d_split_temp_storage->release(stream); diff --git a/cuML/src/randomforest/tmp_testing_dir/launch_rf_testing b/cuML/src/randomforest/tmp_testing_dir/launch_rf_testing new file mode 100755 index 0000000000..c462fb3f61 --- /dev/null +++ b/cuML/src/randomforest/tmp_testing_dir/launch_rf_testing @@ -0,0 +1,42 @@ +#!/bin/bash + +#FIXME Remove this folder before merge + +RF_BASE_DIR="/gpfs/fs1/myrtop/rapids_repos/regression_random_forest/cuml" +#Compile +nvcc -g -std=c++11 -arch=sm_70 rf_testing.cu ../../decisiontree/decisiontree.cu ../../common/cuml_api.cpp ../../common/cumlHandle.cpp -I$RF_BASE_DIR/ml-prims/src/ -I$RF_BASE_DIR/cuML/external/ml-prims/external/cub -I$RF_BASE_DIR/cuML/src -I$RF_BASE_DIR/thirdparty/cuml/ml-prims/src/ -lcublas -lcudart -lcusolver -lcusparse --expt-extended-lambda -o rf_testing + +if [ ! -e tmp_rf_testing ]; then + echo "Error in building tmp_rf_testing" + exit +fi + +# --------- Year Dataset ------------ # +dataset="year" +cols=90 #Note doesn't matter will be fixed in tmp_rf_testing +rows=515345 + +# --------- Airline Dataset ------------ # +#dataset="airline_regression" +#dataset="airline" +#cols=13 #Note: doesn't matter will be fixed in tmp_rf_testing +#rows=115069017 + +trees=1 +col_per=1.0 +row_per=1.0 +train_ratio=0.8 + +max_depth=8 +#max_depth=-1 +max_leaves=-1 +bootstrap=false +test_is_train=false +#test_is_train=true +n_bins=8 +split_algo=1 #0 means hist, 1 means global quantiles +split_criterion=4; #0 is GINI, 1 is ENTROPY, 2 is MSE, 3 is MAE, 4 enforces default (GINI or MSE respectively). + +echo "./rf_testing $rows $cols $trees $col_per $row_per $train_ratio $max_depth $max_leaves $bootstrap $test_is_train $n_bins $dataset $split_algo $split_criterion" +./rf_testing $rows $cols $trees $col_per $row_per $train_ratio $max_depth $max_leaves $bootstrap $test_is_train $n_bins $dataset $split_algo $split_criterion + diff --git a/cuML/src/randomforest/tmp_testing_dir/rf_testing.cu b/cuML/src/randomforest/tmp_testing_dir/rf_testing.cu new file mode 100644 index 0000000000..6dc2173d59 --- /dev/null +++ b/cuML/src/randomforest/tmp_testing_dir/rf_testing.cu @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include "../randomforest.cu" +#include "../../../../thirdparty/cuml/ml-prims/src/utils.h" +#include "ml_utils.h" +#include "cuda_utils.h" +#include +#include +#include +#include + +//Modified version of TIMEIT_LOOP from test_utils.h +#define TIMEIT_LOOP(ms, count, func) \ + do { \ + cudaEvent_t start, stop; \ + CUDA_CHECK(cudaEventCreate(&start)); \ + CUDA_CHECK(cudaEventCreate(&stop)); \ + CUDA_CHECK(cudaEventRecord(start)); \ + for (int i = 0; i < count; ++i) { \ + func; \ + } \ + CUDA_CHECK(cudaEventRecord(stop)); \ + CUDA_CHECK(cudaEventSynchronize(stop)); \ + ms = 0.f; \ + CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop)); \ + ms /= count; \ +} while (0) + +using namespace MLCommon; +using namespace std; + +template +void parse_csv(string dataset_name, int n_cols, std::vector & data, std::vector & labels, int train_cnt, + std::vector & test_data, std::vector & test_labels, int test_cnt, bool test_is_train) { + + string data_file; + int col_offset = 0; + int label_id = 0; // column that is the label (i.e., target feature) + if (dataset_name == "higgs") { + data_file = "/gpfs/fs1/myrtop/rapids_repos/HIGGS.csv"; + col_offset = 1; //because the first column in higgs is the label + } else if (dataset_name == "year") { + data_file = "/gpfs/fs1/myrtop/rapids_repos/year.csv"; + col_offset = 1; //because the first column in year is the label + label_id = 0; + } else if ((dataset_name == "airline_regression") || (dataset_name == "airline")) { + data_file = "/gpfs/fs1/myrtop/rapids_repos/airline_14col.data_modified"; + label_id = n_cols; + } + + cout << "train_cnt " << train_cnt << " test_cnt " << test_cnt << endl; + ifstream myfile; + myfile.open(data_file); + string line; + + int counter = 0; + data.resize(train_cnt * n_cols); + labels.resize(train_cnt); + + test_data.resize(test_cnt * n_cols); + test_labels.resize(test_cnt); + + int break_cnt = (test_is_train) ? train_cnt : train_cnt + test_cnt; + + while (getline(myfile,line) && (counter < break_cnt)) { + stringstream str(line); + vector row; + float i; + while ( str >> i) { + row.push_back(i); + if(str.peek() == ',') + str.ignore(); + } + for (int col = 0; col < n_cols; col++) { + if (counter < train_cnt) { + data[counter + col * train_cnt] = row[col + col_offset]; //train data should be col major + if (test_is_train) + test_data[counter*n_cols + col] = row[col + col_offset]; // test data should be row major + } else if (!test_is_train) + test_data[(counter - train_cnt)*n_cols + col] = row[col + col_offset]; // test data should be row major + } + + if (counter < train_cnt) { + labels[counter] = (dataset_name == "airline") ? (int) (row[label_id] > 0) : row[label_id]; + if (test_is_train) test_labels[counter] = labels[counter]; + } else if (!test_is_train) { + test_labels[counter - train_cnt] = (dataset_name == "airline") ? (int) (row[label_id] > 0) : row[label_id]; + } + counter++; + } + cout << "Lines processed " << counter << endl; + myfile.close(); + +} + + +struct RF_inputs { + int n_rows, n_cols, n_inference_rows; + int n_trees, max_depth, max_leaves, n_bins, split_algo; + float max_features, rows_sample, train_ratio; + bool bootstrap, test_is_train, bootstrap_features; + string dataset; + int min_rows_per_node; + ML::CRITERION split_criterion; + + RF_inputs(int cfg_n_rows, int cfg_n_cols, int cfg_n_trees, float cfg_max_features, + float cfg_rows_sample, float cfg_train_ratio, int cfg_max_depth, + int cfg_max_leaves, bool cfg_bootstrap, bool cfg_test_is_train, int cfg_n_bins, + string cfg_dataset, int cfg_split_algo, int cfg_min_rows_per_node, bool cfg_bootstrap_features, ML::CRITERION cfg_split_criterion) { + + train_ratio = cfg_train_ratio; + test_is_train = cfg_test_is_train; + dataset = cfg_dataset; + + n_cols = cfg_n_cols; // Will be overwritten based on dataset + n_trees = cfg_n_trees; + max_features = cfg_max_features; + rows_sample = cfg_rows_sample; + max_depth = cfg_max_depth; + max_leaves = cfg_max_leaves; + bootstrap = cfg_bootstrap; + if (dataset == "year") { + // Year has hard requirements on train/test examples + // Dataset has 515K lines (515345 to be exact). Hard reqs on train/test examples: Train: first 463,715 examples; test: last 51,630 examples + n_rows = 463715; //hard coded - train_ratio plays no role + n_inference_rows = test_is_train ? n_rows : 51630; //hard-coded + } else { + n_rows = test_is_train ? cfg_n_rows : train_ratio * cfg_n_rows; + n_inference_rows = test_is_train ? cfg_n_rows : (1.0f -train_ratio) * cfg_n_rows; + } + n_bins = cfg_n_bins; + split_algo = cfg_split_algo; + min_rows_per_node = cfg_min_rows_per_node; + bootstrap_features = cfg_bootstrap_features; + split_criterion = cfg_split_criterion; + + if (dataset == "higgs") { + n_cols = 28; + } else if ((dataset == "airline") || (dataset == "airline_regression")) { + n_cols = 13; + } else if (dataset == "year") { + n_cols = 90; + } else { + cerr << "Invalid dataset " << dataset << endl; + exit(1); + } + + ASSERT((split_algo >= 0) && (split_algo < 3), "Unsupported split_algo %d option. Not in [0, 2].", split_algo); + + cout << "Dataset " << dataset << ", train ratio " << train_ratio << " test_is_train " << test_is_train << ", n_rows " << n_rows << ", n_cols " << n_cols << " n_trees " << n_trees << " col_per " << max_features << " row_per " << rows_sample << " max_depth " << max_depth << " max_leaves " << max_leaves << " bootstrap " << bootstrap << " n_inference_rows " << n_inference_rows << " n_bins " << n_bins << " split_algo " << split_algo << endl; + } + +}; + +template +void solve_classification_problem(RF_inputs & params); + +template +void solve_regression_problem(RF_inputs & params); + +int main(int argc, char **argv) { + + /* Command line args: + - # rows + - # cols (fixed per dataset) + - # trees + - col_per + - row_per + - train_ratio (e.g., 0.8 will use 80% of rows for training and 20% for testing) + - max_depth + - max_leaves + - bootstrap + - test_is_train (otherwise 80% of rows is used for training and 20% for testing) + - n_bins + - dataset name + - split_algo + - bootstrap_features + - split_criterion + */ + + const int expected_args_cnt = 15; + if (argc != expected_args_cnt) { + cout << "Error! " << expected_args_cnt - 1 << " args are needed\n"; + return 0; + } + RF_inputs params(stoi(argv[1]), stoi(argv[2]), stoi(argv[3]), stof(argv[4]), stof(argv[5]), stof(argv[6]), stoi(argv[7]), stoi(argv[8]), (strcmp(argv[9], "true") == 0), (strcmp(argv[10], "true") == 0), stoi(argv[11]), argv[12], stoi(argv[13]), 2, false, (ML::CRITERION) stoi(argv[14])); + + bool is_regression = (params.dataset == "year") || (params.dataset == "airline_regression"); + if (is_regression) { + std::cout << "Regression problem\n"; + solve_regression_problem(params); + } else { + std::cout << "Classification problem\n"; + solve_classification_problem(params); + } + return 0; +} + +template +void solve_classification_problem(RF_inputs & params) { + T * input_data; + int * input_labels; + + std::map labels_map; //unique map of labels to int vals starting from 0 + + int input_data_len = params.n_rows * params.n_cols; + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream) ); + allocate(input_data, input_data_len); + allocate(input_labels, params.n_rows); + + std::vector h_input_data, inference_data; + std::vector h_input_labels, inference_labels; + + // Populate labels and data + parse_csv(params.dataset, params.n_cols, h_input_data, h_input_labels, params.n_rows, inference_data, inference_labels, params.n_inference_rows, params.test_is_train); //last arg makes test same as training + + //Preprocess labels + ML::preprocess_labels(params.n_rows, h_input_labels, labels_map); + int n_unique_labels = labels_map.size(); + std::cout << "Dataset has " << n_unique_labels << " labels." << std::endl; + + updateDevice(input_data, h_input_data.data(), input_data_len, stream); + updateDevice(input_labels, h_input_labels.data(), params.n_rows, stream); + cout << "Finished populating device labels and data\n"; + + // Fit input_dataset + ML::rfClassifier * my_rf; + ML::cumlHandle handle; + handle.setStream(stream); + + + ML::DecisionTree::DecisionTreeParams tree_params(params.max_depth, params.max_leaves, params.max_features, params.n_bins, params.split_algo, params.min_rows_per_node, params.bootstrap_features, params.split_criterion); + ML::RF_params rf_params(params.bootstrap, params.bootstrap_features, params.n_trees, params.rows_sample, tree_params); + + my_rf = new typename ML::rfClassifier::rfClassifier(rf_params); + cout << "Called RF constructor\n"; + + float ms; + TIMEIT_LOOP(ms, 1, fit(handle, my_rf, input_data, params.n_rows, params.n_cols, input_labels, n_unique_labels)); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaStreamDestroy(stream)); + + + my_rf->print_rf_detailed(); + cout << "Planted the random forest in " << ms << " ms, " << ms /1000.0 << " s." << endl; + + std::vector predicted_labels; + predicted_labels.resize(params.n_inference_rows); + + ML::postprocess_labels(params.n_rows, h_input_labels, labels_map); + ML::preprocess_labels(params.n_inference_rows, inference_labels, labels_map); //use same map as labels + + cout << "Will start testing\n"; + ML::RF_metrics metrics = cross_validate(handle, my_rf, inference_data.data(), inference_labels.data(), params.n_inference_rows, params.n_cols, predicted_labels.data(), false); + metrics.print(); + ML::postprocess_labels(params.n_inference_rows, inference_labels, labels_map); + + cout << "Free memory\n"; + CUDA_CHECK(cudaFree(input_data)); + CUDA_CHECK(cudaFree(input_labels)); + delete my_rf; +} + +template +void solve_regression_problem(RF_inputs & params) { + T * input_data; + T * input_labels; + + int input_data_len = params.n_rows * params.n_cols; + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream) ); + allocate(input_data, input_data_len); + allocate(input_labels, params.n_rows); + + std::vector h_input_data, inference_data; + std::vector h_input_labels, inference_labels; + + // Populate labels and data + parse_csv(params.dataset, params.n_cols, h_input_data, h_input_labels, params.n_rows, inference_data, inference_labels, params.n_inference_rows, params.test_is_train); //last arg makes test same as training + + updateDevice(input_data, h_input_data.data(), input_data_len, stream); + updateDevice(input_labels, h_input_labels.data(), params.n_rows, stream); + cout << "Finished populating device labels and data\n"; + + // Fit input_dataset + ML::rfRegressor * my_rf; + ML::cumlHandle handle; + handle.setStream(stream); + + + ML::DecisionTree::DecisionTreeParams tree_params(params.max_depth, params.max_leaves, params.max_features, params.n_bins, params.split_algo, params.min_rows_per_node, params.bootstrap_features, params.split_criterion); + ML::RF_params rf_params(params.bootstrap, params.bootstrap_features, params.n_trees, params.rows_sample, tree_params); + + my_rf = new typename ML::rfRegressor::rfRegressor(rf_params); + cout << "Called RF constructor\n"; + + float ms; + TIMEIT_LOOP(ms, 1, fit(handle, my_rf, input_data, params.n_rows, params.n_cols, input_labels)); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaStreamDestroy(stream)); + + + my_rf->print_rf_detailed(); + cout << "Planted the random forest in " << ms << " ms, " << ms /1000.0 << " s." << endl; + + std::vector predicted_labels; + predicted_labels.resize(params.n_inference_rows); + + cout << "Will start testing\n"; + ML::RF_metrics metrics = cross_validate(handle, my_rf, inference_data.data(), inference_labels.data(), params.n_inference_rows, params.n_cols, predicted_labels.data(), false); + metrics.print(); + + cout << "Free memory\n"; + CUDA_CHECK(cudaFree(input_data)); + CUDA_CHECK(cudaFree(input_labels)); + delete my_rf; +} + From 76d2e7ed8e9514d6338e5df266b3263b40686176 Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Tue, 14 May 2019 15:24:36 +0200 Subject: [PATCH 15/51] added iota and permute on GPU using thrust and ml-prims --- cuML/src/randomforest/randomforest.cu | 17 +++++++++++------ cuML/src/randomforest/randomforest.h | 1 + 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/cuML/src/randomforest/randomforest.cu b/cuML/src/randomforest/randomforest.cu index 40387f05b0..b8b7d3410e 100644 --- a/cuML/src/randomforest/randomforest.cu +++ b/cuML/src/randomforest/randomforest.cu @@ -425,16 +425,21 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i // Select n_sampled_rows (with replacement) numbers from [0, n_rows) per tree. // selected_rows: randomly generated IDs for bootstrapped samples (w/ replacement); a device ptr. MLCommon::device_buffer selected_rows(handle.getDeviceAllocator(), stream, n_sampled_rows); - + if (this->rf_params.bootstrap) { MLCommon::Random::Rng r(i * 1000); // Ensure the seed for each tree is different and meaningful. r.uniformInt(selected_rows.data(), n_sampled_rows, (unsigned int) 0, (unsigned int) n_rows, stream); } else { - std::vector h_selected_rows(n_rows); - std::iota(h_selected_rows.begin(), h_selected_rows.end(), 0); - std::random_shuffle(h_selected_rows.begin(), h_selected_rows.end()); - h_selected_rows.resize(n_sampled_rows); - MLCommon::updateDevice(selected_rows.data(), h_selected_rows.data(), n_sampled_rows, stream); + MLCommon::device_buffer *inkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); + MLCommon::device_buffer *outkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); + thrust::sequence(thrust::cuda::par.on(stream), inkeys->data(), inkeys->data() + n_rows); + int *perms = nullptr; + MLCommon::Random::permute(perms, outkeys->data(), inkeys->data(), 1, n_rows, false, stream); + CUDA_CHECK(cudaMemcpyAsync(selected_rows.data(), outkeys->data(), n_sampled_rows * sizeof(unsigned int), cudaMemcpyDeviceToDevice, stream)); + inkeys->release(stream); + outkeys->release(stream); + delete inkeys; + delete outkeys; } /* Build individual tree in the forest. diff --git a/cuML/src/randomforest/randomforest.h b/cuML/src/randomforest/randomforest.h index b2fcc6feb9..4b341572ce 100644 --- a/cuML/src/randomforest/randomforest.h +++ b/cuML/src/randomforest/randomforest.h @@ -19,6 +19,7 @@ #include #include #include "random/rng.h" +#include "random/permute.h" #include #include #include From 9a5f32c2b163815cbaeae15713c71990f0b97f0f Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Tue, 14 May 2019 07:07:26 -0700 Subject: [PATCH 16/51] Preprocess quantiles in batches. - Preprocess quantiles in batches of batch_cols if there isn't enough device memory to process all ncols at once. - The number of columns per batch is dynamically determined based on the available device memory. --- cuML/src/decisiontree/decisiontree.cu | 2 +- cuML/src/decisiontree/kernels/quantile.cuh | 53 +++++++++++++------ cuML/src/decisiontree/memory.cuh | 2 +- .../tmp_testing_dir/launch_rf_testing | 4 +- 4 files changed, 42 insertions(+), 19 deletions(-) diff --git a/cuML/src/decisiontree/decisiontree.cu b/cuML/src/decisiontree/decisiontree.cu index 5b806212c1..de5f63fd3c 100644 --- a/cuML/src/decisiontree/decisiontree.cu +++ b/cuML/src/decisiontree/decisiontree.cu @@ -220,7 +220,7 @@ void DecisionTreeClassifier::fit(const ML::cumlHandle& handle, T *data, const if (tree_params.split_criterion == CRITERION::CRITERION_END) { // Set default to GINI tree_params.split_criterion = CRITERION::GINI; } - ASSERT( (tree_params.split_criterion == CRITERION::GINI || tree_params.split_criterion == CRITERION::ENTROPY ) , " Decision Tree Classifer split creteria, should be Gini or Entropy\n"); + ASSERT( (tree_params.split_criterion == CRITERION::GINI || tree_params.split_criterion == CRITERION::ENTROPY ), " Decision Tree Classifer split criteria, should be Gini or Entropy\n"); return plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree_params.max_depth, tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features); diff --git a/cuML/src/decisiontree/kernels/quantile.cuh b/cuML/src/decisiontree/kernels/quantile.cuh index bded206e4b..27a9c27b74 100644 --- a/cuML/src/decisiontree/kernels/quantile.cuh +++ b/cuML/src/decisiontree/kernels/quantile.cuh @@ -42,41 +42,64 @@ __global__ void get_all_quantiles(const T* __restrict__ data, T* quantile, const template void preprocess_quantile(const T* data, const unsigned int* rowids, const int n_sampled_rows, const int ncols, const int rowoffset, const int nbins, std::shared_ptr> tempmem) { + // Dynamically determine batch_cols (number of columns processed per loop iteration) from the available device memory. + size_t free_mem, total_mem; + CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem)); + int max_ncols = free_mem / (2 * n_sampled_rows * sizeof(T)); + int batch_cols = (max_ncols > ncols) ? ncols : max_ncols; + ASSERT(max_ncols != 0, "Cannot preprocess quantiles due to insufficient device memory."); + //std::cout << "Preprocess quantiles w/ " << batch_cols << " batch columns (default " << ncols << " columns)." << std::endl; + int threads = 128; - int num_items = n_sampled_rows * ncols; // number of items to sort across all segments (i.e., cols) - int num_segments = ncols; MLCommon::device_buffer *d_offsets; MLCommon::device_buffer *d_keys_out; T *d_keys_in = tempmem->temp_data->data(); int *colids = nullptr; - d_offsets = new MLCommon::device_buffer(tempmem->ml_handle.getDeviceAllocator(), tempmem->stream, num_segments + 1); - d_keys_out = new MLCommon::device_buffer(tempmem->ml_handle.getDeviceAllocator(), tempmem->stream, num_items); + d_offsets = new MLCommon::device_buffer(tempmem->ml_handle.getDeviceAllocator(), tempmem->stream, batch_cols + 1); int blocks = MLCommon::ceildiv(ncols * n_sampled_rows, threads); - allcolsampler_kernel<<< blocks , threads, 0, tempmem->stream >>>( data, rowids, colids, n_sampled_rows, ncols, rowoffset, d_keys_in); + allcolsampler_kernel<<< blocks , threads, 0, tempmem->stream >>>( data, rowids, colids, n_sampled_rows, ncols, rowoffset, d_keys_in); // d_keys_in already allocated for all ncols CUDA_CHECK(cudaGetLastError()); - blocks = MLCommon::ceildiv(ncols + 1, threads); - set_sorting_offset<<< blocks, threads, 0, tempmem->stream >>>(n_sampled_rows, ncols, d_offsets->data()); + + blocks = MLCommon::ceildiv(batch_cols + 1, threads); + set_sorting_offset<<< blocks, threads, 0, tempmem->stream >>>(n_sampled_rows, batch_cols, d_offsets->data()); CUDA_CHECK(cudaGetLastError()); // Determine temporary device storage requirements MLCommon::device_buffer *d_temp_storage = nullptr; size_t temp_storage_bytes = 0; + + int batch_cnt = MLCommon::ceildiv(ncols, batch_cols); // number of loop iterations + int last_batch_size = ncols - batch_cols * (batch_cnt - 1); // number of columns in last batch + int batch_items = n_sampled_rows * batch_cols; // used to determine d_temp_storage size + + d_keys_out = new MLCommon::device_buffer(tempmem->ml_handle.getDeviceAllocator(), tempmem->stream, batch_items); CUDA_CHECK(cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out->data(), - num_items, num_segments, d_offsets->data(), d_offsets->data() + 1, 0, 8*sizeof(T), tempmem->stream)); + batch_items, batch_cols, d_offsets->data(), d_offsets->data() + 1, 0, 8*sizeof(T), tempmem->stream)); // Allocate temporary storage d_temp_storage = new MLCommon::device_buffer(tempmem->ml_handle.getDeviceAllocator(), tempmem->stream, temp_storage_bytes); - // Run sorting operation - CUDA_CHECK(cub::DeviceSegmentedRadixSort::SortKeys((void *)d_temp_storage->data(), temp_storage_bytes, d_keys_in, d_keys_out->data(), - num_items, num_segments, d_offsets->data(), d_offsets->data() + 1, 0, 8*sizeof(T), tempmem->stream)); + // Compute quantiles for cur_batch_cols columns per loop iteration. + for (int batch = 0; batch < batch_cnt; batch++) { - blocks = MLCommon::ceildiv(ncols * nbins, threads); - get_all_quantiles<<< blocks, threads, 0, tempmem->stream >>>(d_keys_out->data(), tempmem->d_quantile->data(), n_sampled_rows, ncols, nbins); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); + int cur_batch_cols = (batch == batch_cnt - 1) ? last_batch_size : batch_cols; // properly handle the last batch + + int batch_offset = batch * n_sampled_rows * batch_cols; + int quantile_offset = batch * nbins * batch_cols; + + // Run sorting operation + CUDA_CHECK(cub::DeviceSegmentedRadixSort::SortKeys((void *)d_temp_storage->data(), temp_storage_bytes, &d_keys_in[batch_offset], d_keys_out->data(), + n_sampled_rows * batch_cols, cur_batch_cols, d_offsets->data(), d_offsets->data() + 1, 0, 8*sizeof(T), tempmem->stream)); + + blocks = MLCommon::ceildiv(cur_batch_cols * nbins, threads); + get_all_quantiles<<< blocks, threads, 0, tempmem->stream >>>(d_keys_out->data(), &tempmem->d_quantile->data()[quantile_offset], n_sampled_rows, cur_batch_cols, nbins); + + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); + + } d_keys_out->release(tempmem->stream); d_offsets->release(tempmem->stream); diff --git a/cuML/src/decisiontree/memory.cuh b/cuML/src/decisiontree/memory.cuh index e81d455ca9..c9744079ab 100644 --- a/cuML/src/decisiontree/memory.cuh +++ b/cuML/src/decisiontree/memory.cuh @@ -111,7 +111,7 @@ struct TemporaryMemory // memory of d_histout + d_colids + d_globalminmax + (d_mseout + d_predout) totalmem += (n_hist_elements * sizeof(int) + sizeof(int) + 2*sizeof(T) + 3 * n_bins * sizeof(T))* Ncols; - this->print_info(); + //this->print_info(); } void print_info() diff --git a/cuML/src/randomforest/tmp_testing_dir/launch_rf_testing b/cuML/src/randomforest/tmp_testing_dir/launch_rf_testing index c462fb3f61..2bc7208d6a 100755 --- a/cuML/src/randomforest/tmp_testing_dir/launch_rf_testing +++ b/cuML/src/randomforest/tmp_testing_dir/launch_rf_testing @@ -4,9 +4,9 @@ RF_BASE_DIR="/gpfs/fs1/myrtop/rapids_repos/regression_random_forest/cuml" #Compile -nvcc -g -std=c++11 -arch=sm_70 rf_testing.cu ../../decisiontree/decisiontree.cu ../../common/cuml_api.cpp ../../common/cumlHandle.cpp -I$RF_BASE_DIR/ml-prims/src/ -I$RF_BASE_DIR/cuML/external/ml-prims/external/cub -I$RF_BASE_DIR/cuML/src -I$RF_BASE_DIR/thirdparty/cuml/ml-prims/src/ -lcublas -lcudart -lcusolver -lcusparse --expt-extended-lambda -o rf_testing +nvcc -std=c++11 -arch=sm_70 rf_testing.cu ../../decisiontree/decisiontree.cu ../../common/cuml_api.cpp ../../common/cumlHandle.cpp -I$RF_BASE_DIR/ml-prims/src/ -I$RF_BASE_DIR/cuML/external/ml-prims/external/cub -I$RF_BASE_DIR/cuML/src -I$RF_BASE_DIR/thirdparty/cuml/ml-prims/src/ -lcublas -lcudart -lcusolver -lcusparse --expt-extended-lambda -o rf_testing -if [ ! -e tmp_rf_testing ]; then +if [ ! -e rf_testing ]; then echo "Error in building tmp_rf_testing" exit fi From b472bbd2adef77beeda41bfea8d6c753ec58010a Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Wed, 15 May 2019 06:27:30 -0700 Subject: [PATCH 17/51] Swapped cudaMemcpy w/ updateDevice/Host/Async. - Updated all our cudaMemcpy calls to updateDevice, updateHost, or updateAsync as appropriate. - Copied recent iota -> permute change from rfRegressor to rfClassifier fit too. --- cpp/src/decisiontree/decisiontree.cu | 4 +-- .../kernels/evaluate_classifier.cuh | 7 +++--- .../kernels/evaluate_regressor.cuh | 9 +++---- cpp/src/decisiontree/kernels/gini.cuh | 6 ++--- cpp/src/decisiontree/kernels/split_labels.cuh | 13 +++++----- cpp/src/decisiontree/memory.cuh | 5 ++-- cpp/src/randomforest/randomforest.cu | 25 ++++++++++++------- 7 files changed, 39 insertions(+), 30 deletions(-) diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu index de5f63fd3c..f0a7fd72b2 100644 --- a/cpp/src/decisiontree/decisiontree.cu +++ b/cpp/src/decisiontree/decisiontree.cu @@ -343,7 +343,7 @@ void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const if (depth == 0) { CUDA_CHECK(cudaHostRegister(colselector.data(), sizeof(int) * colselector.size(), cudaHostRegisterDefault)); // Copy sampled column IDs to device memory - CUDA_CHECK(cudaMemcpyAsync(this->tempmem[0]->d_colids->data(), colselector.data(), sizeof(int) * colselector.size(), cudaMemcpyHostToDevice, this->tempmem[0]->stream)); + MLCommon::updateDevice(this->tempmem[0]->d_colids->data(), colselector.data(), colselector.size(), this->tempmem[0]->stream); CUDA_CHECK(cudaStreamSynchronize(this->tempmem[0]->stream)); int *labelptr = this->tempmem[0]->sampledlabels->data(); @@ -519,7 +519,7 @@ void DecisionTreeRegressor::find_best_fruit_all(T *data, T *labels, const flo if (depth == 0) { CUDA_CHECK(cudaHostRegister(colselector.data(), sizeof(int) * colselector.size(), cudaHostRegisterDefault)); // Copy sampled column IDs to device memory - CUDA_CHECK(cudaMemcpyAsync(this->tempmem[0]->d_colids->data(), colselector.data(), sizeof(int) * colselector.size(), cudaMemcpyHostToDevice, this->tempmem[0]->stream)); + MLCommon::updateDevice(this->tempmem[0]->d_colids->data(), colselector.data(), colselector.size(), this->tempmem[0]->stream); CUDA_CHECK(cudaStreamSynchronize(this->tempmem[0]->stream)); T *labelptr = this->tempmem[0]->sampledlabels->data(); diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index f8132bc8c2..3226785b8f 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -184,7 +184,7 @@ void find_best_split_classifier(const std::shared_ptr> tem T ques_val; T *d_quantile = tempmem->d_quantile->data(); int q_index = col_selector[best_col_id] * nbins + best_bin_id; - CUDA_CHECK(cudaMemcpyAsync(&ques_val, &d_quantile[q_index], sizeof(T), cudaMemcpyDeviceToHost, tempmem->stream)); + MLCommon::updateHost(&ques_val, &d_quantile[q_index], 1, tempmem->stream); CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); ques.set_question_fields(best_col_id, col_selector[best_col_id], best_bin_id, nbins, n_cols, std::numeric_limits::max(), -std::numeric_limits::max(), ques_val); } @@ -202,7 +202,8 @@ void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, c int ncols = colselector.size(); int col_minmax_bytes = sizeof(T) * 2 * ncols; - int n_hist_bytes = n_unique_labels * nbins * sizeof(int) * ncols; + int n_hist_elements = n_unique_labels * nbins * ncols; + int n_hist_bytes = n_hist_elements * sizeof(int); CUDA_CHECK(cudaMemsetAsync((void*)d_histout, 0, n_hist_bytes, tempmem->stream)); @@ -234,7 +235,7 @@ void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, c } CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMemcpyAsync(h_histout, d_histout, n_hist_bytes, cudaMemcpyDeviceToHost, tempmem->stream)); + MLCommon::updateHost(h_histout, d_histout, n_hist_elements, tempmem->stream); CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); find_best_split_classifier(tempmem, nbins, n_unique_labels, colselector, &split_info[0], nrows, ques, gain, split_algo); diff --git a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh index d796c68d59..c0251dd836 100644 --- a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh @@ -280,7 +280,7 @@ void find_best_split_regressor(const std::shared_ptr> tempm T ques_val; T *d_quantile = tempmem->d_quantile->data(); int q_index = col_selector[best_col_id] * nbins + best_bin_id; - CUDA_CHECK(cudaMemcpyAsync(&ques_val, &d_quantile[q_index], sizeof(T), cudaMemcpyDeviceToHost, tempmem->stream)); + MLCommon::updateHost(&ques_val, &d_quantile[q_index], 1, tempmem->stream); CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); ques.set_question_fields(best_col_id, col_selector[best_col_id], best_bin_id, nbins, n_cols, std::numeric_limits::max(), -std::numeric_limits::max(), ques_val); } @@ -340,10 +340,9 @@ void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, co } CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMemcpyAsync(h_mseout, d_mseout, n_mse_bytes, cudaMemcpyDeviceToHost, tempmem->stream)); - CUDA_CHECK(cudaMemcpyAsync(h_histout, d_histout, n_count_bytes, cudaMemcpyDeviceToHost, tempmem->stream)); - CUDA_CHECK(cudaMemcpyAsync(h_predout, d_predout, n_pred_bytes, cudaMemcpyDeviceToHost, tempmem->stream)); - + MLCommon::updateHost(h_mseout, d_mseout, n_mse_bytes / sizeof(T), tempmem->stream); + MLCommon::updateHost(h_histout, d_histout, n_count_bytes / sizeof(int), tempmem->stream); + MLCommon::updateHost(h_predout, d_predout, n_pred_bytes / sizeof(T), tempmem->stream); CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); find_best_split_regressor(tempmem, nbins, colselector, &split_info[0], nrows, ques, gain, split_algo); diff --git a/cpp/src/decisiontree/kernels/gini.cuh b/cpp/src/decisiontree/kernels/gini.cuh index bac8dd1336..788d8565f9 100644 --- a/cpp/src/decisiontree/kernels/gini.cuh +++ b/cpp/src/decisiontree/kernels/gini.cuh @@ -116,7 +116,7 @@ void gini(int *labels_in, const int nrows, const std::shared_ptrstream)); gini_kernel<<< MLCommon::ceildiv(nrows, 128), 128, sizeof(int)*unique_labels, tempmem->stream>>>(labels_in, nrows, unique_labels, dhist); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMemcpyAsync(hhist, dhist, sizeof(int)*unique_labels, cudaMemcpyDeviceToHost, tempmem->stream)); + MLCommon::updateHost(hhist, dhist, unique_labels, tempmem->stream); CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); split_info.hist.resize(unique_labels, 0); @@ -145,8 +145,8 @@ void mse(T *labels_in, const int nrows, const std::shared_ptr<<< MLCommon::ceildiv(nrows, 128), 128, 0, tempmem->stream>>>(labels_in, nrows, dpred, dmse); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMemcpyAsync(hmse, dmse, sizeof(T), cudaMemcpyDeviceToHost, tempmem->stream)); - CUDA_CHECK(cudaMemcpyAsync(hpred, dpred, sizeof(T), cudaMemcpyDeviceToHost, tempmem->stream)); + MLCommon::updateHost(hmse, dmse, 1, tempmem->stream); + MLCommon::updateHost(hpred, dpred, 1, tempmem->stream); CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); split_info.best_metric = (float)hmse[0] / (float)nrows; //Update split metric value diff --git a/cpp/src/decisiontree/kernels/split_labels.cuh b/cpp/src/decisiontree/kernels/split_labels.cuh index bb2349b698..b8f92553c4 100644 --- a/cpp/src/decisiontree/kernels/split_labels.cuh +++ b/cpp/src/decisiontree/kernels/split_labels.cuh @@ -93,7 +93,7 @@ template void make_split(T *column, MetricQuestion & ques, const int nrows, int& nrowsleft, int& nrowsright, unsigned int* rowids, int split_algo, const std::shared_ptr> tempmem) { - int *temprowids = tempmem->temprowids->data(); + unsigned int *temprowids = tempmem->temprowids->data(); char *d_flags_left = tempmem->d_flags_left->data(); char *d_flags_right = tempmem->d_flags_right->data(); T *question_value = tempmem->question_value->data(); @@ -112,17 +112,18 @@ void make_split(T *column, MetricQuestion & ques, const int nrows, int& nrows cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, rowids, d_flags_left, temprowids, d_num_selected_out, nrows); - CUDA_CHECK(cudaMemcpyAsync(&nrowsleftright[0], d_num_selected_out, sizeof(int), cudaMemcpyDeviceToHost, tempmem->stream)); + MLCommon::updateHost(&nrowsleftright[0], d_num_selected_out, 1, tempmem->stream); CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); nrowsleft = nrowsleftright[0]; cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, rowids, d_flags_right, &temprowids[nrowsleft], d_num_selected_out, nrows); - CUDA_CHECK(cudaMemcpyAsync(&nrowsleftright[1], d_num_selected_out, sizeof(int), cudaMemcpyDeviceToHost, tempmem->stream)); - CUDA_CHECK(cudaMemcpyAsync(rowids, temprowids, nrows*sizeof(int), cudaMemcpyDeviceToDevice, tempmem->stream)); + MLCommon::updateHost(&nrowsleftright[1], d_num_selected_out, 1, tempmem->stream); + MLCommon::copyAsync(rowids, temprowids, nrows, tempmem->stream); // Copy GPU-computed question value to tree node. - if (split_algo == ML::SPLIT_ALGO::HIST) - CUDA_CHECK(cudaMemcpyAsync(&(ques.value), question_value, sizeof(T), cudaMemcpyDeviceToHost, tempmem->stream)); + if (split_algo == ML::SPLIT_ALGO::HIST) { + MLCommon::updateHost(&(ques.value), question_value, 1, tempmem->stream); + } CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); nrowsright = nrowsleftright[1]; diff --git a/cpp/src/decisiontree/memory.cuh b/cpp/src/decisiontree/memory.cuh index c9744079ab..fed383821b 100644 --- a/cpp/src/decisiontree/memory.cuh +++ b/cpp/src/decisiontree/memory.cuh @@ -45,7 +45,8 @@ struct TemporaryMemory MLCommon::device_buffer *d_split_temp_storage = nullptr; size_t split_temp_storage_bytes = 0; - MLCommon::device_buffer *d_num_selected_out, *temprowids; + MLCommon::device_buffer *d_num_selected_out; + MLCommon::device_buffer *temprowids; MLCommon::device_buffer *question_value, *temp_data; //Total temp mem @@ -89,7 +90,7 @@ struct TemporaryMemory d_num_selected_out = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, 1); d_flags_left = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); d_flags_right = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); - temprowids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); + temprowids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); question_value = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, 1); cub::DeviceSelect::Flagged(d_split_temp_storage, split_temp_storage_bytes, temprowids->data(), d_flags_left->data(), temprowids->data(), d_num_selected_out->data(), N); diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index b8b7d3410e..26f24e9450 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -263,12 +263,18 @@ void rfClassifier::fit(const cumlHandle& user_handle, T * input, int n_rows, if (this->rf_params.bootstrap) { MLCommon::Random::Rng r(i * 1000); // Ensure the seed for each tree is different and meaningful. r.uniformInt(selected_rows.data(), n_sampled_rows, (unsigned int) 0, (unsigned int) n_rows, stream); - } else { - std::vector h_selected_rows(n_rows); - std::iota(h_selected_rows.begin(), h_selected_rows.end(), 0); - std::random_shuffle(h_selected_rows.begin(), h_selected_rows.end()); - h_selected_rows.resize(n_sampled_rows); - MLCommon::updateDevice(selected_rows.data(), h_selected_rows.data(), n_sampled_rows, stream); + } else { // Sampling w/o replacement + MLCommon::device_buffer *inkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); + MLCommon::device_buffer *outkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); + thrust::sequence(thrust::cuda::par.on(stream), inkeys->data(), inkeys->data() + n_rows); + int *perms = nullptr; + MLCommon::Random::permute(perms, outkeys->data(), inkeys->data(), 1, n_rows, false, stream); + // outkeys has more rows than selected_rows; doing the shuffling before the resize to differentiate the per-tree rows sample. + MLCommon::copyAsync(selected_rows.data(), outkeys->data(), n_sampled_rows, stream); + inkeys->release(stream); + outkeys->release(stream); + delete inkeys; + delete outkeys; } /* Build individual tree in the forest. @@ -429,13 +435,14 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i if (this->rf_params.bootstrap) { MLCommon::Random::Rng r(i * 1000); // Ensure the seed for each tree is different and meaningful. r.uniformInt(selected_rows.data(), n_sampled_rows, (unsigned int) 0, (unsigned int) n_rows, stream); - } else { + } else { // Sampling w/o replacement MLCommon::device_buffer *inkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); MLCommon::device_buffer *outkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); thrust::sequence(thrust::cuda::par.on(stream), inkeys->data(), inkeys->data() + n_rows); int *perms = nullptr; MLCommon::Random::permute(perms, outkeys->data(), inkeys->data(), 1, n_rows, false, stream); - CUDA_CHECK(cudaMemcpyAsync(selected_rows.data(), outkeys->data(), n_sampled_rows * sizeof(unsigned int), cudaMemcpyDeviceToDevice, stream)); + // outkeys has more rows than selected_rows; doing the shuffling before the resize to differentiate the per-tree rows sample. + MLCommon::copyAsync(selected_rows.data(), outkeys->data(), n_sampled_rows, stream); inkeys->release(stream); outkeys->release(stream); delete inkeys; @@ -448,7 +455,7 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i - selected_rows: points to a list of row #s (w/ n_sampled_rows elements) used to build the bootstrapped sample. Expectation: Each tree node will contain (a) # n_sampled_rows and (b) a pointer to a list of row numbers w.r.t original data. */ - trees[i].fit(user_handle, input, n_cols, n_rows, labels, selected_rows.data(), n_sampled_rows, /*n_unique_labels,*/ this->rf_params.tree_params); + trees[i].fit(user_handle, input, n_cols, n_rows, labels, selected_rows.data(), n_sampled_rows, this->rf_params.tree_params); //Cleanup selected_rows.release(stream); From d6027a90b8ba61bf933d85138f81cb917038dfbe Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Thu, 16 May 2019 04:55:37 -0700 Subject: [PATCH 18/51] Made rowids and colids unsigned int. - Also minor update to minmax prim code. - minmax prim use in dt commented out. --- cpp/src/decisiontree/decisiontree.cu | 8 ++++---- cpp/src/decisiontree/decisiontree.h | 2 +- cpp/src/decisiontree/kernels/col_condenser.cuh | 9 +++++---- .../kernels/evaluate_classifier.cuh | 14 ++++++++------ .../kernels/evaluate_regressor.cuh | 18 ++++++++++-------- cpp/src/decisiontree/kernels/quantile.cuh | 2 +- cpp/src/decisiontree/memory.cuh | 7 ++++--- cpp/src_prims/stats/minmax.h | 9 +++++---- 8 files changed, 38 insertions(+), 31 deletions(-) diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu index f0a7fd72b2..5efdd085d7 100644 --- a/cpp/src/decisiontree/decisiontree.cu +++ b/cpp/src/decisiontree/decisiontree.cu @@ -337,11 +337,11 @@ TreeNode* DecisionTreeClassifier::grow_tree(T *data, const float colp template void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const float colper, MetricQuestion & ques, float& gain, unsigned int* rowids, const int n_sampled_rows, MetricInfo split_info[3], int depth) { - std::vector& colselector = this->feature_selector; + std::vector& colselector = this->feature_selector; // Optimize ginibefore; no need to compute except for root. if (depth == 0) { - CUDA_CHECK(cudaHostRegister(colselector.data(), sizeof(int) * colselector.size(), cudaHostRegisterDefault)); + CUDA_CHECK(cudaHostRegister(colselector.data(), sizeof(unsigned int) * colselector.size(), cudaHostRegisterDefault)); // Copy sampled column IDs to device memory MLCommon::updateDevice(this->tempmem[0]->d_colids->data(), colselector.data(), colselector.size(), this->tempmem[0]->stream); CUDA_CHECK(cudaStreamSynchronize(this->tempmem[0]->stream)); @@ -513,11 +513,11 @@ template void DecisionTreeRegressor::find_best_fruit_all(T *data, T *labels, const float colper, MetricQuestion & ques, float& gain, unsigned int* rowids, const int n_sampled_rows, MetricInfo split_info[3], int depth) { - std::vector& colselector = this->feature_selector; + std::vector& colselector = this->feature_selector; // Optimize ginibefore; no need to compute except for root. if (depth == 0) { - CUDA_CHECK(cudaHostRegister(colselector.data(), sizeof(int) * colselector.size(), cudaHostRegisterDefault)); + CUDA_CHECK(cudaHostRegister(colselector.data(), sizeof(unsigned int) * colselector.size(), cudaHostRegisterDefault)); // Copy sampled column IDs to device memory MLCommon::updateDevice(this->tempmem[0]->d_colids->data(), colselector.data(), colselector.size(), this->tempmem[0]->stream); CUDA_CHECK(cudaStreamSynchronize(this->tempmem[0]->stream)); diff --git a/cpp/src/decisiontree/decisiontree.h b/cpp/src/decisiontree/decisiontree.h index 3c09d58dfe..98a40ee374 100644 --- a/cpp/src/decisiontree/decisiontree.h +++ b/cpp/src/decisiontree/decisiontree.h @@ -116,7 +116,7 @@ class dt { int min_rows_per_node; bool bootstrap_features; CRITERION split_criterion; - std::vector feature_selector; + std::vector feature_selector; void print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const; void split_branch(T *data, MetricQuestion & ques, const int n_sampled_rows, int& nrowsleft, int& nrowsright, unsigned int* rowids); diff --git a/cpp/src/decisiontree/kernels/col_condenser.cuh b/cpp/src/decisiontree/kernels/col_condenser.cuh index 462fc57673..7d9431fb36 100644 --- a/cpp/src/decisiontree/kernels/col_condenser.cuh +++ b/cpp/src/decisiontree/kernels/col_condenser.cuh @@ -37,17 +37,18 @@ void get_sampled_labels(const T *labels, T *outlabels, unsigned int* rowids, con } template -__global__ void allcolsampler_kernel(const T* __restrict__ data, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nrows, const int ncols, const int rowoffset, T* sampledcols) +__global__ void allcolsampler_kernel(const T* __restrict__ data, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nrows, const int ncols, const int rowoffset, T* sampledcols) { int tid = threadIdx.x + blockIdx.x * blockDim.x; for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { int newcolid = (int)(i / nrows); int myrowstart; - if( colids != nullptr) + if (colids != nullptr) { myrowstart = colids[ newcolid ] * rowoffset; - else + } else { myrowstart = newcolid * rowoffset; + } int index = rowids[ i % nrows] + myrowstart; sampledcols[i] = data[index]; @@ -56,7 +57,7 @@ __global__ void allcolsampler_kernel(const T* __restrict__ data, const unsigned } template -__global__ void allcolsampler_minmax_kernel(const T* __restrict__ data, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nrows, const int ncols, const int rowoffset, T* globalmin, T* globalmax, T* sampledcols, T init_min_val) +__global__ void allcolsampler_minmax_kernel(const T* __restrict__ data, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nrows, const int ncols, const int rowoffset, T* globalmin, T* globalmax, T* sampledcols, T init_min_val) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index 3226785b8f..18e052b8cb 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -22,13 +22,14 @@ #include "col_condenser.cuh" #include #include "../algo_helper.h" +#include "stats/minmax.h" /* The output of the function is a histogram array, of size ncols * nbins * n_unique_lables column order is as per colids (bootstrapped random cols) for each col there are nbins histograms */ template -__global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const int n_unique_labels, const T* __restrict__ globalminmax, int* histout) { +__global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const int n_unique_labels, const T* __restrict__ globalminmax, int* histout) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -73,7 +74,7 @@ __global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, con } template -__global__ void all_cols_histograms_global_quantile_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const int n_unique_labels, int* histout, const T* __restrict__ quantile) { +__global__ void all_cols_histograms_global_quantile_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const int n_unique_labels, int* histout, const T* __restrict__ quantile) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -110,7 +111,7 @@ __global__ void all_cols_histograms_global_quantile_kernel_class(const T* __rest } template -void find_best_split_classifier(const std::shared_ptr> tempmem, const int nbins, const int n_unique_labels, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { +void find_best_split_classifier(const std::shared_ptr> tempmem, const int nbins, const int n_unique_labels, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { gain = 0.0f; int best_col_id = -1; @@ -193,9 +194,9 @@ void find_best_split_classifier(const std::shared_ptr> tem template -void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, const L *labels, const int nbins, const int nrows, const int n_unique_labels, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) +void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, const L *labels, const int nbins, const int nrows, const int n_unique_labels, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) { - int* d_colids = tempmem->d_colids->data(); + unsigned int* d_colids = tempmem->d_colids->data(); T* d_globalminmax = tempmem->d_globalminmax->data(); int *d_histout = tempmem->d_histout->data(); int *h_histout = tempmem->h_histout->data(); @@ -207,7 +208,7 @@ void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, c CUDA_CHECK(cudaMemsetAsync((void*)d_histout, 0, n_hist_bytes, tempmem->stream)); - int threads = 512; + const int threads = 512; int blocks = MLCommon::ceildiv(nrows * ncols, threads); if (blocks > 65536) blocks = 65536; @@ -220,6 +221,7 @@ void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, c size_t shmemsize = col_minmax_bytes; if (split_algo == ML::SPLIT_ALGO::HIST) { // Histograms (min, max) allcolsampler_minmax_kernel<<stream>>>(data, rowids, d_colids, nrows, ncols, rowoffset, &d_globalminmax[0], &d_globalminmax[colselector.size()], tempmem->temp_data->data(), std::numeric_limits::max()); + //MLCommon::Stats::minmax(data, rowids, d_colids, nrows, ncols, rowoffset, &d_globalminmax[0], &d_globalminmax[colselector.size()], tempmem->temp_data->data(), tempmem->stream); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { // Global quantiles; just col condenser allcolsampler_kernel<<stream>>>(data, rowids, d_colids, nrows, ncols, rowoffset, tempmem->temp_data->data()); } diff --git a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh index c0251dd836..4f4df6826b 100644 --- a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh @@ -22,9 +22,10 @@ #include "col_condenser.cuh" #include #include "../algo_helper.h" +#include "stats/minmax.h" template -__global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T pred_parent) { +__global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T pred_parent) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -85,7 +86,7 @@ __global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const column order is as per colids (bootstrapped random cols) for each col there are nbins histograms */ template -__global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const T* __restrict__ globalminmax, T* predout, int* countout) { +__global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const T* __restrict__ globalminmax, T* predout, int* countout) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -134,7 +135,7 @@ __global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data } template -__global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, T* predout, int* countout, const T* __restrict__ quantile) { +__global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, T* predout, int* countout, const T* __restrict__ quantile) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -175,7 +176,7 @@ __global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restri } template -__global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T* __restrict__ quantile, const T pred_parent) { +__global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T* __restrict__ quantile, const T pred_parent) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -225,7 +226,7 @@ __global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ dat } template -void find_best_split_regressor(const std::shared_ptr> tempmem, const int nbins, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { +void find_best_split_regressor(const std::shared_ptr> tempmem, const int nbins, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { gain = 0.0f; int best_col_id = -1; @@ -289,9 +290,9 @@ void find_best_split_regressor(const std::shared_ptr> tempm template -void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, const T *labels, const int nbins, const int nrows, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) +void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, const T *labels, const int nbins, const int nrows, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) { - int* d_colids = tempmem->d_colids->data(); + unsigned int* d_colids = tempmem->d_colids->data(); T* d_globalminmax = tempmem->d_globalminmax->data(); int *d_histout = tempmem->d_histout->data(); int *h_histout = tempmem->h_histout->data(); @@ -310,7 +311,7 @@ void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, co CUDA_CHECK(cudaMemsetAsync((void*)d_predout, 0, n_pred_bytes, tempmem->stream)); CUDA_CHECK(cudaMemsetAsync((void*)d_histout, 0, n_count_bytes, tempmem->stream)); - int threads = 512; + const int threads = 512; int blocks = MLCommon::ceildiv(nrows * ncols, threads); if (blocks > 65536) blocks = 65536; @@ -323,6 +324,7 @@ void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, co size_t shmemsize = col_minmax_bytes; if (split_algo == ML::SPLIT_ALGO::HIST) { // Histograms (min, max) allcolsampler_minmax_kernel<<stream>>>(data, rowids, d_colids, nrows, ncols, rowoffset, &d_globalminmax[0], &d_globalminmax[colselector.size()], tempmem->temp_data->data(), std::numeric_limits::max()); + //MLCommon::Stats::minmax(data, rowids, d_colids, nrows, ncols, rowoffset, &d_globalminmax[0], &d_globalminmax[colselector.size()], tempmem->temp_data->data(), tempmem->stream); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { // Global quantiles; just col condenser allcolsampler_kernel<<stream>>>(data, rowids, d_colids, nrows, ncols, rowoffset, tempmem->temp_data->data()); } diff --git a/cpp/src/decisiontree/kernels/quantile.cuh b/cpp/src/decisiontree/kernels/quantile.cuh index 27a9c27b74..85523df3f4 100644 --- a/cpp/src/decisiontree/kernels/quantile.cuh +++ b/cpp/src/decisiontree/kernels/quantile.cuh @@ -54,7 +54,7 @@ void preprocess_quantile(const T* data, const unsigned int* rowids, const int n_ MLCommon::device_buffer *d_offsets; MLCommon::device_buffer *d_keys_out; T *d_keys_in = tempmem->temp_data->data(); - int *colids = nullptr; + unsigned int *colids = nullptr; d_offsets = new MLCommon::device_buffer(tempmem->ml_handle.getDeviceAllocator(), tempmem->stream, batch_cols + 1); diff --git a/cpp/src/decisiontree/memory.cuh b/cpp/src/decisiontree/memory.cuh index fed383821b..2bf083fad4 100644 --- a/cpp/src/decisiontree/memory.cuh +++ b/cpp/src/decisiontree/memory.cuh @@ -34,7 +34,8 @@ struct TemporaryMemory //Host/Device histograms and device minmaxs MLCommon::device_buffer *d_globalminmax; - MLCommon::device_buffer *d_histout, *d_colids; + MLCommon::device_buffer *d_histout; + MLCommon::device_buffer *d_colids; MLCommon::host_buffer *h_histout; MLCommon::device_buffer *d_mseout, *d_predout; MLCommon::host_buffer *h_mseout, *h_predout; @@ -108,9 +109,9 @@ struct TemporaryMemory d_mseout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, 2*mse_elements); d_predout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, mse_elements); - d_colids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); + d_colids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); // memory of d_histout + d_colids + d_globalminmax + (d_mseout + d_predout) - totalmem += (n_hist_elements * sizeof(int) + sizeof(int) + 2*sizeof(T) + 3 * n_bins * sizeof(T))* Ncols; + totalmem += (n_hist_elements * sizeof(int) + sizeof(unsigned int) + 2*sizeof(T) + 3 * n_bins * sizeof(T))* Ncols; //this->print_info(); } diff --git a/cpp/src_prims/stats/minmax.h b/cpp/src_prims/stats/minmax.h index 725f0e76da..96b263eb91 100644 --- a/cpp/src_prims/stats/minmax.h +++ b/cpp/src_prims/stats/minmax.h @@ -35,8 +35,8 @@ __global__ void minmaxInitKernel(int ncols, T* globalmin, T* globalmax, } template -__global__ void minmaxKernel(const T* data, const int* rowids, - const int* colids, int nrows, int ncols, +__global__ void minmaxKernel(const T* data, const unsigned int* rowids, + const unsigned int* colids, int nrows, int ncols, int row_stride, T* g_min, T* g_max, T* sampledcols, T init_min_val) { int tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -100,7 +100,7 @@ __global__ void minmaxKernel(const T* data, const int* rowids, * in shared memory */ template -void minmax(const T* data, const int* rowids, const int* colids, int nrows, +void minmax(const T* data, const unsigned int* rowids, const unsigned int* colids, int nrows, int ncols, int row_stride, T* globalmin, T* globalmax, T* sampledcols, cudaStream_t stream) { int nblks = ceildiv(ncols, TPB); @@ -109,7 +109,8 @@ void minmax(const T* data, const int* rowids, const int* colids, int nrows, globalmax, init_val); CUDA_CHECK(cudaPeekAtLastError()); nblks = ceildiv(nrows * ncols, TPB); - nblks = max(nblks, 65536); + //nblks = max(nblks, 65536); + nblks = min(nblks, 65536); size_t smemSize = sizeof(T) * 2 * ncols; minmaxKernel<<>>( data, rowids, colids, nrows, ncols, row_stride, globalmin, globalmax, From 29ac9eee80d12d6c169bd9828450a361acb3fe4a Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Thu, 16 May 2019 14:28:28 +0200 Subject: [PATCH 19/51] now using minmax primitive with column sampler --- cpp/src/decisiontree/kernels/evaluate_classifier.cuh | 3 +-- cpp/src/decisiontree/kernels/evaluate_regressor.cuh | 3 +-- cpp/src_prims/stats/minmax.h | 8 ++++---- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index 18e052b8cb..e5976f2de3 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -220,8 +220,7 @@ void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, c */ size_t shmemsize = col_minmax_bytes; if (split_algo == ML::SPLIT_ALGO::HIST) { // Histograms (min, max) - allcolsampler_minmax_kernel<<stream>>>(data, rowids, d_colids, nrows, ncols, rowoffset, &d_globalminmax[0], &d_globalminmax[colselector.size()], tempmem->temp_data->data(), std::numeric_limits::max()); - //MLCommon::Stats::minmax(data, rowids, d_colids, nrows, ncols, rowoffset, &d_globalminmax[0], &d_globalminmax[colselector.size()], tempmem->temp_data->data(), tempmem->stream); + MLCommon::Stats::minmax(data, rowids, d_colids, nrows, ncols, rowoffset, &d_globalminmax[0], &d_globalminmax[colselector.size()], tempmem->temp_data->data(), tempmem->stream); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { // Global quantiles; just col condenser allcolsampler_kernel<<stream>>>(data, rowids, d_colids, nrows, ncols, rowoffset, tempmem->temp_data->data()); } diff --git a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh index 4f4df6826b..06cddbafde 100644 --- a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh @@ -323,8 +323,7 @@ void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, co */ size_t shmemsize = col_minmax_bytes; if (split_algo == ML::SPLIT_ALGO::HIST) { // Histograms (min, max) - allcolsampler_minmax_kernel<<stream>>>(data, rowids, d_colids, nrows, ncols, rowoffset, &d_globalminmax[0], &d_globalminmax[colselector.size()], tempmem->temp_data->data(), std::numeric_limits::max()); - //MLCommon::Stats::minmax(data, rowids, d_colids, nrows, ncols, rowoffset, &d_globalminmax[0], &d_globalminmax[colselector.size()], tempmem->temp_data->data(), tempmem->stream); + MLCommon::Stats::minmax(data, rowids, d_colids, nrows, ncols, rowoffset, &d_globalminmax[0], &d_globalminmax[colselector.size()], tempmem->temp_data->data(), tempmem->stream); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { // Global quantiles; just col condenser allcolsampler_kernel<<stream>>>(data, rowids, d_colids, nrows, ncols, rowoffset, tempmem->temp_data->data()); } diff --git a/cpp/src_prims/stats/minmax.h b/cpp/src_prims/stats/minmax.h index 96b263eb91..b17b0ab00b 100644 --- a/cpp/src_prims/stats/minmax.h +++ b/cpp/src_prims/stats/minmax.h @@ -49,7 +49,7 @@ __global__ void minmaxKernel(const T* data, const unsigned int* rowids, } __syncthreads(); for (int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { - int col = i / nrows; + int col = i / nrows; int row = i % nrows; if(colids != nullptr) { col = colids[col]; @@ -59,8 +59,9 @@ __global__ void minmaxKernel(const T* data, const unsigned int* rowids, } int index = row + col * row_stride; T coldata = data[index]; - myAtomicMin(&s_min[col], coldata); - myAtomicMax(&s_max[col], coldata); + //Min max values are saved in shared memory and global memory as per the shuffled colids. + myAtomicMin(&s_min[(int)(i / nrows)], coldata); + myAtomicMax(&s_max[(int)(i / nrows)], coldata); if(sampledcols != nullptr) { sampledcols[i] = coldata; } @@ -109,7 +110,6 @@ void minmax(const T* data, const unsigned int* rowids, const unsigned int* colid globalmax, init_val); CUDA_CHECK(cudaPeekAtLastError()); nblks = ceildiv(nrows * ncols, TPB); - //nblks = max(nblks, 65536); nblks = min(nblks, 65536); size_t smemSize = sizeof(T) * 2 * ncols; minmaxKernel<<>>( From 8c5ed32c6217aeeed33c52b26fd8363de1b74b19 Mon Sep 17 00:00:00 2001 From: "Vishal Mehta (Compute DevTech)" Date: Thu, 16 May 2019 08:02:29 -0700 Subject: [PATCH 20/51] deleted col_minmax kernel now using ml-prims --- .../decisiontree/kernels/col_condenser.cuh | 43 ------------------- 1 file changed, 43 deletions(-) diff --git a/cpp/src/decisiontree/kernels/col_condenser.cuh b/cpp/src/decisiontree/kernels/col_condenser.cuh index 7d9431fb36..d9ae01bb94 100644 --- a/cpp/src/decisiontree/kernels/col_condenser.cuh +++ b/cpp/src/decisiontree/kernels/col_condenser.cuh @@ -55,46 +55,3 @@ __global__ void allcolsampler_kernel(const T* __restrict__ data, const unsigned } return; } - -template -__global__ void allcolsampler_minmax_kernel(const T* __restrict__ data, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nrows, const int ncols, const int rowoffset, T* globalmin, T* globalmax, T* sampledcols, T init_min_val) -{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ char shmem[]; - T *minshared = (T*)shmem; - T *maxshared = (T*)(shmem + sizeof(T) * ncols); - - for (int i = threadIdx.x; i < ncols; i += blockDim.x) { - minshared[i] = init_min_val; - maxshared[i] = -init_min_val; - } - - // Initialize min max in global memory - if (tid < ncols) { - globalmin[tid] = init_min_val; - globalmax[tid] = -init_min_val; - } - - __syncthreads(); - - for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { - int newcolid = (int)(i / nrows); - int myrowstart = colids[ newcolid ] * rowoffset; - int index = rowids[ i % nrows] + myrowstart; - T coldata = data[index]; - - MLCommon::myAtomicMin(&minshared[newcolid], coldata); - MLCommon::myAtomicMax(&maxshared[newcolid], coldata); - sampledcols[i] = coldata; - } - - __syncthreads(); - - for (int j = threadIdx.x; j < ncols; j+= blockDim.x) { - MLCommon::myAtomicMin(&globalmin[j], minshared[j]); - MLCommon::myAtomicMax(&globalmax[j], maxshared[j]); - } - - return; -} - From 280bce667973189ac4d174157d62f2cf119727d8 Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Fri, 17 May 2019 10:48:52 +0200 Subject: [PATCH 21/51] adding missing stream in cub --- cpp/src/decisiontree/kernels/split_labels.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/decisiontree/kernels/split_labels.cuh b/cpp/src/decisiontree/kernels/split_labels.cuh index b8f92553c4..2982899225 100644 --- a/cpp/src/decisiontree/kernels/split_labels.cuh +++ b/cpp/src/decisiontree/kernels/split_labels.cuh @@ -111,12 +111,12 @@ void make_split(T *column, MetricQuestion & ques, const int nrows, int& nrows int *d_num_selected_out = tempmem->d_num_selected_out->data(); - cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, rowids, d_flags_left, temprowids, d_num_selected_out, nrows); + cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, rowids, d_flags_left, temprowids, d_num_selected_out, nrows, tempmem->stream); MLCommon::updateHost(&nrowsleftright[0], d_num_selected_out, 1, tempmem->stream); CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); nrowsleft = nrowsleftright[0]; - cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, rowids, d_flags_right, &temprowids[nrowsleft], d_num_selected_out, nrows); + cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, rowids, d_flags_right, &temprowids[nrowsleft], d_num_selected_out, nrows, tempmem->stream); MLCommon::updateHost(&nrowsleftright[1], d_num_selected_out, 1, tempmem->stream); MLCommon::copyAsync(rowids, temprowids, nrows, tempmem->stream); From 86483637b44c15506fb0564c134f26c95183c320 Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Tue, 21 May 2019 09:02:11 -0700 Subject: [PATCH 22/51] Reordered call to find_best_fruit_all function. - Calling find_best_fruit_all is unnecessary when a node will be considered a leaf due to depth (or max leaves) constraints. - Added stream to cub call in memory.cuh --- cpp/src/decisiontree/decisiontree.cu | 23 ++++++++++------------- cpp/src/decisiontree/decisiontree.h | 4 ++-- cpp/src/decisiontree/memory.cuh | 2 +- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu index 5efdd085d7..1b13c3603c 100644 --- a/cpp/src/decisiontree/decisiontree.cu +++ b/cpp/src/decisiontree/decisiontree.cu @@ -303,17 +303,17 @@ TreeNode* DecisionTreeClassifier::grow_tree(T *data, const float colp bool condition = ((depth != 0) && (prev_split_info.best_metric == 0.0f)); // This node is a leaf, no need to search for best split condition = condition || (n_sampled_rows < this->min_rows_per_node); // Do not split a node with less than min_rows_per_node samples - if (!condition) { - find_best_fruit_all(data, labels, colper, ques, gain, rowids, n_sampled_rows, &split_info[0], depth); //ques and gain are output here - condition = condition || (gain == 0.0f); - } - if (this->treedepth != -1) condition = (condition || (depth == this->treedepth)); if (this->maxleaves != -1) condition = (condition || (this->leaf_counter >= this->maxleaves)); // FIXME not fully respecting maxleaves, but >= constraints it more than == + if (!condition) { + find_best_fruit_all(data, labels, colper, ques, gain, rowids, n_sampled_rows, &split_info[0], depth); //ques and gain are output here + condition = condition || (gain == 0.0f); + } + if (condition) { node->prediction = get_class_hist(split_info[0].hist); node->split_metric_val = split_info[0].best_metric; @@ -479,17 +479,17 @@ TreeNode* DecisionTreeRegressor::grow_tree(T *data, const float colper, bool condition = ((depth != 0) && (prev_split_info.best_metric == 0.0f)); // This node is a leaf, no need to search for best split condition = condition || (n_sampled_rows < this->min_rows_per_node); // Do not split a node with less than min_rows_per_node samples - if (!condition) { - find_best_fruit_all(data, labels, colper, ques, gain, rowids, n_sampled_rows, split_info, depth); //ques and gain are output here - condition = condition || (gain == 0.0f); - } - if (this->treedepth != -1) condition = (condition || (depth == this->treedepth)); if (this->maxleaves != -1) condition = (condition || (this->leaf_counter >= this->maxleaves)); // FIXME not fully respecting maxleaves, but >= constraints it more than == + if (!condition) { + find_best_fruit_all(data, labels, colper, ques, gain, rowids, n_sampled_rows, split_info, depth); //ques and gain are output here + condition = condition || (gain == 0.0f); + } + if (condition) { node->prediction = split_info[0].predict; node->split_metric_val = split_info[0].best_metric; @@ -543,9 +543,6 @@ void DecisionTreeRegressor::find_best_fruit_all(T *data, T *labels, const flo } } -// ---------------- Regression end - - //Class specializations template class dt; template class dt; diff --git a/cpp/src/decisiontree/decisiontree.h b/cpp/src/decisiontree/decisiontree.h index 98a40ee374..68a16da420 100644 --- a/cpp/src/decisiontree/decisiontree.h +++ b/cpp/src/decisiontree/decisiontree.h @@ -145,7 +145,7 @@ class DecisionTreeClassifier : public dt { const int n_sampled_rows, const int unique_labels, DecisionTreeParams tree_params); private: - // Same as above fit, but planting is better for a tree then fitting. + // Same as above fit, but planting is better for a tree than fitting. void plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, int maxdepth = -1, int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node=2, bool cfg_bootstrap_features=false); @@ -163,7 +163,7 @@ class DecisionTreeRegressor : public dt { const int n_sampled_rows, DecisionTreeParams tree_params); private: - // Same as above fit, but planting is better for a tree then fitting. + // Same as above fit, but planting is better for a tree than fitting. void plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels = 1, int maxdepth = -1, int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node=2, bool cfg_bootstrap_features=false, CRITERION cfg_split_criterion=CRITERION::MSE); diff --git a/cpp/src/decisiontree/memory.cuh b/cpp/src/decisiontree/memory.cuh index 2bf083fad4..fca96ccbc9 100644 --- a/cpp/src/decisiontree/memory.cuh +++ b/cpp/src/decisiontree/memory.cuh @@ -94,7 +94,7 @@ struct TemporaryMemory temprowids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); question_value = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, 1); - cub::DeviceSelect::Flagged(d_split_temp_storage, split_temp_storage_bytes, temprowids->data(), d_flags_left->data(), temprowids->data(), d_num_selected_out->data(), N); + cub::DeviceSelect::Flagged(d_split_temp_storage, split_temp_storage_bytes, temprowids->data(), d_flags_left->data(), temprowids->data(), d_num_selected_out->data(), N, stream); d_split_temp_storage = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, split_temp_storage_bytes); totalmem += split_temp_storage_bytes + (N + 1)*sizeof(int) + 2*N*sizeof(char) + sizeof(T); From d48e9c2b5721055e2525dc10f1a59ca53561da9a Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Mon, 27 May 2019 07:24:03 -0700 Subject: [PATCH 23/51] Fixed nbins bug for GLOBAL_QUANTILE. - Do not update # bins for GLOBAL_QUANTILE when # rows per node < # bins. --- cpp/src/decisiontree/decisiontree.cu | 7 ++- cpp/src/decisiontree/kernels/quantile.cuh | 1 - cpp/test/sg/rf_test.cu | 67 +++++++++++++++-------- 3 files changed, 49 insertions(+), 26 deletions(-) diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu index 1b13c3603c..dd32332a54 100644 --- a/cpp/src/decisiontree/decisiontree.cu +++ b/cpp/src/decisiontree/decisiontree.cu @@ -359,7 +359,8 @@ void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const CUDA_CHECK(cudaHostUnregister(colselector.data())); } - int current_nbins = (n_sampled_rows < this->nbins) ? n_sampled_rows : this->nbins; + // Do not update bin count for the GLOBAL_QUANTILE split algorithm, as all potential split points were precomputed. + int current_nbins = ((this->split_algo != SPLIT_ALGO::GLOBAL_QUANTILE) && (n_sampled_rows < this->nbins)) ? n_sampled_rows : this->nbins; if (this->split_criterion == CRITERION::GINI) { best_split_all_cols_classifier(data, rowids, labels, current_nbins, n_sampled_rows, this->n_unique_labels, this->dinfo.NLocalrows, colselector, @@ -533,7 +534,9 @@ void DecisionTreeRegressor::find_best_fruit_all(T *data, T *labels, const flo CUDA_CHECK(cudaHostUnregister(colselector.data())); } - int current_nbins = (n_sampled_rows < this->nbins) ? n_sampled_rows : this->nbins; + // Do not update bin count for the GLOBAL_QUANTILE split algorithm, as all potential split points were precomputed. + int current_nbins = ((this->split_algo != SPLIT_ALGO::GLOBAL_QUANTILE) && (n_sampled_rows < this->nbins)) ? n_sampled_rows : this->nbins; + if (this->split_criterion == CRITERION::MSE) { best_split_all_cols_regressor(data, rowids, labels, current_nbins, n_sampled_rows, this->dinfo.NLocalrows, colselector, this->tempmem[0], split_info, ques, gain, this->split_algo); diff --git a/cpp/src/decisiontree/kernels/quantile.cuh b/cpp/src/decisiontree/kernels/quantile.cuh index 85523df3f4..6d8dbda6aa 100644 --- a/cpp/src/decisiontree/kernels/quantile.cuh +++ b/cpp/src/decisiontree/kernels/quantile.cuh @@ -48,7 +48,6 @@ void preprocess_quantile(const T* data, const unsigned int* rowids, const int n_ int max_ncols = free_mem / (2 * n_sampled_rows * sizeof(T)); int batch_cols = (max_ncols > ncols) ? ncols : max_ncols; ASSERT(max_ncols != 0, "Cannot preprocess quantiles due to insufficient device memory."); - //std::cout << "Preprocess quantiles w/ " << batch_cols << " batch columns (default " << ncols << " columns)." << std::endl; int threads = 128; MLCommon::device_buffer *d_offsets; diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 726a96383e..65c898fbf6 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -168,7 +168,6 @@ protected: // Populate labels labels_h = {1.0, 2.0, 3.0, 4.0}; labels_h.resize(params.n_rows); - //preprocess_labels(params.n_rows, labels_h, labels_map); updateDevice(labels, labels_h.data(), params.n_rows, stream); rf_regressor = new typename rfRegressor::rfRegressor(rf_params); @@ -192,7 +191,6 @@ protected: RF_metrics tmp = cross_validate(handle, rf_regressor, inference_data_h.data(), labels_h.data(), params.n_inference_rows, params.n_cols, predicted_labels.data(), false); mse = tmp.mean_squared_error; - std::cout << "MSE is " << mse << std::endl; } void SetUp() override { @@ -225,20 +223,30 @@ protected: }; //------------------------------------------------------------------------------------------------------------------------------------- -const std::vector > inputsf2 = { - {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, // single tree forest, bootstrap false, unlimited depth, 4 bins - {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, // single tree forest, bootstrap false, depth of 8, 4 bins - {4, 2, 10, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, //forest with 10 trees, all trees should produce identical predictions (no bootstrapping or column subsampling) - {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, //forest with 10 trees, with bootstrap and column subsampling enabled, 3 bins - {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE, 2, CRITERION::MSE} //forest with 10 trees, with bootstrap and column subsampling enabled, 3 bins, different split algorithm +const std::vector > inputsf2_clf = { + {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::GINI}, // single tree forest, bootstrap false, unlimited depth, 4 bins + {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::GINI}, // single tree forest, bootstrap false, depth of 8, 4 bins + {4, 2, 10, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::GINI}, //forest with 10 trees, all trees should produce identical predictions (no bootstrapping or column subsampling) + {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::HIST, 2, CRITERION::GINI}, //forest with 10 trees, with bootstrap and column subsampling enabled, 3 bins + {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE, 2, CRITERION::CRITERION_END}, //forest with 10 trees, with bootstrap and column subsampling enabled, 3 bins, different split algorithm + {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::ENTROPY}, + {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::ENTROPY}, + {4, 2, 10, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::ENTROPY}, + {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::HIST, 2, CRITERION::ENTROPY}, + {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE, 2, CRITERION::ENTROPY}, }; -const std::vector > inputsd2 = { // Same as inputsf2 - {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, - {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, - {4, 2, 10, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, - {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, - {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE, 2, CRITERION::MSE} +const std::vector > inputsd2_clf = { // Same as inputsf2_clf + {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::GINI}, + {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::GINI}, + {4, 2, 10, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::GINI}, + {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::HIST, 2, CRITERION::GINI}, + {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE, 2, CRITERION::CRITERION_END}, + {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::ENTROPY}, + {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::ENTROPY}, + {4, 2, 10, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::ENTROPY}, + {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::HIST, 2, CRITERION::ENTROPY}, + {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE, 2, CRITERION::ENTROPY} }; @@ -261,17 +269,17 @@ TEST_P(RfClassifierTestD, Fit) { } } -INSTANTIATE_TEST_CASE_P(RfClassifierTests, RfClassifierTestF, ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_CASE_P(RfClassifierTests, RfClassifierTestF, ::testing::ValuesIn(inputsf2_clf)); -INSTANTIATE_TEST_CASE_P(RfClassifierTests, RfClassifierTestD, ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_CASE_P(RfClassifierTests, RfClassifierTestD, ::testing::ValuesIn(inputsd2_clf)); typedef RfRegressorTest RfRegressorTestF; TEST_P(RfRegressorTestF, Fit) { - rf_regressor->print_rf_detailed(); // Prints all trees in the forest. + //rf_regressor->print_rf_detailed(); // Prints all trees in the forest. if (!params.bootstrap && (params.max_features == 1.0f)) { ASSERT_TRUE(mse == 0.0f); } else { - ASSERT_TRUE(mse <= 0.1f); // Empirically derived mse range. TODO FIXME + ASSERT_TRUE(mse <= 0.2f); } } @@ -284,13 +292,26 @@ TEST_P(RfRegressorTestD, Fit) { } } -const std::vector > inputsf2_temp = { - {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}}; // single tree forest, bootstrap false, unlimited depth, 4 bins +const std::vector > inputsf2_reg = { + {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, + {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, + {4, 2, 5, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::CRITERION_END}, // CRITERION_END uses the default criterion (GINI for classification, MSE for regression) + {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MAE}, + {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::GLOBAL_QUANTILE, 2, CRITERION::MAE}, + {4, 2, 5, 1.0f, 1.0f, 4, 8, -1, true, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::CRITERION_END} +}; -//INSTANTIATE_TEST_CASE_P(RfRegressorTests, RfRegressorTestF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_CASE_P(RfRegressorTests, RfRegressorTestF, ::testing::ValuesIn(inputsf2_temp)); +const std::vector > inputsd2_reg = { // Same as inputsf2_reg + {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, + {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MSE}, + {4, 2, 5, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::CRITERION_END}, + {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::MAE}, + {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::GLOBAL_QUANTILE, 2, CRITERION::MAE}, + {4, 2, 5, 1.0f, 1.0f, 4, 8, -1, true, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::CRITERION_END} +}; -//INSTANTIATE_TEST_CASE_P(RfRegressorTests, RfRegressorTestD, ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_CASE_P(RfRegressorTests, RfRegressorTestF, ::testing::ValuesIn(inputsf2_reg)); +INSTANTIATE_TEST_CASE_P(RfRegressorTests, RfRegressorTestD, ::testing::ValuesIn(inputsd2_reg)); } // end namespace ML From 21d30e926fc4a388f3910a56fbceb293efd38c8c Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Mon, 27 May 2019 09:14:14 -0700 Subject: [PATCH 24/51] Changelog update. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef87c13d1c..6d22e7c2cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## New Features - PR #515: Added Random Projection feature - PR #504: Contingency matrix ml-prim +- PR #635: Random Forest & Decision Tree Regression (Single-GPU) ## Improvements From 6d374b38ec1954f74ffe062377b3381ff90101fa Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Tue, 28 May 2019 11:29:43 +0200 Subject: [PATCH 25/51] removing unused function parameters and some comments --- .../kernels/evaluate_classifier.cuh | 12 +++++------ .../kernels/evaluate_regressor.cuh | 20 ++++++++----------- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index e5976f2de3..e8fae6baa8 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -25,11 +25,11 @@ #include "stats/minmax.h" /* - The output of the function is a histogram array, of size ncols * nbins * n_unique_lables + The output of the function is a histogram array, of size ncols * nbins * n_unique_labels column order is as per colids (bootstrapped random cols) for each col there are nbins histograms */ template -__global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const int n_unique_labels, const T* __restrict__ globalminmax, int* histout) { +__global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const int nbins, const int nrows, const int ncols, const int n_unique_labels, const T* __restrict__ globalminmax, int* histout) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -50,7 +50,6 @@ __global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, con int mycolid = (int)( i / nrows); int coloffset = mycolid*n_unique_labels*nbins; - // nbins is # batched bins. Use (batched bins + 1) for delta computation. T delta = (minmaxshared[mycolid + ncols] - minmaxshared[mycolid]) / (nbins); T base_quesval = minmaxshared[mycolid] + delta; @@ -74,7 +73,7 @@ __global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, con } template -__global__ void all_cols_histograms_global_quantile_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const int n_unique_labels, int* histout, const T* __restrict__ quantile) { +__global__ void all_cols_histograms_global_quantile_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int n_unique_labels, int* histout, const T* __restrict__ quantile) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -90,7 +89,6 @@ __global__ void all_cols_histograms_global_quantile_kernel_class(const T* __rest int mycolid = (int)( i / nrows); int coloffset = mycolid*n_unique_labels*nbins; - // nbins is # batched bins. T localdata = data[i]; int label = labels[ rowids[ i % nrows ] ]; for (int j=0; j < nbins; j++) { @@ -230,9 +228,9 @@ void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, c if (split_algo == ML::SPLIT_ALGO::HIST) { shmemsize += col_minmax_bytes; - all_cols_histograms_kernel_class<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, n_unique_labels, d_globalminmax, d_histout); + all_cols_histograms_kernel_class<<stream>>>(tempmem->temp_data->data(), labels, rowids, nbins, nrows, ncols, n_unique_labels, d_globalminmax, d_histout); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { - all_cols_histograms_global_quantile_kernel_class<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, n_unique_labels, d_histout, tempmem->d_quantile->data()); + all_cols_histograms_global_quantile_kernel_class<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, n_unique_labels, d_histout, tempmem->d_quantile->data()); } CUDA_CHECK(cudaGetLastError()); diff --git a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh index 06cddbafde..15639de4af 100644 --- a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh @@ -25,7 +25,7 @@ #include "stats/minmax.h" template -__global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T pred_parent) { +__global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int nbins, const int nrows, const int ncols, const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T pred_parent) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -51,7 +51,6 @@ __global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const int mycolid = (int)( i / nrows); int coloffset = mycolid*nbins; - // nbins is # batched bins. Use (batched bins + 1) for delta computation. T delta = (minmaxshared[mycolid + ncols] - minmaxshared[mycolid]) / (nbins); T base_quesval = minmaxshared[mycolid] + delta; @@ -86,7 +85,7 @@ __global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const column order is as per colids (bootstrapped random cols) for each col there are nbins histograms */ template -__global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, const T* __restrict__ globalminmax, T* predout, int* countout) { +__global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int nbins, const int nrows, const int ncols, const T* __restrict__ globalminmax, T* predout, int* countout) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -109,7 +108,6 @@ __global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data int mycolid = (int)( i / nrows); int coloffset = mycolid*nbins; - // nbins is # batched bins. Use (batched bins + 1) for delta computation. T delta = (minmaxshared[mycolid + ncols] - minmaxshared[mycolid]) / (nbins); T base_quesval = minmaxshared[mycolid] + delta; @@ -135,7 +133,7 @@ __global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data } template -__global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, T* predout, int* countout, const T* __restrict__ quantile) { +__global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, T* predout, int* countout, const T* __restrict__ quantile) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -153,7 +151,6 @@ __global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restri int mycolid = (int)( i / nrows); int coloffset = mycolid*nbins; - // nbins is # batched bins. T localdata = data[i]; T label = labels[ rowids[ i % nrows ] ]; for (int j=0; j < nbins; j++) { @@ -176,7 +173,7 @@ __global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restri } template -__global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int rowoffset, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T* __restrict__ quantile, const T pred_parent) { +__global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T* __restrict__ quantile, const T pred_parent) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -197,7 +194,6 @@ __global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ dat int mycolid = (int)( i / nrows); int coloffset = mycolid*nbins; - // nbins is # batched bins. T localdata = data[i]; T label = labels[ rowids[ i % nrows ] ]; for (int j=0; j < nbins; j++) { @@ -333,11 +329,11 @@ void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, co if (split_algo == ML::SPLIT_ALGO::HIST) { shmemsize += col_minmax_bytes; - all_cols_histograms_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_globalminmax, d_predout, d_histout); - compute_mse_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_globalminmax, d_mseout, d_predout, d_histout, split_info[0].predict); + all_cols_histograms_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, nbins, nrows, ncols, d_globalminmax, d_predout, d_histout); + compute_mse_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, nbins, nrows, ncols, d_globalminmax, d_mseout, d_predout, d_histout, split_info[0].predict); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { - all_cols_histograms_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_predout, d_histout, tempmem->d_quantile->data()); - compute_mse_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, rowoffset, d_mseout, d_predout, d_histout, tempmem->d_quantile->data(), split_info[0].predict); + all_cols_histograms_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, d_predout, d_histout, tempmem->d_quantile->data()); + compute_mse_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, d_mseout, d_predout, d_histout, tempmem->d_quantile->data(), split_info[0].predict); } CUDA_CHECK(cudaGetLastError()); From 507601d8079472aed8db66d1eb68ea78aa8b6963 Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Wed, 29 May 2019 11:09:58 +0200 Subject: [PATCH 26/51] adding label sampler in tree build --- .../decisiontree/kernels/col_condenser.cuh | 2 +- .../kernels/evaluate_classifier.cuh | 14 +++++---- .../kernels/evaluate_regressor.cuh | 29 ++++++++++--------- 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/cpp/src/decisiontree/kernels/col_condenser.cuh b/cpp/src/decisiontree/kernels/col_condenser.cuh index d9ae01bb94..a89b1abd9e 100644 --- a/cpp/src/decisiontree/kernels/col_condenser.cuh +++ b/cpp/src/decisiontree/kernels/col_condenser.cuh @@ -29,7 +29,7 @@ __global__ void get_sampled_column_kernel(const T* __restrict__ column, T *outco } template -void get_sampled_labels(const T *labels, T *outlabels, unsigned int* rowids, const int n_sampled_rows, const cudaStream_t stream) { +void get_sampled_labels(const T *labels, T *outlabels, const unsigned int* rowids, const int n_sampled_rows, const cudaStream_t stream) { int threads = 128; get_sampled_column_kernel<<>>(labels, outlabels, rowids, n_sampled_rows); CUDA_CHECK(cudaGetLastError()); diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index e8fae6baa8..d058a66c49 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -29,7 +29,7 @@ column order is as per colids (bootstrapped random cols) for each col there are nbins histograms */ template -__global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const int nbins, const int nrows, const int ncols, const int n_unique_labels, const T* __restrict__ globalminmax, int* histout) { +__global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const int nbins, const int nrows, const int ncols, const int n_unique_labels, const T* __restrict__ globalminmax, int* histout) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -54,7 +54,7 @@ __global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, con T base_quesval = minmaxshared[mycolid] + delta; T localdata = data[i]; - int label = labels[ rowids[ i % nrows ] ]; + int label = labels[ i % nrows ]; for (int j=0; j < nbins; j++) { T quesval = base_quesval + j * delta; @@ -73,7 +73,7 @@ __global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, con } template -__global__ void all_cols_histograms_global_quantile_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int n_unique_labels, int* histout, const T* __restrict__ quantile) { +__global__ void all_cols_histograms_global_quantile_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int n_unique_labels, int* histout, const T* __restrict__ quantile) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -90,7 +90,7 @@ __global__ void all_cols_histograms_global_quantile_kernel_class(const T* __rest int coloffset = mycolid*n_unique_labels*nbins; T localdata = data[i]; - int label = labels[ rowids[ i % nrows ] ]; + int label = labels[ i % nrows ]; for (int j=0; j < nbins; j++) { int quantile_index = colids[mycolid] * nbins + j; T quesval = quantile[quantile_index]; @@ -223,14 +223,16 @@ void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, c allcolsampler_kernel<<stream>>>(data, rowids, d_colids, nrows, ncols, rowoffset, tempmem->temp_data->data()); } CUDA_CHECK(cudaGetLastError()); + L *labelptr = tempmem->sampledlabels->data(); + get_sampled_labels(labels, labelptr, rowids, nrows, tempmem->stream); shmemsize = n_hist_bytes; if (split_algo == ML::SPLIT_ALGO::HIST) { shmemsize += col_minmax_bytes; - all_cols_histograms_kernel_class<<stream>>>(tempmem->temp_data->data(), labels, rowids, nbins, nrows, ncols, n_unique_labels, d_globalminmax, d_histout); + all_cols_histograms_kernel_class<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, n_unique_labels, d_globalminmax, d_histout); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { - all_cols_histograms_global_quantile_kernel_class<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, n_unique_labels, d_histout, tempmem->d_quantile->data()); + all_cols_histograms_global_quantile_kernel_class<<stream>>>(tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, n_unique_labels, d_histout, tempmem->d_quantile->data()); } CUDA_CHECK(cudaGetLastError()); diff --git a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh index 15639de4af..9963e660ea 100644 --- a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh @@ -25,7 +25,7 @@ #include "stats/minmax.h" template -__global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int nbins, const int nrows, const int ncols, const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T pred_parent) { +__global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const int nbins, const int nrows, const int ncols, const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T pred_parent) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -55,7 +55,7 @@ __global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const T base_quesval = minmaxshared[mycolid] + delta; T localdata = data[i]; - T label = labels[ rowids[ i % nrows ] ]; + T label = labels[ i % nrows]; for (int j=0; j < nbins; j++) { T quesval = base_quesval + j * delta; @@ -85,7 +85,7 @@ __global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const column order is as per colids (bootstrapped random cols) for each col there are nbins histograms */ template -__global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const int nbins, const int nrows, const int ncols, const T* __restrict__ globalminmax, T* predout, int* countout) { +__global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const int nbins, const int nrows, const int ncols, const T* __restrict__ globalminmax, T* predout, int* countout) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -112,7 +112,7 @@ __global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data T base_quesval = minmaxshared[mycolid] + delta; T localdata = data[i]; - T label = labels[ rowids[ i % nrows ] ]; + T label = labels[ i % nrows ]; for (int j=0; j < nbins; j++) { T quesval = base_quesval + j * delta; @@ -133,7 +133,7 @@ __global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data } template -__global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, T* predout, int* countout, const T* __restrict__ quantile) { +__global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, T* predout, int* countout, const T* __restrict__ quantile) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -152,7 +152,7 @@ __global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restri int coloffset = mycolid*nbins; T localdata = data[i]; - T label = labels[ rowids[ i % nrows ] ]; + T label = labels[ i % nrows ]; for (int j=0; j < nbins; j++) { int quantile_index = colids[mycolid] * nbins + j; T quesval = quantile[quantile_index]; @@ -173,7 +173,7 @@ __global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restri } template -__global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T* __restrict__ quantile, const T pred_parent) { +__global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T* __restrict__ quantile, const T pred_parent) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; @@ -195,7 +195,7 @@ __global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ dat int coloffset = mycolid*nbins; T localdata = data[i]; - T label = labels[ rowids[ i % nrows ] ]; + T label = labels[ i % nrows ]; for (int j=0; j < nbins; j++) { int quantile_index = colids[mycolid] * nbins + j; T quesval = quantile[quantile_index]; @@ -326,14 +326,17 @@ void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, co CUDA_CHECK(cudaGetLastError()); shmemsize = n_pred_bytes + n_count_bytes; - + + T *labelptr = tempmem->sampledlabels->data(); + get_sampled_labels(labels, labelptr, rowids, nrows, tempmem->stream); + if (split_algo == ML::SPLIT_ALGO::HIST) { shmemsize += col_minmax_bytes; - all_cols_histograms_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, nbins, nrows, ncols, d_globalminmax, d_predout, d_histout); - compute_mse_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, nbins, nrows, ncols, d_globalminmax, d_mseout, d_predout, d_histout, split_info[0].predict); + all_cols_histograms_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, d_globalminmax, d_predout, d_histout); + compute_mse_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, d_globalminmax, d_mseout, d_predout, d_histout, split_info[0].predict); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { - all_cols_histograms_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, d_predout, d_histout, tempmem->d_quantile->data()); - compute_mse_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labels, rowids, d_colids, nbins, nrows, ncols, d_mseout, d_predout, d_histout, tempmem->d_quantile->data(), split_info[0].predict); + all_cols_histograms_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, d_predout, d_histout, tempmem->d_quantile->data()); + compute_mse_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, d_mseout, d_predout, d_histout, tempmem->d_quantile->data(), split_info[0].predict); } CUDA_CHECK(cudaGetLastError()); From 76a5d3bde2dab3d99a1a47a309da459b0ed47b03 Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Wed, 29 May 2019 17:56:22 +0200 Subject: [PATCH 27/51] name change for gini/mse metric files --- cpp/src/decisiontree/decisiontree.cu | 2 +- cpp/src/decisiontree/decisiontree.h | 2 +- cpp/src/decisiontree/kernels/evaluate_classifier.cuh | 2 +- cpp/src/decisiontree/kernels/evaluate_regressor.cuh | 2 +- cpp/src/decisiontree/kernels/{gini.cuh => metric.cuh} | 2 +- cpp/src/decisiontree/kernels/{gini_def.h => metric_def.h} | 0 cpp/src/decisiontree/kernels/split_labels.cuh | 2 +- 7 files changed, 6 insertions(+), 6 deletions(-) rename cpp/src/decisiontree/kernels/{gini.cuh => metric.cuh} (99%) rename cpp/src/decisiontree/kernels/{gini_def.h => metric_def.h} (100%) diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu index dd32332a54..b1617d4f97 100644 --- a/cpp/src/decisiontree/decisiontree.cu +++ b/cpp/src/decisiontree/decisiontree.cu @@ -16,7 +16,7 @@ #include #include "decisiontree.h" -#include "kernels/gini.cuh" +#include "kernels/metric.cuh" #include "kernels/split_labels.cuh" #include "kernels/col_condenser.cuh" #include "kernels/evaluate_classifier.cuh" diff --git a/cpp/src/decisiontree/decisiontree.h b/cpp/src/decisiontree/decisiontree.h index 68a16da420..fa3bf90973 100644 --- a/cpp/src/decisiontree/decisiontree.h +++ b/cpp/src/decisiontree/decisiontree.h @@ -16,7 +16,7 @@ #pragma once #include "algo_helper.h" -#include "kernels/gini_def.h" +#include "kernels/metric_def.h" #include "memory.cuh" #include #include diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index d058a66c49..9ed4c38444 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -17,7 +17,7 @@ #pragma once #include #include -#include "gini.cuh" +#include "metric.cuh" #include "../memory.cuh" #include "col_condenser.cuh" #include diff --git a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh index 9963e660ea..7a21ddb878 100644 --- a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh @@ -17,7 +17,7 @@ #pragma once #include #include -#include "gini.cuh" +#include "metric.cuh" #include "../memory.cuh" #include "col_condenser.cuh" #include diff --git a/cpp/src/decisiontree/kernels/gini.cuh b/cpp/src/decisiontree/kernels/metric.cuh similarity index 99% rename from cpp/src/decisiontree/kernels/gini.cuh rename to cpp/src/decisiontree/kernels/metric.cuh index 788d8565f9..66711070b6 100644 --- a/cpp/src/decisiontree/kernels/gini.cuh +++ b/cpp/src/decisiontree/kernels/metric.cuh @@ -19,7 +19,7 @@ #include "cub/cub.cuh" #include "../memory.cuh" #include -#include "gini_def.h" +#include "metric_def.h" #include "cuda_utils.h" template diff --git a/cpp/src/decisiontree/kernels/gini_def.h b/cpp/src/decisiontree/kernels/metric_def.h similarity index 100% rename from cpp/src/decisiontree/kernels/gini_def.h rename to cpp/src/decisiontree/kernels/metric_def.h diff --git a/cpp/src/decisiontree/kernels/split_labels.cuh b/cpp/src/decisiontree/kernels/split_labels.cuh index 2982899225..2cb1718442 100644 --- a/cpp/src/decisiontree/kernels/split_labels.cuh +++ b/cpp/src/decisiontree/kernels/split_labels.cuh @@ -18,7 +18,7 @@ #include #include "cub/cub.cuh" #include -#include "gini.cuh" +#include "metric.cuh" #include "../algo_helper.h" template From c5abd2614b6bbe4e674d20017efae320e5844fc9 Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Thu, 30 May 2019 00:50:00 -0700 Subject: [PATCH 28/51] Renamed dt class; sorted rf rowIDs; del MemGetInfo - Renamed dt base class to DecisionTreeBase - Commented out cudaMemGetInfo call until an appropriate MemGetInfo API is provided. - Sorted row IDs in randomforest.cu to improve access patterns. --- cpp/src/decisiontree/decisiontree.cu | 22 +++++------ cpp/src/decisiontree/decisiontree.h | 8 ++-- cpp/src/decisiontree/kernels/quantile.cuh | 3 ++ cpp/src/randomforest/randomforest.cu | 48 ++++++++++++++++++----- cpp/src/randomforest/randomforest.h | 2 +- 5 files changed, 58 insertions(+), 25 deletions(-) diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu index b1617d4f97..5289f9519a 100644 --- a/cpp/src/decisiontree/decisiontree.cu +++ b/cpp/src/decisiontree/decisiontree.cu @@ -95,7 +95,7 @@ void DecisionTreeParams::print() const { * @tparam L: data type for labels (int type for classification, T type for regression). */ template -void dt::print_tree_summary() const { +void DecisionTreeBase::print_tree_summary() const { std::cout << " Decision Tree depth --> " << depth_counter << " and n_leaves --> " << leaf_counter << std::endl; std::cout << " Total temporary memory usage--> "<< ((double)total_temp_mem / (1024*1024)) << " MB" << std::endl; std::cout << " Tree growing time --> " << construct_time << " seconds" << std::endl; @@ -108,14 +108,14 @@ void dt::print_tree_summary() const { * @tparam L: data type for labels (int type for classification, T type for regression). */ template -void dt::print() const { +void DecisionTreeBase::print() const { print_tree_summary(); print_node("", this->root, false); } template -void dt::print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const { +void DecisionTreeBase::print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const { if (node != nullptr) { std::cout << prefix; @@ -132,7 +132,7 @@ void dt::print_node(const std::string& prefix, const TreeNode* const } template -void dt::split_branch(T *data, MetricQuestion & ques, const int n_sampled_rows, int& nrowsleft, +void DecisionTreeBase::split_branch(T *data, MetricQuestion & ques, const int n_sampled_rows, int& nrowsleft, int& nrowsright, unsigned int* rowids) { T *temp_data = this->tempmem[0]->temp_data->data(); @@ -153,7 +153,7 @@ void dt::split_branch(T *data, MetricQuestion & ques, const int n_sampl * @param[in] verbose: flag for debugging purposes. */ template -void dt::predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, L* predictions, bool verbose) const { +void DecisionTreeBase::predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, L* predictions, bool verbose) const { ASSERT(root, "Cannot predict w/ empty tree!"); ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); @@ -161,14 +161,14 @@ void dt::predict(const ML::cumlHandle& handle, const T * rows, const int n } template -void dt::predict_all(const T * rows, const int n_rows, const int n_cols, L * preds, bool verbose) const { +void DecisionTreeBase::predict_all(const T * rows, const int n_rows, const int n_cols, L * preds, bool verbose) const { for (int row_id = 0; row_id < n_rows; row_id++) { preds[row_id] = predict_one(&rows[row_id * n_cols], this->root, verbose); } } template -L dt::predict_one(const T * row, const TreeNode* const node, bool verbose) const { +L DecisionTreeBase::predict_one(const T * row, const TreeNode* const node, bool verbose) const { Question q = node->question; if (node->left && (row[q.column] <= q.value)) { @@ -547,10 +547,10 @@ void DecisionTreeRegressor::find_best_fruit_all(T *data, T *labels, const flo } //Class specializations -template class dt; -template class dt; -template class dt; -template class dt; +template class DecisionTreeBase; +template class DecisionTreeBase; +template class DecisionTreeBase; +template class DecisionTreeBase; template class DecisionTreeClassifier; template class DecisionTreeClassifier; diff --git a/cpp/src/decisiontree/decisiontree.h b/cpp/src/decisiontree/decisiontree.h index fa3bf90973..29f7d3fdc9 100644 --- a/cpp/src/decisiontree/decisiontree.h +++ b/cpp/src/decisiontree/decisiontree.h @@ -96,7 +96,7 @@ struct DecisionTreeParams { }; template -class dt { +class DecisionTreeBase { protected: int split_algo; TreeNode *root = nullptr; @@ -133,10 +133,10 @@ class dt { void predict_all(const T * rows, const int n_rows, const int n_cols, L * preds, bool verbose=false) const; L predict_one(const T * row, const TreeNode * const node, bool verbose=false) const; -}; // End dt Class +}; // End DecisionTreeBase Class template -class DecisionTreeClassifier : public dt { +class DecisionTreeClassifier : public DecisionTreeBase { public: // Expects column major T dataset, integer labels // data, labels are both device ptr. @@ -157,7 +157,7 @@ class DecisionTreeClassifier : public dt { }; // End DecisionTreeClassifier Class template -class DecisionTreeRegressor : public dt { +class DecisionTreeRegressor : public DecisionTreeBase { public: void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, const int n_sampled_rows, DecisionTreeParams tree_params); diff --git a/cpp/src/decisiontree/kernels/quantile.cuh b/cpp/src/decisiontree/kernels/quantile.cuh index 6d8dbda6aa..9ec7cb58a7 100644 --- a/cpp/src/decisiontree/kernels/quantile.cuh +++ b/cpp/src/decisiontree/kernels/quantile.cuh @@ -42,12 +42,15 @@ __global__ void get_all_quantiles(const T* __restrict__ data, T* quantile, const template void preprocess_quantile(const T* data, const unsigned int* rowids, const int n_sampled_rows, const int ncols, const int rowoffset, const int nbins, std::shared_ptr> tempmem) { + /* // Dynamically determine batch_cols (number of columns processed per loop iteration) from the available device memory. size_t free_mem, total_mem; CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem)); int max_ncols = free_mem / (2 * n_sampled_rows * sizeof(T)); int batch_cols = (max_ncols > ncols) ? ncols : max_ncols; ASSERT(max_ncols != 0, "Cannot preprocess quantiles due to insufficient device memory."); + */ + int batch_cols = 1; // Processing one column at a time, for now, until an appropriate getMemInfo function is provided for the deviceAllocator interface. int threads = 128; MLCommon::device_buffer *d_offsets; diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index 26f24e9450..17e86ca733 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -167,7 +167,7 @@ int rf::get_ntrees() { */ template void rf::print_rf_summary() { - const DecisionTree::dt * trees = get_trees_ptr(); + const DecisionTree::DecisionTreeBase * trees = get_trees_ptr(); if (!trees) { std::cout << "Empty forest" << std::endl; } else { @@ -189,7 +189,7 @@ void rf::print_rf_summary() { template void rf::print_rf_detailed() { - const DecisionTree::dt * trees = get_trees_ptr(); + const DecisionTree::DecisionTreeBase * trees = get_trees_ptr(); if (!trees) { std::cout << "Empty forest" << std::endl; } else { @@ -259,10 +259,21 @@ void rfClassifier::fit(const cumlHandle& user_handle, T * input, int n_rows, // Select n_sampled_rows (with replacement) numbers from [0, n_rows) per tree. // selected_rows: randomly generated IDs for bootstrapped samples (w/ replacement); a device ptr. MLCommon::device_buffer selected_rows(handle.getDeviceAllocator(), stream, n_sampled_rows); + MLCommon::device_buffer sorted_selected_rows(handle.getDeviceAllocator(), stream, n_sampled_rows); + + // Will sort selected_rows (row IDs), prior to fit, to improve access patterns + MLCommon::device_buffer * rows_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + CUDA_CHECK(cub::DeviceRadixSort::SortKeys(rows_temp_storage, temp_storage_bytes, selected_rows.data(), sorted_selected_rows.data(), + n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); + // Allocate temporary storage + rows_temp_storage = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, temp_storage_bytes); if (this->rf_params.bootstrap) { MLCommon::Random::Rng r(i * 1000); // Ensure the seed for each tree is different and meaningful. r.uniformInt(selected_rows.data(), n_sampled_rows, (unsigned int) 0, (unsigned int) n_rows, stream); + CUDA_CHECK(cub::DeviceRadixSort::SortKeys((void *)rows_temp_storage->data(), temp_storage_bytes, selected_rows.data(), sorted_selected_rows.data(), + n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); } else { // Sampling w/o replacement MLCommon::device_buffer *inkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); MLCommon::device_buffer *outkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); @@ -270,7 +281,8 @@ void rfClassifier::fit(const cumlHandle& user_handle, T * input, int n_rows, int *perms = nullptr; MLCommon::Random::permute(perms, outkeys->data(), inkeys->data(), 1, n_rows, false, stream); // outkeys has more rows than selected_rows; doing the shuffling before the resize to differentiate the per-tree rows sample. - MLCommon::copyAsync(selected_rows.data(), outkeys->data(), n_sampled_rows, stream); + CUDA_CHECK(cub::DeviceRadixSort::SortKeys((void *)rows_temp_storage->data(), temp_storage_bytes, outkeys->data(), sorted_selected_rows.data(), + n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); inkeys->release(stream); outkeys->release(stream); delete inkeys; @@ -280,13 +292,16 @@ void rfClassifier::fit(const cumlHandle& user_handle, T * input, int n_rows, /* Build individual tree in the forest. - input is a pointer to orig data that have n_cols features and n_rows rows. - n_sampled_rows: # rows sampled for tree's bootstrap sample. - - selected_rows: points to a list of row #s (w/ n_sampled_rows elements) used to build the bootstrapped sample. + - sorted_selected_rows: points to a list of row #s (w/ n_sampled_rows elements) used to build the bootstrapped sample. Expectation: Each tree node will contain (a) # n_sampled_rows and (b) a pointer to a list of row numbers w.r.t original data. */ - trees[i].fit(user_handle, input, n_cols, n_rows, labels, selected_rows.data(), n_sampled_rows, n_unique_labels, this->rf_params.tree_params); + trees[i].fit(user_handle, input, n_cols, n_rows, labels, sorted_selected_rows.data(), n_sampled_rows, n_unique_labels, this->rf_params.tree_params); //Cleanup + rows_temp_storage->release(stream); selected_rows.release(stream); + sorted_selected_rows.release(stream); + delete rows_temp_storage; } } @@ -431,10 +446,21 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i // Select n_sampled_rows (with replacement) numbers from [0, n_rows) per tree. // selected_rows: randomly generated IDs for bootstrapped samples (w/ replacement); a device ptr. MLCommon::device_buffer selected_rows(handle.getDeviceAllocator(), stream, n_sampled_rows); - + MLCommon::device_buffer sorted_selected_rows(handle.getDeviceAllocator(), stream, n_sampled_rows); + + // Will sort selected_rows (row IDs), prior to fit, to improve access patterns + MLCommon::device_buffer *rows_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + CUDA_CHECK(cub::DeviceRadixSort::SortKeys(rows_temp_storage, temp_storage_bytes, selected_rows.data(), sorted_selected_rows.data(), + n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); + // Allocate temporary storage + rows_temp_storage = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, temp_storage_bytes); + if (this->rf_params.bootstrap) { MLCommon::Random::Rng r(i * 1000); // Ensure the seed for each tree is different and meaningful. r.uniformInt(selected_rows.data(), n_sampled_rows, (unsigned int) 0, (unsigned int) n_rows, stream); + CUDA_CHECK(cub::DeviceRadixSort::SortKeys((void *)rows_temp_storage->data(), temp_storage_bytes, selected_rows.data(), sorted_selected_rows.data(), + n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); } else { // Sampling w/o replacement MLCommon::device_buffer *inkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); MLCommon::device_buffer *outkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); @@ -442,7 +468,8 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i int *perms = nullptr; MLCommon::Random::permute(perms, outkeys->data(), inkeys->data(), 1, n_rows, false, stream); // outkeys has more rows than selected_rows; doing the shuffling before the resize to differentiate the per-tree rows sample. - MLCommon::copyAsync(selected_rows.data(), outkeys->data(), n_sampled_rows, stream); + CUDA_CHECK(cub::DeviceRadixSort::SortKeys((void *)rows_temp_storage->data(), temp_storage_bytes, outkeys->data(), sorted_selected_rows.data(), + n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); inkeys->release(stream); outkeys->release(stream); delete inkeys; @@ -452,13 +479,16 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i /* Build individual tree in the forest. - input is a pointer to orig data that have n_cols features and n_rows rows. - n_sampled_rows: # rows sampled for tree's bootstrap sample. - - selected_rows: points to a list of row #s (w/ n_sampled_rows elements) used to build the bootstrapped sample. + - sorted_selected_rows: points to a list of row #s (w/ n_sampled_rows elements) used to build the bootstrapped sample. Expectation: Each tree node will contain (a) # n_sampled_rows and (b) a pointer to a list of row numbers w.r.t original data. */ - trees[i].fit(user_handle, input, n_cols, n_rows, labels, selected_rows.data(), n_sampled_rows, this->rf_params.tree_params); + trees[i].fit(user_handle, input, n_cols, n_rows, labels, sorted_selected_rows.data(), n_sampled_rows, this->rf_params.tree_params); //Cleanup + rows_temp_storage->release(stream); selected_rows.release(stream); + sorted_selected_rows.release(stream); + delete rows_temp_storage; } } diff --git a/cpp/src/randomforest/randomforest.h b/cpp/src/randomforest/randomforest.h index 4b341572ce..18dc85ad96 100644 --- a/cpp/src/randomforest/randomforest.h +++ b/cpp/src/randomforest/randomforest.h @@ -89,7 +89,7 @@ class rf { protected: RF_params rf_params; int rf_type; - virtual const DecisionTree::dt * get_trees_ptr() const = 0; + virtual const DecisionTree::DecisionTreeBase * get_trees_ptr() const = 0; ~rf() = default; public: From 35f1b594ca5bc3fefd4acc23598701022a9b753a Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Thu, 30 May 2019 03:51:15 -0700 Subject: [PATCH 29/51] Moved plant, grow_tree methods to DecisionTreeBase class. --- cpp/src/decisiontree/decisiontree.cu | 357 ++++++++++----------------- cpp/src/decisiontree/decisiontree.h | 20 +- 2 files changed, 137 insertions(+), 240 deletions(-) diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu index 5289f9519a..b054ac59f5 100644 --- a/cpp/src/decisiontree/decisiontree.cu +++ b/cpp/src/decisiontree/decisiontree.cu @@ -110,7 +110,7 @@ void DecisionTreeBase::print_tree_summary() const { template void DecisionTreeBase::print() const { print_tree_summary(); - print_node("", this->root, false); + print_node("", root, false); } @@ -135,11 +135,126 @@ template void DecisionTreeBase::split_branch(T *data, MetricQuestion & ques, const int n_sampled_rows, int& nrowsleft, int& nrowsright, unsigned int* rowids) { - T *temp_data = this->tempmem[0]->temp_data->data(); + T *temp_data = tempmem[0]->temp_data->data(); T *sampledcolumn = &temp_data[n_sampled_rows * ques.bootstrapped_column]; - make_split(sampledcolumn, ques, n_sampled_rows, nrowsleft, nrowsright, rowids, this->split_algo, this->tempmem[0]); + make_split(sampledcolumn, ques, n_sampled_rows, nrowsleft, nrowsright, rowids, split_algo, tempmem[0]); } +template +void DecisionTreeBase::plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, L *labels, unsigned int *rowids, + const int n_sampled_rows, int unique_labels, int maxdepth, int max_leaf_nodes, const float colper, int n_bins, int split_algo_flag, + int cfg_min_rows_per_node, bool cfg_bootstrap_features, CRITERION cfg_split_criterion) { + + split_algo = split_algo_flag; + dinfo.NLocalrows = nrows; + dinfo.NGlobalrows = nrows; + dinfo.Ncols = ncols; + nbins = n_bins; + treedepth = maxdepth; + maxleaves = max_leaf_nodes; + tempmem.resize(MAXSTREAMS); + n_unique_labels = unique_labels; + min_rows_per_node = cfg_min_rows_per_node; + bootstrap_features = cfg_bootstrap_features; + split_criterion = cfg_split_criterion; + + //Bootstrap features + feature_selector.resize(dinfo.Ncols); + if (bootstrap_features) { + srand(n_bins); + for (int i=0; i < dinfo.Ncols; i++) { + feature_selector.push_back( rand() % dinfo.Ncols ); + } + } else { + std::iota(feature_selector.begin(), feature_selector.end(), 0); + } + + std::random_shuffle(feature_selector.begin(), feature_selector.end()); + feature_selector.resize((int) (colper * dinfo.Ncols)); + + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, 0)); + max_shared_mem = prop.sharedMemPerBlock; + + if (split_algo == SPLIT_ALGO::HIST) { + shmem_used += 2 * sizeof(T) * ncols; + } + if (typeid(L) == typeid(int)) { // Classification + shmem_used += nbins * n_unique_labels * sizeof(int) * ncols; + } else { // Regression + shmem_used += nbins * sizeof(T) * ncols * 3; + shmem_used += nbins * sizeof(int) * ncols; + } + ASSERT(shmem_used <= max_shared_mem, "Shared memory per block limit %zd , requested %zd \n", max_shared_mem, shmem_used); + + for (int i = 0; i < MAXSTREAMS; i++) { + tempmem[i] = std::make_shared>(handle, n_sampled_rows, ncols, MAXSTREAMS, unique_labels, n_bins, split_algo); + if (split_algo == SPLIT_ALGO::GLOBAL_QUANTILE) { + preprocess_quantile(data, rowids, n_sampled_rows, ncols, dinfo.NLocalrows, n_bins, tempmem[i]); + } + } + total_temp_mem = tempmem[0]->totalmem; + total_temp_mem *= MAXSTREAMS; + MetricInfo split_info; + MLCommon::TimerCPU timer; + root = grow_tree(data, colper, labels, 0, rowids, n_sampled_rows, split_info); + construct_time = timer.getElapsedSeconds(); + + for (int i = 0; i < MAXSTREAMS; i++) { + tempmem[i].reset(); + } +} + +template +TreeNode* DecisionTreeBase::grow_tree(T *data, const float colper, L *labels, int depth, unsigned int* rowids, + const int n_sampled_rows, MetricInfo prev_split_info) { + + TreeNode *node = new TreeNode(); + MetricQuestion ques; + Question node_ques; + float gain = 0.0; + MetricInfo split_info[3]; // basis, left, right. Populate this + split_info[0] = prev_split_info; + + bool condition = ((depth != 0) && (prev_split_info.best_metric == 0.0f)); // This node is a leaf, no need to search for best split + condition = condition || (n_sampled_rows < min_rows_per_node); // Do not split a node with less than min_rows_per_node samples + + if (treedepth != -1) { + condition = (condition || (depth == treedepth)); + } + + if (maxleaves != -1) { + condition = (condition || (leaf_counter >= maxleaves)); // FIXME not fully respecting maxleaves, but >= constraints it more than == + } + + if (!condition) { + find_best_fruit_all(data, labels, colper, ques, gain, rowids, n_sampled_rows, &split_info[0], depth); //ques and gain are output here + condition = condition || (gain == 0.0f); + } + + if (condition) { + if (typeid(L) == typeid(int)) { // classification + node->prediction = get_class_hist(split_info[0].hist); + } else { // regression (typeid(L) == typeid(T)) + node->prediction = split_info[0].predict; + } + node->split_metric_val = split_info[0].best_metric; + + leaf_counter++; + if (depth > depth_counter) { + depth_counter = depth; + } + } else { + int nrowsleft, nrowsright; + split_branch(data, ques, n_sampled_rows, nrowsleft, nrowsright, rowids); // populates ques.value + node_ques.update(ques); + node->question = node_ques; + node->left = grow_tree(data, colper, labels, depth+1, &rowids[0], nrowsleft, split_info[1]); + node->right = grow_tree(data, colper, labels, depth+1, &rowids[nrowsleft], nrowsright, split_info[2]); + node->split_metric_val = split_info[0].best_metric; + } + return node; +} /** * @brief Predict target feature for input data; n-ary classification or regression for single feature supported. Inference of trees is CPU only for now. @@ -163,7 +278,7 @@ void DecisionTreeBase::predict(const ML::cumlHandle& handle, const T * row template void DecisionTreeBase::predict_all(const T * rows, const int n_rows, const int n_cols, L * preds, bool verbose) const { for (int row_id = 0; row_id < n_rows; row_id++) { - preds[row_id] = predict_one(&rows[row_id * n_cols], this->root, verbose); + preds[row_id] = predict_one(&rows[row_id * n_cols], root, verbose); } } @@ -220,120 +335,14 @@ void DecisionTreeClassifier::fit(const ML::cumlHandle& handle, T *data, const if (tree_params.split_criterion == CRITERION::CRITERION_END) { // Set default to GINI tree_params.split_criterion = CRITERION::GINI; } - ASSERT( (tree_params.split_criterion == CRITERION::GINI || tree_params.split_criterion == CRITERION::ENTROPY ), " Decision Tree Classifer split criteria, should be Gini or Entropy\n"); - - return plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree_params.max_depth, - tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features); -} - -template -void DecisionTreeClassifier::plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, - int unique_labels, int maxdepth, int max_leaf_nodes, const float colper, int n_bins, int split_algo_flag, int cfg_min_rows_per_node, bool cfg_bootstrap_features) { - - this->split_algo = split_algo_flag; - this->dinfo.NLocalrows = nrows; - this->dinfo.NGlobalrows = nrows; - this->dinfo.Ncols = ncols; - this->nbins = n_bins; - this->treedepth = maxdepth; - this->maxleaves = max_leaf_nodes; - this->tempmem.resize(this->MAXSTREAMS); - this->n_unique_labels = unique_labels; - this->min_rows_per_node = cfg_min_rows_per_node; - this->bootstrap_features = cfg_bootstrap_features; - this->split_criterion = CRITERION::GINI; - - //Bootstrap features - this->feature_selector.resize(this->dinfo.Ncols); - if (this->bootstrap_features) { - srand(n_bins); - for (int i=0; i < this->dinfo.Ncols; i++) { - this->feature_selector.push_back( rand() % this->dinfo.Ncols ); - } - } else { - std::iota(this->feature_selector.begin(), this->feature_selector.end(), 0); - } - - std::random_shuffle(this->feature_selector.begin(), this->feature_selector.end()); - this->feature_selector.resize((int) (colper * this->dinfo.Ncols)); - - cudaDeviceProp prop; - CUDA_CHECK(cudaGetDeviceProperties(&prop, 0)); - this->max_shared_mem = prop.sharedMemPerBlock; - - if (this->split_algo == SPLIT_ALGO::HIST) { - this->shmem_used += 2 * sizeof(T) * ncols; - this->shmem_used += this->nbins * this->n_unique_labels * sizeof(int) * ncols; - } else { - this->shmem_used += this->nbins * this->n_unique_labels * sizeof(int) * ncols; - } - ASSERT(this->shmem_used <= this->max_shared_mem, "Shared memory per block limit %zd , requested %zd \n", this->max_shared_mem, this->shmem_used); - - for (int i = 0; i < this->MAXSTREAMS; i++) { - this->tempmem[i] = std::make_shared>(handle, n_sampled_rows, ncols, this->MAXSTREAMS, unique_labels, n_bins, this->split_algo); - if (this->split_algo == SPLIT_ALGO::GLOBAL_QUANTILE) { - preprocess_quantile(data, rowids, n_sampled_rows, ncols, this->dinfo.NLocalrows, n_bins, this->tempmem[i]); - } - } - this->total_temp_mem = this->tempmem[0]->totalmem; - this->total_temp_mem *= this->MAXSTREAMS; - MetricInfo split_info; - MLCommon::TimerCPU timer; - this->root = grow_tree(data, colper, labels, 0, rowids, n_sampled_rows, split_info); - this->construct_time = timer.getElapsedSeconds(); - - for (int i = 0; i < this->MAXSTREAMS; i++) { - this->tempmem[i].reset(); - } - - return; -} - -template -TreeNode* DecisionTreeClassifier::grow_tree(T *data, const float colper, int *labels, int depth, unsigned int* rowids, - const int n_sampled_rows, MetricInfo prev_split_info) { - - TreeNode *node = new TreeNode(); - MetricQuestion ques; - Question node_ques; - float gain = 0.0; - MetricInfo split_info[3]; // basis, left, right. Populate this - split_info[0] = prev_split_info; - - bool condition = ((depth != 0) && (prev_split_info.best_metric == 0.0f)); // This node is a leaf, no need to search for best split - condition = condition || (n_sampled_rows < this->min_rows_per_node); // Do not split a node with less than min_rows_per_node samples - - if (this->treedepth != -1) - condition = (condition || (depth == this->treedepth)); - - if (this->maxleaves != -1) - condition = (condition || (this->leaf_counter >= this->maxleaves)); // FIXME not fully respecting maxleaves, but >= constraints it more than == - - if (!condition) { - find_best_fruit_all(data, labels, colper, ques, gain, rowids, n_sampled_rows, &split_info[0], depth); //ques and gain are output here - condition = condition || (gain == 0.0f); - } + ASSERT((tree_params.split_criterion == CRITERION::GINI || tree_params.split_criterion == CRITERION::ENTROPY ), + " Decision Tree Classifer split criteria, should be GINI or ENTROPY\n"); - if (condition) { - node->prediction = get_class_hist(split_info[0].hist); - node->split_metric_val = split_info[0].best_metric; - - this->leaf_counter++; - if (depth > this->depth_counter) - this->depth_counter = depth; - } else { - int nrowsleft, nrowsright; - this->split_branch(data, ques, n_sampled_rows, nrowsleft, nrowsright, rowids); // populates ques.value - node_ques.update(ques); - node->question = node_ques; - node->left = grow_tree(data, colper, labels, depth+1, &rowids[0], nrowsleft, split_info[1]); - node->right = grow_tree(data, colper, labels, depth+1, &rowids[nrowsleft], nrowsright, split_info[2]); - node->split_metric_val = split_info[0].best_metric; - } - return node; + this->plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree_params.max_depth, + tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, + tree_params.min_rows_per_node, tree_params.bootstrap_features, tree_params.split_criterion); } - template void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const float colper, MetricQuestion & ques, float& gain, unsigned int* rowids, const int n_sampled_rows, MetricInfo split_info[3], int depth) { @@ -396,118 +405,11 @@ void DecisionTreeRegressor::fit(const ML::cumlHandle& handle, T *data, const if (tree_params.split_criterion == CRITERION::CRITERION_END) { // Set default to MSE tree_params.split_criterion = CRITERION::MSE; } - ASSERT( (tree_params.split_criterion == CRITERION::MSE || tree_params.split_criterion == CRITERION::MAE ) , "Decision Tree Regressor split creteria should be MSE or MAE\n"); - plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, 1, tree_params.max_depth, - tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features, tree_params.split_criterion); -} - -template -void DecisionTreeRegressor::plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, const int n_sampled_rows, - int unique_labels, int maxdepth, int max_leaf_nodes, const float colper, int n_bins, int split_algo_flag, int cfg_min_rows_per_node, bool cfg_bootstrap_features, CRITERION cfg_split_criterion) { - - this->split_algo = split_algo_flag; - this->dinfo.NLocalrows = nrows; - this->dinfo.NGlobalrows = nrows; - this->dinfo.Ncols = ncols; - this->nbins = n_bins; - this->treedepth = maxdepth; - this->maxleaves = max_leaf_nodes; - this->tempmem.resize(this->MAXSTREAMS); - this->n_unique_labels = unique_labels; - this->min_rows_per_node = cfg_min_rows_per_node; - this->bootstrap_features = cfg_bootstrap_features; - this->split_criterion = cfg_split_criterion; - - //Bootstrap features - this->feature_selector.resize(this->dinfo.Ncols); - if (this->bootstrap_features) { - srand(n_bins); - for (int i=0; i < this->dinfo.Ncols; i++) { - this->feature_selector.push_back( rand() % this->dinfo.Ncols ); - } - } else { - std::iota(this->feature_selector.begin(), this->feature_selector.end(), 0); - } - - std::random_shuffle(this->feature_selector.begin(), this->feature_selector.end()); - this->feature_selector.resize((int) (colper * this->dinfo.Ncols)); - - cudaDeviceProp prop; - CUDA_CHECK(cudaGetDeviceProperties(&prop, 0)); - this->max_shared_mem = prop.sharedMemPerBlock; - - if (this->split_algo == SPLIT_ALGO::HIST) { - this->shmem_used += 2 * sizeof(T) * ncols; - this->shmem_used += this->nbins * sizeof(T) * ncols * 3; - this->shmem_used += this->nbins * sizeof(int) * ncols; - } else { - this->shmem_used += this->nbins * sizeof(T) * ncols * 3; - this->shmem_used += this->nbins * sizeof(int) * ncols; - } - ASSERT(this->shmem_used <= this->max_shared_mem, "Shared memory per block limit %zd , requested %zd \n", this->max_shared_mem, this->shmem_used); - - for (int i = 0; i < this->MAXSTREAMS; i++) { - this->tempmem[i] = std::make_shared>(handle, n_sampled_rows, ncols, this->MAXSTREAMS, unique_labels, n_bins, this->split_algo); - if (this->split_algo == SPLIT_ALGO::GLOBAL_QUANTILE) { - preprocess_quantile(data, rowids, n_sampled_rows, ncols, this->dinfo.NLocalrows, n_bins, this->tempmem[i]); - } - } - this->total_temp_mem = this->tempmem[0]->totalmem; - this->total_temp_mem *= this->MAXSTREAMS; - MetricInfo split_info; - MLCommon::TimerCPU timer; - this->root = grow_tree(data, colper, labels, 0, rowids, n_sampled_rows, split_info); - this->construct_time = timer.getElapsedSeconds(); - - for (int i = 0; i < this->MAXSTREAMS; i++) { - this->tempmem[i].reset(); - } - - return; -} - -template -TreeNode* DecisionTreeRegressor::grow_tree(T *data, const float colper, T *labels, int depth, unsigned int* rowids, - const int n_sampled_rows, MetricInfo prev_split_info) { - - TreeNode *node = new TreeNode(); - MetricQuestion ques; - Question node_ques; - float gain = 0.0; - MetricInfo split_info[3]; // basis, left, right. Populate this - split_info[0] = prev_split_info; - - bool condition = ((depth != 0) && (prev_split_info.best_metric == 0.0f)); // This node is a leaf, no need to search for best split - condition = condition || (n_sampled_rows < this->min_rows_per_node); // Do not split a node with less than min_rows_per_node samples - - if (this->treedepth != -1) - condition = (condition || (depth == this->treedepth)); - - if (this->maxleaves != -1) - condition = (condition || (this->leaf_counter >= this->maxleaves)); // FIXME not fully respecting maxleaves, but >= constraints it more than == - - if (!condition) { - find_best_fruit_all(data, labels, colper, ques, gain, rowids, n_sampled_rows, split_info, depth); //ques and gain are output here - condition = condition || (gain == 0.0f); - } - - if (condition) { - node->prediction = split_info[0].predict; - node->split_metric_val = split_info[0].best_metric; - - this->leaf_counter++; - if (depth > this->depth_counter) - this->depth_counter = depth; - } else { - int nrowsleft, nrowsright; - this->split_branch(data, ques, n_sampled_rows, nrowsleft, nrowsright, rowids); // populates ques.value - node_ques.update(ques); - node->question = node_ques; - node->left = grow_tree(data, colper, labels, depth+1, &rowids[0], nrowsleft, split_info[1]); - node->right = grow_tree(data, colper, labels, depth+1, &rowids[nrowsleft], nrowsright, split_info[2]); - node->split_metric_val = split_info[0].best_metric; - } - return node; + ASSERT((tree_params.split_criterion == CRITERION::MSE || tree_params.split_criterion == CRITERION::MAE), + "Decision Tree Regressor split criteria should be MSE or MAE\n"); + this->plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, 1, tree_params.max_depth, + tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, + tree_params.min_rows_per_node, tree_params.bootstrap_features, tree_params.split_criterion); } template @@ -516,7 +418,6 @@ void DecisionTreeRegressor::find_best_fruit_all(T *data, T *labels, const flo std::vector& colselector = this->feature_selector; - // Optimize ginibefore; no need to compute except for root. if (depth == 0) { CUDA_CHECK(cudaHostRegister(colselector.data(), sizeof(unsigned int) * colselector.size(), cudaHostRegisterDefault)); // Copy sampled column IDs to device memory diff --git a/cpp/src/decisiontree/decisiontree.h b/cpp/src/decisiontree/decisiontree.h index 29f7d3fdc9..13419c55c0 100644 --- a/cpp/src/decisiontree/decisiontree.h +++ b/cpp/src/decisiontree/decisiontree.h @@ -121,6 +121,14 @@ class DecisionTreeBase { void print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const; void split_branch(T *data, MetricQuestion & ques, const int n_sampled_rows, int& nrowsleft, int& nrowsright, unsigned int* rowids); + void plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, L *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, + int maxdepth = -1, int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node=2, + bool cfg_bootstrap_features=false, CRITERION cfg_split_criterion=CRITERION::CRITERION_END); + + TreeNode * grow_tree(T *data, const float colper, L *labels, int depth, unsigned int* rowids, const int n_sampled_rows, MetricInfo prev_split_info); + virtual void find_best_fruit_all(T *data, L *labels, const float colper, MetricQuestion & ques, float& gain, unsigned int* rowids, + const int n_sampled_rows, MetricInfo split_info[3], int depth) = 0; + public: // Printing utility for high level tree info. void print_tree_summary() const; @@ -145,12 +153,6 @@ class DecisionTreeClassifier : public DecisionTreeBase { const int n_sampled_rows, const int unique_labels, DecisionTreeParams tree_params); private: - // Same as above fit, but planting is better for a tree than fitting. - void plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, - int maxdepth = -1, int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node=2, bool cfg_bootstrap_features=false); - - TreeNode * grow_tree(T *data, const float colper, int *labels, int depth, unsigned int* rowids, const int n_sampled_rows, MetricInfo prev_split_info); - /* depth is used to distinguish between root and other tree nodes for computations */ void find_best_fruit_all(T *data, int *labels, const float colper, MetricQuestion & ques, float& gain, unsigned int* rowids, const int n_sampled_rows, MetricInfo split_info[3], int depth); @@ -163,12 +165,6 @@ class DecisionTreeRegressor : public DecisionTreeBase { const int n_sampled_rows, DecisionTreeParams tree_params); private: - // Same as above fit, but planting is better for a tree than fitting. - void plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels = 1, - int maxdepth = -1, int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node=2, bool cfg_bootstrap_features=false, CRITERION cfg_split_criterion=CRITERION::MSE); - - TreeNode * grow_tree(T *data, const float colper, T *labels, int depth, unsigned int* rowids, const int n_sampled_rows, MetricInfo prev_split_info); - /* depth is used to distinguish between root and other tree nodes for computations */ void find_best_fruit_all(T *data, T *labels, const float colper, MetricQuestion & ques, float& gain, unsigned int* rowids, const int n_sampled_rows, MetricInfo split_info[3], int depth); From f8cb923c8817b70a341f2d8d640513f60516e879 Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Wed, 5 Jun 2019 16:43:55 +0200 Subject: [PATCH 30/51] added depth zero helper function --- cpp/src/decisiontree/decisiontree.cu | 38 +++++++++++++++------------- cpp/src/decisiontree/decisiontree.h | 4 +-- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu index b054ac59f5..b312dc75ed 100644 --- a/cpp/src/decisiontree/decisiontree.cu +++ b/cpp/src/decisiontree/decisiontree.cu @@ -255,6 +255,22 @@ TreeNode* DecisionTreeBase::grow_tree(T *data, const float colper, L } return node; } + +template +void DecisionTreeBase::init_depth_zero(const L* labels, std::vector& colselector, const unsigned int* rowids, const int n_sampled_rows, const std::shared_ptr> tempmem) { + + CUDA_CHECK(cudaHostRegister(colselector.data(), sizeof(unsigned int) * colselector.size(), cudaHostRegisterDefault)); + // Copy sampled column IDs to device memory + MLCommon::updateDevice(tempmem->d_colids->data(), colselector.data(), colselector.size(), tempmem->stream); + CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); + + L *labelptr = tempmem->sampledlabels->data(); + get_sampled_labels(labels, labelptr, rowids, n_sampled_rows, tempmem->stream); + + //Unregister + CUDA_CHECK(cudaHostUnregister(colselector.data())); + +} /** * @brief Predict target feature for input data; n-ary classification or regression for single feature supported. Inference of trees is CPU only for now. @@ -350,22 +366,14 @@ void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const // Optimize ginibefore; no need to compute except for root. if (depth == 0) { - CUDA_CHECK(cudaHostRegister(colselector.data(), sizeof(unsigned int) * colselector.size(), cudaHostRegisterDefault)); - // Copy sampled column IDs to device memory - MLCommon::updateDevice(this->tempmem[0]->d_colids->data(), colselector.data(), colselector.size(), this->tempmem[0]->stream); - CUDA_CHECK(cudaStreamSynchronize(this->tempmem[0]->stream)); - + this->init_depth_zero(labels, colselector, rowids, n_sampled_rows, this->tempmem[0]); int *labelptr = this->tempmem[0]->sampledlabels->data(); - get_sampled_labels(labels, labelptr, rowids, n_sampled_rows, this->tempmem[0]->stream); - if (this->split_criterion == CRITERION::GINI) { gini(labelptr, n_sampled_rows, this->tempmem[0], split_info[0], this->n_unique_labels); } else { gini(labelptr, n_sampled_rows, this->tempmem[0], split_info[0], this->n_unique_labels); } - - //Unregister - CUDA_CHECK(cudaHostUnregister(colselector.data())); + } // Do not update bin count for the GLOBAL_QUANTILE split algorithm, as all potential split points were precomputed. @@ -419,20 +427,14 @@ void DecisionTreeRegressor::find_best_fruit_all(T *data, T *labels, const flo std::vector& colselector = this->feature_selector; if (depth == 0) { - CUDA_CHECK(cudaHostRegister(colselector.data(), sizeof(unsigned int) * colselector.size(), cudaHostRegisterDefault)); - // Copy sampled column IDs to device memory - MLCommon::updateDevice(this->tempmem[0]->d_colids->data(), colselector.data(), colselector.size(), this->tempmem[0]->stream); - CUDA_CHECK(cudaStreamSynchronize(this->tempmem[0]->stream)); - + this->init_depth_zero(labels, colselector, rowids, n_sampled_rows, this->tempmem[0]); T *labelptr = this->tempmem[0]->sampledlabels->data(); - get_sampled_labels(labels, labelptr, rowids, n_sampled_rows, this->tempmem[0]->stream); if (this->split_criterion == CRITERION::MSE) { mse(labelptr, n_sampled_rows, this->tempmem[0], split_info[0]); } else { mse(labelptr, n_sampled_rows, this->tempmem[0], split_info[0]); } - //Unregister - CUDA_CHECK(cudaHostUnregister(colselector.data())); + } // Do not update bin count for the GLOBAL_QUANTILE split algorithm, as all potential split points were precomputed. diff --git a/cpp/src/decisiontree/decisiontree.h b/cpp/src/decisiontree/decisiontree.h index 13419c55c0..9ca622a75a 100644 --- a/cpp/src/decisiontree/decisiontree.h +++ b/cpp/src/decisiontree/decisiontree.h @@ -118,13 +118,13 @@ class DecisionTreeBase { CRITERION split_criterion; std::vector feature_selector; - void print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const; + void print_node(const std::string& prefix, const TreeNode* const node, bool isLeft) const; void split_branch(T *data, MetricQuestion & ques, const int n_sampled_rows, int& nrowsleft, int& nrowsright, unsigned int* rowids); void plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, L *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, int maxdepth = -1, int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node=2, bool cfg_bootstrap_features=false, CRITERION cfg_split_criterion=CRITERION::CRITERION_END); - + void init_depth_zero(const L* labels, std::vector& colselector, const unsigned int* rowids, const int n_sampled_rows, const std::shared_ptr> tempmem); TreeNode * grow_tree(T *data, const float colper, L *labels, int depth, unsigned int* rowids, const int n_sampled_rows, MetricInfo prev_split_info); virtual void find_best_fruit_all(T *data, L *labels, const float colper, MetricQuestion & ques, float& gain, unsigned int* rowids, const int n_sampled_rows, MetricInfo split_info[3], int depth) = 0; From 8417d376ee2dc09b8d1dbe829db4c25e59eb6d9d Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Thu, 6 Jun 2019 03:00:42 -0700 Subject: [PATCH 31/51] Copied RF predictions on device. Added metrics too. - Moved RF predictions to device memory. DT preds change will follow. - Implemented accuracy, MSE, etc. computations on the device (under src_prims/score). - Added helper method. --- cpp/src/randomforest/randomforest.cu | 196 ++++++++++++++------------- cpp/src/randomforest/randomforest.h | 2 + cpp/src_prims/score/scores.h | 116 ++++++++++++++++ cpp/test/sg/rf_test.cu | 41 +++--- 4 files changed, 240 insertions(+), 115 deletions(-) diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index 17e86ca733..dddabea673 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -14,8 +14,8 @@ * limitations under the License. */ - #include "randomforest.h" +#include "score/scores.h" namespace ML { @@ -202,6 +202,45 @@ void rf::print_rf_detailed() { } } +/** + * @brief Sample row IDs for tree fitting and bootstrap if requested. + * @tparam T: data type for input data (float or double). + * @tparam L: data type for labels (int type for classification, T type for regression). + * @param[in] handle: cumlHandle + * @param[in] tree_id: unique tree ID + * @param[in] n_rows: total number of data samples. + * @param[in] n_sampled_rows: number of rows used for training + * @param[in, out] selected_rows: already allocated array w/ row IDs + * @param[in, out] sorted_selected_rows: already allocated array. Will contain sorted row IDs. + * @param[in, out] rows_temp_storage: temp. storage used for sorting (previously allocated). + * @param[in] temp_storage_bytes: size in bytes of rows_temp_storage. + */ +template +void rf::prepare_fit_per_tree(const ML::cumlHandle_impl& handle, int tree_id, int n_rows, int n_sampled_rows, unsigned int * selected_rows, + unsigned int * sorted_selected_rows, char * rows_temp_storage, size_t temp_storage_bytes) { + + cudaStream_t stream = handle.getStream(); + + if (rf_params.bootstrap) { + MLCommon::Random::Rng r(tree_id * 1000); // Ensure the seed for each tree is different and meaningful. + r.uniformInt(selected_rows, n_sampled_rows, (unsigned int) 0, (unsigned int) n_rows, stream); + CUDA_CHECK(cub::DeviceRadixSort::SortKeys((void *)rows_temp_storage, temp_storage_bytes, selected_rows, sorted_selected_rows, + n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); + } else { // Sampling w/o replacement + MLCommon::device_buffer *inkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); + MLCommon::device_buffer *outkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); + thrust::sequence(thrust::cuda::par.on(stream), inkeys->data(), inkeys->data() + n_rows); + int *perms = nullptr; + MLCommon::Random::permute(perms, outkeys->data(), inkeys->data(), 1, n_rows, false, stream); + // outkeys has more rows than selected_rows; doing the shuffling before the resize to differentiate the per-tree rows sample. + CUDA_CHECK(cub::DeviceRadixSort::SortKeys((void *)rows_temp_storage, temp_storage_bytes, outkeys->data(), sorted_selected_rows, + n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); + inkeys->release(stream); + outkeys->release(stream); + delete inkeys; + delete outkeys; + } +} /** * @brief Construct rfClassifier object. @@ -250,6 +289,7 @@ void rfClassifier::fit(const cumlHandle& user_handle, T * input, int n_rows, ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); trees = new DecisionTree::DecisionTreeClassifier[this->rf_params.n_trees]; + int n_sampled_rows = this->rf_params.rows_sample * n_rows; const cumlHandle_impl& handle = user_handle.getImpl(); @@ -262,32 +302,15 @@ void rfClassifier::fit(const cumlHandle& user_handle, T * input, int n_rows, MLCommon::device_buffer sorted_selected_rows(handle.getDeviceAllocator(), stream, n_sampled_rows); // Will sort selected_rows (row IDs), prior to fit, to improve access patterns - MLCommon::device_buffer * rows_temp_storage = nullptr; + MLCommon::device_buffer *rows_temp_storage = nullptr; size_t temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceRadixSort::SortKeys(rows_temp_storage, temp_storage_bytes, selected_rows.data(), sorted_selected_rows.data(), n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); // Allocate temporary storage rows_temp_storage = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, temp_storage_bytes); - if (this->rf_params.bootstrap) { - MLCommon::Random::Rng r(i * 1000); // Ensure the seed for each tree is different and meaningful. - r.uniformInt(selected_rows.data(), n_sampled_rows, (unsigned int) 0, (unsigned int) n_rows, stream); - CUDA_CHECK(cub::DeviceRadixSort::SortKeys((void *)rows_temp_storage->data(), temp_storage_bytes, selected_rows.data(), sorted_selected_rows.data(), - n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); - } else { // Sampling w/o replacement - MLCommon::device_buffer *inkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); - MLCommon::device_buffer *outkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); - thrust::sequence(thrust::cuda::par.on(stream), inkeys->data(), inkeys->data() + n_rows); - int *perms = nullptr; - MLCommon::Random::permute(perms, outkeys->data(), inkeys->data(), 1, n_rows, false, stream); - // outkeys has more rows than selected_rows; doing the shuffling before the resize to differentiate the per-tree rows sample. - CUDA_CHECK(cub::DeviceRadixSort::SortKeys((void *)rows_temp_storage->data(), temp_storage_bytes, outkeys->data(), sorted_selected_rows.data(), - n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); - inkeys->release(stream); - outkeys->release(stream); - delete inkeys; - delete outkeys; - } + this->prepare_fit_per_tree(handle, i, n_rows, n_sampled_rows, selected_rows.data(), + sorted_selected_rows.data(), rows_temp_storage->data(), temp_storage_bytes); /* Build individual tree in the forest. - input is a pointer to orig data that have n_cols features and n_rows rows. @@ -295,6 +318,7 @@ void rfClassifier::fit(const cumlHandle& user_handle, T * input, int n_rows, - sorted_selected_rows: points to a list of row #s (w/ n_sampled_rows elements) used to build the bootstrapped sample. Expectation: Each tree node will contain (a) # n_sampled_rows and (b) a pointer to a list of row numbers w.r.t original data. */ + trees[i].fit(user_handle, input, n_cols, n_rows, labels, sorted_selected_rows.data(), n_sampled_rows, n_unique_labels, this->rf_params.tree_params); //Cleanup @@ -305,15 +329,14 @@ void rfClassifier::fit(const cumlHandle& user_handle, T * input, int n_rows, } } - /** * @brief Predict target feature for input data; n-ary classification for single feature supported. * @tparam T: data type for input data (float or double). - * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] user_handle: cumlHandle. * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). - * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. * @param[in] verbose: flag for debugging purposes. */ template @@ -324,6 +347,10 @@ void rfClassifier::predict(const cumlHandle& user_handle, const T * input, in ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); ASSERT(predictions != nullptr, "Error! User has not allocated memory for predictions."); + std::vector h_predictions(n_rows); + const cumlHandle_impl& handle = user_handle.getImpl(); + cudaStream_t stream = user_handle.getStream(); + int row_size = n_cols; for (int row_id = 0; row_id < n_rows; row_id++) { @@ -358,20 +385,23 @@ void rfClassifier::predict(const cumlHandle& user_handle, const T * input, in } } - predictions[row_id] = majority_prediction; + h_predictions[row_id] = majority_prediction; } + + MLCommon::updateDevice(predictions, h_predictions.data(), n_rows, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); } /** * @brief Predict target feature for input data and validate against ref_labels. * @tparam T: data type for input data (float or double). - * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] user_handle: cumlHandle. * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. - * @param[in] ref_labels: label values for cross validation (n_rows elements); CPU pointer. + * @param[in] ref_labels: label values for cross validation (n_rows elements); GPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). - * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. * @param[in] verbose: flag for debugging purposes. */ template @@ -379,12 +409,9 @@ RF_metrics rfClassifier::cross_validate(const cumlHandle& user_handle, const predict(user_handle, input, n_rows, n_cols, predictions, verbose); - unsigned long long correctly_predicted = 0ULL; - for (int i = 0; i < n_rows; i++) { - correctly_predicted += (predictions[i] == ref_labels[i]); - } - - float accuracy = correctly_predicted * 1.0f/n_rows; + cudaStream_t stream = user_handle.getImpl().getStream(); + auto d_alloc = user_handle.getDeviceAllocator(); + float accuracy = MLCommon::Score::accuracy_score(predictions, ref_labels, n_rows, d_alloc, stream); RF_metrics stats(accuracy); if (verbose) stats.print(); @@ -437,6 +464,7 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); trees = new DecisionTree::DecisionTreeRegressor[this->rf_params.n_trees]; + int n_sampled_rows = this->rf_params.rows_sample * n_rows; const cumlHandle_impl& handle = user_handle.getImpl(); @@ -456,25 +484,8 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i // Allocate temporary storage rows_temp_storage = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, temp_storage_bytes); - if (this->rf_params.bootstrap) { - MLCommon::Random::Rng r(i * 1000); // Ensure the seed for each tree is different and meaningful. - r.uniformInt(selected_rows.data(), n_sampled_rows, (unsigned int) 0, (unsigned int) n_rows, stream); - CUDA_CHECK(cub::DeviceRadixSort::SortKeys((void *)rows_temp_storage->data(), temp_storage_bytes, selected_rows.data(), sorted_selected_rows.data(), - n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); - } else { // Sampling w/o replacement - MLCommon::device_buffer *inkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); - MLCommon::device_buffer *outkeys = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_rows); - thrust::sequence(thrust::cuda::par.on(stream), inkeys->data(), inkeys->data() + n_rows); - int *perms = nullptr; - MLCommon::Random::permute(perms, outkeys->data(), inkeys->data(), 1, n_rows, false, stream); - // outkeys has more rows than selected_rows; doing the shuffling before the resize to differentiate the per-tree rows sample. - CUDA_CHECK(cub::DeviceRadixSort::SortKeys((void *)rows_temp_storage->data(), temp_storage_bytes, outkeys->data(), sorted_selected_rows.data(), - n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); - inkeys->release(stream); - outkeys->release(stream); - delete inkeys; - delete outkeys; - } + this->prepare_fit_per_tree(handle, i, n_rows, n_sampled_rows, selected_rows.data(), + sorted_selected_rows.data(), rows_temp_storage->data(), temp_storage_bytes); /* Build individual tree in the forest. - input is a pointer to orig data that have n_cols features and n_rows rows. @@ -482,6 +493,7 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i - sorted_selected_rows: points to a list of row #s (w/ n_sampled_rows elements) used to build the bootstrapped sample. Expectation: Each tree node will contain (a) # n_sampled_rows and (b) a pointer to a list of row numbers w.r.t original data. */ + trees[i].fit(user_handle, input, n_cols, n_rows, labels, sorted_selected_rows.data(), n_sampled_rows, this->rf_params.tree_params); //Cleanup @@ -492,14 +504,15 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i } } + /** * @brief Predict target feature for input data; regression for single feature supported. * @tparam T: data type for input data (float or double). - * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] user_handle: cumlHandle. * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). - * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. * @param[in] verbose: flag for debugging purposes. */ template @@ -510,6 +523,10 @@ void rfRegressor::predict(const cumlHandle& user_handle, const T * input, int ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); ASSERT(predictions != nullptr, "Error! User has not allocated memory for predictions."); + std::vector h_predictions(n_rows); + const cumlHandle_impl& handle = user_handle.getImpl(); + cudaStream_t stream = user_handle.getStream(); + int row_size = n_cols; for (int row_id = 0; row_id < n_rows; row_id++) { @@ -534,19 +551,22 @@ void rfRegressor::predict(const cumlHandle& user_handle, const T * input, int sum_predictions += prediction; } // Random forest's prediction is the arithmetic mean of all its decision tree predictions. - predictions[row_id] = sum_predictions / this->rf_params.n_trees; + h_predictions[row_id] = sum_predictions / this->rf_params.n_trees; } + + MLCommon::updateDevice(predictions, h_predictions.data(), n_rows, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); } /** * @brief Predict target feature for input data and validate against ref_labels. * @tparam T: data type for input data (float or double). - * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] user_handle: cumlHandle. * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. - * @param[in] ref_labels: label values for cross validation (n_rows elements); CPU pointer. + * @param[in] ref_labels: label values for cross validation (n_rows elements); GPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). - * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. * @param[in] verbose: flag for debugging purposes. */ template @@ -554,29 +574,11 @@ RF_metrics rfRegressor::cross_validate(const cumlHandle& user_handle, const T predict(user_handle, input, n_rows, n_cols, predictions, verbose); - double abs_difference_sum = 0; - double mse_sum = 0; - std::vector abs_diffs; - - for (int i = 0; i < n_rows; i++) { - double abs_diff = abs(predictions[i] - ref_labels[i]); - abs_difference_sum += abs_diff; - mse_sum += pow(predictions[i] - ref_labels[i], 2); - abs_diffs.push_back(abs_diff); - } - - double mean_abs_error = abs_difference_sum / n_rows; - double mean_squared_error = mse_sum / n_rows; - - std::sort(abs_diffs.begin(), abs_diffs.end()); - double median_abs_error = 0; - int middle = n_rows / 2; - if (n_rows % 2 == 1) { - median_abs_error = abs_diffs[middle]; - } else { - median_abs_error = (abs_diffs[middle] + abs_diffs[middle - 1]) / 2; - } + cudaStream_t stream = user_handle.getImpl().getStream(); + auto d_alloc = user_handle.getDeviceAllocator(); + double mean_abs_error, mean_squared_error, median_abs_error; + MLCommon::Score::regression_metrics(predictions, ref_labels, n_rows, d_alloc, stream, mean_abs_error, mean_squared_error, median_abs_error); RF_metrics stats(mean_abs_error, mean_squared_error, median_abs_error); if (verbose) stats.print(); @@ -632,12 +634,12 @@ void fit(const cumlHandle& user_handle, rfClassifier * rf_classifier, do /** * @brief Predict target feature for input data of type float; n-ary classification for single feature supported. - * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] user_handle: cumlHandle. * @param[in] rf_classifier: pointer to the rfClassifier object. The user should have previously called fit to build the random forest. * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). - * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. * @param[in] verbose: flag for debugging purposes. */ void predict(const cumlHandle& user_handle, const rfClassifier * rf_classifier, const float * input, int n_rows, int n_cols, int * predictions, bool verbose) { @@ -646,12 +648,12 @@ void predict(const cumlHandle& user_handle, const rfClassifier * rf_class /** * @brief Predict target feature for input data of type double; n-ary classification for single feature supported. - * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] user_handle: cumlHandle. * @param[in] rf_classifier: pointer to the rfClassifier object. The user should have previously called fit to build the random forest. * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). - * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. * @param[in] verbose: flag for debugging purposes. */ void predict(const cumlHandle& user_handle, const rfClassifier * rf_classifier, const double * input, int n_rows, int n_cols, int * predictions, bool verbose) { @@ -660,13 +662,13 @@ void predict(const cumlHandle& user_handle, const rfClassifier * rf_clas /** * @brief Predict target feature for input data of type float and validate against ref_labels. - * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] user_handle: cumlHandle. * @param[in] rf_classifier: pointer to the rfClassifier object. The user should have previously called fit to build the random forest. * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. * @param[in] ref_labels: label values for cross validation (n_rows elements); CPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). - * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. * @param[in] verbose: flag for debugging purposes. */ RF_metrics cross_validate(const cumlHandle& user_handle, const rfClassifier * rf_classifier, const float * input, const int * ref_labels, @@ -676,13 +678,13 @@ RF_metrics cross_validate(const cumlHandle& user_handle, const rfClassifier * rf_classifier, const double * input, const int * ref_labels, @@ -720,12 +722,12 @@ void fit(const cumlHandle& user_handle, rfRegressor * rf_regressor, doub /** * @brief Predict target feature for input data of type float; regression for single feature supported. - * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] user_handle: cumlHandle. * @param[in] rf_regressor: pointer to the rfRegressor object. The user should have previously called fit to build the random forest. * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). - * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. * @param[in] verbose: flag for debugging purposes. */ void predict(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const float * input, int n_rows, int n_cols, float * predictions, bool verbose) { @@ -734,12 +736,12 @@ void predict(const cumlHandle& user_handle, const rfRegressor * rf_regres /** * @brief Predict target feature for input data of type double; regression for single feature supported. - * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] user_handle: cumlHandle. * @param[in] rf_regressor: pointer to the rfRegressor object. The user should have previously called fit to build the random forest. * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). - * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. * @param[in] verbose: flag for debugging purposes. */ void predict(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const double * input, int n_rows, int n_cols, double * predictions, bool verbose) { @@ -748,13 +750,13 @@ void predict(const cumlHandle& user_handle, const rfRegressor * rf_regre /** * @brief Predict target feature for input data of type float and validate against ref_labels. - * @param[in] user_handle: cumlHandle (currently unused; API placeholder) + * @param[in] user_handle: cumlHandle. * @param[in] rf_regressor: pointer to the rfRegressor object. The user should have previously called fit to build the random forest. * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. * @param[in] ref_labels: label values for cross validation (n_rows elements); CPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). - * @param[in, out] predictions: n_rows predicted labels. CPU pointer, user allocated. + * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. * @param[in] verbose: flag for debugging purposes. */ RF_metrics cross_validate(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const float * input, const float * ref_labels, @@ -764,13 +766,13 @@ RF_metrics cross_validate(const cumlHandle& user_handle, const rfRegressor * rf_regressor, const double * input, const double * ref_labels, diff --git a/cpp/src/randomforest/randomforest.h b/cpp/src/randomforest/randomforest.h index 18dc85ad96..b914373d23 100644 --- a/cpp/src/randomforest/randomforest.h +++ b/cpp/src/randomforest/randomforest.h @@ -91,6 +91,8 @@ class rf { int rf_type; virtual const DecisionTree::DecisionTreeBase * get_trees_ptr() const = 0; ~rf() = default; + void prepare_fit_per_tree(const ML::cumlHandle_impl& handle, int tree_id, int n_rows, int n_sampled_rows, + unsigned int * selected_rows, unsigned int * sorted_selected_rows, char * rows_temp_storage, size_t temp_storage_bytes); public: rf(RF_params cfg_rf_params, int cfg_rf_type=RF_type::CLASSIFICATION); diff --git a/cpp/src_prims/score/scores.h b/cpp/src_prims/score/scores.h index ca2227ad38..f45f7d2b3d 100644 --- a/cpp/src_prims/score/scores.h +++ b/cpp/src_prims/score/scores.h @@ -78,6 +78,122 @@ namespace MLCommon { return 1.0 - sse/ssto; } + + /** + * @brief Compute accuracy of predictions. Useful for classification. + * @tparam math_t: data type for predictions (e.g., int for classification) + * @param[in] predictions: array of predictions (GPU pointer). + * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer). + * @param[in] n: number of elements in each of predictions, ref_predictions. + * @param[in] d_alloc: device allocator. + * @param[in] stream: cuda stream. + * @return: Accuracy score in [0, 1]; higher is better. + */ + template + float accuracy_score(const math_t * predictions, const math_t * ref_predictions, int n, + std::shared_ptr d_alloc, cudaStream_t stream) { + + unsigned long long correctly_predicted = 0ULL; + math_t * diffs_array = (math_t *)d_alloc->allocate(n * sizeof(math_t), stream); + + //TODO could write a kernel instead + MLCommon::LinAlg::eltwiseSub(diffs_array, predictions, ref_predictions, n, stream); + CUDA_CHECK(cudaGetLastError()); + correctly_predicted = thrust::count(thrust::cuda::par.on(stream), diffs_array, diffs_array + n, 0); + d_alloc->deallocate(diffs_array, n * sizeof(math_t), stream); + + float accuracy = correctly_predicted * 1.0f/n; + return accuracy; + } + + template + __global__ void reg_metrics_kernel(const T * predictions, const T * ref_predictions, int n, + double * abs_diffs, double * tmp_sums) { + + int tid = threadIdx.x + blockIdx.x * blockDim.x; + __shared__ double shmem[2]; // {abs_difference_sum, squared difference sum} + + for (int i = threadIdx.x; i < 2; i += blockDim.x) { + shmem[i] = 0; + } + __syncthreads(); + + for (int i = tid; i < n; i += blockDim.x*gridDim.x) { + double diff = predictions[i] - ref_predictions[i]; + double abs_diff = abs(diff); + atomicAdd(&shmem[0], abs_diff); + atomicAdd(&shmem[1], diff * diff); + + // update absolute difference in global memory for subsequent abs. median computation + abs_diffs[i] = abs_diff; + } + __syncthreads(); + + // Update tmp_sum w/ total abs_difference_sum and squared difference sum. + for (int i = threadIdx.x; i < 2; i += blockDim.x) { + atomicAdd(&tmp_sums[i], shmem[i]); + } + } + + /** + * @brief Compute regression metrics mean absolute error, mean squared error, median absolute error + * @tparam T: data type for predictions (e.g., float or double for regression). + * @param[in] predictions: array of predictions (GPU pointer). + * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer). + * @param[in] n: number of elements in each of predictions, ref_predictions. Should be > 0. + * @param[in] d_alloc: device allocator. + * @param[in] stream: cuda stream. + * @param[out] mean_abs_error: Mean Absolute Error. Sum over n of (|predictions[i] - ref_predictions[i]|) / n. + * @param[out] mean_squared_error: Mean Squared Error. Sum over n of ((predictions[i] - ref_predictions[i])^2) / n. + * @param[out] median_abs_error: Median Absolute Error. Median of |predictions[i] - ref_predictions[i]| for i in [0, n). + */ + template + void regression_metrics(const T * predictions, const T * ref_predictions, int n, std::shared_ptr d_alloc, cudaStream_t stream, + double & mean_abs_error, double & mean_squared_error, double & median_abs_error) { + + std::vector mean_errors(2); + std::vector h_sorted_abs_diffs(n); + int thread_cnt = 256; + int block_cnt = ceildiv(n, thread_cnt); + + int array_size = n * sizeof(double); + double * abs_diffs_array = (double *)d_alloc->allocate(array_size, stream); + double * sorted_abs_diffs = (double *)d_alloc->allocate(array_size, stream); + double * tmp_sums = (double *)d_alloc->allocate(2 * sizeof(double), stream); + CUDA_CHECK(cudaMemsetAsync(tmp_sums, 0, 2 * sizeof(double), stream)); + + reg_metrics_kernel<<>>(predictions, ref_predictions, n, abs_diffs_array, tmp_sums); + CUDA_CHECK(cudaGetLastError()); + MLCommon::updateHost(&mean_errors[0], tmp_sums, 2, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + mean_abs_error = mean_errors[0] / n; + mean_squared_error = mean_errors[1] / n; + + // Compute median error. Sort diffs_array and pick median value + char * temp_storage = nullptr; + size_t temp_storage_bytes; + CUDA_CHECK(cub::DeviceRadixSort::SortKeys((void *)temp_storage, temp_storage_bytes, abs_diffs_array, sorted_abs_diffs, + n, 0, 8*sizeof(double), stream)); + temp_storage = (char *)d_alloc->allocate(temp_storage_bytes, stream); + CUDA_CHECK(cub::DeviceRadixSort::SortKeys((void *)temp_storage, temp_storage_bytes, abs_diffs_array, sorted_abs_diffs, + n, 0, 8*sizeof(double), stream)); + + MLCommon::updateHost(h_sorted_abs_diffs.data(), sorted_abs_diffs, n, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + int middle = n / 2; + if (n % 2 == 1) { + median_abs_error = h_sorted_abs_diffs[middle]; + } else { + median_abs_error = (h_sorted_abs_diffs[middle] + h_sorted_abs_diffs[middle - 1]) / 2; + } + + d_alloc->deallocate(abs_diffs_array, array_size, stream); + d_alloc->deallocate(sorted_abs_diffs, array_size, stream); + d_alloc->deallocate(temp_storage, temp_storage_bytes, stream); + d_alloc->deallocate(tmp_sums, 2 * sizeof(double), stream); + } } } diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 65c898fbf6..c9a65fa750 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -67,8 +67,10 @@ protected: int data_len = params.n_rows * params.n_cols; allocate(data, data_len); allocate(labels, params.n_rows); - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream) ); + allocate(predicted_labels, params.n_inference_rows); + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream) ); // Populate data (assume Col major) std::vector data_h = {30.0, 1.0, 2.0, 0.0, 10.0, 20.0, 10.0, 40.0}; @@ -89,19 +91,19 @@ protected: fit(handle, rf_classifier, data, params.n_rows, params.n_cols, labels, labels_map.size()); CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaStreamDestroy(stream)); // Inference data: same as train, but row major int inference_data_len = params.n_inference_rows * params.n_cols; inference_data_h = {30.0, 10.0, 1.0, 20.0, 2.0, 10.0, 0.0, 40.0}; inference_data_h.resize(inference_data_len); - // Predict and compare against known labels - predicted_labels.resize(params.n_inference_rows); - RF_metrics tmp = cross_validate(handle, rf_classifier, inference_data_h.data(), labels_h.data(), - params.n_inference_rows, params.n_cols, predicted_labels.data(), false); + RF_metrics tmp = cross_validate(handle, rf_classifier, inference_data_h.data(), labels, + params.n_inference_rows, params.n_cols, predicted_labels, false); accuracy = tmp.accuracy; + + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaStreamDestroy(stream)); } void SetUp() override { @@ -114,9 +116,9 @@ protected: inference_data_h.clear(); labels_h.clear(); labels_map.clear(); - predicted_labels.clear(); CUDA_CHECK(cudaFree(labels)); + CUDA_CHECK(cudaFree(predicted_labels)); CUDA_CHECK(cudaFree(data)); delete rf_classifier; } @@ -133,7 +135,7 @@ protected: rfClassifier * rf_classifier; float accuracy = -1.0f; // overriden in each test SetUp and TearDown - std::vector predicted_labels; + int * predicted_labels; }; //------------------------------------------------------------------------------------------------------------------------------------- @@ -157,8 +159,9 @@ protected: int data_len = params.n_rows * params.n_cols; allocate(data, data_len); allocate(labels, params.n_rows); - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream) ); + allocate(predicted_labels, params.n_inference_rows); + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream) ); // Populate data (assume Col major) std::vector data_h = {0.0, 0.0, 0.0, 0.0, 10.0, 20.0, 30.0, 40.0}; @@ -173,24 +176,26 @@ protected: rf_regressor = new typename rfRegressor::rfRegressor(rf_params); cumlHandle handle; - handle.setStream(stream); + handle.setStream(stream); fit(handle, rf_regressor, data, params.n_rows, params.n_cols, labels); CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaStreamDestroy(stream)); // Inference data: same as train, but row major int inference_data_len = params.n_inference_rows * params.n_cols; inference_data_h = {0.0, 10.0, 0.0, 20.0, 0.0, 30.0, 0.0, 40.0}; inference_data_h.resize(inference_data_len); + //TODO FIXME stream // Predict and compare against known labels - predicted_labels.resize(params.n_inference_rows); - RF_metrics tmp = cross_validate(handle, rf_regressor, inference_data_h.data(), labels_h.data(), - params.n_inference_rows, params.n_cols, predicted_labels.data(), false); + RF_metrics tmp = cross_validate(handle, rf_regressor, inference_data_h.data(), labels, + params.n_inference_rows, params.n_cols, predicted_labels, false); mse = tmp.mean_squared_error; + + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaStreamDestroy(stream)); } void SetUp() override { @@ -201,9 +206,9 @@ protected: mse = -1.0f; // reset mse inference_data_h.clear(); labels_h.clear(); - predicted_labels.clear(); CUDA_CHECK(cudaFree(labels)); + CUDA_CHECK(cudaFree(predicted_labels)); CUDA_CHECK(cudaFree(data)); delete rf_regressor; } @@ -219,7 +224,7 @@ protected: rfRegressor * rf_regressor; float mse = -1.0f; // overriden in each test SetUp and TearDown - std::vector predicted_labels; + T * predicted_labels; }; //------------------------------------------------------------------------------------------------------------------------------------- From 895ccf6dabf3204a72bef1d4cdcef2eedebd6ee9 Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Thu, 6 Jun 2019 13:44:41 +0200 Subject: [PATCH 32/51] moving allocations outside the loop --- cpp/src/randomforest/randomforest.cu | 93 ++++++++++++++-------------- 1 file changed, 48 insertions(+), 45 deletions(-) diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index dddabea673..b1ebbe65eb 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -295,38 +295,40 @@ void rfClassifier::fit(const cumlHandle& user_handle, T * input, int n_rows, const cumlHandle_impl& handle = user_handle.getImpl(); cudaStream_t stream = user_handle.getStream(); + // Select n_sampled_rows (with replacement) numbers from [0, n_rows) per tree. + // selected_rows: randomly generated IDs for bootstrapped samples (w/ replacement); a device ptr. + MLCommon::device_buffer selected_rows(handle.getDeviceAllocator(), stream, n_sampled_rows); + MLCommon::device_buffer sorted_selected_rows(handle.getDeviceAllocator(), stream, n_sampled_rows); + + // Will sort selected_rows (row IDs), prior to fit, to improve access patterns + MLCommon::device_buffer *rows_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + CUDA_CHECK(cub::DeviceRadixSort::SortKeys(rows_temp_storage, temp_storage_bytes, selected_rows.data(), sorted_selected_rows.data(), + n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); + // Allocate temporary storage + rows_temp_storage = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, temp_storage_bytes); + for (int i = 0; i < this->rf_params.n_trees; i++) { - // Select n_sampled_rows (with replacement) numbers from [0, n_rows) per tree. - // selected_rows: randomly generated IDs for bootstrapped samples (w/ replacement); a device ptr. - MLCommon::device_buffer selected_rows(handle.getDeviceAllocator(), stream, n_sampled_rows); - MLCommon::device_buffer sorted_selected_rows(handle.getDeviceAllocator(), stream, n_sampled_rows); - - // Will sort selected_rows (row IDs), prior to fit, to improve access patterns - MLCommon::device_buffer *rows_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceRadixSort::SortKeys(rows_temp_storage, temp_storage_bytes, selected_rows.data(), sorted_selected_rows.data(), - n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); - // Allocate temporary storage - rows_temp_storage = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, temp_storage_bytes); this->prepare_fit_per_tree(handle, i, n_rows, n_sampled_rows, selected_rows.data(), - sorted_selected_rows.data(), rows_temp_storage->data(), temp_storage_bytes); - + sorted_selected_rows.data(), rows_temp_storage->data(), temp_storage_bytes); + /* Build individual tree in the forest. - input is a pointer to orig data that have n_cols features and n_rows rows. - n_sampled_rows: # rows sampled for tree's bootstrap sample. - sorted_selected_rows: points to a list of row #s (w/ n_sampled_rows elements) used to build the bootstrapped sample. - Expectation: Each tree node will contain (a) # n_sampled_rows and (b) a pointer to a list of row numbers w.r.t original data. + Expectation: Each tree node will contain (a) # n_sampled_rows and (b) a pointer to a list of row numbers w.r.t original data. */ - + trees[i].fit(user_handle, input, n_cols, n_rows, labels, sorted_selected_rows.data(), n_sampled_rows, n_unique_labels, this->rf_params.tree_params); - - //Cleanup - rows_temp_storage->release(stream); - selected_rows.release(stream); - sorted_selected_rows.release(stream); - delete rows_temp_storage; } + + //Cleanup + rows_temp_storage->release(stream); + selected_rows.release(stream); + sorted_selected_rows.release(stream); + delete rows_temp_storage; + } /** @@ -470,38 +472,39 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i const cumlHandle_impl& handle = user_handle.getImpl(); cudaStream_t stream = user_handle.getStream(); - for (int i = 0; i < this->rf_params.n_trees; i++) { - // Select n_sampled_rows (with replacement) numbers from [0, n_rows) per tree. - // selected_rows: randomly generated IDs for bootstrapped samples (w/ replacement); a device ptr. - MLCommon::device_buffer selected_rows(handle.getDeviceAllocator(), stream, n_sampled_rows); - MLCommon::device_buffer sorted_selected_rows(handle.getDeviceAllocator(), stream, n_sampled_rows); - - // Will sort selected_rows (row IDs), prior to fit, to improve access patterns - MLCommon::device_buffer *rows_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceRadixSort::SortKeys(rows_temp_storage, temp_storage_bytes, selected_rows.data(), sorted_selected_rows.data(), - n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); - // Allocate temporary storage - rows_temp_storage = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, temp_storage_bytes); + // Select n_sampled_rows (with replacement) numbers from [0, n_rows) per tree. + // selected_rows: randomly generated IDs for bootstrapped samples (w/ replacement); a device ptr. + MLCommon::device_buffer selected_rows(handle.getDeviceAllocator(), stream, n_sampled_rows); + MLCommon::device_buffer sorted_selected_rows(handle.getDeviceAllocator(), stream, n_sampled_rows); + + // Will sort selected_rows (row IDs), prior to fit, to improve access patterns + MLCommon::device_buffer *rows_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + CUDA_CHECK(cub::DeviceRadixSort::SortKeys(rows_temp_storage, temp_storage_bytes, selected_rows.data(), sorted_selected_rows.data(), + n_sampled_rows, 0, 8*sizeof(unsigned int), stream)); + // Allocate temporary storage + rows_temp_storage = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, temp_storage_bytes); + for (int i = 0; i < this->rf_params.n_trees; i++) { + this->prepare_fit_per_tree(handle, i, n_rows, n_sampled_rows, selected_rows.data(), - sorted_selected_rows.data(), rows_temp_storage->data(), temp_storage_bytes); - + sorted_selected_rows.data(), rows_temp_storage->data(), temp_storage_bytes); + /* Build individual tree in the forest. - input is a pointer to orig data that have n_cols features and n_rows rows. - n_sampled_rows: # rows sampled for tree's bootstrap sample. - sorted_selected_rows: points to a list of row #s (w/ n_sampled_rows elements) used to build the bootstrapped sample. - Expectation: Each tree node will contain (a) # n_sampled_rows and (b) a pointer to a list of row numbers w.r.t original data. + Expectation: Each tree node will contain (a) # n_sampled_rows and (b) a pointer to a list of row numbers w.r.t original data. */ - + trees[i].fit(user_handle, input, n_cols, n_rows, labels, sorted_selected_rows.data(), n_sampled_rows, this->rf_params.tree_params); - - //Cleanup - rows_temp_storage->release(stream); - selected_rows.release(stream); - sorted_selected_rows.release(stream); - delete rows_temp_storage; + } + //Cleanup + rows_temp_storage->release(stream); + selected_rows.release(stream); + sorted_selected_rows.release(stream); + delete rows_temp_storage; } From 5250be8cfc03f7c5062eb92835c8498db8c5dae7 Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Thu, 6 Jun 2019 07:39:25 -0700 Subject: [PATCH 33/51] Added helper for tree fit. - Also split memory.cuh to header file (useful for python bindings later on). --- cpp/src/decisiontree/decisiontree.cu | 58 ++-- cpp/src/decisiontree/decisiontree.h | 3 +- .../kernels/evaluate_classifier.cuh | 2 +- .../kernels/evaluate_regressor.cuh | 2 +- cpp/src/decisiontree/kernels/metric.cuh | 3 +- cpp/src/decisiontree/kernels/metric_def.h | 3 +- cpp/src/decisiontree/memory.cuh | 252 ++++++++---------- cpp/src/decisiontree/memory.h | 66 +++++ 8 files changed, 204 insertions(+), 185 deletions(-) create mode 100644 cpp/src/decisiontree/memory.h diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu index b312dc75ed..4174ba9793 100644 --- a/cpp/src/decisiontree/decisiontree.cu +++ b/cpp/src/decisiontree/decisiontree.cu @@ -16,6 +16,7 @@ #include #include "decisiontree.h" +#include "memory.cuh" #include "kernels/metric.cuh" #include "kernels/split_labels.cuh" #include "kernels/col_condenser.cuh" @@ -320,6 +321,31 @@ L DecisionTreeBase::predict_one(const T * row, const TreeNode* const } } +template +void DecisionTreeBase::base_fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, L *labels, + unsigned int *rowids, const int n_sampled_rows, int unique_labels, DecisionTreeParams& tree_params, + ML::CRITERION default_criterion, ML::CRITERION other_criterion, const std::string & dt_name) { + + const char * CRITERION_NAME[]={"GINI", "ENTROPY", "MSE", "MAE", "END"}; + + tree_params.validity_check(); + if (tree_params.n_bins > n_sampled_rows) { + std::cout << "Warning! Calling with number of bins > number of rows! "; + std::cout << "Resetting n_bins to " << n_sampled_rows << "." << std::endl; + tree_params.n_bins = n_sampled_rows; + } + + if (tree_params.split_criterion == CRITERION::CRITERION_END) { // Set default to GINI (classification) or MSE (regression) + tree_params.split_criterion = default_criterion; + } + ASSERT((tree_params.split_criterion == default_criterion || tree_params.split_criterion == other_criterion), + "Decision Tree %s split criteria should be %s or %s\n", dt_name.c_str(), CRITERION_NAME[default_criterion], CRITERION_NAME[other_criterion]); + + plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree_params.max_depth, + tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, + tree_params.min_rows_per_node, tree_params.bootstrap_features, tree_params.split_criterion); + +} /** * @brief Build (i.e., fit, train) Decision Tree classifier for input data. @@ -341,22 +367,7 @@ template void DecisionTreeClassifier::fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, DecisionTreeParams tree_params) { - tree_params.validity_check(); - if (tree_params.n_bins > n_sampled_rows) { - std::cout << "Warning! Calling with number of bins > number of rows! "; - std::cout << "Resetting n_bins to " << n_sampled_rows << "." << std::endl; - tree_params.n_bins = n_sampled_rows; - } - - if (tree_params.split_criterion == CRITERION::CRITERION_END) { // Set default to GINI - tree_params.split_criterion = CRITERION::GINI; - } - ASSERT((tree_params.split_criterion == CRITERION::GINI || tree_params.split_criterion == CRITERION::ENTROPY ), - " Decision Tree Classifer split criteria, should be GINI or ENTROPY\n"); - - this->plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree_params.max_depth, - tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, - tree_params.min_rows_per_node, tree_params.bootstrap_features, tree_params.split_criterion); + this->base_fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree_params, CRITERION::GINI, CRITERION::ENTROPY, "Classifier"); } template @@ -404,20 +415,7 @@ void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const template void DecisionTreeRegressor::fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, const int n_sampled_rows, DecisionTreeParams tree_params) { - tree_params.validity_check(); - if (tree_params.n_bins > n_sampled_rows) { - std::cout << "Warning! Calling with number of bins > number of rows! "; - std::cout << "Resetting n_bins to " << n_sampled_rows << "." << std::endl; - tree_params.n_bins = n_sampled_rows; - } - if (tree_params.split_criterion == CRITERION::CRITERION_END) { // Set default to MSE - tree_params.split_criterion = CRITERION::MSE; - } - ASSERT((tree_params.split_criterion == CRITERION::MSE || tree_params.split_criterion == CRITERION::MAE), - "Decision Tree Regressor split criteria should be MSE or MAE\n"); - this->plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, 1, tree_params.max_depth, - tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, - tree_params.min_rows_per_node, tree_params.bootstrap_features, tree_params.split_criterion); + this->base_fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, 1, tree_params, CRITERION::MSE, CRITERION::MAE, "Regressor"); } template diff --git a/cpp/src/decisiontree/decisiontree.h b/cpp/src/decisiontree/decisiontree.h index 9ca622a75a..e77d5c8049 100644 --- a/cpp/src/decisiontree/decisiontree.h +++ b/cpp/src/decisiontree/decisiontree.h @@ -17,7 +17,6 @@ #pragma once #include "algo_helper.h" #include "kernels/metric_def.h" -#include "memory.cuh" #include #include #include @@ -128,6 +127,8 @@ class DecisionTreeBase { TreeNode * grow_tree(T *data, const float colper, L *labels, int depth, unsigned int* rowids, const int n_sampled_rows, MetricInfo prev_split_info); virtual void find_best_fruit_all(T *data, L *labels, const float colper, MetricQuestion & ques, float& gain, unsigned int* rowids, const int n_sampled_rows, MetricInfo split_info[3], int depth) = 0; + void base_fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, L *labels, unsigned int *rowids, + const int n_sampled_rows, int unique_labels, DecisionTreeParams & tree_params, CRITERION default_criterion, CRITERION other_criterion, const std::string & name); public: // Printing utility for high level tree info. diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index 9ed4c38444..133cdc9d37 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -18,7 +18,7 @@ #include #include #include "metric.cuh" -#include "../memory.cuh" +#include "../memory.h" #include "col_condenser.cuh" #include #include "../algo_helper.h" diff --git a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh index 7a21ddb878..f7402ec7fa 100644 --- a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh @@ -18,7 +18,7 @@ #include #include #include "metric.cuh" -#include "../memory.cuh" +#include "../memory.h" #include "col_condenser.cuh" #include #include "../algo_helper.h" diff --git a/cpp/src/decisiontree/kernels/metric.cuh b/cpp/src/decisiontree/kernels/metric.cuh index 66711070b6..56469664c3 100644 --- a/cpp/src/decisiontree/kernels/metric.cuh +++ b/cpp/src/decisiontree/kernels/metric.cuh @@ -16,8 +16,7 @@ #pragma once #include -#include "cub/cub.cuh" -#include "../memory.cuh" +#include "../memory.h" #include #include "metric_def.h" #include "cuda_utils.h" diff --git a/cpp/src/decisiontree/kernels/metric_def.h b/cpp/src/decisiontree/kernels/metric_def.h index 8cbc04cacd..4ec56ea8b8 100644 --- a/cpp/src/decisiontree/kernels/metric_def.h +++ b/cpp/src/decisiontree/kernels/metric_def.h @@ -16,8 +16,7 @@ #pragma once #include -#include "cub/cub.cuh" -#include "../memory.cuh" +#include "../memory.h" #include #include "cuda_utils.h" #include diff --git a/cpp/src/decisiontree/memory.cuh b/cpp/src/decisiontree/memory.cuh index fca96ccbc9..376f888f59 100644 --- a/cpp/src/decisiontree/memory.cuh +++ b/cpp/src/decisiontree/memory.cuh @@ -15,164 +15,120 @@ */ #pragma once +#include "memory.h" #include #include "cub/cub.cuh" #include -#include "common/cumlHandle.hpp" -#include -#include template -struct TemporaryMemory -{ - // Labels after boostrapping - MLCommon::device_buffer *sampledlabels; - - // Used for gini histograms (root tree node) - MLCommon::device_buffer *d_hist; - MLCommon::host_buffer *h_hist; +TemporaryMemory::TemporaryMemory(const ML::cumlHandle_impl& handle, int N, int Ncols, int maxstr, int n_unique, int n_bins, const int split_algo):ml_handle(handle) { + + //Assign Stream from cumlHandle + stream = ml_handle.getStream(); - //Host/Device histograms and device minmaxs - MLCommon::device_buffer *d_globalminmax; - MLCommon::device_buffer *d_histout; - MLCommon::device_buffer *d_colids; - MLCommon::host_buffer *h_histout; - MLCommon::device_buffer *d_mseout, *d_predout; - MLCommon::host_buffer *h_mseout, *h_predout; + int n_hist_elements = n_unique * n_bins; + + h_hist = new MLCommon::host_buffer(handle.getHostAllocator(), stream, n_hist_elements); + d_hist = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_hist_elements); + nrowsleftright = new MLCommon::host_buffer(handle.getHostAllocator(), stream, 2); - //Below pointers are shared for split functions - MLCommon::device_buffer *d_flags_left, *d_flags_right; - MLCommon::host_buffer *nrowsleftright; - MLCommon::device_buffer *d_split_temp_storage = nullptr; - size_t split_temp_storage_bytes = 0; - - MLCommon::device_buffer *d_num_selected_out; - MLCommon::device_buffer *temprowids; - MLCommon::device_buffer *question_value, *temp_data; - - //Total temp mem - size_t totalmem = 0; - - //CUDA stream - cudaStream_t stream; - - //For quantiles - MLCommon::device_buffer *d_quantile = nullptr; - - const ML::cumlHandle_impl& ml_handle; - - TemporaryMemory(const ML::cumlHandle_impl& handle, int N, int Ncols, int maxstr, int n_unique, int n_bins, const int split_algo):ml_handle(handle) - { - - //Assign Stream from cumlHandle - stream = ml_handle.getStream(); - - int n_hist_elements = n_unique * n_bins; - - h_hist = new MLCommon::host_buffer(handle.getHostAllocator(), stream, n_hist_elements); - d_hist = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_hist_elements); - nrowsleftright = new MLCommon::host_buffer(handle.getHostAllocator(), stream, 2); - - int extra_elements = Ncols; - int quantile_elements = (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) ? extra_elements : 1; - - temp_data = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N * extra_elements); - totalmem += n_hist_elements * sizeof(int) + N * extra_elements * sizeof(T); - - if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { - d_quantile = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_bins * quantile_elements); - totalmem += n_bins * extra_elements * sizeof(T); - } - - sampledlabels = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); - totalmem += N*sizeof(L); - - //Allocate Temporary for split functions - d_num_selected_out = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, 1); - d_flags_left = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); - d_flags_right = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); - temprowids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); - question_value = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, 1); - - cub::DeviceSelect::Flagged(d_split_temp_storage, split_temp_storage_bytes, temprowids->data(), d_flags_left->data(), temprowids->data(), d_num_selected_out->data(), N, stream); - d_split_temp_storage = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, split_temp_storage_bytes); - - totalmem += split_temp_storage_bytes + (N + 1)*sizeof(int) + 2*N*sizeof(char) + sizeof(T); - - h_histout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, n_hist_elements * Ncols); - int mse_elements = Ncols * n_bins; - h_mseout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, 2*mse_elements); - h_predout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, mse_elements); - - d_globalminmax = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols * 2); - d_histout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_hist_elements * Ncols); - d_mseout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, 2*mse_elements); - d_predout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, mse_elements); - - d_colids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); - // memory of d_histout + d_colids + d_globalminmax + (d_mseout + d_predout) - totalmem += (n_hist_elements * sizeof(int) + sizeof(unsigned int) + 2*sizeof(T) + 3 * n_bins * sizeof(T))* Ncols; - - //this->print_info(); - } + int extra_elements = Ncols; + int quantile_elements = (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) ? extra_elements : 1; - void print_info() - { - std::cout << " Total temporary memory usage--> "<< ((double)totalmem/ (1024*1024)) << " MB" << std::endl; - return; + temp_data = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N * extra_elements); + totalmem += n_hist_elements * sizeof(int) + N * extra_elements * sizeof(T); + + if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { + d_quantile = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_bins * quantile_elements); + totalmem += n_bins * extra_elements * sizeof(T); } - ~TemporaryMemory() - { - - h_hist->release(stream); - d_hist->release(stream); - nrowsleftright->release(stream); - temp_data->release(stream); - - delete h_hist; - delete d_hist; - delete temp_data; - - if (d_quantile != nullptr) { - d_quantile->release(stream); - delete d_quantile; - } - - sampledlabels->release(stream); - d_split_temp_storage->release(stream); - d_num_selected_out->release(stream); - d_flags_left->release(stream); - d_flags_right->release(stream); - temprowids->release(stream); - question_value->release(stream); - h_histout->release(stream); - h_mseout->release(stream); - h_predout->release(stream); - - delete sampledlabels; - delete d_split_temp_storage; - delete d_num_selected_out; - delete d_flags_left; - delete d_flags_right; - delete temprowids; - delete question_value; - delete h_histout; - delete h_mseout; - delete h_predout; - - d_globalminmax->release(stream); - d_histout->release(stream); - d_mseout->release(stream); - d_predout->release(stream); - d_colids->release(stream); - - delete d_globalminmax; - delete d_histout; - delete d_mseout; - delete d_predout; - delete d_colids; + sampledlabels = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); + totalmem += N*sizeof(L); + + //Allocate Temporary for split functions + d_num_selected_out = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, 1); + d_flags_left = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); + d_flags_right = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); + temprowids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, N); + question_value = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, 1); + + cub::DeviceSelect::Flagged(d_split_temp_storage, split_temp_storage_bytes, temprowids->data(), d_flags_left->data(), temprowids->data(), d_num_selected_out->data(), N, stream); + d_split_temp_storage = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, split_temp_storage_bytes); + + totalmem += split_temp_storage_bytes + (N + 1)*sizeof(int) + 2*N*sizeof(char) + sizeof(T); + + h_histout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, n_hist_elements * Ncols); + int mse_elements = Ncols * n_bins; + h_mseout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, 2*mse_elements); + h_predout = new MLCommon::host_buffer(handle.getHostAllocator(), stream, mse_elements); + + d_globalminmax = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols * 2); + d_histout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, n_hist_elements * Ncols); + d_mseout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, 2*mse_elements); + d_predout = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, mse_elements); + + d_colids = new MLCommon::device_buffer(handle.getDeviceAllocator(), stream, Ncols); + // memory of d_histout + d_colids + d_globalminmax + (d_mseout + d_predout) + totalmem += (n_hist_elements * sizeof(int) + sizeof(unsigned int) + 2*sizeof(T) + 3 * n_bins * sizeof(T))* Ncols; + + //this->print_info(); +} + +template +void TemporaryMemory::print_info() { + std::cout << " Total temporary memory usage--> "<< ((double)totalmem/ (1024*1024)) << " MB" << std::endl; +} + +template +TemporaryMemory::~TemporaryMemory() { + + h_hist->release(stream); + d_hist->release(stream); + nrowsleftright->release(stream); + temp_data->release(stream); + + delete h_hist; + delete d_hist; + delete temp_data; + if (d_quantile != nullptr) { + d_quantile->release(stream); + delete d_quantile; } -}; + sampledlabels->release(stream); + d_split_temp_storage->release(stream); + d_num_selected_out->release(stream); + d_flags_left->release(stream); + d_flags_right->release(stream); + temprowids->release(stream); + question_value->release(stream); + h_histout->release(stream); + h_mseout->release(stream); + h_predout->release(stream); + + delete sampledlabels; + delete d_split_temp_storage; + delete d_num_selected_out; + delete d_flags_left; + delete d_flags_right; + delete temprowids; + delete question_value; + delete h_histout; + delete h_mseout; + delete h_predout; + + d_globalminmax->release(stream); + d_histout->release(stream); + d_mseout->release(stream); + d_predout->release(stream); + d_colids->release(stream); + + delete d_globalminmax; + delete d_histout; + delete d_mseout; + delete d_predout; + delete d_colids; + +} diff --git a/cpp/src/decisiontree/memory.h b/cpp/src/decisiontree/memory.h new file mode 100644 index 0000000000..c2700a9375 --- /dev/null +++ b/cpp/src/decisiontree/memory.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include "common/cumlHandle.hpp" +#include +#include + +template +struct TemporaryMemory +{ + // Labels after boostrapping + MLCommon::device_buffer *sampledlabels; + + // Used for gini histograms (root tree node) + MLCommon::device_buffer *d_hist; + MLCommon::host_buffer *h_hist; + + //Host/Device histograms and device minmaxs + MLCommon::device_buffer *d_globalminmax; + MLCommon::device_buffer *d_histout; + MLCommon::device_buffer *d_colids; + MLCommon::host_buffer *h_histout; + MLCommon::device_buffer *d_mseout, *d_predout; + MLCommon::host_buffer *h_mseout, *h_predout; + + //Below pointers are shared for split functions + MLCommon::device_buffer *d_flags_left, *d_flags_right; + MLCommon::host_buffer *nrowsleftright; + MLCommon::device_buffer *d_split_temp_storage = nullptr; + size_t split_temp_storage_bytes = 0; + + MLCommon::device_buffer *d_num_selected_out; + MLCommon::device_buffer *temprowids; + MLCommon::device_buffer *question_value, *temp_data; + + //Total temp mem + size_t totalmem = 0; + + //CUDA stream + cudaStream_t stream; + + //For quantiles + MLCommon::device_buffer *d_quantile = nullptr; + + const ML::cumlHandle_impl& ml_handle; + + TemporaryMemory(const ML::cumlHandle_impl& handle, int N, int Ncols, int maxstr, int n_unique, int n_bins, const int split_algo); + + void print_info(); + ~TemporaryMemory(); +}; From 1dd63f4397265bf8decdeae73a0a665465c67546 Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Fri, 7 Jun 2019 06:41:52 -0700 Subject: [PATCH 34/51] Made RF's data input for predictions a GPU ptr. - DT's predict method expects data input and predictions to be CPU ptrs. Added relevant error checking. --- cpp/src/decisiontree/decisiontree.cu | 32 ++++++++++++---- cpp/src/decisiontree/decisiontree.h | 5 ++- cpp/src/randomforest/randomforest.cu | 55 +++++++++++++++++++--------- cpp/src/randomforest/randomforest.h | 4 +- cpp/test/sg/rf_test.cu | 16 +++++--- 5 files changed, 79 insertions(+), 33 deletions(-) diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu index 4174ba9793..167e557d02 100644 --- a/cpp/src/decisiontree/decisiontree.cu +++ b/cpp/src/decisiontree/decisiontree.cu @@ -25,6 +25,18 @@ #include "kernels/quantile.cuh" namespace ML { + +bool is_dev_ptr(const void *p) { + cudaPointerAttributes pointer_attr; + cudaError_t err = cudaPointerGetAttributes(&pointer_attr, p); + if (err == cudaSuccess) { + return pointer_attr.devicePointer; + } else { + err = cudaGetLastError(); + return false; + } +} + namespace DecisionTree { template @@ -278,17 +290,21 @@ void DecisionTreeBase::init_depth_zero(const L* labels, std::vector void DecisionTreeBase::predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, L* predictions, bool verbose) const { + + ASSERT(!is_dev_ptr(rows) && !is_dev_ptr(predictions), "DT Error: Current impl. expects both input and predictions to be CPU pointers.\n"); + ASSERT(root, "Cannot predict w/ empty tree!"); ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); + predict_all(rows, n_rows, n_cols, predictions, verbose); } @@ -324,9 +340,11 @@ L DecisionTreeBase::predict_one(const T * row, const TreeNode* const template void DecisionTreeBase::base_fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, L *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, DecisionTreeParams& tree_params, - ML::CRITERION default_criterion, ML::CRITERION other_criterion, const std::string & dt_name) { + bool is_classifier) { const char * CRITERION_NAME[]={"GINI", "ENTROPY", "MSE", "MAE", "END"}; + CRITERION default_criterion = (is_classifier) ? CRITERION::GINI : CRITERION::MSE; + CRITERION last_criterion = (is_classifier) ? CRITERION::ENTROPY : CRITERION::MAE; tree_params.validity_check(); if (tree_params.n_bins > n_sampled_rows) { @@ -338,8 +356,8 @@ void DecisionTreeBase::base_fit(const ML::cumlHandle& handle, T *data, con if (tree_params.split_criterion == CRITERION::CRITERION_END) { // Set default to GINI (classification) or MSE (regression) tree_params.split_criterion = default_criterion; } - ASSERT((tree_params.split_criterion == default_criterion || tree_params.split_criterion == other_criterion), - "Decision Tree %s split criteria should be %s or %s\n", dt_name.c_str(), CRITERION_NAME[default_criterion], CRITERION_NAME[other_criterion]); + ASSERT((tree_params.split_criterion >= default_criterion) && (tree_params.split_criterion <= last_criterion), + "Unsupported criterion %s\n", CRITERION_NAME[tree_params.split_criterion]); plant(handle.getImpl(), data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree_params.max_depth, tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, @@ -367,7 +385,7 @@ template void DecisionTreeClassifier::fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, DecisionTreeParams tree_params) { - this->base_fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree_params, CRITERION::GINI, CRITERION::ENTROPY, "Classifier"); + this->base_fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree_params, true); } template @@ -415,7 +433,7 @@ void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const template void DecisionTreeRegressor::fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, const int n_sampled_rows, DecisionTreeParams tree_params) { - this->base_fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, 1, tree_params, CRITERION::MSE, CRITERION::MAE, "Regressor"); + this->base_fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, 1, tree_params, false); } template diff --git a/cpp/src/decisiontree/decisiontree.h b/cpp/src/decisiontree/decisiontree.h index e77d5c8049..6f23a42f9c 100644 --- a/cpp/src/decisiontree/decisiontree.h +++ b/cpp/src/decisiontree/decisiontree.h @@ -26,6 +26,9 @@ #include namespace ML { + +bool is_dev_ptr(const void *p); + namespace DecisionTree { template @@ -128,7 +131,7 @@ class DecisionTreeBase { virtual void find_best_fruit_all(T *data, L *labels, const float colper, MetricQuestion & ques, float& gain, unsigned int* rowids, const int n_sampled_rows, MetricInfo split_info[3], int depth) = 0; void base_fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, L *labels, unsigned int *rowids, - const int n_sampled_rows, int unique_labels, DecisionTreeParams & tree_params, CRITERION default_criterion, CRITERION other_criterion, const std::string & name); + const int n_sampled_rows, int unique_labels, DecisionTreeParams & tree_params, bool is_classifier); public: // Printing utility for high level tree info. diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index b1ebbe65eb..b5c914065d 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -242,6 +242,26 @@ void rf::prepare_fit_per_tree(const ML::cumlHandle_impl& handle, int tree_ } } +template +void rf::error_checking(const T * input, L * predictions, int n_rows, int n_cols, bool predict) const { + + if (predict) { + ASSERT(get_trees_ptr(), "Cannot predict! No trees in the forest."); + ASSERT(predictions != nullptr, "Error! User has not allocated memory for predictions."); + } else { + ASSERT(!get_trees_ptr(), "Cannot fit an existing forest."); + } + ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); + ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); + + bool input_is_dev_ptr = is_dev_ptr(input); + bool preds_is_dev_ptr = is_dev_ptr(predictions); + + if (!input_is_dev_ptr || (input_is_dev_ptr != preds_is_dev_ptr)) { + ASSERT(false, "RF Error: Expected both input and labels/predictions to be GPU pointers"); + } +} + /** * @brief Construct rfClassifier object. * @tparam T: data type for input data (float or double). @@ -284,9 +304,7 @@ const DecisionTree::DecisionTreeClassifier * rfClassifier::get_trees_ptr() template void rfClassifier::fit(const cumlHandle& user_handle, T * input, int n_rows, int n_cols, int * labels, int n_unique_labels) { - ASSERT(!trees, "Cannot fit an existing forest."); - ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); - ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); + this->error_checking(input, labels, n_rows, n_cols, false); trees = new DecisionTree::DecisionTreeClassifier[this->rf_params.n_trees]; @@ -344,15 +362,17 @@ void rfClassifier::fit(const cumlHandle& user_handle, T * input, int n_rows, template void rfClassifier::predict(const cumlHandle& user_handle, const T * input, int n_rows, int n_cols, int * predictions, bool verbose) const { - ASSERT(trees, "Cannot predict! No trees in the forest."); - ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); - ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); - ASSERT(predictions != nullptr, "Error! User has not allocated memory for predictions."); + this->error_checking(input, predictions, n_rows, n_cols, true); std::vector h_predictions(n_rows); const cumlHandle_impl& handle = user_handle.getImpl(); cudaStream_t stream = user_handle.getStream(); + std::vector h_input(n_rows * n_cols); + MLCommon::updateHost(h_input.data(), input, n_rows * n_cols, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + int row_size = n_cols; for (int row_id = 0; row_id < n_rows; row_id++) { @@ -360,7 +380,7 @@ void rfClassifier::predict(const cumlHandle& user_handle, const T * input, in if (verbose) { std::cout << "\n\n"; std::cout << "Predict for sample: "; - for (int i = 0; i < n_cols; i++) std::cout << input[row_id*row_size + i] << ", "; + for (int i = 0; i < n_cols; i++) std::cout << h_input[row_id*row_size + i] << ", "; std::cout << std::endl; } @@ -376,7 +396,7 @@ void rfClassifier::predict(const cumlHandle& user_handle, const T * input, in trees[i].print(); } int prediction; - trees[i].predict(user_handle, &input[row_id * row_size], 1, n_cols, &prediction, verbose); + trees[i].predict(user_handle, &h_input[row_id * row_size], 1, n_cols, &prediction, verbose); ret = prediction_to_cnt.insert(std::pair(prediction, 1)); if (!(ret.second)) { ret.first->second += 1; @@ -461,9 +481,7 @@ const DecisionTree::DecisionTreeRegressor * rfRegressor::get_trees_ptr() c template void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, int n_cols, T * labels) { - ASSERT(!trees, "Cannot fit an existing forest."); - ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); - ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); + this->error_checking(input, labels, n_rows, n_cols, false); trees = new DecisionTree::DecisionTreeRegressor[this->rf_params.n_trees]; @@ -521,15 +539,16 @@ void rfRegressor::fit(const cumlHandle& user_handle, T * input, int n_rows, i template void rfRegressor::predict(const cumlHandle& user_handle, const T * input, int n_rows, int n_cols, T * predictions, bool verbose) const { - ASSERT(trees, "Cannot predict! No trees in the forest."); - ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows); - ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols); - ASSERT(predictions != nullptr, "Error! User has not allocated memory for predictions."); + this->error_checking(input, predictions, n_rows, n_cols, true); std::vector h_predictions(n_rows); const cumlHandle_impl& handle = user_handle.getImpl(); cudaStream_t stream = user_handle.getStream(); + std::vector h_input(n_rows * n_cols); + MLCommon::updateHost(h_input.data(), input, n_rows * n_cols, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + int row_size = n_cols; for (int row_id = 0; row_id < n_rows; row_id++) { @@ -537,7 +556,7 @@ void rfRegressor::predict(const cumlHandle& user_handle, const T * input, int if (verbose) { std::cout << "\n\n"; std::cout << "Predict for sample: "; - for (int i = 0; i < n_cols; i++) std::cout << input[row_id*row_size + i] << ", "; + for (int i = 0; i < n_cols; i++) std::cout << h_input[row_id*row_size + i] << ", "; std::cout << std::endl; } @@ -550,7 +569,7 @@ void rfRegressor::predict(const cumlHandle& user_handle, const T * input, int trees[i].print(); } T prediction; - trees[i].predict(user_handle, &input[row_id * row_size], 1, n_cols, &prediction, verbose); + trees[i].predict(user_handle, &h_input[row_id * row_size], 1, n_cols, &prediction, verbose); sum_predictions += prediction; } // Random forest's prediction is the arithmetic mean of all its decision tree predictions. diff --git a/cpp/src/randomforest/randomforest.h b/cpp/src/randomforest/randomforest.h index b914373d23..72e25f3d1a 100644 --- a/cpp/src/randomforest/randomforest.h +++ b/cpp/src/randomforest/randomforest.h @@ -36,7 +36,7 @@ struct RF_metrics { // Classification metrics float accuracy = -1.0f; - // Regression metrics - TODO FIXME change the type? + // Regression metrics double mean_abs_error = -1.0; double mean_squared_error = -1.0; double median_abs_error = -1.0; @@ -94,6 +94,8 @@ class rf { void prepare_fit_per_tree(const ML::cumlHandle_impl& handle, int tree_id, int n_rows, int n_sampled_rows, unsigned int * selected_rows, unsigned int * sorted_selected_rows, char * rows_temp_storage, size_t temp_storage_bytes); + void error_checking(const T * input, L * predictions, int n_rows, int n_cols, bool is_predict) const; + public: rf(RF_params cfg_rf_params, int cfg_rf_type=RF_type::CLASSIFICATION); diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index c9a65fa750..74a3fc0081 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -96,9 +96,11 @@ protected: int inference_data_len = params.n_inference_rows * params.n_cols; inference_data_h = {30.0, 10.0, 1.0, 20.0, 2.0, 10.0, 0.0, 40.0}; inference_data_h.resize(inference_data_len); + allocate(inference_data_d, inference_data_len); + updateDevice(inference_data_d, inference_data_h.data(), data_len, stream); // Predict and compare against known labels - RF_metrics tmp = cross_validate(handle, rf_classifier, inference_data_h.data(), labels, + RF_metrics tmp = cross_validate(handle, rf_classifier, inference_data_d, labels, params.n_inference_rows, params.n_cols, predicted_labels, false); accuracy = tmp.accuracy; @@ -120,13 +122,14 @@ protected: CUDA_CHECK(cudaFree(labels)); CUDA_CHECK(cudaFree(predicted_labels)); CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(inference_data_d)); delete rf_classifier; } protected: RfInputs params; - T * data; + T * data, * inference_data_d; int * labels; std::vector inference_data_h; std::vector labels_h; @@ -186,11 +189,11 @@ protected: int inference_data_len = params.n_inference_rows * params.n_cols; inference_data_h = {0.0, 10.0, 0.0, 20.0, 0.0, 30.0, 0.0, 40.0}; inference_data_h.resize(inference_data_len); + allocate(inference_data_d, inference_data_len); + updateDevice(inference_data_d, inference_data_h.data(), data_len, stream); - //TODO FIXME stream - // Predict and compare against known labels - RF_metrics tmp = cross_validate(handle, rf_regressor, inference_data_h.data(), labels, + RF_metrics tmp = cross_validate(handle, rf_regressor, inference_data_d, labels, params.n_inference_rows, params.n_cols, predicted_labels, false); mse = tmp.mean_squared_error; @@ -210,13 +213,14 @@ protected: CUDA_CHECK(cudaFree(labels)); CUDA_CHECK(cudaFree(predicted_labels)); CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(inference_data_d)); delete rf_regressor; } protected: RfInputs params; - T * data; + T * data, * inference_data_d; T * labels; std::vector inference_data_h; std::vector labels_h; From babb624fc487468ae48f4c7efc4f4d7e2232f6ee Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Wed, 12 Jun 2019 10:34:33 -0700 Subject: [PATCH 35/51] Added unit-tests for Accuracy score. - Also updated min_rows_per_node boundary check (>= 2). --- cpp/src/decisiontree/decisiontree.cu | 2 +- cpp/test/prims/score.cu | 104 ++++++++++++++++++++++++++- 2 files changed, 104 insertions(+), 2 deletions(-) diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu index 167e557d02..8d9e0c21f7 100644 --- a/cpp/src/decisiontree/decisiontree.cu +++ b/cpp/src/decisiontree/decisiontree.cu @@ -85,7 +85,7 @@ void DecisionTreeParams::validity_check() const { ASSERT((n_bins > 0), "Invalid n_bins %d", n_bins); ASSERT((split_algo >= 0) && (split_algo < SPLIT_ALGO::SPLIT_ALGO_END), "split_algo value %d outside permitted [0, %d) range", split_algo, SPLIT_ALGO::SPLIT_ALGO_END); - ASSERT((min_rows_per_node > 0), "Invalid min # rows per node %d", min_rows_per_node); + ASSERT((min_rows_per_node >= 2), "Invalid min # rows per node value %d. Should be >= 2.", min_rows_per_node); } /** diff --git a/cpp/test/prims/score.cu b/cpp/test/prims/score.cu index de4baa4cd0..32a34ab4e7 100644 --- a/cpp/test/prims/score.cu +++ b/cpp/test/prims/score.cu @@ -77,5 +77,107 @@ TEST(ScoreTestLowScore, Result) { CUDA_CHECK(cudaStreamDestroy(stream)); } -}} +struct AccuracyInputs { + /** + * Number of predictions. + */ + int n; + /** + * Number of predictions w/ different values than their corresponding element in reference predictions. + * Valid range [0, n]. changed_n in [0, n] will yield accuracy of (n - changed_n) / n. + */ + int changed_n; + /** + * Seed for randomly generated predictions. + */ + unsigned long long int seed; +}; + +std::ostream &operator<<(::std::ostream &os, const AccuracyInputs &acc_inputs) { + os << "AccuracyInputs are {" << acc_inputs.n << ", " << acc_inputs.changed_n << ", " << acc_inputs.seed << "}" << std::endl; + return os; +} + +template +__global__ void change_vals(T * predictions, T * ref_predictions, const int changed_n) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < changed_n) { + predictions[tid] = ref_predictions[tid] + 1; // change first changed_n predictions + } +} + +template +class AccuracyTest : public ::testing::TestWithParam { +protected: + + void SetUp() override { + params = ::testing::TestWithParam::GetParam(); + ASSERT((params.changed_n <= params.n) && (params.changed_n >= 0), "Invalid params."); + + Random::Rng r(params.seed); + CUDA_CHECK(cudaStreamCreate(&stream)); + std::shared_ptr d_allocator(new defaultDeviceAllocator); + + allocate(predictions, params.n); + allocate(ref_predictions, params.n); + r.normal(ref_predictions, params.n, (T) 0.0, (T) 1.0, stream); + copyAsync(predictions, ref_predictions, params.n, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + //Modify params.changed_n unique predictions to a different value. New value is irrelevant. + if (params.changed_n > 0) { + int threads = 64; + int blocks = ceildiv(params.changed_n, threads); + //@todo Could also generate params.changed_n unique random positions in [0, n) range, instead of changing the first ones. + change_vals<<>>(predictions, ref_predictions, params.changed_n); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + + computed_accuracy = MLCommon::Score::accuracy_score(predictions, ref_predictions, params.n, d_allocator, stream); + ref_accuracy = (params.n - params.changed_n) * 1.0f/ params.n; + //std::cout << "computed_accuracy is " << computed_accuracy << " ref_accuracy is " << ref_accuracy << std::endl; + } + + void TearDown() override { + CUDA_CHECK(cudaFree(predictions)); + CUDA_CHECK(cudaFree(ref_predictions)); + CUDA_CHECK(cudaStreamDestroy(stream)); + computed_accuracy = -1.0f; + ref_accuracy = -1.0f; + } + + AccuracyInputs params; + T * predictions, * ref_predictions; + float computed_accuracy, ref_accuracy; + cudaStream_t stream; +}; + +const std::vector inputs = { + {1, 1, 1234ULL}, // single element, wrong prediction + {1, 0, 1234ULL}, // single element, perfect prediction + {2, 1, 1234ULL}, // multiple elements, 0.5 accuracy + {1000, 0, 1234ULL}, // multiple elements, perfect predictions + {1000, 1000, 1234ULL}, // multiple elements, no correct predictions + {1000, 80, 1234ULL}, // multiple elements, prediction mix + {1000, 45, 1234ULL} // multiple elements, prediction mix +}; + + +typedef AccuracyTest AccuracyTestF; +TEST_P(AccuracyTestF, Result) { + ASSERT_TRUE(computed_accuracy == ref_accuracy); +} + +typedef AccuracyTest AccuracyTestD; +TEST_P(AccuracyTestD, Result) { + ASSERT_TRUE(computed_accuracy == ref_accuracy); +} + +INSTANTIATE_TEST_CASE_P(AccuracyTests, AccuracyTestF, ::testing::ValuesIn(inputs)); + +INSTANTIATE_TEST_CASE_P(AccuracyTests, AccuracyTestD, ::testing::ValuesIn(inputs)); + +} // end namespace Score +} // end namespace MLCommon From 5f4a5fd3a0ced6bf5ab62a5534af36d9de9bd757 Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Thu, 13 Jun 2019 15:14:07 +0200 Subject: [PATCH 36/51] added support for when large number of features histograms do not fint in shared mem --- cpp/src/decisiontree/decisiontree.cu | 12 +- cpp/src/decisiontree/kernels/batch_cal.cuh | 27 +++ .../kernels/evaluate_classifier.cuh | 172 +++++++++++------- 3 files changed, 138 insertions(+), 73 deletions(-) create mode 100644 cpp/src/decisiontree/kernels/batch_cal.cuh diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu index 8d9e0c21f7..a87c7ad352 100644 --- a/cpp/src/decisiontree/decisiontree.cu +++ b/cpp/src/decisiontree/decisiontree.cu @@ -190,13 +190,13 @@ void DecisionTreeBase::plant(const cumlHandle_impl& handle, T *data, const max_shared_mem = prop.sharedMemPerBlock; if (split_algo == SPLIT_ALGO::HIST) { - shmem_used += 2 * sizeof(T) * ncols; + shmem_used += 2 * sizeof(T); } if (typeid(L) == typeid(int)) { // Classification - shmem_used += nbins * n_unique_labels * sizeof(int) * ncols; + shmem_used += nbins * n_unique_labels * sizeof(int); } else { // Regression - shmem_used += nbins * sizeof(T) * ncols * 3; - shmem_used += nbins * sizeof(int) * ncols; + shmem_used += nbins * sizeof(T) * 3; + shmem_used += nbins * sizeof(int); } ASSERT(shmem_used <= max_shared_mem, "Shared memory per block limit %zd , requested %zd \n", max_shared_mem, shmem_used); @@ -410,10 +410,10 @@ void DecisionTreeClassifier::find_best_fruit_all(T *data, int *labels, const if (this->split_criterion == CRITERION::GINI) { best_split_all_cols_classifier(data, rowids, labels, current_nbins, n_sampled_rows, this->n_unique_labels, this->dinfo.NLocalrows, colselector, - this->tempmem[0], &split_info[0], ques, gain, this->split_algo); + this->tempmem[0], &split_info[0], ques, gain, this->split_algo, this->max_shared_mem); } else { best_split_all_cols_classifier(data, rowids, labels, current_nbins, n_sampled_rows, this->n_unique_labels, this->dinfo.NLocalrows, colselector, - this->tempmem[0], &split_info[0], ques, gain, this->split_algo); + this->tempmem[0], &split_info[0], ques, gain, this->split_algo, this->max_shared_mem); } } diff --git a/cpp/src/decisiontree/kernels/batch_cal.cuh b/cpp/src/decisiontree/kernels/batch_cal.cuh new file mode 100644 index 0000000000..ef0b2302b7 --- /dev/null +++ b/cpp/src/decisiontree/kernels/batch_cal.cuh @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +/* Return max. possible number of columns that can be processed within avail_shared_memory. + Expects that requested_shared_memory is O(ncols) */ +int get_batch_cols_cnt(const size_t avail_shared_memory, const size_t requested_shared_memory, const int ncols) { + int ncols_in_batch = ncols; + int ncols_factor = requested_shared_memory / ncols; + if (requested_shared_memory > avail_shared_memory) { + ncols_in_batch = avail_shared_memory / ncols_factor; // floor div. + } + return ncols_in_batch; +} diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index 133cdc9d37..ebedfbe75e 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -18,6 +18,7 @@ #include #include #include "metric.cuh" +#include "batch_cal.cuh" #include "../memory.h" #include "col_condenser.cuh" #include @@ -29,82 +30,111 @@ column order is as per colids (bootstrapped random cols) for each col there are nbins histograms */ template -__global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const int nbins, const int nrows, const int ncols, const int n_unique_labels, const T* __restrict__ globalminmax, int* histout) { - +__global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const int nbins, const int nrows, const int ncols, const int batch_ncols, const int n_unique_labels, const T* __restrict__ globalminmax, int* histout) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; - T *minmaxshared = (T*)shmem; - int *shmemhist = (int*)(shmem + 2*ncols*sizeof(T)); - - for (int i=threadIdx.x; i < 2*ncols; i += blockDim.x) { - minmaxshared[i] = globalminmax[i]; - } - - for (int i = threadIdx.x; i < n_unique_labels*nbins*ncols; i += blockDim.x) { - shmemhist[i] = 0; - } - - __syncthreads(); - - for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { - int mycolid = (int)( i / nrows); - int coloffset = mycolid*n_unique_labels*nbins; - - T delta = (minmaxshared[mycolid + ncols] - minmaxshared[mycolid]) / (nbins); - T base_quesval = minmaxshared[mycolid] + delta; + + int colstep = (int)(ncols/batch_ncols); + if((ncols % batch_ncols) != 0) + colstep++; + + int batchsz = batch_ncols; + for(int k = 0; k < colstep; k++) { + + if(k == (colstep-1) && ( (ncols % batch_ncols) != 0) ) { + batchsz = ncols % batch_ncols; + } + + T *minmaxshared = (T*)shmem; + int *shmemhist = (int*)(shmem + 2*batchsz*sizeof(T)); + + for (int i=threadIdx.x; i < batchsz; i += blockDim.x) { + minmaxshared[i] = globalminmax[k*batch_ncols + i]; + minmaxshared[i + batchsz] = globalminmax[k*batch_ncols + i + ncols]; + } + + for (int i = threadIdx.x; i < n_unique_labels*nbins*batchsz; i += blockDim.x) { + shmemhist[i] = 0; + } - T localdata = data[i]; - int label = labels[ i % nrows ]; - for (int j=0; j < nbins; j++) { - T quesval = base_quesval + j * delta; + __syncthreads(); - if (localdata <= quesval) { - atomicAdd(&shmemhist[label + n_unique_labels * j + coloffset], 1); + for (unsigned int i = tid; i < nrows*batchsz; i += blockDim.x*gridDim.x) { + int mycolid = (int)( i / nrows); + int coloffset = mycolid*n_unique_labels*nbins; + + T delta = (minmaxshared[mycolid + batchsz] - minmaxshared[mycolid]) / (nbins); + T base_quesval = minmaxshared[mycolid] + delta; + + T localdata = data[i + k*batch_ncols*nrows]; + int label = labels[ i % nrows ]; + for (int j=0; j < nbins; j++) { + T quesval = base_quesval + j * delta; + + if (localdata <= quesval) { + atomicAdd(&shmemhist[label + n_unique_labels * j + coloffset], 1); + } } + } - } - - __syncthreads(); - - for (int i = threadIdx.x; i < ncols*n_unique_labels*nbins; i += blockDim.x) { - atomicAdd(&histout[i], shmemhist[i]); + __syncthreads(); + for (int i = threadIdx.x; i < batchsz*n_unique_labels*nbins; i += blockDim.x) { + atomicAdd(&histout[k*batch_ncols*n_unique_labels*nbins + i], shmemhist[i]); + } + + __syncthreads(); } } template -__global__ void all_cols_histograms_global_quantile_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int n_unique_labels, int* histout, const T* __restrict__ quantile) { +__global__ void all_cols_histograms_global_quantile_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int batch_ncols, const int n_unique_labels, int* histout, const T* __restrict__ quantile) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; int *shmemhist = (int*)(shmem); - - for (int i = threadIdx.x; i < n_unique_labels*nbins*ncols; i += blockDim.x) { - shmemhist[i] = 0; - } - - __syncthreads(); - - for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { - int mycolid = (int)( i / nrows); - int coloffset = mycolid*n_unique_labels*nbins; - - T localdata = data[i]; - int label = labels[ i % nrows ]; - for (int j=0; j < nbins; j++) { - int quantile_index = colids[mycolid] * nbins + j; - T quesval = quantile[quantile_index]; - if (localdata <= quesval) { - atomicAdd(&shmemhist[label + n_unique_labels * j + coloffset], 1); + + int colstep = (int)(ncols/batch_ncols); + if((ncols % batch_ncols) != 0) + colstep++; + + int batchsz = batch_ncols; + for(int k = 0; k < colstep; k++) { + + if(k == (colstep-1) && ( (ncols % batch_ncols) != 0) ) { + batchsz = ncols % batch_ncols; + } + + for (int i = threadIdx.x; i < n_unique_labels*nbins*batchsz; i += blockDim.x) { + shmemhist[i] = 0; + } + + __syncthreads(); + + for (unsigned int i = tid; i < nrows*batchsz; i += blockDim.x*gridDim.x) { + int mycolid = (int)( i / nrows); + int coloffset = mycolid*n_unique_labels*nbins; + + T localdata = data[k*batch_ncols*nrows + i]; + int label = labels[ i % nrows ]; + for (int j=0; j < nbins; j++) { + int quantile_index = colids[k*batch_ncols + mycolid] * nbins + j; + T quesval = quantile[quantile_index]; + if (localdata <= quesval) { + atomicAdd(&shmemhist[label + n_unique_labels * j + coloffset], 1); + } } + } - - } - - __syncthreads(); - - for (int i = threadIdx.x; i < ncols*n_unique_labels*nbins; i += blockDim.x) { - atomicAdd(&histout[i], shmemhist[i]); + + __syncthreads(); + + for (int i = threadIdx.x; i < batchsz*n_unique_labels*nbins; i += blockDim.x) { + atomicAdd(&histout[k*batch_ncols*n_unique_labels*nbins + i], shmemhist[i]); + } + + __syncthreads(); } } @@ -192,7 +222,7 @@ void find_best_split_classifier(const std::shared_ptr> tem template -void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, const L *labels, const int nbins, const int nrows, const int n_unique_labels, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) +void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, const L *labels, const int nbins, const int nrows, const int n_unique_labels, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo, const size_t max_shared_mem) { unsigned int* d_colids = tempmem->d_colids->data(); T* d_globalminmax = tempmem->d_globalminmax->data(); @@ -225,17 +255,25 @@ void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, c CUDA_CHECK(cudaGetLastError()); L *labelptr = tempmem->sampledlabels->data(); get_sampled_labels(labels, labelptr, rowids, nrows, tempmem->stream); - - shmemsize = n_hist_bytes; - + + int batch_ncols; + size_t shmem_needed = ncols * n_unique_labels * nbins * sizeof(int); + if(split_algo == ML::SPLIT_ALGO::HIST) + shmem_needed = ncols * 2 * sizeof(T); + + batch_ncols = get_batch_cols_cnt(max_shared_mem, shmem_needed, ncols); + + shmemsize = batch_ncols * n_unique_labels * nbins * sizeof(int); + blocks = MLCommon::ceildiv(batch_ncols * nrows, threads); + if (split_algo == ML::SPLIT_ALGO::HIST) { - shmemsize += col_minmax_bytes; - all_cols_histograms_kernel_class<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, n_unique_labels, d_globalminmax, d_histout); + shmemsize += 2 * batch_ncols * sizeof(T); + all_cols_histograms_kernel_class<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, batch_ncols, n_unique_labels, d_globalminmax, d_histout); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { - all_cols_histograms_global_quantile_kernel_class<<stream>>>(tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, n_unique_labels, d_histout, tempmem->d_quantile->data()); + all_cols_histograms_global_quantile_kernel_class<<stream>>>(tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, batch_ncols, n_unique_labels, d_histout, tempmem->d_quantile->data()); } CUDA_CHECK(cudaGetLastError()); - + MLCommon::updateHost(h_histout, d_histout, n_hist_elements, tempmem->stream); CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); From ae744ad90ca68555201c063b13736d03ea55f07c Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Thu, 13 Jun 2019 15:21:55 +0200 Subject: [PATCH 37/51] fixed missing plus sign --- cpp/src/decisiontree/kernels/evaluate_classifier.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index ebedfbe75e..6664bebc88 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -259,7 +259,7 @@ void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, c int batch_ncols; size_t shmem_needed = ncols * n_unique_labels * nbins * sizeof(int); if(split_algo == ML::SPLIT_ALGO::HIST) - shmem_needed = ncols * 2 * sizeof(T); + shmem_needed += ncols * 2 * sizeof(T); batch_ncols = get_batch_cols_cnt(max_shared_mem, shmem_needed, ncols); From fae84feb2bcbf731f1f5bc2318adcc770523ad15 Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Thu, 13 Jun 2019 07:23:09 -0700 Subject: [PATCH 38/51] Added unit-tests for regression metrics --- cpp/test/prims/score.cu | 159 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 158 insertions(+), 1 deletion(-) diff --git a/cpp/test/prims/score.cu b/cpp/test/prims/score.cu index 32a34ab4e7..d66b1eb1ef 100644 --- a/cpp/test/prims/score.cu +++ b/cpp/test/prims/score.cu @@ -19,6 +19,7 @@ #include "random/rng.h" #include "test_utils.h" #include +#include namespace MLCommon { namespace Score { @@ -77,6 +78,8 @@ TEST(ScoreTestLowScore, Result) { CUDA_CHECK(cudaStreamDestroy(stream)); } +// Tests for accuracy_score + struct AccuracyInputs { /** * Number of predictions. @@ -175,9 +178,163 @@ TEST_P(AccuracyTestD, Result) { } INSTANTIATE_TEST_CASE_P(AccuracyTests, AccuracyTestF, ::testing::ValuesIn(inputs)); - INSTANTIATE_TEST_CASE_P(AccuracyTests, AccuracyTestD, ::testing::ValuesIn(inputs)); +// Tests for regression_metrics + +template +struct RegressionInputs { + T tolerance; + int n; // number of predictions + bool hardcoded_preds; // (hardcoded_preds) ? use predictions, ref_predictions : use randomly generated arrays. + std::vector predictions; + std::vector ref_predictions; + T predictions_range[2]; // predictions in predictions_range if not hardcoded_preds + T ref_predictions_range[2]; // predictions in ref_predictions_range if not hardcoded_preds + unsigned long long int seed; +}; + +template +std::ostream &operator<<(std::ostream &os, const RegressionInputs ®_inputs) { + os << "RegressionInputs are {" << reg_inputs.tolerance << ", " << reg_inputs.n << ", " << reg_inputs.hardcoded_preds << ", "; + if (reg_inputs.hardcoded_preds) { + os << "{"; + for (int i = 0; i < reg_inputs.n; i++) os << reg_inputs.predictions[i] << ", "; + os << "}, {"; + for (int i = 0; i < reg_inputs.n; i++) os << reg_inputs.ref_predictions[i] << ", "; + os << "}"; + os << "{" << reg_inputs.predictions_range[0] << ", " << reg_inputs.predictions_range[1] << "}, "; + os << "{" << reg_inputs.ref_predictions_range[0] << ", " << reg_inputs.ref_predictions_range[1] << "}"; + } else { + os << "{}, {}, {}, {}"; + } + os << ", " << reg_inputs.seed; + return os; +} + +template +void host_regression_computations(std::vector &predictions, std::vector &ref_predictions, const int n, std::vector ®ression_metrics) { + + double abs_difference_sum = 0; + double mse_sum = 0; + std::vector abs_diffs(n); + + for (int i = 0; i < n; i++) { + double abs_diff = abs(predictions[i] - ref_predictions[i]); + abs_difference_sum += abs_diff; + mse_sum += pow(predictions[i] - ref_predictions[i], 2); + abs_diffs[i] = abs_diff; + } + + regression_metrics[0] = abs_difference_sum / n; + regression_metrics[1] = mse_sum / n; + + std::sort(abs_diffs.begin(), abs_diffs.end()); + int middle = n / 2; + if (n % 2 == 1) { + regression_metrics[2] = abs_diffs[middle]; + } else { + regression_metrics[2] = (abs_diffs[middle] + abs_diffs[middle - 1]) / 2; + } +} + +template +class RegressionMetricsTest : public ::testing::TestWithParam> { +protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + computed_regression_metrics.assign(3, -1.0); + ref_regression_metrics.assign(3, -1.0); + + CUDA_CHECK(cudaStreamCreate(&stream)); + std::shared_ptr d_allocator(new defaultDeviceAllocator); + + allocate(d_predictions, params.n); + allocate(d_ref_predictions, params.n); + + if (params.hardcoded_preds) { + updateDevice(d_predictions, params.predictions.data(), params.n, stream); + updateDevice(d_ref_predictions, params.ref_predictions.data(), params.n, stream); + } else { + params.predictions.resize(params.n); + params.ref_predictions.resize(params.n); + Random::Rng r(params.seed); + // randomly generate arrays + r.uniform(d_predictions, params.n, params.predictions_range[0], params.predictions_range[1], stream); + r.uniform(d_ref_predictions, params.n, params.ref_predictions_range[0], params.ref_predictions_range[1], stream); + // copy to host to compute reference regression metrics + updateHost(params.predictions.data(), d_predictions, params.n, stream); + updateHost(params.ref_predictions.data(), d_ref_predictions, params.n, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + + MLCommon::Score::regression_metrics(d_predictions, d_ref_predictions, params.n, d_allocator, stream, + computed_regression_metrics[0], computed_regression_metrics[1], computed_regression_metrics[2]); + + host_regression_computations(params.predictions, params.ref_predictions, params.n, ref_regression_metrics); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaFree(d_predictions)); + CUDA_CHECK(cudaFree(d_ref_predictions)); + } + + RegressionInputs params; + T * d_predictions, * d_ref_predictions; + std::vector computed_regression_metrics; + std::vector ref_regression_metrics; + cudaStream_t stream; +}; + +const std::vector> regression_inputs_float = { + {0.00001f, 1, true, {10.2f}, {20.2f}, {}, {}, 1234ULL}, // single element + {0.00001f, 2, true, {10.2f, 40.2f}, {20.2f, 80.2f}, {}, {}, 1234ULL}, // two elements, mean same as median + // next three inputs should result in identical regression metrics values + {0.00001f, 6, true, {10.5f, 20.5f, 30.5f, 40.5f, 50.5f, 60.5f}, {20.5f, 40.5f, 55.5f, 80.5f, 100.5f, 120.5f}, {}, {}, 1234ULL}, // diffs all negative, reverse sorted + {0.00001f, 6, true, {20.5f, 40.5f, 55.5f, 80.5f, 100.5f, 120.5f}, {10.5f, 20.5f, 30.5f, 40.5f, 50.5f, 60.5f}, {}, {}, 1234ULL}, // diffs all positive, already sorted + {0.00001f, 6, true, {40.5f, 55.5f, 20.5f, 120.5f, 100.5f, 80.5f}, {20.5f, 30.5f, 10.5f, 60.5f, 50.5f, 40.5f}, {}, {}, 1234ULL}, // mix + {0.00001f, 6, true, {10.5f, 20.5f, 30.5f, 40.5f, 50.5f, 60.5f}, {10.5f, 20.5f, 30.5f, 40.5f, 50.5f, 60.5f}, {}, {}, 1234ULL}, // identical predictions (0 error) + {0.00001f, 6, true, {10.5f, 20.5f, 30.5f, 40.5f, 50.5f, 60.5f}, {20.5f, 30.5f, 40.5f, 50.5f, 60.5f, 70.5f}, {}, {}, 1234ULL}, // predictions[i] - ref_predictions[i] const for each i + {0.00001f, 2048, false, {}, {}, {-2048.0f, 2048.0f}, {-2048.0f, 2048.0f}, 1234ULL}, // random mix, even number of elements + {0.00001f, 2049, false, {}, {}, {-2048.0f, 2048.0f}, {-2048.0f, 2048.0f}, 1234ULL}, // random mix, odd number of elements + {0.00001f, 1024, false, {}, {}, {0.0f, 2048.0f}, {8192.0f, 16384.0f}, 1234ULL}, // random mix, diffs are all negative + {0.00001f, 1024, false, {}, {}, {8192.0f, 16384.0f}, {0.0f, 2048.0f}, 1234ULL} // random mix, diffs are all positive +}; + +const std::vector> regression_inputs_double = { + {0.0000001, 1, true, {10.2}, {20.2}, {}, {}, 1234ULL}, // single element + {0.0000001, 2, true, {10.2, 40.2}, {20.2, 80.2}, {}, {}, 1234ULL}, // two elements + {0.0000001, 6, true, {10.5, 20.5, 30.5, 40.5, 50.5, 60.5}, {20.5, 40.5, 55.5, 80.5, 100.5, 120.5}, {}, {}, 1234ULL}, // diffs all negative, reverse sorted + {0.0000001, 6, true, {20.5, 40.5, 55.5, 80.5, 100.5, 120.5}, {10.5, 20.5, 30.5, 40.5, 50.5, 60.5}, {}, {}, 1234ULL}, // diffs all positive, already sorted + {0.0000001, 6, true, {40.5, 55.5, 20.5, 120.5, 100.5, 80.5}, {20.5, 30.5, 10.5, 60.5, 50.5, 40.5}, {}, {}, 1234ULL}, // mix + {0.0000001, 6, true, {10.5, 20.5, 30.5, 40.5, 50.5, 60.5}, {10.5, 20.5, 30.5, 40.5, 50.5, 60.5}, {}, {}, 1234ULL}, // identical predictions (0 error) + {0.0000001, 6, true, {10.5, 20.5, 30.5, 40.5, 50.5, 60.5}, {20.5, 30.5, 40.5, 50.5, 60.5, 70.5}, {}, {}, 1234ULL}, // predictions[i] - ref_predictions[i] const for each i + {0.0000001, 2048, false, {}, {}, {-2048.0, 2048.0}, {-2048.0, 2048.0}, 1234ULL}, // random mix, even number of elements + {0.0000001, 2049, false, {}, {}, {-2048.0, 2048.0}, {-2048.0, 2048.0}, 1234ULL}, // random mix, odd number of elements + {0.0000001, 1024, false, {}, {}, {0, 2048}, {8192.0, 16384.0}, 1234ULL}, // random mix, diffs are all negative + {0.0000001, 1024, false, {}, {}, {8192.0, 16384.0}, {0.0, 2048}, 1234ULL} // random mix, diffs are all positive +}; + + +typedef RegressionMetricsTest RegressionMetricsTestF; +TEST_P(RegressionMetricsTestF, Result) { + for (int i = 0; i < 3; i++) { + ASSERT_TRUE(match(computed_regression_metrics[i], ref_regression_metrics[i], CompareApprox(params.tolerance))); + } +} + +typedef RegressionMetricsTest RegressionMetricsTestD; +TEST_P(RegressionMetricsTestD, Result) { + for (int i = 0; i < 3; i++) { + ASSERT_TRUE(match(computed_regression_metrics[i], ref_regression_metrics[i], CompareApprox(params.tolerance))); + } +} + +INSTANTIATE_TEST_CASE_P(RegressionMetricsTests, RegressionMetricsTestF, ::testing::ValuesIn(regression_inputs_float)); +INSTANTIATE_TEST_CASE_P(RegressionMetricsTests, RegressionMetricsTestD, ::testing::ValuesIn(regression_inputs_double)); + } // end namespace Score } // end namespace MLCommon From 6f604cce75a0d63e3271c052eeee0c6d1d0948fc Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Thu, 13 Jun 2019 17:19:57 +0200 Subject: [PATCH 39/51] blocks max limit for classifier --- cpp/src/decisiontree/kernels/evaluate_classifier.cuh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index 6664bebc88..826b3355d9 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -265,7 +265,9 @@ void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, c shmemsize = batch_ncols * n_unique_labels * nbins * sizeof(int); blocks = MLCommon::ceildiv(batch_ncols * nrows, threads); - + if (blocks > 65536) + blocks = 65536; + if (split_algo == ML::SPLIT_ALGO::HIST) { shmemsize += 2 * batch_ncols * sizeof(T); all_cols_histograms_kernel_class<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, batch_ncols, n_unique_labels, d_globalminmax, d_histout); From 18ef5b2d1774b1bf09c53a99d1ff2ff12ec6bcbe Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Fri, 14 Jun 2019 03:22:10 -0700 Subject: [PATCH 40/51] Added support for wider datasets for minmax prim. - Added batching support to minmaxKernel to enable datasets with a large number of features. - Added an extra testcase where ncols wouldn't previously fit in the available shared memory. --- cpp/src_prims/stats/minmax.h | 88 ++++++++++++++++++++++++------------ cpp/test/prims/minmax.cu | 7 ++- 2 files changed, 65 insertions(+), 30 deletions(-) diff --git a/cpp/src_prims/stats/minmax.h b/cpp/src_prims/stats/minmax.h index b17b0ab00b..d0abdbeb5c 100644 --- a/cpp/src_prims/stats/minmax.h +++ b/cpp/src_prims/stats/minmax.h @@ -38,39 +38,58 @@ template __global__ void minmaxKernel(const T* data, const unsigned int* rowids, const unsigned int* colids, int nrows, int ncols, int row_stride, T* g_min, T* g_max, T* sampledcols, - T init_min_val) { + T init_min_val, int batch_ncols, int num_batches) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; T *s_min = (T*)shmem; - T *s_max = (T*)(shmem + sizeof(T) * ncols); - for (int i = threadIdx.x; i < ncols; i += blockDim.x) { - s_min[i] = init_min_val; - s_max[i] = -init_min_val; + T *s_max = (T*)(shmem + sizeof(T) * batch_ncols); + + int last_batch_ncols = ncols % batch_ncols; + if (last_batch_ncols == 0) { + last_batch_ncols = batch_ncols; } - __syncthreads(); - for (int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { - int col = i / nrows; - int row = i % nrows; - if(colids != nullptr) { - col = colids[col]; + int orig_batch_ncols = batch_ncols; + + for (int batch_id = 0; batch_id < num_batches; batch_id++) { + if (batch_id == num_batches - 1) { + batch_ncols = last_batch_ncols; } - if(rowids != nullptr) { - row = rowids[row]; + + for (int i = threadIdx.x; i < batch_ncols; i += blockDim.x) { + s_min[i] = init_min_val; + s_max[i] = -init_min_val; } - int index = row + col * row_stride; - T coldata = data[index]; - //Min max values are saved in shared memory and global memory as per the shuffled colids. - myAtomicMin(&s_min[(int)(i / nrows)], coldata); - myAtomicMax(&s_max[(int)(i / nrows)], coldata); - if(sampledcols != nullptr) { - sampledcols[i] = coldata; + __syncthreads(); + + for (int i = tid; i < nrows * batch_ncols; i += blockDim.x*gridDim.x) { + + int col = (batch_id * orig_batch_ncols) + (i / nrows); + int row = i % nrows; + if (colids != nullptr) { + col = colids[col]; + } + if (rowids != nullptr) { + row = rowids[row]; + } + int index = row + col * row_stride; + T coldata = data[index]; + + //Min max values are saved in shared memory and global memory as per the shuffled colids. + myAtomicMin(&s_min[(int)(i / nrows)], coldata); + myAtomicMax(&s_max[(int)(i / nrows)], coldata); + if (sampledcols != nullptr) { + sampledcols[batch_id * orig_batch_ncols + i] = coldata; + } } - } - __syncthreads(); - // finally, perform global mem atomics - for (int j = threadIdx.x; j < ncols; j+= blockDim.x) { - myAtomicMin(&g_min[j], s_min[j]); - myAtomicMax(&g_max[j], s_max[j]); + __syncthreads(); + + // finally, perform global mem atomics + for (int j = threadIdx.x; j < batch_ncols; j+= blockDim.x) { + myAtomicMin(&g_min[batch_id * orig_batch_ncols + j], s_min[j]); + myAtomicMax(&g_max[batch_id * orig_batch_ncols + j], s_max[j]); + } + __syncthreads(); } } @@ -93,7 +112,7 @@ __global__ void minmaxKernel(const T* data, const unsigned int* rowids, * @param globalmin final col-wise global minimum (size = ncols) * @param globalmax final col-wise global maximum (size = ncols) * @param sampledcols output sampled data. Pass nullptr if you don't need this - * @param init_val initial minimum value to be + * @param init_val initial minimum value to be * @param stream: cuda stream * @note This method makes the following assumptions: * 1. input and output matrices are assumed to be col-major @@ -112,9 +131,22 @@ void minmax(const T* data, const unsigned int* rowids, const unsigned int* colid nblks = ceildiv(nrows * ncols, TPB); nblks = min(nblks, 65536); size_t smemSize = sizeof(T) * 2 * ncols; + + // Get available shared memory size. + cudaDeviceProp prop; + int dev_ID = 0; + CUDA_CHECK(cudaGetDevice(&dev_ID)); + CUDA_CHECK(cudaGetDeviceProperties(&prop, dev_ID)); + size_t max_shared_mem = prop.sharedMemPerBlock; + + // Compute the batch_ncols, in [1, ncols] range, that meet the available shared memory constraints. + int batch_ncols = min(ncols, (int) (max_shared_mem / (sizeof(T) * 2))); + int num_batches = ceildiv(ncols, batch_ncols); + smemSize = sizeof(T) * 2 * batch_ncols; + minmaxKernel<<>>( data, rowids, colids, nrows, ncols, row_stride, globalmin, globalmax, - sampledcols, init_val); + sampledcols, init_val, batch_ncols, num_batches); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/test/prims/minmax.cu b/cpp/test/prims/minmax.cu index d99e983438..bfbe62ca36 100644 --- a/cpp/test/prims/minmax.cu +++ b/cpp/test/prims/minmax.cu @@ -124,7 +124,8 @@ const std::vector> inputsf = { {0.00001f, 8192, 128, 1234ULL}, {0.00001f, 8192, 256, 1234ULL}, {0.00001f, 8192, 512, 1234ULL}, - {0.00001f, 8192, 1024, 1234ULL}}; + {0.00001f, 8192, 1024, 1234ULL}, + {0.00001f, 1024, 8192, 1234ULL}}; const std::vector> inputsd = { {0.0000001, 1024, 32, 1234ULL}, @@ -144,7 +145,9 @@ const std::vector> inputsd = { {0.0000001, 8192, 128, 1234ULL}, {0.0000001, 8192, 256, 1234ULL}, {0.0000001, 8192, 512, 1234ULL}, - {0.0000001, 8192, 1024, 1234ULL}}; + {0.0000001, 8192, 1024, 1234ULL}, + {0.0000001, 1024, 8192, 1234ULL}}; + typedef MinMaxTest MinMaxTestF; TEST_P(MinMaxTestF, Result) { From 4ab84a22f49024de7251d25049ed8b7151d72bc0 Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Fri, 14 Jun 2019 13:35:15 +0200 Subject: [PATCH 41/51] loop around regressor kernels for large number of features --- cpp/src/decisiontree/decisiontree.cu | 6 +- .../kernels/evaluate_classifier.cuh | 5 +- .../kernels/evaluate_regressor.cuh | 373 +++++++++++------- 3 files changed, 239 insertions(+), 145 deletions(-) diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu index a87c7ad352..2388e77daa 100644 --- a/cpp/src/decisiontree/decisiontree.cu +++ b/cpp/src/decisiontree/decisiontree.cu @@ -186,7 +186,7 @@ void DecisionTreeBase::plant(const cumlHandle_impl& handle, T *data, const feature_selector.resize((int) (colper * dinfo.Ncols)); cudaDeviceProp prop; - CUDA_CHECK(cudaGetDeviceProperties(&prop, 0)); + CUDA_CHECK(cudaGetDeviceProperties(&prop, handle.getDevice())); max_shared_mem = prop.sharedMemPerBlock; if (split_algo == SPLIT_ALGO::HIST) { @@ -458,10 +458,10 @@ void DecisionTreeRegressor::find_best_fruit_all(T *data, T *labels, const flo if (this->split_criterion == CRITERION::MSE) { best_split_all_cols_regressor(data, rowids, labels, current_nbins, n_sampled_rows, this->dinfo.NLocalrows, colselector, - this->tempmem[0], split_info, ques, gain, this->split_algo); + this->tempmem[0], split_info, ques, gain, this->split_algo, this->max_shared_mem); } else { best_split_all_cols_regressor(data, rowids, labels, current_nbins, n_sampled_rows, this->dinfo.NLocalrows, colselector, - this->tempmem[0], split_info, ques, gain, this->split_algo); + this->tempmem[0], split_info, ques, gain, this->split_algo, this->max_shared_mem); } } diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index 826b3355d9..db847496ef 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -49,9 +49,8 @@ __global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, con T *minmaxshared = (T*)shmem; int *shmemhist = (int*)(shmem + 2*batchsz*sizeof(T)); - for (int i=threadIdx.x; i < batchsz; i += blockDim.x) { - minmaxshared[i] = globalminmax[k*batch_ncols + i]; - minmaxshared[i + batchsz] = globalminmax[k*batch_ncols + i + ncols]; + for (int i=threadIdx.x; i < 2*batchsz; i += blockDim.x) { + (i < batchsz) ? (minmaxshared[i] = globalminmax[k*batch_ncols + i] ) : (minmaxshared[i] = globalminmax[k*batch_ncols + (i-batchsz) + ncols]); } for (int i = threadIdx.x; i < n_unique_labels*nbins*batchsz; i += blockDim.x) { diff --git a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh index f7402ec7fa..f56ac7bf7c 100644 --- a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh @@ -19,64 +19,80 @@ #include #include "metric.cuh" #include "../memory.h" +#include "batch_cal.cuh" #include "col_condenser.cuh" #include #include "../algo_helper.h" #include "stats/minmax.h" template -__global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const int nbins, const int nrows, const int ncols, const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T pred_parent) { - +__global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const int nbins, const int nrows, const int ncols, const int batch_ncols, const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T pred_parent) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; - T *minmaxshared = (T*)shmem; - T *shmem_pred = (T*)(shmem + 2*ncols*sizeof(T)); - T *shmem_mse = (T*)(shmem + 2*ncols*sizeof(T) + nbins*ncols*sizeof(T)); - int *shmem_count = (int*)(shmem + 2*ncols*sizeof(T) + 3*nbins*ncols*sizeof(T)); - - for (int i=threadIdx.x; i < 2*ncols; i += blockDim.x) { - minmaxshared[i] = globalminmax[i]; - } - for (int i = threadIdx.x; i < nbins*ncols; i += blockDim.x) { - shmem_count[i] = countout[i]; - shmem_pred[i] = predout[i]; - shmem_mse[i] = 0.0; - shmem_mse[i + ncols*nbins] = 0.0; - } + int colstep = (int)(ncols/batch_ncols); + if((ncols % batch_ncols) != 0) + colstep++; - __syncthreads(); + int batchsz = batch_ncols; + for(int k = 0; k < colstep; k++) { + + if(k == (colstep-1) && ( (ncols % batch_ncols) != 0) ) { + batchsz = ncols % batch_ncols; + } + + T *minmaxshared = (T*)shmem; + T *shmem_pred = (T*)(shmem + 2*batchsz*sizeof(T)); + T *shmem_mse = (T*)(shmem + 2*batchsz*sizeof(T) + nbins*batchsz*sizeof(T)); + int *shmem_count = (int*)(shmem + 2*batchsz*sizeof(T) + 3*nbins*batchsz*sizeof(T)); + + for (int i=threadIdx.x; i < 2*batchsz; i += blockDim.x) { + (i < batchsz) ? (minmaxshared[i] = globalminmax[k*batch_ncols + i] ) : (minmaxshared[i] = globalminmax[k*batch_ncols + (i-batchsz) + ncols]); + } + + for (int i = threadIdx.x; i < nbins*batchsz; i += blockDim.x) { + shmem_count[i] = countout[i + k*nbins*batch_ncols]; + shmem_pred[i] = predout[i + k*nbins*batch_ncols]; + shmem_mse[i] = 0.0; + shmem_mse[i + batchsz*nbins] = 0.0; + } - for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { - int mycolid = (int)( i / nrows); - int coloffset = mycolid*nbins; + __syncthreads(); + + for (unsigned int i = tid; i < nrows*batchsz; i += blockDim.x*gridDim.x) { + int mycolid = (int)( i / nrows); + int coloffset = mycolid*nbins; - T delta = (minmaxshared[mycolid + ncols] - minmaxshared[mycolid]) / (nbins); - T base_quesval = minmaxshared[mycolid] + delta; + T delta = (minmaxshared[mycolid + batchsz] - minmaxshared[mycolid]) / (nbins); + T base_quesval = minmaxshared[mycolid] + delta; - T localdata = data[i]; - T label = labels[ i % nrows]; - for (int j=0; j < nbins; j++) { - T quesval = base_quesval + j * delta; + T localdata = data[i + k*batch_ncols*nrows]; + T label = labels[ i % nrows]; + for (int j=0; j < nbins; j++) { + T quesval = base_quesval + j * delta; - if (localdata <= quesval) { - T temp = shmem_pred[coloffset +j] / shmem_count[coloffset + j] ; - temp = label - temp; - atomicAdd(&shmem_mse[j + coloffset], F::exec(temp)); - } else { - T temp = ( pred_parent*nrows - shmem_pred[coloffset +j] ) / (nrows - shmem_count[coloffset + j] ); - temp = label - temp; - atomicAdd(&shmem_mse[j + coloffset + ncols*nbins], F::exec(temp)); - } + if (localdata <= quesval) { + T temp = shmem_pred[coloffset +j] / shmem_count[coloffset + j] ; + temp = label - temp; + atomicAdd(&shmem_mse[j + coloffset], F::exec(temp)); + } else { + T temp = ( pred_parent*nrows - shmem_pred[coloffset +j] ) / (nrows - shmem_count[coloffset + j] ); + temp = label - temp; + atomicAdd(&shmem_mse[j + coloffset + batchsz*nbins], F::exec(temp)); + } + } } - - } - __syncthreads(); + __syncthreads(); - for (int i = threadIdx.x; i < 2*ncols*nbins; i += blockDim.x) { - atomicAdd(&mseout[i], shmem_mse[i]); + for (int i = threadIdx.x; i < batchsz*nbins; i += blockDim.x) { + atomicAdd(&mseout[i + k*batch_ncols*nbins], shmem_mse[i]); + atomicAdd(&mseout[i + k*batch_ncols*nbins + ncols*nbins], shmem_mse[i + batchsz*nbins]); + } + + __syncthreads(); } } @@ -85,140 +101,189 @@ __global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const column order is as per colids (bootstrapped random cols) for each col there are nbins histograms */ template -__global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const int nbins, const int nrows, const int ncols, const T* __restrict__ globalminmax, T* predout, int* countout) { +__global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const int nbins, const int nrows, const int ncols, const int batch_ncols, const T* __restrict__ globalminmax, T* predout, int* countout) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; - T *minmaxshared = (T*)shmem; - T *shmem_pred = (T*)(shmem + 2*ncols*sizeof(T)); - int *shmem_count = (int*)(shmem + 2*ncols*sizeof(T) + nbins*ncols*sizeof(T)); + + int colstep = (int)(ncols/batch_ncols); + if((ncols % batch_ncols) != 0) + colstep++; - for (int i=threadIdx.x; i < 2*ncols; i += blockDim.x) { - minmaxshared[i] = globalminmax[i]; - } + int batchsz = batch_ncols; + for(int k = 0; k < colstep; k++) { + + if(k == (colstep-1) && ( (ncols % batch_ncols) != 0) ) { + batchsz = ncols % batch_ncols; + } - for (int i = threadIdx.x; i < nbins*ncols; i += blockDim.x) { - shmem_pred[i] = 0; - shmem_count[i] = 0; - } + T *minmaxshared = (T*)shmem; + T *shmem_pred = (T*)(shmem + 2*batchsz*sizeof(T)); + int *shmem_count = (int*)(shmem + 2*batchsz*sizeof(T) + nbins*batchsz*sizeof(T)); + + for (int i=threadIdx.x; i < 2*batchsz; i += blockDim.x) { + (i < batchsz) ? (minmaxshared[i] = globalminmax[k*batch_ncols + i] ) : (minmaxshared[i] = globalminmax[k*batch_ncols + (i-batchsz) + ncols]); + //minmaxshared[i] = globalminmax[k*batch_ncols + i]; + //minmaxshared[i + batchsz] = globalminmax[k*batch_ncols + i + ncols]; + } + + for (int i = threadIdx.x; i < nbins*batchsz; i += blockDim.x) { + shmem_pred[i] = 0; + shmem_count[i] = 0; + } - __syncthreads(); + __syncthreads(); - for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { - int mycolid = (int)( i / nrows); - int coloffset = mycolid*nbins; + for (unsigned int i = tid; i < nrows*batchsz; i += blockDim.x*gridDim.x) { + int mycolid = (int)( i / nrows); + int coloffset = mycolid*nbins; - T delta = (minmaxshared[mycolid + ncols] - minmaxshared[mycolid]) / (nbins); - T base_quesval = minmaxshared[mycolid] + delta; + T delta = (minmaxshared[mycolid + batchsz] - minmaxshared[mycolid]) / (nbins); + T base_quesval = minmaxshared[mycolid] + delta; - T localdata = data[i]; - T label = labels[ i % nrows ]; - for (int j=0; j < nbins; j++) { - T quesval = base_quesval + j * delta; + T localdata = data[i + k*batch_ncols*nrows]; + T label = labels[ i % nrows ]; + for (int j=0; j < nbins; j++) { + T quesval = base_quesval + j * delta; - if (localdata <= quesval) { - atomicAdd(&shmem_count[j + coloffset], 1); - atomicAdd(&shmem_pred[j + coloffset], label); + if (localdata <= quesval) { + atomicAdd(&shmem_count[j + coloffset], 1); + atomicAdd(&shmem_pred[j + coloffset], label); + } } + } + + __syncthreads(); - } - - __syncthreads(); + for (int i = threadIdx.x; i < batchsz*nbins; i += blockDim.x) { + atomicAdd(&predout[i + k*batch_ncols*nbins], shmem_pred[i]); + atomicAdd(&countout[i + k*batch_ncols*nbins], shmem_count[i]); + } - for (int i = threadIdx.x; i < ncols*nbins; i += blockDim.x) { - atomicAdd(&predout[i], shmem_pred[i]); - atomicAdd(&countout[i], shmem_count[i]); + __syncthreads(); } } template -__global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, T* predout, int* countout, const T* __restrict__ quantile) { +__global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int batch_ncols, T* predout, int* countout, const T* __restrict__ quantile) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; - T *shmem_pred = (T*) (shmem); - int *shmem_count = (int*)(shmem + nbins*ncols*sizeof(T)); - - for (int i = threadIdx.x; i < nbins*ncols; i += blockDim.x) { - shmem_pred[i] = 0; - shmem_count[i] = 0; - } + + int colstep = (int)(ncols/batch_ncols); + if((ncols % batch_ncols) != 0) + colstep++; + + int batchsz = batch_ncols; + for(int k = 0; k < colstep; k++) { + + if(k == (colstep-1) && ( (ncols % batch_ncols) != 0) ) { + batchsz = ncols % batch_ncols; + } - __syncthreads(); + T *shmem_pred = (T*) (shmem); + int *shmem_count = (int*)(shmem + nbins*batchsz*sizeof(T)); - for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { - int mycolid = (int)( i / nrows); - int coloffset = mycolid*nbins; + for (int i = threadIdx.x; i < nbins*batchsz; i += blockDim.x) { + shmem_pred[i] = 0; + shmem_count[i] = 0; + } - T localdata = data[i]; - T label = labels[ i % nrows ]; - for (int j=0; j < nbins; j++) { - int quantile_index = colids[mycolid] * nbins + j; - T quesval = quantile[quantile_index]; - if (localdata <= quesval) { - atomicAdd(&shmem_count[j + coloffset], 1); - atomicAdd(&shmem_pred[j + coloffset], label); + __syncthreads(); + + for (unsigned int i = tid; i < nrows*batchsz; i += blockDim.x*gridDim.x) { + int mycolid = (int)( i / nrows); + int coloffset = mycolid*nbins; + + T localdata = data[i + k*batch_ncols*nrows]; + T label = labels[ i % nrows ]; + for (int j=0; j < nbins; j++) { + int quantile_index = colids[mycolid + k*batch_ncols] * nbins + j; + T quesval = quantile[quantile_index]; + if (localdata <= quesval) { + atomicAdd(&shmem_count[j + coloffset], 1); + atomicAdd(&shmem_pred[j + coloffset], label); + } } + } - } - - __syncthreads(); + __syncthreads(); - for (int i = threadIdx.x; i < ncols*nbins; i += blockDim.x) { - atomicAdd(&predout[i], shmem_pred[i]); - atomicAdd(&countout[i], shmem_count[i]); + for (int i = threadIdx.x; i < batchsz*nbins; i += blockDim.x) { + atomicAdd(&predout[i + k*batch_ncols*nbins], shmem_pred[i]); + atomicAdd(&countout[i + k*batch_ncols*nbins], shmem_count[i]); + } + __syncthreads(); + } } template -__global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T* __restrict__ quantile, const T pred_parent) { +__global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int batch_ncols, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T* __restrict__ quantile, const T pred_parent) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ char shmem[]; - T *shmem_pred = (T*) (shmem); - T *shmem_mse = (T*)(shmem + nbins*ncols*sizeof(T)); - int *shmem_count = (int*)(shmem + 3*nbins*ncols*sizeof(T)); - - for (int i = threadIdx.x; i < nbins*ncols; i += blockDim.x) { - shmem_count[i] = countout[i]; - shmem_pred[i] = predout[i]; - shmem_mse[i] = 0.0; - shmem_mse[i + nbins*ncols] = 0.0; - } + + int colstep = (int)(ncols/batch_ncols); + if((ncols % batch_ncols) != 0) + colstep++; - __syncthreads(); + int batchsz = batch_ncols; + for(int k = 0; k < colstep; k++) { + + if(k == (colstep-1) && ( (ncols % batch_ncols) != 0) ) { + batchsz = ncols % batch_ncols; + } + + T *shmem_pred = (T*)(shmem); + T *shmem_mse = (T*)(shmem + nbins*batchsz*sizeof(T)); + int *shmem_count = (int*)(shmem + 3*nbins*batchsz*sizeof(T)); + + for (int i = threadIdx.x; i < nbins*batchsz; i += blockDim.x) { + shmem_count[i] = countout[i + k*nbins*batch_ncols]; + shmem_pred[i] = predout[i + k*nbins*batch_ncols]; + shmem_mse[i] = 0.0; + shmem_mse[i + batchsz*nbins] = 0.0; + } + + __syncthreads(); - for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) { - int mycolid = (int)( i / nrows); - int coloffset = mycolid*nbins; + for (unsigned int i = tid; i < nrows*batchsz; i += blockDim.x*gridDim.x) { + int mycolid = (int)( i / nrows); + int coloffset = mycolid*nbins; - T localdata = data[i]; - T label = labels[ i % nrows ]; - for (int j=0; j < nbins; j++) { - int quantile_index = colids[mycolid] * nbins + j; - T quesval = quantile[quantile_index]; - - if (localdata <= quesval) { - T temp = shmem_pred[coloffset +j] / shmem_count[coloffset + j] ; - temp = label - temp; - atomicAdd(&shmem_mse[j + coloffset], F::exec(temp)); - } else { - T temp = ( pred_parent*nrows - shmem_pred[coloffset +j] ) / (nrows - shmem_count[coloffset + j] ); - temp = label - temp; - atomicAdd(&shmem_mse[j + coloffset + ncols*nbins], F::exec(temp)); + T localdata = data[i + k*batch_ncols*nrows]; + T label = labels[ i % nrows ]; + for (int j=0; j < nbins; j++) { + int quantile_index = colids[mycolid + k*batch_ncols] * nbins + j; + T quesval = quantile[quantile_index]; + + if (localdata <= quesval) { + T temp = shmem_pred[coloffset +j] / shmem_count[coloffset + j] ; + temp = label - temp; + atomicAdd(&shmem_mse[j + coloffset], F::exec(temp)); + } else { + T temp = ( pred_parent*nrows - shmem_pred[coloffset +j] ) / (nrows - shmem_count[coloffset + j] ); + temp = label - temp; + atomicAdd(&shmem_mse[j + coloffset + batchsz*nbins], F::exec(temp)); + } + } - + } - - } - __syncthreads(); + __syncthreads(); - for (int i = threadIdx.x; i < 2*ncols*nbins; i += blockDim.x) { - atomicAdd(&mseout[i], shmem_mse[i]); + for (int i = threadIdx.x; i < 2*batchsz*nbins; i += blockDim.x) { + atomicAdd(&mseout[i + k*batch_ncols*nbins], shmem_mse[i]); + atomicAdd(&mseout[i + k*batch_ncols*nbins + ncols*nbins], shmem_mse[i + batchsz*nbins]); + } + __syncthreads(); + } + } template @@ -286,7 +351,7 @@ void find_best_split_regressor(const std::shared_ptr> tempm template -void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, const T *labels, const int nbins, const int nrows, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo) +void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, const T *labels, const int nbins, const int nrows, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo, const size_t max_shared_mem) { unsigned int* d_colids = tempmem->d_colids->data(); T* d_globalminmax = tempmem->d_globalminmax->data(); @@ -324,19 +389,49 @@ void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, co allcolsampler_kernel<<stream>>>(data, rowids, d_colids, nrows, ncols, rowoffset, tempmem->temp_data->data()); } CUDA_CHECK(cudaGetLastError()); - - shmemsize = n_pred_bytes + n_count_bytes; + + int batch_ncols; + size_t shmem_needed; T *labelptr = tempmem->sampledlabels->data(); get_sampled_labels(labels, labelptr, rowids, nrows, tempmem->stream); if (split_algo == ML::SPLIT_ALGO::HIST) { - shmemsize += col_minmax_bytes; - all_cols_histograms_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, d_globalminmax, d_predout, d_histout); - compute_mse_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, d_globalminmax, d_mseout, d_predout, d_histout, split_info[0].predict); + shmem_needed = n_pred_bytes + n_count_bytes + col_minmax_bytes; + batch_ncols = get_batch_cols_cnt(max_shared_mem, shmem_needed, ncols); + shmemsize = batch_ncols * ( nbins * sizeof(T) + nbins * sizeof(int) + 2 * sizeof(T) ); + blocks = MLCommon::ceildiv(batch_ncols * nrows, threads); + if (blocks > 65536) + blocks = 65536; + + all_cols_histograms_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, batch_ncols, d_globalminmax, d_predout, d_histout); + + shmem_needed += n_mse_bytes; + batch_ncols = get_batch_cols_cnt(max_shared_mem, shmem_needed, ncols); + shmemsize = batch_ncols * ( nbins * sizeof(T) + nbins * sizeof(int) + 2 * sizeof(T) + 2 * nbins * sizeof(T)); + blocks = MLCommon::ceildiv(batch_ncols * nrows, threads); + if (blocks > 65536) + blocks = 65536; + + compute_mse_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, batch_ncols, d_globalminmax, d_mseout, d_predout, d_histout, split_info[0].predict); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { - all_cols_histograms_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, d_predout, d_histout, tempmem->d_quantile->data()); - compute_mse_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, d_mseout, d_predout, d_histout, tempmem->d_quantile->data(), split_info[0].predict); + shmem_needed = n_pred_bytes + n_count_bytes; + batch_ncols = get_batch_cols_cnt(max_shared_mem, shmem_needed, ncols); + shmemsize = batch_ncols * ( nbins * sizeof(T) + nbins * sizeof(int) ); + blocks = MLCommon::ceildiv(batch_ncols * nrows, threads); + if (blocks > 65536) + blocks = 65536; + + all_cols_histograms_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, batch_ncols, d_predout, d_histout, tempmem->d_quantile->data()); + + shmem_needed += n_mse_bytes; + batch_ncols = get_batch_cols_cnt(max_shared_mem, shmem_needed, ncols); + shmemsize = batch_ncols * ( nbins * sizeof(T) + nbins * sizeof(int) + 2 * nbins * sizeof(T)); + blocks = MLCommon::ceildiv(batch_ncols * nrows, threads); + if (blocks > 65536) + blocks = 65536; + + compute_mse_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, batch_ncols, d_mseout, d_predout, d_histout, tempmem->d_quantile->data(), split_info[0].predict); } CUDA_CHECK(cudaGetLastError()); From f79f8faf4148cf16671cf5a1a3b79e6a3580b8eb Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Fri, 14 Jun 2019 07:04:04 -0700 Subject: [PATCH 42/51] Minor kernel fix + helper function. --- cpp/src/decisiontree/kernels/batch_cal.cuh | 13 +++++++- .../kernels/evaluate_classifier.cuh | 22 +++++-------- .../kernels/evaluate_regressor.cuh | 31 +++---------------- 3 files changed, 24 insertions(+), 42 deletions(-) diff --git a/cpp/src/decisiontree/kernels/batch_cal.cuh b/cpp/src/decisiontree/kernels/batch_cal.cuh index ef0b2302b7..a6d07ba473 100644 --- a/cpp/src/decisiontree/kernels/batch_cal.cuh +++ b/cpp/src/decisiontree/kernels/batch_cal.cuh @@ -16,7 +16,7 @@ #pragma once /* Return max. possible number of columns that can be processed within avail_shared_memory. - Expects that requested_shared_memory is O(ncols) */ + Expects that requested_shared_memory is a multiple of ncols. */ int get_batch_cols_cnt(const size_t avail_shared_memory, const size_t requested_shared_memory, const int ncols) { int ncols_in_batch = ncols; int ncols_factor = requested_shared_memory / ncols; @@ -25,3 +25,14 @@ int get_batch_cols_cnt(const size_t avail_shared_memory, const size_t requested_ } return ncols_in_batch; } + + +/* Update batch_ncols (max. possible number of columns that can be processed within avail_shared_memory), + blocks (for next kernel launch), and shmemsize (requested shared memory for next kernel launch). + Precondition: requested_shared_memory is a multiple of ncols. */ +void update_kernel_config(const size_t avail_shared_memory, const size_t requested_shared_memory, const int ncols, + const int nrows, const int threads, int & batch_ncols, int & blocks, size_t & shmemsize) { + batch_ncols = get_batch_cols_cnt(avail_shared_memory, requested_shared_memory, ncols); + shmemsize = (requested_shared_memory / ncols) * batch_ncols; // requested_shared_memory is a multiple of ncols for all kernels + blocks = min(MLCommon::ceildiv(batch_ncols * nrows, threads), 65536); +} diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index db847496ef..bc45700896 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -236,9 +236,7 @@ void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, c CUDA_CHECK(cudaMemsetAsync((void*)d_histout, 0, n_hist_bytes, tempmem->stream)); const int threads = 512; - int blocks = MLCommon::ceildiv(nrows * ncols, threads); - if (blocks > 65536) - blocks = 65536; + int blocks = min(MLCommon::ceildiv(nrows * ncols, threads), 65536); /* Kernel allcolsampler_*_kernel: - populates tempmem->tempdata with the sampled column data, @@ -255,20 +253,14 @@ void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, c L *labelptr = tempmem->sampledlabels->data(); get_sampled_labels(labels, labelptr, rowids, nrows, tempmem->stream); - int batch_ncols; - size_t shmem_needed = ncols * n_unique_labels * nbins * sizeof(int); - if(split_algo == ML::SPLIT_ALGO::HIST) - shmem_needed += ncols * 2 * sizeof(T); - - batch_ncols = get_batch_cols_cnt(max_shared_mem, shmem_needed, ncols); - - shmemsize = batch_ncols * n_unique_labels * nbins * sizeof(int); - blocks = MLCommon::ceildiv(batch_ncols * nrows, threads); - if (blocks > 65536) - blocks = 65536; + int batch_ncols = 1; + size_t shmem_needed = n_hist_bytes; + if (split_algo == ML::SPLIT_ALGO::HIST) { + shmem_needed += col_minmax_bytes; + } + update_kernel_config(max_shared_mem, shmem_needed, ncols, nrows, threads, batch_ncols, blocks, shmemsize); if (split_algo == ML::SPLIT_ALGO::HIST) { - shmemsize += 2 * batch_ncols * sizeof(T); all_cols_histograms_kernel_class<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, batch_ncols, n_unique_labels, d_globalminmax, d_histout); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { all_cols_histograms_global_quantile_kernel_class<<stream>>>(tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, batch_ncols, n_unique_labels, d_histout, tempmem->d_quantile->data()); diff --git a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh index f56ac7bf7c..498e47e3ad 100644 --- a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh @@ -123,8 +123,6 @@ __global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data for (int i=threadIdx.x; i < 2*batchsz; i += blockDim.x) { (i < batchsz) ? (minmaxshared[i] = globalminmax[k*batch_ncols + i] ) : (minmaxshared[i] = globalminmax[k*batch_ncols + (i-batchsz) + ncols]); - //minmaxshared[i] = globalminmax[k*batch_ncols + i]; - //minmaxshared[i + batchsz] = globalminmax[k*batch_ncols + i + ncols]; } for (int i = threadIdx.x; i < nbins*batchsz; i += blockDim.x) { @@ -276,7 +274,7 @@ __global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ dat __syncthreads(); - for (int i = threadIdx.x; i < 2*batchsz*nbins; i += blockDim.x) { + for (int i = threadIdx.x; i < batchsz*nbins; i += blockDim.x) { atomicAdd(&mseout[i + k*batch_ncols*nbins], shmem_mse[i]); atomicAdd(&mseout[i + k*batch_ncols*nbins + ncols*nbins], shmem_mse[i + batchsz*nbins]); } @@ -398,38 +396,19 @@ void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, co if (split_algo == ML::SPLIT_ALGO::HIST) { shmem_needed = n_pred_bytes + n_count_bytes + col_minmax_bytes; - batch_ncols = get_batch_cols_cnt(max_shared_mem, shmem_needed, ncols); - shmemsize = batch_ncols * ( nbins * sizeof(T) + nbins * sizeof(int) + 2 * sizeof(T) ); - blocks = MLCommon::ceildiv(batch_ncols * nrows, threads); - if (blocks > 65536) - blocks = 65536; - + update_kernel_config(max_shared_mem, shmem_needed, ncols, nrows, threads, batch_ncols, blocks, shmemsize); all_cols_histograms_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, batch_ncols, d_globalminmax, d_predout, d_histout); shmem_needed += n_mse_bytes; - batch_ncols = get_batch_cols_cnt(max_shared_mem, shmem_needed, ncols); - shmemsize = batch_ncols * ( nbins * sizeof(T) + nbins * sizeof(int) + 2 * sizeof(T) + 2 * nbins * sizeof(T)); - blocks = MLCommon::ceildiv(batch_ncols * nrows, threads); - if (blocks > 65536) - blocks = 65536; - + update_kernel_config(max_shared_mem, shmem_needed, ncols, nrows, threads, batch_ncols, blocks, shmemsize); compute_mse_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, batch_ncols, d_globalminmax, d_mseout, d_predout, d_histout, split_info[0].predict); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { shmem_needed = n_pred_bytes + n_count_bytes; - batch_ncols = get_batch_cols_cnt(max_shared_mem, shmem_needed, ncols); - shmemsize = batch_ncols * ( nbins * sizeof(T) + nbins * sizeof(int) ); - blocks = MLCommon::ceildiv(batch_ncols * nrows, threads); - if (blocks > 65536) - blocks = 65536; - + update_kernel_config(max_shared_mem, shmem_needed, ncols, nrows, threads, batch_ncols, blocks, shmemsize); all_cols_histograms_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, batch_ncols, d_predout, d_histout, tempmem->d_quantile->data()); shmem_needed += n_mse_bytes; - batch_ncols = get_batch_cols_cnt(max_shared_mem, shmem_needed, ncols); - shmemsize = batch_ncols * ( nbins * sizeof(T) + nbins * sizeof(int) + 2 * nbins * sizeof(T)); - blocks = MLCommon::ceildiv(batch_ncols * nrows, threads); - if (blocks > 65536) - blocks = 65536; + update_kernel_config(max_shared_mem, shmem_needed, ncols, nrows, threads, batch_ncols, blocks, shmemsize); compute_mse_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, batch_ncols, d_mseout, d_predout, d_histout, tempmem->d_quantile->data(), split_info[0].predict); } From 8d326b35a5d535a750e95dbb794f9182a419ed9f Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Tue, 18 Jun 2019 03:16:37 -0700 Subject: [PATCH 43/51] Python related updates to rf/dt, randomforest.pyx - Also more clang formatting changes that the previous commit missed. --- cpp/src/decisiontree/kernels/batch_cal.cuh | 29 +- .../kernels/evaluate_classifier.cuh | 507 ++++++----- .../kernels/evaluate_regressor.cuh | 814 ++++++++++-------- cpp/src/decisiontree/kernels/metric.cuh | 250 +++--- cpp/src/decisiontree/kernels/metric_def.h | 27 +- cpp/src/randomforest/randomforest.cu | 6 +- cpp/src/randomforest/randomforest.h | 2 +- cpp/test/sg/rf_test.cu | 6 +- python/cuml/ensemble/randomforest.pyx | 25 +- 9 files changed, 892 insertions(+), 774 deletions(-) diff --git a/cpp/src/decisiontree/kernels/batch_cal.cuh b/cpp/src/decisiontree/kernels/batch_cal.cuh index a6d07ba473..10681d1033 100644 --- a/cpp/src/decisiontree/kernels/batch_cal.cuh +++ b/cpp/src/decisiontree/kernels/batch_cal.cuh @@ -17,22 +17,27 @@ #pragma once /* Return max. possible number of columns that can be processed within avail_shared_memory. Expects that requested_shared_memory is a multiple of ncols. */ -int get_batch_cols_cnt(const size_t avail_shared_memory, const size_t requested_shared_memory, const int ncols) { - int ncols_in_batch = ncols; - int ncols_factor = requested_shared_memory / ncols; - if (requested_shared_memory > avail_shared_memory) { - ncols_in_batch = avail_shared_memory / ncols_factor; // floor div. - } - return ncols_in_batch; +int get_batch_cols_cnt(const size_t avail_shared_memory, + const size_t requested_shared_memory, const int ncols) { + int ncols_in_batch = ncols; + int ncols_factor = requested_shared_memory / ncols; + if (requested_shared_memory > avail_shared_memory) { + ncols_in_batch = avail_shared_memory / ncols_factor; // floor div. + } + return ncols_in_batch; } - /* Update batch_ncols (max. possible number of columns that can be processed within avail_shared_memory), blocks (for next kernel launch), and shmemsize (requested shared memory for next kernel launch). Precondition: requested_shared_memory is a multiple of ncols. */ -void update_kernel_config(const size_t avail_shared_memory, const size_t requested_shared_memory, const int ncols, - const int nrows, const int threads, int & batch_ncols, int & blocks, size_t & shmemsize) { - batch_ncols = get_batch_cols_cnt(avail_shared_memory, requested_shared_memory, ncols); - shmemsize = (requested_shared_memory / ncols) * batch_ncols; // requested_shared_memory is a multiple of ncols for all kernels +void update_kernel_config(const size_t avail_shared_memory, + const size_t requested_shared_memory, const int ncols, + const int nrows, const int threads, int& batch_ncols, + int& blocks, size_t& shmemsize) { + batch_ncols = + get_batch_cols_cnt(avail_shared_memory, requested_shared_memory, ncols); + shmemsize = + (requested_shared_memory / ncols) * + batch_ncols; // requested_shared_memory is a multiple of ncols for all kernels blocks = min(MLCommon::ceildiv(batch_ncols * nrows, threads), 65536); } diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index bc45700896..aee1e5fc4f 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -15,262 +15,301 @@ */ #pragma once +#include #include #include -#include "metric.cuh" -#include "batch_cal.cuh" +#include "../algo_helper.h" #include "../memory.h" +#include "batch_cal.cuh" #include "col_condenser.cuh" -#include -#include "../algo_helper.h" +#include "metric.cuh" #include "stats/minmax.h" /* The output of the function is a histogram array, of size ncols * nbins * n_unique_labels column order is as per colids (bootstrapped random cols) for each col there are nbins histograms */ -template -__global__ void all_cols_histograms_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const int nbins, const int nrows, const int ncols, const int batch_ncols, const int n_unique_labels, const T* __restrict__ globalminmax, int* histout) { - - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ char shmem[]; - - int colstep = (int)(ncols/batch_ncols); - if((ncols % batch_ncols) != 0) - colstep++; - - int batchsz = batch_ncols; - for(int k = 0; k < colstep; k++) { - - if(k == (colstep-1) && ( (ncols % batch_ncols) != 0) ) { - batchsz = ncols % batch_ncols; - } - - T *minmaxshared = (T*)shmem; - int *shmemhist = (int*)(shmem + 2*batchsz*sizeof(T)); - - for (int i=threadIdx.x; i < 2*batchsz; i += blockDim.x) { - (i < batchsz) ? (minmaxshared[i] = globalminmax[k*batch_ncols + i] ) : (minmaxshared[i] = globalminmax[k*batch_ncols + (i-batchsz) + ncols]); - } - - for (int i = threadIdx.x; i < n_unique_labels*nbins*batchsz; i += blockDim.x) { - shmemhist[i] = 0; - } - - __syncthreads(); - - for (unsigned int i = tid; i < nrows*batchsz; i += blockDim.x*gridDim.x) { - int mycolid = (int)( i / nrows); - int coloffset = mycolid*n_unique_labels*nbins; - - T delta = (minmaxshared[mycolid + batchsz] - minmaxshared[mycolid]) / (nbins); - T base_quesval = minmaxshared[mycolid] + delta; - - T localdata = data[i + k*batch_ncols*nrows]; - int label = labels[ i % nrows ]; - for (int j=0; j < nbins; j++) { - T quesval = base_quesval + j * delta; - - if (localdata <= quesval) { - atomicAdd(&shmemhist[label + n_unique_labels * j + coloffset], 1); - } - } - - } - - __syncthreads(); - for (int i = threadIdx.x; i < batchsz*n_unique_labels*nbins; i += blockDim.x) { - atomicAdd(&histout[k*batch_ncols*n_unique_labels*nbins + i], shmemhist[i]); - } - - __syncthreads(); - } -} +template +__global__ void all_cols_histograms_kernel_class( + const T* __restrict__ data, const int* __restrict__ labels, const int nbins, + const int nrows, const int ncols, const int batch_ncols, + const int n_unique_labels, const T* __restrict__ globalminmax, int* histout) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ char shmem[]; + + int colstep = (int)(ncols / batch_ncols); + if ((ncols % batch_ncols) != 0) colstep++; + + int batchsz = batch_ncols; + for (int k = 0; k < colstep; k++) { + if (k == (colstep - 1) && ((ncols % batch_ncols) != 0)) { + batchsz = ncols % batch_ncols; + } + + T* minmaxshared = (T*)shmem; + int* shmemhist = (int*)(shmem + 2 * batchsz * sizeof(T)); -template -__global__ void all_cols_histograms_global_quantile_kernel_class(const T* __restrict__ data, const int* __restrict__ labels, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int batch_ncols, const int n_unique_labels, int* histout, const T* __restrict__ quantile) { - - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ char shmem[]; - int *shmemhist = (int*)(shmem); - - int colstep = (int)(ncols/batch_ncols); - if((ncols % batch_ncols) != 0) - colstep++; - - int batchsz = batch_ncols; - for(int k = 0; k < colstep; k++) { - - if(k == (colstep-1) && ( (ncols % batch_ncols) != 0) ) { - batchsz = ncols % batch_ncols; - } - - for (int i = threadIdx.x; i < n_unique_labels*nbins*batchsz; i += blockDim.x) { - shmemhist[i] = 0; - } - - __syncthreads(); - - for (unsigned int i = tid; i < nrows*batchsz; i += blockDim.x*gridDim.x) { - int mycolid = (int)( i / nrows); - int coloffset = mycolid*n_unique_labels*nbins; - - T localdata = data[k*batch_ncols*nrows + i]; - int label = labels[ i % nrows ]; - for (int j=0; j < nbins; j++) { - int quantile_index = colids[k*batch_ncols + mycolid] * nbins + j; - T quesval = quantile[quantile_index]; - if (localdata <= quesval) { - atomicAdd(&shmemhist[label + n_unique_labels * j + coloffset], 1); - } - } - - } - - __syncthreads(); - - for (int i = threadIdx.x; i < batchsz*n_unique_labels*nbins; i += blockDim.x) { - atomicAdd(&histout[k*batch_ncols*n_unique_labels*nbins + i], shmemhist[i]); - } - - __syncthreads(); - } + for (int i = threadIdx.x; i < 2 * batchsz; i += blockDim.x) { + (i < batchsz) ? (minmaxshared[i] = globalminmax[k * batch_ncols + i]) + : (minmaxshared[i] = + globalminmax[k * batch_ncols + (i - batchsz) + ncols]); + } + + for (int i = threadIdx.x; i < n_unique_labels * nbins * batchsz; + i += blockDim.x) { + shmemhist[i] = 0; + } + + __syncthreads(); + + for (unsigned int i = tid; i < nrows * batchsz; + i += blockDim.x * gridDim.x) { + int mycolid = (int)(i / nrows); + int coloffset = mycolid * n_unique_labels * nbins; + + T delta = + (minmaxshared[mycolid + batchsz] - minmaxshared[mycolid]) / (nbins); + T base_quesval = minmaxshared[mycolid] + delta; + + T localdata = data[i + k * batch_ncols * nrows]; + int label = labels[i % nrows]; + for (int j = 0; j < nbins; j++) { + T quesval = base_quesval + j * delta; + + if (localdata <= quesval) { + atomicAdd(&shmemhist[label + n_unique_labels * j + coloffset], 1); + } + } + } + + __syncthreads(); + for (int i = threadIdx.x; i < batchsz * n_unique_labels * nbins; + i += blockDim.x) { + atomicAdd(&histout[k * batch_ncols * n_unique_labels * nbins + i], + shmemhist[i]); + } + + __syncthreads(); + } } -template -void find_best_split_classifier(const std::shared_ptr> tempmem, const int nbins, const int n_unique_labels, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { - - gain = 0.0f; - int best_col_id = -1; - int best_bin_id = -1; - - int n_cols = col_selector.size(); - for (int col_id = 0; col_id < n_cols; col_id++) { - - int col_hist_base_index = col_id * nbins * n_unique_labels; - // tempmem->h_histout holds n_cols histograms of nbins of n_unique_labels each. - for (int i = 0; i < nbins; i++) { - - // if tmp_lnrows or tmp_rnrows is 0, the corresponding gini will be 1 but that doesn't - // matter as it won't count in the info_gain computation. - int tmp_lnrows = 0; - - //separate loop for now to avoid overflow. - for (int j = 0; j < n_unique_labels; j++) { - int hist_index = i * n_unique_labels + j; - tmp_lnrows += tempmem->h_histout->data()[col_hist_base_index + hist_index]; - } - int tmp_rnrows = nrows - tmp_lnrows; - - if (tmp_lnrows == 0 || tmp_rnrows == 0) - continue; - - std::vector tmp_histleft(n_unique_labels); - std::vector tmp_histright(n_unique_labels); - - // Compute gini right and gini left value for each bin. - for (int j = 0; j < n_unique_labels; j++) { - int hist_index = i * n_unique_labels + j; - tmp_histleft[j] = tempmem->h_histout->data()[col_hist_base_index + hist_index]; - tmp_histright[j] = split_info[0].hist[j] - tmp_histleft[j]; - } - - float tmp_gini_left = F::exec(tmp_histleft, tmp_lnrows); - float tmp_gini_right = F::exec(tmp_histright, tmp_rnrows); - - ASSERT((tmp_gini_left >= 0.0f) && (tmp_gini_left <= 1.0f), "gini left value %f not in [0.0, 1.0]", tmp_gini_left); - ASSERT((tmp_gini_right >= 0.0f) && (tmp_gini_right <= 1.0f), "gini right value %f not in [0.0, 1.0]", tmp_gini_right); - - float impurity = (tmp_lnrows * 1.0f/nrows) * tmp_gini_left + (tmp_rnrows * 1.0f/nrows) * tmp_gini_right; - float info_gain = split_info[0].best_metric - impurity; - - - // Compute best information col_gain so far - if (info_gain > gain) { - gain = info_gain; - best_bin_id = i; - best_col_id = col_id; - split_info[1].best_metric = tmp_gini_left; - split_info[2].best_metric = tmp_gini_right; - } - } - } - - if (best_col_id == -1 || best_bin_id == -1) - return; - - split_info[1].hist.resize(n_unique_labels); - split_info[2].hist.resize(n_unique_labels); - for (int j = 0; j < n_unique_labels; j++) { - split_info[1].hist[j] = tempmem->h_histout->data()[best_col_id * n_unique_labels * nbins + best_bin_id * n_unique_labels + j]; - split_info[2].hist[j] = split_info[0].hist[j] - split_info[1].hist[j]; - } - - if (split_algo == ML::SPLIT_ALGO::HIST) { - ques.set_question_fields(best_col_id, col_selector[best_col_id], best_bin_id, nbins, n_cols, std::numeric_limits::max(), -std::numeric_limits::max(), (T) 0); - } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { - T ques_val; - T *d_quantile = tempmem->d_quantile->data(); - int q_index = col_selector[best_col_id] * nbins + best_bin_id; - MLCommon::updateHost(&ques_val, &d_quantile[q_index], 1, tempmem->stream); - CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); - ques.set_question_fields(best_col_id, col_selector[best_col_id], best_bin_id, nbins, n_cols, std::numeric_limits::max(), -std::numeric_limits::max(), ques_val); - } - return; +template +__global__ void all_cols_histograms_global_quantile_kernel_class( + const T* __restrict__ data, const int* __restrict__ labels, + const unsigned int* __restrict__ colids, const int nbins, const int nrows, + const int ncols, const int batch_ncols, const int n_unique_labels, + int* histout, const T* __restrict__ quantile) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ char shmem[]; + int* shmemhist = (int*)(shmem); + + int colstep = (int)(ncols / batch_ncols); + if ((ncols % batch_ncols) != 0) colstep++; + + int batchsz = batch_ncols; + for (int k = 0; k < colstep; k++) { + if (k == (colstep - 1) && ((ncols % batch_ncols) != 0)) { + batchsz = ncols % batch_ncols; + } + + for (int i = threadIdx.x; i < n_unique_labels * nbins * batchsz; + i += blockDim.x) { + shmemhist[i] = 0; + } + + __syncthreads(); + + for (unsigned int i = tid; i < nrows * batchsz; + i += blockDim.x * gridDim.x) { + int mycolid = (int)(i / nrows); + int coloffset = mycolid * n_unique_labels * nbins; + + T localdata = data[k * batch_ncols * nrows + i]; + int label = labels[i % nrows]; + for (int j = 0; j < nbins; j++) { + int quantile_index = colids[k * batch_ncols + mycolid] * nbins + j; + T quesval = quantile[quantile_index]; + if (localdata <= quesval) { + atomicAdd(&shmemhist[label + n_unique_labels * j + coloffset], 1); + } + } + } + + __syncthreads(); + + for (int i = threadIdx.x; i < batchsz * n_unique_labels * nbins; + i += blockDim.x) { + atomicAdd(&histout[k * batch_ncols * n_unique_labels * nbins + i], + shmemhist[i]); + } + + __syncthreads(); + } } +template +void find_best_split_classifier( + const std::shared_ptr> tempmem, const int nbins, + const int n_unique_labels, const std::vector& col_selector, + MetricInfo split_info[3], const int nrows, MetricQuestion& ques, + float& gain, const int split_algo) { + gain = 0.0f; + int best_col_id = -1; + int best_bin_id = -1; + + int n_cols = col_selector.size(); + for (int col_id = 0; col_id < n_cols; col_id++) { + int col_hist_base_index = col_id * nbins * n_unique_labels; + // tempmem->h_histout holds n_cols histograms of nbins of n_unique_labels each. + for (int i = 0; i < nbins; i++) { + // if tmp_lnrows or tmp_rnrows is 0, the corresponding gini will be 1 but that doesn't + // matter as it won't count in the info_gain computation. + int tmp_lnrows = 0; + + //separate loop for now to avoid overflow. + for (int j = 0; j < n_unique_labels; j++) { + int hist_index = i * n_unique_labels + j; + tmp_lnrows += + tempmem->h_histout->data()[col_hist_base_index + hist_index]; + } + int tmp_rnrows = nrows - tmp_lnrows; + + if (tmp_lnrows == 0 || tmp_rnrows == 0) continue; + + std::vector tmp_histleft(n_unique_labels); + std::vector tmp_histright(n_unique_labels); + + // Compute gini right and gini left value for each bin. + for (int j = 0; j < n_unique_labels; j++) { + int hist_index = i * n_unique_labels + j; + tmp_histleft[j] = + tempmem->h_histout->data()[col_hist_base_index + hist_index]; + tmp_histright[j] = split_info[0].hist[j] - tmp_histleft[j]; + } + + float tmp_gini_left = F::exec(tmp_histleft, tmp_lnrows); + float tmp_gini_right = F::exec(tmp_histright, tmp_rnrows); + + ASSERT((tmp_gini_left >= 0.0f) && (tmp_gini_left <= 1.0f), + "gini left value %f not in [0.0, 1.0]", tmp_gini_left); + ASSERT((tmp_gini_right >= 0.0f) && (tmp_gini_right <= 1.0f), + "gini right value %f not in [0.0, 1.0]", tmp_gini_right); -template -void best_split_all_cols_classifier(const T *data, const unsigned int* rowids, const L *labels, const int nbins, const int nrows, const int n_unique_labels, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo, const size_t max_shared_mem) -{ - unsigned int* d_colids = tempmem->d_colids->data(); - T* d_globalminmax = tempmem->d_globalminmax->data(); - int *d_histout = tempmem->d_histout->data(); - int *h_histout = tempmem->h_histout->data(); + float impurity = (tmp_lnrows * 1.0f / nrows) * tmp_gini_left + + (tmp_rnrows * 1.0f / nrows) * tmp_gini_right; + float info_gain = split_info[0].best_metric - impurity; - int ncols = colselector.size(); - int col_minmax_bytes = sizeof(T) * 2 * ncols; - int n_hist_elements = n_unique_labels * nbins * ncols; - int n_hist_bytes = n_hist_elements * sizeof(int); + // Compute best information col_gain so far + if (info_gain > gain) { + gain = info_gain; + best_bin_id = i; + best_col_id = col_id; + split_info[1].best_metric = tmp_gini_left; + split_info[2].best_metric = tmp_gini_right; + } + } + } - CUDA_CHECK(cudaMemsetAsync((void*)d_histout, 0, n_hist_bytes, tempmem->stream)); + if (best_col_id == -1 || best_bin_id == -1) return; - const int threads = 512; - int blocks = min(MLCommon::ceildiv(nrows * ncols, threads), 65536); + split_info[1].hist.resize(n_unique_labels); + split_info[2].hist.resize(n_unique_labels); + for (int j = 0; j < n_unique_labels; j++) { + split_info[1].hist[j] = + tempmem->h_histout->data()[best_col_id * n_unique_labels * nbins + + best_bin_id * n_unique_labels + j]; + split_info[2].hist[j] = split_info[0].hist[j] - split_info[1].hist[j]; + } - /* Kernel allcolsampler_*_kernel: + if (split_algo == ML::SPLIT_ALGO::HIST) { + ques.set_question_fields( + best_col_id, col_selector[best_col_id], best_bin_id, nbins, n_cols, + std::numeric_limits::max(), -std::numeric_limits::max(), (T)0); + } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { + T ques_val; + T* d_quantile = tempmem->d_quantile->data(); + int q_index = col_selector[best_col_id] * nbins + best_bin_id; + MLCommon::updateHost(&ques_val, &d_quantile[q_index], 1, tempmem->stream); + CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); + ques.set_question_fields( + best_col_id, col_selector[best_col_id], best_bin_id, nbins, n_cols, + std::numeric_limits::max(), -std::numeric_limits::max(), ques_val); + } + return; +} + +template +void best_split_all_cols_classifier( + const T* data, const unsigned int* rowids, const L* labels, const int nbins, + const int nrows, const int n_unique_labels, const int rowoffset, + const std::vector& colselector, + const std::shared_ptr> tempmem, + MetricInfo split_info[3], MetricQuestion& ques, float& gain, + const int split_algo, const size_t max_shared_mem) { + unsigned int* d_colids = tempmem->d_colids->data(); + T* d_globalminmax = tempmem->d_globalminmax->data(); + int* d_histout = tempmem->d_histout->data(); + int* h_histout = tempmem->h_histout->data(); + + int ncols = colselector.size(); + int col_minmax_bytes = sizeof(T) * 2 * ncols; + int n_hist_elements = n_unique_labels * nbins * ncols; + int n_hist_bytes = n_hist_elements * sizeof(int); + + CUDA_CHECK( + cudaMemsetAsync((void*)d_histout, 0, n_hist_bytes, tempmem->stream)); + + const int threads = 512; + int blocks = min(MLCommon::ceildiv(nrows * ncols, threads), 65536); + + /* Kernel allcolsampler_*_kernel: - populates tempmem->tempdata with the sampled column data, - and computes min max histograms in tempmem->d_globalminmax *if minmax in name across all columns. */ - size_t shmemsize = col_minmax_bytes; - if (split_algo == ML::SPLIT_ALGO::HIST) { // Histograms (min, max) - MLCommon::Stats::minmax(data, rowids, d_colids, nrows, ncols, rowoffset, &d_globalminmax[0], &d_globalminmax[colselector.size()], tempmem->temp_data->data(), tempmem->stream); - } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { // Global quantiles; just col condenser - allcolsampler_kernel<<stream>>>(data, rowids, d_colids, nrows, ncols, rowoffset, tempmem->temp_data->data()); - } - CUDA_CHECK(cudaGetLastError()); - L *labelptr = tempmem->sampledlabels->data(); - get_sampled_labels(labels, labelptr, rowids, nrows, tempmem->stream); - - int batch_ncols = 1; - size_t shmem_needed = n_hist_bytes; - if (split_algo == ML::SPLIT_ALGO::HIST) { - shmem_needed += col_minmax_bytes; - } - update_kernel_config(max_shared_mem, shmem_needed, ncols, nrows, threads, batch_ncols, blocks, shmemsize); - - if (split_algo == ML::SPLIT_ALGO::HIST) { - all_cols_histograms_kernel_class<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, batch_ncols, n_unique_labels, d_globalminmax, d_histout); - } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { - all_cols_histograms_global_quantile_kernel_class<<stream>>>(tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, batch_ncols, n_unique_labels, d_histout, tempmem->d_quantile->data()); - } - CUDA_CHECK(cudaGetLastError()); - - MLCommon::updateHost(h_histout, d_histout, n_hist_elements, tempmem->stream); - CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); - - find_best_split_classifier(tempmem, nbins, n_unique_labels, colselector, &split_info[0], nrows, ques, gain, split_algo); - return; -} + size_t shmemsize = col_minmax_bytes; + if (split_algo == ML::SPLIT_ALGO::HIST) { // Histograms (min, max) + MLCommon::Stats::minmax( + data, rowids, d_colids, nrows, ncols, rowoffset, &d_globalminmax[0], + &d_globalminmax[colselector.size()], tempmem->temp_data->data(), + tempmem->stream); + } else if (split_algo == + ML::SPLIT_ALGO:: + GLOBAL_QUANTILE) { // Global quantiles; just col condenser + allcolsampler_kernel<<stream>>>( + data, rowids, d_colids, nrows, ncols, rowoffset, + tempmem->temp_data->data()); + } + CUDA_CHECK(cudaGetLastError()); + L* labelptr = tempmem->sampledlabels->data(); + get_sampled_labels(labels, labelptr, rowids, nrows, tempmem->stream); + + int batch_ncols = 1; + size_t shmem_needed = n_hist_bytes; + if (split_algo == ML::SPLIT_ALGO::HIST) { + shmem_needed += col_minmax_bytes; + } + update_kernel_config(max_shared_mem, shmem_needed, ncols, nrows, threads, + batch_ncols, blocks, shmemsize); + + if (split_algo == ML::SPLIT_ALGO::HIST) { + all_cols_histograms_kernel_class<<stream>>>( + tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, batch_ncols, + n_unique_labels, d_globalminmax, d_histout); + } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { + all_cols_histograms_global_quantile_kernel_class<<< + blocks, threads, shmemsize, tempmem->stream>>>( + tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, + batch_ncols, n_unique_labels, d_histout, tempmem->d_quantile->data()); + } + CUDA_CHECK(cudaGetLastError()); + MLCommon::updateHost(h_histout, d_histout, n_hist_elements, tempmem->stream); + CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); + + find_best_split_classifier(tempmem, nbins, n_unique_labels, + colselector, &split_info[0], nrows, ques, + gain, split_algo); + return; +} diff --git a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh index 498e47e3ad..7400c5cded 100644 --- a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh @@ -15,411 +15,461 @@ */ #pragma once +#include #include #include -#include "metric.cuh" +#include "../algo_helper.h" #include "../memory.h" #include "batch_cal.cuh" #include "col_condenser.cuh" -#include -#include "../algo_helper.h" +#include "metric.cuh" #include "stats/minmax.h" -template -__global__ void compute_mse_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const int nbins, const int nrows, const int ncols, const int batch_ncols, const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T pred_parent) { - - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ char shmem[]; - - int colstep = (int)(ncols/batch_ncols); - if((ncols % batch_ncols) != 0) - colstep++; - - int batchsz = batch_ncols; - for(int k = 0; k < colstep; k++) { - - if(k == (colstep-1) && ( (ncols % batch_ncols) != 0) ) { - batchsz = ncols % batch_ncols; - } - - T *minmaxshared = (T*)shmem; - T *shmem_pred = (T*)(shmem + 2*batchsz*sizeof(T)); - T *shmem_mse = (T*)(shmem + 2*batchsz*sizeof(T) + nbins*batchsz*sizeof(T)); - int *shmem_count = (int*)(shmem + 2*batchsz*sizeof(T) + 3*nbins*batchsz*sizeof(T)); - - for (int i=threadIdx.x; i < 2*batchsz; i += blockDim.x) { - (i < batchsz) ? (minmaxshared[i] = globalminmax[k*batch_ncols + i] ) : (minmaxshared[i] = globalminmax[k*batch_ncols + (i-batchsz) + ncols]); - } - - for (int i = threadIdx.x; i < nbins*batchsz; i += blockDim.x) { - shmem_count[i] = countout[i + k*nbins*batch_ncols]; - shmem_pred[i] = predout[i + k*nbins*batch_ncols]; - shmem_mse[i] = 0.0; - shmem_mse[i + batchsz*nbins] = 0.0; - } - - __syncthreads(); - - for (unsigned int i = tid; i < nrows*batchsz; i += blockDim.x*gridDim.x) { - int mycolid = (int)( i / nrows); - int coloffset = mycolid*nbins; - - T delta = (minmaxshared[mycolid + batchsz] - minmaxshared[mycolid]) / (nbins); - T base_quesval = minmaxshared[mycolid] + delta; - - T localdata = data[i + k*batch_ncols*nrows]; - T label = labels[ i % nrows]; - for (int j=0; j < nbins; j++) { - T quesval = base_quesval + j * delta; - - if (localdata <= quesval) { - T temp = shmem_pred[coloffset +j] / shmem_count[coloffset + j] ; - temp = label - temp; - atomicAdd(&shmem_mse[j + coloffset], F::exec(temp)); - } else { - T temp = ( pred_parent*nrows - shmem_pred[coloffset +j] ) / (nrows - shmem_count[coloffset + j] ); - temp = label - temp; - atomicAdd(&shmem_mse[j + coloffset + batchsz*nbins], F::exec(temp)); - } - - } - } - - __syncthreads(); - - for (int i = threadIdx.x; i < batchsz*nbins; i += blockDim.x) { - atomicAdd(&mseout[i + k*batch_ncols*nbins], shmem_mse[i]); - atomicAdd(&mseout[i + k*batch_ncols*nbins + ncols*nbins], shmem_mse[i + batchsz*nbins]); - } - - __syncthreads(); - } +template +__global__ void compute_mse_minmax_kernel_reg( + const T* __restrict__ data, const T* __restrict__ labels, const int nbins, + const int nrows, const int ncols, const int batch_ncols, + const T* __restrict__ globalminmax, T* mseout, const T* __restrict__ predout, + const int* __restrict__ countout, const T pred_parent) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ char shmem[]; + + int colstep = (int)(ncols / batch_ncols); + if ((ncols % batch_ncols) != 0) colstep++; + + int batchsz = batch_ncols; + for (int k = 0; k < colstep; k++) { + if (k == (colstep - 1) && ((ncols % batch_ncols) != 0)) { + batchsz = ncols % batch_ncols; + } + + T* minmaxshared = (T*)shmem; + T* shmem_pred = (T*)(shmem + 2 * batchsz * sizeof(T)); + T* shmem_mse = + (T*)(shmem + 2 * batchsz * sizeof(T) + nbins * batchsz * sizeof(T)); + int* shmem_count = + (int*)(shmem + 2 * batchsz * sizeof(T) + 3 * nbins * batchsz * sizeof(T)); + + for (int i = threadIdx.x; i < 2 * batchsz; i += blockDim.x) { + (i < batchsz) ? (minmaxshared[i] = globalminmax[k * batch_ncols + i]) + : (minmaxshared[i] = + globalminmax[k * batch_ncols + (i - batchsz) + ncols]); + } + + for (int i = threadIdx.x; i < nbins * batchsz; i += blockDim.x) { + shmem_count[i] = countout[i + k * nbins * batch_ncols]; + shmem_pred[i] = predout[i + k * nbins * batch_ncols]; + shmem_mse[i] = 0.0; + shmem_mse[i + batchsz * nbins] = 0.0; + } + + __syncthreads(); + + for (unsigned int i = tid; i < nrows * batchsz; + i += blockDim.x * gridDim.x) { + int mycolid = (int)(i / nrows); + int coloffset = mycolid * nbins; + + T delta = + (minmaxshared[mycolid + batchsz] - minmaxshared[mycolid]) / (nbins); + T base_quesval = minmaxshared[mycolid] + delta; + + T localdata = data[i + k * batch_ncols * nrows]; + T label = labels[i % nrows]; + for (int j = 0; j < nbins; j++) { + T quesval = base_quesval + j * delta; + + if (localdata <= quesval) { + T temp = shmem_pred[coloffset + j] / shmem_count[coloffset + j]; + temp = label - temp; + atomicAdd(&shmem_mse[j + coloffset], F::exec(temp)); + } else { + T temp = (pred_parent * nrows - shmem_pred[coloffset + j]) / + (nrows - shmem_count[coloffset + j]); + temp = label - temp; + atomicAdd(&shmem_mse[j + coloffset + batchsz * nbins], F::exec(temp)); + } + } + } + + __syncthreads(); + + for (int i = threadIdx.x; i < batchsz * nbins; i += blockDim.x) { + atomicAdd(&mseout[i + k * batch_ncols * nbins], shmem_mse[i]); + atomicAdd(&mseout[i + k * batch_ncols * nbins + ncols * nbins], + shmem_mse[i + batchsz * nbins]); + } + + __syncthreads(); + } } /* The output of the function is a histogram array, of size ncols * nbins * n_unique_lables column order is as per colids (bootstrapped random cols) for each col there are nbins histograms */ -template -__global__ void all_cols_histograms_minmax_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const int nbins, const int nrows, const int ncols, const int batch_ncols, const T* __restrict__ globalminmax, T* predout, int* countout) { - - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ char shmem[]; - - int colstep = (int)(ncols/batch_ncols); - if((ncols % batch_ncols) != 0) - colstep++; - - int batchsz = batch_ncols; - for(int k = 0; k < colstep; k++) { - - if(k == (colstep-1) && ( (ncols % batch_ncols) != 0) ) { - batchsz = ncols % batch_ncols; - } - - T *minmaxshared = (T*)shmem; - T *shmem_pred = (T*)(shmem + 2*batchsz*sizeof(T)); - int *shmem_count = (int*)(shmem + 2*batchsz*sizeof(T) + nbins*batchsz*sizeof(T)); - - for (int i=threadIdx.x; i < 2*batchsz; i += blockDim.x) { - (i < batchsz) ? (minmaxshared[i] = globalminmax[k*batch_ncols + i] ) : (minmaxshared[i] = globalminmax[k*batch_ncols + (i-batchsz) + ncols]); - } - - for (int i = threadIdx.x; i < nbins*batchsz; i += blockDim.x) { - shmem_pred[i] = 0; - shmem_count[i] = 0; - } - - __syncthreads(); - - for (unsigned int i = tid; i < nrows*batchsz; i += blockDim.x*gridDim.x) { - int mycolid = (int)( i / nrows); - int coloffset = mycolid*nbins; - - T delta = (minmaxshared[mycolid + batchsz] - minmaxshared[mycolid]) / (nbins); - T base_quesval = minmaxshared[mycolid] + delta; - - T localdata = data[i + k*batch_ncols*nrows]; - T label = labels[ i % nrows ]; - for (int j=0; j < nbins; j++) { - T quesval = base_quesval + j * delta; - - if (localdata <= quesval) { - atomicAdd(&shmem_count[j + coloffset], 1); - atomicAdd(&shmem_pred[j + coloffset], label); - } - } - - } - - __syncthreads(); - - for (int i = threadIdx.x; i < batchsz*nbins; i += blockDim.x) { - atomicAdd(&predout[i + k*batch_ncols*nbins], shmem_pred[i]); - atomicAdd(&countout[i + k*batch_ncols*nbins], shmem_count[i]); - } - - __syncthreads(); - } +template +__global__ void all_cols_histograms_minmax_kernel_reg( + const T* __restrict__ data, const T* __restrict__ labels, const int nbins, + const int nrows, const int ncols, const int batch_ncols, + const T* __restrict__ globalminmax, T* predout, int* countout) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ char shmem[]; + + int colstep = (int)(ncols / batch_ncols); + if ((ncols % batch_ncols) != 0) colstep++; + + int batchsz = batch_ncols; + for (int k = 0; k < colstep; k++) { + if (k == (colstep - 1) && ((ncols % batch_ncols) != 0)) { + batchsz = ncols % batch_ncols; + } + + T* minmaxshared = (T*)shmem; + T* shmem_pred = (T*)(shmem + 2 * batchsz * sizeof(T)); + int* shmem_count = + (int*)(shmem + 2 * batchsz * sizeof(T) + nbins * batchsz * sizeof(T)); + + for (int i = threadIdx.x; i < 2 * batchsz; i += blockDim.x) { + (i < batchsz) ? (minmaxshared[i] = globalminmax[k * batch_ncols + i]) + : (minmaxshared[i] = + globalminmax[k * batch_ncols + (i - batchsz) + ncols]); + } + + for (int i = threadIdx.x; i < nbins * batchsz; i += blockDim.x) { + shmem_pred[i] = 0; + shmem_count[i] = 0; + } + + __syncthreads(); + + for (unsigned int i = tid; i < nrows * batchsz; + i += blockDim.x * gridDim.x) { + int mycolid = (int)(i / nrows); + int coloffset = mycolid * nbins; + + T delta = + (minmaxshared[mycolid + batchsz] - minmaxshared[mycolid]) / (nbins); + T base_quesval = minmaxshared[mycolid] + delta; + + T localdata = data[i + k * batch_ncols * nrows]; + T label = labels[i % nrows]; + for (int j = 0; j < nbins; j++) { + T quesval = base_quesval + j * delta; + + if (localdata <= quesval) { + atomicAdd(&shmem_count[j + coloffset], 1); + atomicAdd(&shmem_pred[j + coloffset], label); + } + } + } + + __syncthreads(); + + for (int i = threadIdx.x; i < batchsz * nbins; i += blockDim.x) { + atomicAdd(&predout[i + k * batch_ncols * nbins], shmem_pred[i]); + atomicAdd(&countout[i + k * batch_ncols * nbins], shmem_count[i]); + } + + __syncthreads(); + } } -template -__global__ void all_cols_histograms_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int batch_ncols, T* predout, int* countout, const T* __restrict__ quantile) { - - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ char shmem[]; - - int colstep = (int)(ncols/batch_ncols); - if((ncols % batch_ncols) != 0) - colstep++; - - int batchsz = batch_ncols; - for(int k = 0; k < colstep; k++) { - - if(k == (colstep-1) && ( (ncols % batch_ncols) != 0) ) { - batchsz = ncols % batch_ncols; - } - - T *shmem_pred = (T*) (shmem); - int *shmem_count = (int*)(shmem + nbins*batchsz*sizeof(T)); - - for (int i = threadIdx.x; i < nbins*batchsz; i += blockDim.x) { - shmem_pred[i] = 0; - shmem_count[i] = 0; - } - - __syncthreads(); - - for (unsigned int i = tid; i < nrows*batchsz; i += blockDim.x*gridDim.x) { - int mycolid = (int)( i / nrows); - int coloffset = mycolid*nbins; - - T localdata = data[i + k*batch_ncols*nrows]; - T label = labels[ i % nrows ]; - for (int j=0; j < nbins; j++) { - int quantile_index = colids[mycolid + k*batch_ncols] * nbins + j; - T quesval = quantile[quantile_index]; - if (localdata <= quesval) { - atomicAdd(&shmem_count[j + coloffset], 1); - atomicAdd(&shmem_pred[j + coloffset], label); - } - } - - } - - __syncthreads(); - - for (int i = threadIdx.x; i < batchsz*nbins; i += blockDim.x) { - atomicAdd(&predout[i + k*batch_ncols*nbins], shmem_pred[i]); - atomicAdd(&countout[i + k*batch_ncols*nbins], shmem_count[i]); - } - __syncthreads(); - - } +template +__global__ void all_cols_histograms_global_quantile_kernel_reg( + const T* __restrict__ data, const T* __restrict__ labels, + const unsigned int* __restrict__ colids, const int nbins, const int nrows, + const int ncols, const int batch_ncols, T* predout, int* countout, + const T* __restrict__ quantile) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ char shmem[]; + + int colstep = (int)(ncols / batch_ncols); + if ((ncols % batch_ncols) != 0) colstep++; + + int batchsz = batch_ncols; + for (int k = 0; k < colstep; k++) { + if (k == (colstep - 1) && ((ncols % batch_ncols) != 0)) { + batchsz = ncols % batch_ncols; + } + + T* shmem_pred = (T*)(shmem); + int* shmem_count = (int*)(shmem + nbins * batchsz * sizeof(T)); + + for (int i = threadIdx.x; i < nbins * batchsz; i += blockDim.x) { + shmem_pred[i] = 0; + shmem_count[i] = 0; + } + + __syncthreads(); + + for (unsigned int i = tid; i < nrows * batchsz; + i += blockDim.x * gridDim.x) { + int mycolid = (int)(i / nrows); + int coloffset = mycolid * nbins; + + T localdata = data[i + k * batch_ncols * nrows]; + T label = labels[i % nrows]; + for (int j = 0; j < nbins; j++) { + int quantile_index = colids[mycolid + k * batch_ncols] * nbins + j; + T quesval = quantile[quantile_index]; + if (localdata <= quesval) { + atomicAdd(&shmem_count[j + coloffset], 1); + atomicAdd(&shmem_pred[j + coloffset], label); + } + } + } + + __syncthreads(); + + for (int i = threadIdx.x; i < batchsz * nbins; i += blockDim.x) { + atomicAdd(&predout[i + k * batch_ncols * nbins], shmem_pred[i]); + atomicAdd(&countout[i + k * batch_ncols * nbins], shmem_count[i]); + } + __syncthreads(); + } } -template -__global__ void compute_mse_global_quantile_kernel_reg(const T* __restrict__ data, const T* __restrict__ labels, const unsigned int* __restrict__ colids, const int nbins, const int nrows, const int ncols, const int batch_ncols, T* mseout, const T* __restrict__ predout, const int* __restrict__ countout, const T* __restrict__ quantile, const T pred_parent) { - - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ char shmem[]; - - int colstep = (int)(ncols/batch_ncols); - if((ncols % batch_ncols) != 0) - colstep++; - - int batchsz = batch_ncols; - for(int k = 0; k < colstep; k++) { - - if(k == (colstep-1) && ( (ncols % batch_ncols) != 0) ) { - batchsz = ncols % batch_ncols; - } - - T *shmem_pred = (T*)(shmem); - T *shmem_mse = (T*)(shmem + nbins*batchsz*sizeof(T)); - int *shmem_count = (int*)(shmem + 3*nbins*batchsz*sizeof(T)); - - for (int i = threadIdx.x; i < nbins*batchsz; i += blockDim.x) { - shmem_count[i] = countout[i + k*nbins*batch_ncols]; - shmem_pred[i] = predout[i + k*nbins*batch_ncols]; - shmem_mse[i] = 0.0; - shmem_mse[i + batchsz*nbins] = 0.0; - } - - __syncthreads(); - - for (unsigned int i = tid; i < nrows*batchsz; i += blockDim.x*gridDim.x) { - int mycolid = (int)( i / nrows); - int coloffset = mycolid*nbins; - - T localdata = data[i + k*batch_ncols*nrows]; - T label = labels[ i % nrows ]; - for (int j=0; j < nbins; j++) { - int quantile_index = colids[mycolid + k*batch_ncols] * nbins + j; - T quesval = quantile[quantile_index]; - - if (localdata <= quesval) { - T temp = shmem_pred[coloffset +j] / shmem_count[coloffset + j] ; - temp = label - temp; - atomicAdd(&shmem_mse[j + coloffset], F::exec(temp)); - } else { - T temp = ( pred_parent*nrows - shmem_pred[coloffset +j] ) / (nrows - shmem_count[coloffset + j] ); - temp = label - temp; - atomicAdd(&shmem_mse[j + coloffset + batchsz*nbins], F::exec(temp)); - } - - } - - } - - __syncthreads(); - - for (int i = threadIdx.x; i < batchsz*nbins; i += blockDim.x) { - atomicAdd(&mseout[i + k*batch_ncols*nbins], shmem_mse[i]); - atomicAdd(&mseout[i + k*batch_ncols*nbins + ncols*nbins], shmem_mse[i + batchsz*nbins]); - } - __syncthreads(); - - } - +template +__global__ void compute_mse_global_quantile_kernel_reg( + const T* __restrict__ data, const T* __restrict__ labels, + const unsigned int* __restrict__ colids, const int nbins, const int nrows, + const int ncols, const int batch_ncols, T* mseout, + const T* __restrict__ predout, const int* __restrict__ countout, + const T* __restrict__ quantile, const T pred_parent) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ char shmem[]; + + int colstep = (int)(ncols / batch_ncols); + if ((ncols % batch_ncols) != 0) colstep++; + + int batchsz = batch_ncols; + for (int k = 0; k < colstep; k++) { + if (k == (colstep - 1) && ((ncols % batch_ncols) != 0)) { + batchsz = ncols % batch_ncols; + } + + T* shmem_pred = (T*)(shmem); + T* shmem_mse = (T*)(shmem + nbins * batchsz * sizeof(T)); + int* shmem_count = (int*)(shmem + 3 * nbins * batchsz * sizeof(T)); + + for (int i = threadIdx.x; i < nbins * batchsz; i += blockDim.x) { + shmem_count[i] = countout[i + k * nbins * batch_ncols]; + shmem_pred[i] = predout[i + k * nbins * batch_ncols]; + shmem_mse[i] = 0.0; + shmem_mse[i + batchsz * nbins] = 0.0; + } + + __syncthreads(); + + for (unsigned int i = tid; i < nrows * batchsz; + i += blockDim.x * gridDim.x) { + int mycolid = (int)(i / nrows); + int coloffset = mycolid * nbins; + + T localdata = data[i + k * batch_ncols * nrows]; + T label = labels[i % nrows]; + for (int j = 0; j < nbins; j++) { + int quantile_index = colids[mycolid + k * batch_ncols] * nbins + j; + T quesval = quantile[quantile_index]; + + if (localdata <= quesval) { + T temp = shmem_pred[coloffset + j] / shmem_count[coloffset + j]; + temp = label - temp; + atomicAdd(&shmem_mse[j + coloffset], F::exec(temp)); + } else { + T temp = (pred_parent * nrows - shmem_pred[coloffset + j]) / + (nrows - shmem_count[coloffset + j]); + temp = label - temp; + atomicAdd(&shmem_mse[j + coloffset + batchsz * nbins], F::exec(temp)); + } + } + } + + __syncthreads(); + + for (int i = threadIdx.x; i < batchsz * nbins; i += blockDim.x) { + atomicAdd(&mseout[i + k * batch_ncols * nbins], shmem_mse[i]); + atomicAdd(&mseout[i + k * batch_ncols * nbins + ncols * nbins], + shmem_mse[i + batchsz * nbins]); + } + __syncthreads(); + } } -template -void find_best_split_regressor(const std::shared_ptr> tempmem, const int nbins, const std::vector& col_selector, MetricInfo split_info[3], const int nrows, MetricQuestion & ques, float & gain, const int split_algo) { - - gain = 0.0f; - int best_col_id = -1; - int best_bin_id = -1; - - int n_cols = col_selector.size(); - for (int col_id = 0; col_id < n_cols; col_id++) { - - int col_count_base_index = col_id * nbins; - // tempmem->h_histout holds n_cols histograms of nbins of n_unique_labels each. - for (int i = 0; i < nbins; i++) { - - int tmp_lnrows = tempmem->h_histout->data()[col_count_base_index + i]; - int tmp_rnrows = nrows - tmp_lnrows; - - if (tmp_lnrows == 0 || tmp_rnrows == 0) - continue; - - float tmp_pred_left = tempmem->h_predout->data()[col_count_base_index + i]; - float tmp_pred_right = (nrows * split_info[0].predict) - tmp_pred_left; - tmp_pred_left /= tmp_lnrows; - tmp_pred_right /= tmp_rnrows; - - // Compute MSE right and MSE left value for each bin. - float tmp_mse_left = tempmem->h_mseout->data()[col_count_base_index + i]; - float tmp_mse_right = tempmem->h_mseout->data()[col_count_base_index + i + n_cols*nbins]; - tmp_mse_left /= tmp_lnrows; - tmp_mse_right /= tmp_rnrows; - - float impurity = (tmp_lnrows * 1.0f/nrows) * tmp_mse_left + (tmp_rnrows * 1.0f/nrows) * tmp_mse_right; - float info_gain = split_info[0].best_metric - impurity; - - // Compute best information col_gain so far - if (info_gain > gain) { - gain = info_gain; - best_bin_id = i; - best_col_id = col_id; - split_info[1].best_metric = tmp_mse_left; - split_info[2].best_metric = tmp_mse_right; - split_info[1].predict = tmp_pred_left; - split_info[2].predict = tmp_pred_right; - } - } - } - - if (best_col_id == -1 || best_bin_id == -1) - return; - - if (split_algo == ML::SPLIT_ALGO::HIST) { - ques.set_question_fields(best_col_id, col_selector[best_col_id], best_bin_id, nbins, n_cols, std::numeric_limits::max(), -std::numeric_limits::max(), (T) 0); - } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { - T ques_val; - T *d_quantile = tempmem->d_quantile->data(); - int q_index = col_selector[best_col_id] * nbins + best_bin_id; - MLCommon::updateHost(&ques_val, &d_quantile[q_index], 1, tempmem->stream); - CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); - ques.set_question_fields(best_col_id, col_selector[best_col_id], best_bin_id, nbins, n_cols, std::numeric_limits::max(), -std::numeric_limits::max(), ques_val); - } - return; +template +void find_best_split_regressor( + const std::shared_ptr> tempmem, const int nbins, + const std::vector& col_selector, MetricInfo split_info[3], + const int nrows, MetricQuestion& ques, float& gain, const int split_algo) { + gain = 0.0f; + int best_col_id = -1; + int best_bin_id = -1; + + int n_cols = col_selector.size(); + for (int col_id = 0; col_id < n_cols; col_id++) { + int col_count_base_index = col_id * nbins; + // tempmem->h_histout holds n_cols histograms of nbins of n_unique_labels each. + for (int i = 0; i < nbins; i++) { + int tmp_lnrows = tempmem->h_histout->data()[col_count_base_index + i]; + int tmp_rnrows = nrows - tmp_lnrows; + + if (tmp_lnrows == 0 || tmp_rnrows == 0) continue; + + float tmp_pred_left = + tempmem->h_predout->data()[col_count_base_index + i]; + float tmp_pred_right = (nrows * split_info[0].predict) - tmp_pred_left; + tmp_pred_left /= tmp_lnrows; + tmp_pred_right /= tmp_rnrows; + + // Compute MSE right and MSE left value for each bin. + float tmp_mse_left = tempmem->h_mseout->data()[col_count_base_index + i]; + float tmp_mse_right = + tempmem->h_mseout->data()[col_count_base_index + i + n_cols * nbins]; + tmp_mse_left /= tmp_lnrows; + tmp_mse_right /= tmp_rnrows; + + float impurity = (tmp_lnrows * 1.0f / nrows) * tmp_mse_left + + (tmp_rnrows * 1.0f / nrows) * tmp_mse_right; + float info_gain = split_info[0].best_metric - impurity; + + // Compute best information col_gain so far + if (info_gain > gain) { + gain = info_gain; + best_bin_id = i; + best_col_id = col_id; + split_info[1].best_metric = tmp_mse_left; + split_info[2].best_metric = tmp_mse_right; + split_info[1].predict = tmp_pred_left; + split_info[2].predict = tmp_pred_right; + } + } + } + + if (best_col_id == -1 || best_bin_id == -1) return; + + if (split_algo == ML::SPLIT_ALGO::HIST) { + ques.set_question_fields( + best_col_id, col_selector[best_col_id], best_bin_id, nbins, n_cols, + std::numeric_limits::max(), -std::numeric_limits::max(), (T)0); + } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { + T ques_val; + T* d_quantile = tempmem->d_quantile->data(); + int q_index = col_selector[best_col_id] * nbins + best_bin_id; + MLCommon::updateHost(&ques_val, &d_quantile[q_index], 1, tempmem->stream); + CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); + ques.set_question_fields( + best_col_id, col_selector[best_col_id], best_bin_id, nbins, n_cols, + std::numeric_limits::max(), -std::numeric_limits::max(), ques_val); + } + return; } - -template -void best_split_all_cols_regressor(const T *data, const unsigned int* rowids, const T *labels, const int nbins, const int nrows, const int rowoffset, const std::vector& colselector, const std::shared_ptr> tempmem, MetricInfo split_info[3], MetricQuestion & ques, float & gain, const int split_algo, const size_t max_shared_mem) -{ - unsigned int* d_colids = tempmem->d_colids->data(); - T* d_globalminmax = tempmem->d_globalminmax->data(); - int *d_histout = tempmem->d_histout->data(); - int *h_histout = tempmem->h_histout->data(); - T* d_mseout = tempmem->d_mseout->data(); - T* h_mseout = tempmem->h_mseout->data(); - T* d_predout = tempmem->d_predout->data(); - T* h_predout = tempmem->h_predout->data(); - - int ncols = colselector.size(); - int col_minmax_bytes = sizeof(T) * 2 * ncols; - int n_pred_bytes = nbins * sizeof(T) * ncols; - int n_count_bytes = nbins * ncols * sizeof(int); - int n_mse_bytes = 2 * nbins * sizeof(T) * ncols; - - CUDA_CHECK(cudaMemsetAsync((void*)d_mseout, 0, n_mse_bytes, tempmem->stream)); - CUDA_CHECK(cudaMemsetAsync((void*)d_predout, 0, n_pred_bytes, tempmem->stream)); - CUDA_CHECK(cudaMemsetAsync((void*)d_histout, 0, n_count_bytes, tempmem->stream)); - - const int threads = 512; - int blocks = MLCommon::ceildiv(nrows * ncols, threads); - if (blocks > 65536) - blocks = 65536; - - /* Kernel allcolsampler_*_kernel: +template +void best_split_all_cols_regressor( + const T* data, const unsigned int* rowids, const T* labels, const int nbins, + const int nrows, const int rowoffset, + const std::vector& colselector, + const std::shared_ptr> tempmem, + MetricInfo split_info[3], MetricQuestion& ques, float& gain, + const int split_algo, const size_t max_shared_mem) { + unsigned int* d_colids = tempmem->d_colids->data(); + T* d_globalminmax = tempmem->d_globalminmax->data(); + int* d_histout = tempmem->d_histout->data(); + int* h_histout = tempmem->h_histout->data(); + T* d_mseout = tempmem->d_mseout->data(); + T* h_mseout = tempmem->h_mseout->data(); + T* d_predout = tempmem->d_predout->data(); + T* h_predout = tempmem->h_predout->data(); + + int ncols = colselector.size(); + int col_minmax_bytes = sizeof(T) * 2 * ncols; + int n_pred_bytes = nbins * sizeof(T) * ncols; + int n_count_bytes = nbins * ncols * sizeof(int); + int n_mse_bytes = 2 * nbins * sizeof(T) * ncols; + + CUDA_CHECK(cudaMemsetAsync((void*)d_mseout, 0, n_mse_bytes, tempmem->stream)); + CUDA_CHECK( + cudaMemsetAsync((void*)d_predout, 0, n_pred_bytes, tempmem->stream)); + CUDA_CHECK( + cudaMemsetAsync((void*)d_histout, 0, n_count_bytes, tempmem->stream)); + + const int threads = 512; + int blocks = MLCommon::ceildiv(nrows * ncols, threads); + if (blocks > 65536) blocks = 65536; + + /* Kernel allcolsampler_*_kernel: - populates tempmem->tempdata with the sampled column data, - and computes min max histograms in tempmem->d_globalminmax *if minmax in name across all columns. */ - size_t shmemsize = col_minmax_bytes; - if (split_algo == ML::SPLIT_ALGO::HIST) { // Histograms (min, max) - MLCommon::Stats::minmax(data, rowids, d_colids, nrows, ncols, rowoffset, &d_globalminmax[0], &d_globalminmax[colselector.size()], tempmem->temp_data->data(), tempmem->stream); - } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { // Global quantiles; just col condenser - allcolsampler_kernel<<stream>>>(data, rowids, d_colids, nrows, ncols, rowoffset, tempmem->temp_data->data()); - } - CUDA_CHECK(cudaGetLastError()); - - int batch_ncols; - size_t shmem_needed; - - T *labelptr = tempmem->sampledlabels->data(); - get_sampled_labels(labels, labelptr, rowids, nrows, tempmem->stream); - - if (split_algo == ML::SPLIT_ALGO::HIST) { - shmem_needed = n_pred_bytes + n_count_bytes + col_minmax_bytes; - update_kernel_config(max_shared_mem, shmem_needed, ncols, nrows, threads, batch_ncols, blocks, shmemsize); - all_cols_histograms_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, batch_ncols, d_globalminmax, d_predout, d_histout); - - shmem_needed += n_mse_bytes; - update_kernel_config(max_shared_mem, shmem_needed, ncols, nrows, threads, batch_ncols, blocks, shmemsize); - compute_mse_minmax_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, batch_ncols, d_globalminmax, d_mseout, d_predout, d_histout, split_info[0].predict); - } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { - shmem_needed = n_pred_bytes + n_count_bytes; - update_kernel_config(max_shared_mem, shmem_needed, ncols, nrows, threads, batch_ncols, blocks, shmemsize); - all_cols_histograms_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, batch_ncols, d_predout, d_histout, tempmem->d_quantile->data()); - - shmem_needed += n_mse_bytes; - update_kernel_config(max_shared_mem, shmem_needed, ncols, nrows, threads, batch_ncols, blocks, shmemsize); - - compute_mse_global_quantile_kernel_reg<<stream>>>(tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, batch_ncols, d_mseout, d_predout, d_histout, tempmem->d_quantile->data(), split_info[0].predict); - } - CUDA_CHECK(cudaGetLastError()); - - MLCommon::updateHost(h_mseout, d_mseout, n_mse_bytes / sizeof(T), tempmem->stream); - MLCommon::updateHost(h_histout, d_histout, n_count_bytes / sizeof(int), tempmem->stream); - MLCommon::updateHost(h_predout, d_predout, n_pred_bytes / sizeof(T), tempmem->stream); - CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); - - find_best_split_regressor(tempmem, nbins, colselector, &split_info[0], nrows, ques, gain, split_algo); - return; + size_t shmemsize = col_minmax_bytes; + if (split_algo == ML::SPLIT_ALGO::HIST) { // Histograms (min, max) + MLCommon::Stats::minmax( + data, rowids, d_colids, nrows, ncols, rowoffset, &d_globalminmax[0], + &d_globalminmax[colselector.size()], tempmem->temp_data->data(), + tempmem->stream); + } else if (split_algo == + ML::SPLIT_ALGO:: + GLOBAL_QUANTILE) { // Global quantiles; just col condenser + allcolsampler_kernel<<stream>>>( + data, rowids, d_colids, nrows, ncols, rowoffset, + tempmem->temp_data->data()); + } + CUDA_CHECK(cudaGetLastError()); + + int batch_ncols; + size_t shmem_needed; + + T* labelptr = tempmem->sampledlabels->data(); + get_sampled_labels(labels, labelptr, rowids, nrows, tempmem->stream); + + if (split_algo == ML::SPLIT_ALGO::HIST) { + shmem_needed = n_pred_bytes + n_count_bytes + col_minmax_bytes; + update_kernel_config(max_shared_mem, shmem_needed, ncols, nrows, threads, + batch_ncols, blocks, shmemsize); + all_cols_histograms_minmax_kernel_reg<<stream>>>( + tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, batch_ncols, + d_globalminmax, d_predout, d_histout); + + shmem_needed += n_mse_bytes; + update_kernel_config(max_shared_mem, shmem_needed, ncols, nrows, threads, + batch_ncols, blocks, shmemsize); + compute_mse_minmax_kernel_reg + <<stream>>>( + tempmem->temp_data->data(), labelptr, nbins, nrows, ncols, batch_ncols, + d_globalminmax, d_mseout, d_predout, d_histout, split_info[0].predict); + } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { + shmem_needed = n_pred_bytes + n_count_bytes; + update_kernel_config(max_shared_mem, shmem_needed, ncols, nrows, threads, + batch_ncols, blocks, shmemsize); + all_cols_histograms_global_quantile_kernel_reg<<stream>>>( + tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, + batch_ncols, d_predout, d_histout, tempmem->d_quantile->data()); + + shmem_needed += n_mse_bytes; + update_kernel_config(max_shared_mem, shmem_needed, ncols, nrows, threads, + batch_ncols, blocks, shmemsize); + + compute_mse_global_quantile_kernel_reg + <<stream>>>( + tempmem->temp_data->data(), labelptr, d_colids, nbins, nrows, ncols, + batch_ncols, d_mseout, d_predout, d_histout, + tempmem->d_quantile->data(), split_info[0].predict); + } + CUDA_CHECK(cudaGetLastError()); + + MLCommon::updateHost(h_mseout, d_mseout, n_mse_bytes / sizeof(T), + tempmem->stream); + MLCommon::updateHost(h_histout, d_histout, n_count_bytes / sizeof(int), + tempmem->stream); + MLCommon::updateHost(h_predout, d_predout, n_pred_bytes / sizeof(T), + tempmem->stream); + CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); + + find_best_split_regressor(tempmem, nbins, colselector, &split_info[0], nrows, + ques, gain, split_algo); + return; } - diff --git a/cpp/src/decisiontree/kernels/metric.cuh b/cpp/src/decisiontree/kernels/metric.cuh index 56469664c3..50d9a52cee 100644 --- a/cpp/src/decisiontree/kernels/metric.cuh +++ b/cpp/src/decisiontree/kernels/metric.cuh @@ -15,141 +15,173 @@ */ #pragma once -#include -#include "../memory.h" -#include -#include "metric_def.h" #include "cuda_utils.h" +#include "metric_def.h" + +template +void MetricQuestion::set_question_fields(int cfg_bootcolumn, int cfg_column, + int cfg_batch_id, int cfg_nbins, + int cfg_ncols, T cfg_min, T cfg_max, + T cfg_value) { + bootstrapped_column = cfg_bootcolumn; + original_column = cfg_column; + batch_id = cfg_batch_id; + min = cfg_min; + max = cfg_max; + nbins = cfg_nbins; + ncols = cfg_ncols; + value = cfg_value; // Will be updated in make_split +} + +template +__device__ __forceinline__ T SquareFunctor::exec(T x) { + return MLCommon::myPow(x, (T)2); +} -template -void MetricQuestion::set_question_fields(int cfg_bootcolumn, int cfg_column, int cfg_batch_id, int cfg_nbins, int cfg_ncols, T cfg_min, T cfg_max, T cfg_value) { - bootstrapped_column = cfg_bootcolumn; - original_column = cfg_column; - batch_id = cfg_batch_id; - min = cfg_min; - max = cfg_max; - nbins = cfg_nbins; - ncols = cfg_ncols; - value = cfg_value; // Will be updated in make_split +template +__device__ __forceinline__ T AbsFunctor::exec(T x) { + return MLCommon::myAbs(x); } -__global__ void gini_kernel(const int* __restrict__ labels, const int nrows, const int nmax, int* histout) -{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ unsigned int shmemhist[]; - if (threadIdx.x < nmax) - shmemhist[threadIdx.x] = 0; +float GiniFunctor::exec(std::vector &hist, int nrows) { + float gval = 1.0; + for (int i = 0; i < hist.size(); i++) { + float prob = ((float)hist[i]) / nrows; + gval -= prob * prob; + } + return gval; +} + +float EntropyFunctor::exec(std::vector &hist, int nrows) { + float eval = 0.0; + for (int i = 0; i < hist.size(); i++) { + float prob = ((float)hist[i]) / nrows; + eval += prob * logf(prob); + } + return (-1 * eval); +} - __syncthreads(); +__global__ void gini_kernel(const int *__restrict__ labels, const int nrows, + const int nmax, int *histout) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ unsigned int shmemhist[]; + if (threadIdx.x < nmax) shmemhist[threadIdx.x] = 0; - if (tid < nrows) { - int label = labels[tid]; - atomicAdd(&shmemhist[label], 1); - } + __syncthreads(); - __syncthreads(); + if (tid < nrows) { + int label = labels[tid]; + atomicAdd(&shmemhist[label], 1); + } - if (threadIdx.x < nmax) - atomicAdd(&histout[threadIdx.x], shmemhist[threadIdx.x]); + __syncthreads(); - return; + if (threadIdx.x < nmax) + atomicAdd(&histout[threadIdx.x], shmemhist[threadIdx.x]); + + return; } -template -__global__ void pred_kernel(const T* __restrict__ labels, const int nrows, T* predout) -{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - __shared__ T shmempred; +template +__global__ void pred_kernel(const T *__restrict__ labels, const int nrows, + T *predout) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + __shared__ T shmempred; - if (threadIdx.x == 0) - shmempred = 0; + if (threadIdx.x == 0) shmempred = 0; - __syncthreads(); + __syncthreads(); - if (tid < nrows) { - T label = labels[tid]; - atomicAdd(&shmempred, label); - } + if (tid < nrows) { + T label = labels[tid]; + atomicAdd(&shmempred, label); + } - __syncthreads(); + __syncthreads(); - if (threadIdx.x == 0) { - atomicAdd(predout, shmempred); - } + if (threadIdx.x == 0) { + atomicAdd(predout, shmempred); + } - return; + return; } -template -__global__ void mse_kernel(const T* __restrict__ labels, const int nrows, const T* predout, T* mseout) -{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - __shared__ T shmemmse; - - if (threadIdx.x == 0) { - shmemmse = 0; - } +template +__global__ void mse_kernel(const T *__restrict__ labels, const int nrows, + const T *predout, T *mseout) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + __shared__ T shmemmse; - __syncthreads(); + if (threadIdx.x == 0) { + shmemmse = 0; + } - if (tid < nrows) { - T label = labels[tid] - (predout[0]/nrows); - atomicAdd(&shmemmse, F::exec(label)); - } + __syncthreads(); - __syncthreads(); + if (tid < nrows) { + T label = labels[tid] - (predout[0] / nrows); + atomicAdd(&shmemmse, F::exec(label)); + } - if (threadIdx.x == 0) { - atomicAdd(mseout, shmemmse); - } + __syncthreads(); - return; -} + if (threadIdx.x == 0) { + atomicAdd(mseout, shmemmse); + } -template -void gini(int *labels_in, const int nrows, const std::shared_ptr> tempmem, MetricInfo & split_info, int & unique_labels) -{ - int *dhist = tempmem->d_hist->data(); - int *hhist = tempmem->h_hist->data(); - - CUDA_CHECK(cudaMemsetAsync(dhist, 0, sizeof(int)*unique_labels, tempmem->stream)); - gini_kernel<<< MLCommon::ceildiv(nrows, 128), 128, sizeof(int)*unique_labels, tempmem->stream>>>(labels_in, nrows, unique_labels, dhist); - CUDA_CHECK(cudaGetLastError()); - MLCommon::updateHost(hhist, dhist, unique_labels, tempmem->stream); - CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); - - split_info.hist.resize(unique_labels, 0); - for (int i=0; i < unique_labels; i++) { - split_info.hist[i] = hhist[i]; - } - - split_info.best_metric = F::exec(split_info.hist, nrows); - - return; + return; } -template -void mse(T *labels_in, const int nrows, const std::shared_ptr> tempmem, MetricInfo & split_info) -{ - T *dpred = tempmem->d_predout->data(); - T *dmse = tempmem->d_mseout->data(); - T *hmse = tempmem->h_mseout->data(); - T *hpred = tempmem->h_predout->data(); - - CUDA_CHECK(cudaMemsetAsync(dpred, 0, sizeof(T), tempmem->stream)); - CUDA_CHECK(cudaMemsetAsync(dmse, 0, sizeof(T), tempmem->stream)); - - pred_kernel<<< MLCommon::ceildiv(nrows, 128), 128, 0, tempmem->stream>>>(labels_in, nrows, dpred); - CUDA_CHECK(cudaGetLastError()); - mse_kernel<<< MLCommon::ceildiv(nrows, 128), 128, 0, tempmem->stream>>>(labels_in, nrows, dpred, dmse); - CUDA_CHECK(cudaGetLastError()); - - MLCommon::updateHost(hmse, dmse, 1, tempmem->stream); - MLCommon::updateHost(hpred, dpred, 1, tempmem->stream); - CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); - - split_info.best_metric = (float)hmse[0] / (float)nrows; //Update split metric value - split_info.predict = hpred[0] / (T)nrows; - return; +template +void gini(int *labels_in, const int nrows, + const std::shared_ptr> tempmem, + MetricInfo &split_info, const int unique_labels) { + int *dhist = tempmem->d_hist->data(); + int *hhist = tempmem->h_hist->data(); + + CUDA_CHECK( + cudaMemsetAsync(dhist, 0, sizeof(int) * unique_labels, tempmem->stream)); + gini_kernel<<stream>>>(labels_in, nrows, unique_labels, dhist); + CUDA_CHECK(cudaGetLastError()); + MLCommon::updateHost(hhist, dhist, unique_labels, tempmem->stream); + CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); + + split_info.hist.resize(unique_labels, 0); + for (int i = 0; i < unique_labels; i++) { + split_info.hist[i] = hhist[i]; + } + + split_info.best_metric = F::exec(split_info.hist, nrows); + + return; } +template +void mse(T *labels_in, const int nrows, + const std::shared_ptr> tempmem, + MetricInfo &split_info) { + T *dpred = tempmem->d_predout->data(); + T *dmse = tempmem->d_mseout->data(); + T *hmse = tempmem->h_mseout->data(); + T *hpred = tempmem->h_predout->data(); + + CUDA_CHECK(cudaMemsetAsync(dpred, 0, sizeof(T), tempmem->stream)); + CUDA_CHECK(cudaMemsetAsync(dmse, 0, sizeof(T), tempmem->stream)); + + pred_kernel<<stream>>>( + labels_in, nrows, dpred); + CUDA_CHECK(cudaGetLastError()); + mse_kernel<<stream>>>( + labels_in, nrows, dpred, dmse); + CUDA_CHECK(cudaGetLastError()); + + MLCommon::updateHost(hmse, dmse, 1, tempmem->stream); + MLCommon::updateHost(hpred, dpred, 1, tempmem->stream); + CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); + + split_info.best_metric = + (float)hmse[0] / (float)nrows; //Update split metric value + split_info.predict = hpred[0] / (T)nrows; + return; +} diff --git a/cpp/src/decisiontree/kernels/metric_def.h b/cpp/src/decisiontree/kernels/metric_def.h index 65881e0fec..ffc7102d38 100644 --- a/cpp/src/decisiontree/kernels/metric_def.h +++ b/cpp/src/decisiontree/kernels/metric_def.h @@ -19,7 +19,6 @@ #include #include #include "../memory.h" -#include "cuda_utils.h" template struct MetricQuestion { @@ -58,36 +57,18 @@ struct MetricInfo { struct SquareFunctor { template - static __device__ __forceinline__ T exec(T x) { - return MLCommon::myPow(x, (T)2); - } + static __device__ __forceinline__ T exec(T x); }; struct AbsFunctor { template - static __device__ __forceinline__ T exec(T x) { - return MLCommon::myAbs(x); - } + static __device__ __forceinline__ T exec(T x); }; struct GiniFunctor { - static float exec(std::vector& hist, int nrows) { - float gval = 1.0; - for (int i = 0; i < hist.size(); i++) { - float prob = ((float)hist[i]) / nrows; - gval -= prob * prob; - } - return gval; - } + static float exec(std::vector& hist, int nrows); }; struct EntropyFunctor { - static float exec(std::vector& hist, int nrows) { - float eval = 0.0; - for (int i = 0; i < hist.size(); i++) { - float prob = ((float)hist[i]) / nrows; - eval += prob * logf(prob); - } - return (-1 * eval); - } + static float exec(std::vector& hist, int nrows); }; diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index d9f9167038..82de741bc3 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -26,7 +26,7 @@ namespace ML { * @param[in] cfg_accuracy: accuracy. */ RF_metrics::RF_metrics(float cfg_accuracy) - : rf_type(RF_type::CLASSIFICATION), accuracy(cfg_accuracy) {}; + : rf_type(RF_type::CLASSIFICATION), accuracy(cfg_accuracy){}; /** * @brief Construct RF_metrics. @@ -39,7 +39,7 @@ RF_metrics::RF_metrics(double cfg_mean_abs_error, double cfg_mean_squared_error, : rf_type(RF_type::REGRESSION), mean_abs_error(cfg_mean_abs_error), mean_squared_error(cfg_mean_squared_error), - median_abs_error(cfg_median_abs_error) {}; + median_abs_error(cfg_median_abs_error){}; /** * @brief Print either accuracy metric for classification, or mean absolute error, mean squared error, @@ -111,7 +111,7 @@ void postprocess_labels(int n_rows, std::vector& labels, /** * @brief Random forest hyper-parameter object default constructor (1 tree). */ -//RF_params::RF_params() : n_trees(1) {} +RF_params::RF_params() : n_trees(1) {} /** * @brief Random forest hyper-parameter object constructor to set n_trees member. diff --git a/cpp/src/randomforest/randomforest.h b/cpp/src/randomforest/randomforest.h index 85bb3a70de..ad23c69353 100644 --- a/cpp/src/randomforest/randomforest.h +++ b/cpp/src/randomforest/randomforest.h @@ -65,7 +65,7 @@ struct RF_params { */ DecisionTree::DecisionTreeParams tree_params; - //RF_params(); + RF_params(); RF_params(int cfg_n_trees); RF_params(bool cfg_bootstrap, bool cfg_bootstrap_features, int cfg_n_trees, float cfg_rows_sample); diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index aa15e436ac..3edc404290 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -235,7 +235,8 @@ class RfRegressorTest : public ::testing::TestWithParam> { const std::vector> inputsf2_clf = { {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, - CRITERION::GINI}, // single tree forest, bootstrap false, unlimited depth, 4 bins + CRITERION:: + GINI}, // single tree forest, bootstrap false, unlimited depth, 4 bins {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, CRITERION::GINI}, // single tree forest, bootstrap false, depth of 8, 4 bins {4, 2, 10, 1.0f, 1.0f, 4, 8, -1, false, false, 4, SPLIT_ALGO::HIST, 2, @@ -257,8 +258,7 @@ const std::vector> inputsf2_clf = { {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::HIST, 2, CRITERION::ENTROPY}, {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE, - 2, CRITERION::ENTROPY} -}; + 2, CRITERION::ENTROPY}}; const std::vector> inputsd2_clf = { // Same as inputsf2_clf {4, 2, 1, 1.0f, 1.0f, 4, -1, -1, false, false, 4, SPLIT_ALGO::HIST, 2, diff --git a/python/cuml/ensemble/randomforest.pyx b/python/cuml/ensemble/randomforest.pyx index 1ea55ff4a0..8c474b5ba3 100644 --- a/python/cuml/ensemble/randomforest.pyx +++ b/python/cuml/ensemble/randomforest.pyx @@ -48,6 +48,13 @@ cdef extern from "randomforest/randomforest.h" namespace "ML": CLASSIFICATION, REGRESSION + cdef enum CRITERION: + GINI, + ENTROPY, + MSE, + MAE, + CRITERION_END + cdef struct RF_params: pass @@ -98,7 +105,7 @@ cdef extern from "randomforest/randomforest.h" namespace "ML": cdef RF_params set_rf_class_obj(int, int, float, int, int, int, - bool, bool, int, int) except + + bool, bool, int, int, CRITERION) except + cdef class RandomForest_impl(): @@ -112,6 +119,7 @@ cdef class RandomForest_impl(): cdef object max_features cdef object n_bins cdef object split_algo + cdef object split_criterion cdef object min_rows_per_node cdef object bootstrap cdef object bootstrap_features @@ -126,7 +134,7 @@ cdef class RandomForest_impl(): def __cinit__(self, n_estimators=10, max_depth=-1, handle=None, max_features=1.0, n_bins=8, - split_algo=0, min_rows_per_node=2, + split_algo=0, split_criterion=0, min_rows_per_node=2, bootstrap=True, bootstrap_features=False, type_model="classifier", verbose=False, rows_sample=1.0, max_leaves=-1, @@ -134,6 +142,7 @@ cdef class RandomForest_impl(): self.handle = handle self.split_algo = split_algo + self.split_criterion = split_criterion self.min_rows_per_node = min_rows_per_node self.bootstrap_features = bootstrap_features self.rows_sample = rows_sample @@ -197,7 +206,8 @@ cdef class RandomForest_impl(): self.bootstrap_features, self.bootstrap, self.n_estimators, - self.rows_sample) + self.rows_sample, + self.split_criterion) self.rf_classifier32 = new \ rfClassifier[float](rf_param) @@ -356,7 +366,7 @@ class RandomForestClassifier(Base): handle : cuml.Handle If it is None, a new one is created just for this class split_algo : The type of algorithm to be used to create the trees. - 0 for HIST, 1 for GLOBAL_QUANTILE and 3 for SPLIT_ALGO_END. + 0 for HIST, 1 for GLOBAL_QUANTILE and 2 for SPLIT_ALGO_END. default = 0 bootstrap : Control bootstrapping. If set, each tree in the forest is built @@ -379,7 +389,7 @@ class RandomForestClassifier(Base): """ def __init__(self, n_estimators=10, max_depth=-1, handle=None, max_features=1.0, n_bins=8, - split_algo=0, min_rows_per_node=2, + split_algo=0, split_criterion=0, min_rows_per_node=2, bootstrap=True, bootstrap_features=False, type_model="classifier", verbose=False, rows_sample=1.0, max_leaves=-1, @@ -410,6 +420,7 @@ class RandomForestClassifier(Base): super(RandomForestClassifier, self).__init__(handle, verbose) self.split_algo = split_algo + self.split_criterion = split_criterion self.min_rows_per_node = min_rows_per_node self.bootstrap_features = bootstrap_features self.rows_sample = rows_sample @@ -424,7 +435,7 @@ class RandomForestClassifier(Base): self._impl = RandomForest_impl(n_estimators, max_depth, self.handle, max_features, n_bins, - split_algo, min_rows_per_node, + split_algo, split_criterion, min_rows_per_node, bootstrap, bootstrap_features, type_model, verbose, rows_sample, max_leaves, @@ -494,7 +505,7 @@ class RandomForestClassifier(Base): params = dict() self.variables = ['n_estimators', 'max_depth', 'handle', 'max_features', 'n_bins', - 'split_algo', 'min_rows_per_node', + 'split_algo', 'split_criterion', 'min_rows_per_node', 'bootstrap', 'bootstrap_features', 'verbose', 'rows_sample', 'max_leaves'] From b588047e1e0e8d704c42e114eee738b59f70cff1 Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Tue, 18 Jun 2019 07:57:24 -0700 Subject: [PATCH 44/51] WIP python wrapper changes. - Examples run but accuracy is low. TODO Debug - Corrected doxygen comments in randomforest.cu --- cpp/src/randomforest/randomforest.cu | 32 +++++++++++------------ python/cuml/ensemble/randomforest.pyx | 35 +++++++++++++++++--------- python/cuml/test/test_random_forest.py | 3 ++- 3 files changed, 41 insertions(+), 29 deletions(-) diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index 82de741bc3..a912732927 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -410,7 +410,7 @@ void rfClassifier::fit(const cumlHandle& user_handle, T* input, int n_rows, * @brief Predict target feature for input data; n-ary classification for single feature supported. * @tparam T: data type for input data (float or double). * @param[in] user_handle: cumlHandle. - * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. @@ -476,7 +476,7 @@ void rfClassifier::predict(const cumlHandle& user_handle, const T* input, * @brief Predict target feature for input data and validate against ref_labels. * @tparam T: data type for input data (float or double). * @param[in] user_handle: cumlHandle. - * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU pointer. * @param[in] ref_labels: label values for cross validation (n_rows elements); GPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). @@ -598,7 +598,7 @@ void rfRegressor::fit(const cumlHandle& user_handle, T* input, int n_rows, * @brief Predict target feature for input data; regression for single feature supported. * @tparam T: data type for input data (float or double). * @param[in] user_handle: cumlHandle. - * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. @@ -654,7 +654,7 @@ void rfRegressor::predict(const cumlHandle& user_handle, const T* input, * @brief Predict target feature for input data and validate against ref_labels. * @tparam T: data type for input data (float or double). * @param[in] user_handle: cumlHandle. - * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU pointer. * @param[in] ref_labels: label values for cross validation (n_rows elements); GPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). @@ -738,7 +738,7 @@ void fit(const cumlHandle& user_handle, rfClassifier* rf_classifier, * @brief Predict target feature for input data of type float; n-ary classification for single feature supported. * @param[in] user_handle: cumlHandle. * @param[in] rf_classifier: pointer to the rfClassifier object. The user should have previously called fit to build the random forest. - * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. @@ -755,7 +755,7 @@ void predict(const cumlHandle& user_handle, * @brief Predict target feature for input data of type double; n-ary classification for single feature supported. * @param[in] user_handle: cumlHandle. * @param[in] rf_classifier: pointer to the rfClassifier object. The user should have previously called fit to build the random forest. - * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. @@ -772,8 +772,8 @@ void predict(const cumlHandle& user_handle, * @brief Predict target feature for input data of type float and validate against ref_labels. * @param[in] user_handle: cumlHandle. * @param[in] rf_classifier: pointer to the rfClassifier object. The user should have previously called fit to build the random forest. - * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. - * @param[in] ref_labels: label values for cross validation (n_rows elements); CPU pointer. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU pointer. + * @param[in] ref_labels: label values for cross validation (n_rows elements); GPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. @@ -791,8 +791,8 @@ RF_metrics cross_validate(const cumlHandle& user_handle, * @brief Predict target feature for input data of type double and validate against ref_labels. * @param[in] user_handle: cumlHandle. * @param[in] rf_classifier: pointer to the rfClassifier object. The user should have previously called fit to build the random forest. - * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. - * @param[in] ref_labels: label values for cross validation (n_rows elements); CPU pointer. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU pointer. + * @param[in] ref_labels: label values for cross validation (n_rows elements); GPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. @@ -853,7 +853,7 @@ void fit(const cumlHandle& user_handle, rfRegressor* rf_regressor, * @brief Predict target feature for input data of type float; regression for single feature supported. * @param[in] user_handle: cumlHandle. * @param[in] rf_regressor: pointer to the rfRegressor object. The user should have previously called fit to build the random forest. - * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. @@ -870,7 +870,7 @@ void predict(const cumlHandle& user_handle, * @brief Predict target feature for input data of type double; regression for single feature supported. * @param[in] user_handle: cumlHandle. * @param[in] rf_regressor: pointer to the rfRegressor object. The user should have previously called fit to build the random forest. - * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. @@ -887,8 +887,8 @@ void predict(const cumlHandle& user_handle, * @brief Predict target feature for input data of type float and validate against ref_labels. * @param[in] user_handle: cumlHandle. * @param[in] rf_regressor: pointer to the rfRegressor object. The user should have previously called fit to build the random forest. - * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. - * @param[in] ref_labels: label values for cross validation (n_rows elements); CPU pointer. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU pointer. + * @param[in] ref_labels: label values for cross validation (n_rows elements); GPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. @@ -907,8 +907,8 @@ RF_metrics cross_validate(const cumlHandle& user_handle, * @brief Predict target feature for input data of type double and validate against ref_labels. * @param[in] user_handle: cumlHandle. * @param[in] rf_regressor: pointer to the rfRegressor object. The user should have previously called fit to build the random forest. - * @param[in] input: test data (n_rows samples, n_cols features) in row major format. CPU pointer. - * @param[in] ref_labels: label values for cross validation (n_rows elements); CPU pointer. + * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU pointer. + * @param[in] ref_labels: label values for cross validation (n_rows elements); GPU pointer. * @param[in] n_rows: number of data samples. * @param[in] n_cols: number of features (excluding target feature). * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated. diff --git a/python/cuml/ensemble/randomforest.pyx b/python/cuml/ensemble/randomforest.pyx index 8c474b5ba3..64798527a3 100644 --- a/python/cuml/ensemble/randomforest.pyx +++ b/python/cuml/ensemble/randomforest.pyx @@ -241,16 +241,16 @@ cdef class RandomForest_impl(): def predict(self, X): cdef uintptr_t X_ptr - X_ptr = X.ctypes.data - n_rows, n_cols = np.shape(X) + X_m, X_ptr, n_rows, n_cols, _ = \ + input_to_dev_array(X, order='C') # row major format if n_cols != self.n_cols: raise ValueError(" The number of columns/features in the training" " and test data should be the same ") - preds = np.zeros(n_rows, - dtype=np.int32) - - cdef uintptr_t preds_ptr = preds.ctypes.data + preds = np.zeros(n_rows, dtype=np.int32) + cdef uintptr_t preds_ptr; + preds_m, preds_ptr, _, _, _ = \ + input_to_dev_array(preds) cdef cumlHandle* handle_ =\ self.handle.getHandle() @@ -278,14 +278,16 @@ cdef class RandomForest_impl(): % (str(self.dtype))) self.handle.sync() + del(X_m) + del(preds_m) return preds def cross_validate(self, X, y): cdef uintptr_t X_ptr, y_ptr - X_ptr = X.ctypes.data - y_ptr = y.ctypes.data - n_rows, n_cols = np.shape(X) + X_m, X_ptr, n_rows, n_cols, _ = \ + input_to_dev_array(X, order='C') # row major format + y_m, y_ptr, _, _, _ = input_to_dev_array(y) if n_cols != self.n_cols: raise ValueError(" The number of columns/features in the training" @@ -293,8 +295,9 @@ cdef class RandomForest_impl(): preds = np.zeros(n_rows, dtype=np.int32) - - cdef uintptr_t preds_ptr = (preds).ctypes.data + cdef uintptr_t preds_ptr; + preds_m, preds_ptr, _, _, _ = \ + input_to_dev_array(preds) cdef cumlHandle* handle_ =\ self.handle.getHandle() @@ -320,6 +323,9 @@ cdef class RandomForest_impl(): self.verbose) self.handle.sync() + del(X_m) + del(y_m) + del(preds_m) return self.stats @@ -368,6 +374,10 @@ class RandomForestClassifier(Base): split_algo : The type of algorithm to be used to create the trees. 0 for HIST, 1 for GLOBAL_QUANTILE and 2 for SPLIT_ALGO_END. default = 0 + split_criterion: The criterion used to split nodes. + 0 for GINI, 1 for ENTROPY, 4 for CRITERION_END. + 2 and 3 not valid for classification + default = 0 bootstrap : Control bootstrapping. If set, each tree in the forest is built on a bootstrapped sample with replacement. @@ -435,7 +445,8 @@ class RandomForestClassifier(Base): self._impl = RandomForest_impl(n_estimators, max_depth, self.handle, max_features, n_bins, - split_algo, split_criterion, min_rows_per_node, + split_algo, split_criterion, + min_rows_per_node, bootstrap, bootstrap_features, type_model, verbose, rows_sample, max_leaves, diff --git a/python/cuml/test/test_random_forest.py b/python/cuml/test/test_random_forest.py index c27b379cea..b45c387e69 100644 --- a/python/cuml/test/test_random_forest.py +++ b/python/cuml/test/test_random_forest.py @@ -22,7 +22,8 @@ def test_rf_predict_numpy(datatype, use_handle): y_test = np.asarray(y[900:, ]) handle, stream = get_handle(use_handle) cuml_model = curfc(max_features=1.0, - n_bins=4, split_algo=0, min_rows_per_node=2, + n_bins=4, split_algo=0, split_criterion=0, + min_rows_per_node=2, n_estimators=40, handle=handle, max_leaves=-1) cuml_model.fit(X_train, y_train) cu_predict = cuml_model.predict(X_test) From 11818901783782d15d6c19bd10a9d2c9945048a5 Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Wed, 19 Jun 2019 02:05:49 -0700 Subject: [PATCH 45/51] Fixed Python wrapper --- python/cuml/ensemble/randomforest.pyx | 12 +++++++----- python/cuml/test/test_random_forest.py | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/python/cuml/ensemble/randomforest.pyx b/python/cuml/ensemble/randomforest.pyx index 64798527a3..4ba96b9228 100644 --- a/python/cuml/ensemble/randomforest.pyx +++ b/python/cuml/ensemble/randomforest.pyx @@ -241,14 +241,15 @@ cdef class RandomForest_impl(): def predict(self, X): cdef uintptr_t X_ptr + # row major format X_m, X_ptr, n_rows, n_cols, _ = \ - input_to_dev_array(X, order='C') # row major format + input_to_dev_array(X, order='C') if n_cols != self.n_cols: raise ValueError(" The number of columns/features in the training" " and test data should be the same ") preds = np.zeros(n_rows, dtype=np.int32) - cdef uintptr_t preds_ptr; + cdef uintptr_t preds_ptr preds_m, preds_ptr, _, _, _ = \ input_to_dev_array(preds) cdef cumlHandle* handle_ =\ @@ -278,6 +279,7 @@ cdef class RandomForest_impl(): % (str(self.dtype))) self.handle.sync() + preds = preds_m.copy_to_host() #synchronous w/o a stream del(X_m) del(preds_m) return preds @@ -286,7 +288,7 @@ cdef class RandomForest_impl(): cdef uintptr_t X_ptr, y_ptr X_m, X_ptr, n_rows, n_cols, _ = \ - input_to_dev_array(X, order='C') # row major format + input_to_dev_array(X, order='C') y_m, y_ptr, _, _, _ = input_to_dev_array(y) if n_cols != self.n_cols: @@ -295,8 +297,8 @@ cdef class RandomForest_impl(): preds = np.zeros(n_rows, dtype=np.int32) - cdef uintptr_t preds_ptr; - preds_m, preds_ptr, _, _, _ = \ + cdef uintptr_t preds_ptr + preds_m, preds_ptr, _, _, _ = \ input_to_dev_array(preds) cdef cumlHandle* handle_ =\ diff --git a/python/cuml/test/test_random_forest.py b/python/cuml/test/test_random_forest.py index b45c387e69..a9b272defa 100644 --- a/python/cuml/test/test_random_forest.py +++ b/python/cuml/test/test_random_forest.py @@ -22,9 +22,9 @@ def test_rf_predict_numpy(datatype, use_handle): y_test = np.asarray(y[900:, ]) handle, stream = get_handle(use_handle) cuml_model = curfc(max_features=1.0, - n_bins=4, split_algo=0, split_criterion=0, + n_bins=8, split_algo=0, split_criterion=0, min_rows_per_node=2, - n_estimators=40, handle=handle, max_leaves=-1) + n_estimators=40, handle=handle, max_leaves=-1, max_depth=-1) cuml_model.fit(X_train, y_train) cu_predict = cuml_model.predict(X_test) cu_acc = accuracy_score(y_test, cu_predict) From 51017ac5bf02504d333d54eb662db0245dfb8d73 Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Mon, 24 Jun 2019 11:30:13 +0200 Subject: [PATCH 46/51] adding host quatile data structure and removing host mem copies for node questions --- cpp/src/decisiontree/kernels/evaluate_classifier.cuh | 4 +--- cpp/src/decisiontree/kernels/evaluate_regressor.cuh | 4 +--- cpp/src/decisiontree/kernels/quantile.cuh | 3 ++- cpp/src/decisiontree/memory.cuh | 4 ++++ cpp/src/decisiontree/memory.h | 1 + 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index aee1e5fc4f..eadde4f329 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -227,10 +227,8 @@ void find_best_split_classifier( std::numeric_limits::max(), -std::numeric_limits::max(), (T)0); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { T ques_val; - T* d_quantile = tempmem->d_quantile->data(); int q_index = col_selector[best_col_id] * nbins + best_bin_id; - MLCommon::updateHost(&ques_val, &d_quantile[q_index], 1, tempmem->stream); - CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); + ques_val = tempmem->h_quantile->data()[q_index]; ques.set_question_fields( best_col_id, col_selector[best_col_id], best_bin_id, nbins, n_cols, std::numeric_limits::max(), -std::numeric_limits::max(), ques_val); diff --git a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh index 7400c5cded..a651f256af 100644 --- a/cpp/src/decisiontree/kernels/evaluate_regressor.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_regressor.cuh @@ -354,10 +354,8 @@ void find_best_split_regressor( std::numeric_limits::max(), -std::numeric_limits::max(), (T)0); } else if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { T ques_val; - T* d_quantile = tempmem->d_quantile->data(); int q_index = col_selector[best_col_id] * nbins + best_bin_id; - MLCommon::updateHost(&ques_val, &d_quantile[q_index], 1, tempmem->stream); - CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); + ques_val = tempmem->h_quantile->data()[q_index]; ques.set_question_fields( best_col_id, col_selector[best_col_id], best_bin_id, nbins, n_cols, std::numeric_limits::max(), -std::numeric_limits::max(), ques_val); diff --git a/cpp/src/decisiontree/kernels/quantile.cuh b/cpp/src/decisiontree/kernels/quantile.cuh index 69216e7a27..0cfefdcc34 100644 --- a/cpp/src/decisiontree/kernels/quantile.cuh +++ b/cpp/src/decisiontree/kernels/quantile.cuh @@ -122,7 +122,8 @@ void preprocess_quantile(const T *data, const unsigned int *rowids, CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); } - + MLCommon::updateHost(tempmem->h_quantile->data(), tempmem->d_quantile->data(), + nbins * ncols, tempmem->stream); d_keys_out->release(tempmem->stream); d_offsets->release(tempmem->stream); d_temp_storage->release(tempmem->stream); diff --git a/cpp/src/decisiontree/memory.cuh b/cpp/src/decisiontree/memory.cuh index 7968522f5c..f6de336080 100644 --- a/cpp/src/decisiontree/memory.cuh +++ b/cpp/src/decisiontree/memory.cuh @@ -46,6 +46,8 @@ TemporaryMemory::TemporaryMemory(const ML::cumlHandle_impl& handle, int N, totalmem += n_hist_elements * sizeof(int) + N * extra_elements * sizeof(T); if (split_algo == ML::SPLIT_ALGO::GLOBAL_QUANTILE) { + h_quantile = new MLCommon::host_buffer(handle.getHostAllocator(), stream, + n_bins * quantile_elements); d_quantile = new MLCommon::device_buffer( handle.getDeviceAllocator(), stream, n_bins * quantile_elements); totalmem += n_bins * extra_elements * sizeof(T); @@ -123,6 +125,8 @@ TemporaryMemory::~TemporaryMemory() { if (d_quantile != nullptr) { d_quantile->release(stream); + h_quantile->release(stream); + delete h_quantile; delete d_quantile; } diff --git a/cpp/src/decisiontree/memory.h b/cpp/src/decisiontree/memory.h index e59df95650..803b4bf44b 100644 --- a/cpp/src/decisiontree/memory.h +++ b/cpp/src/decisiontree/memory.h @@ -55,6 +55,7 @@ struct TemporaryMemory { //For quantiles MLCommon::device_buffer *d_quantile = nullptr; + MLCommon::host_buffer *h_quantile = nullptr; const ML::cumlHandle_impl &ml_handle; From 10473d32f9bcd792e5d52a3010c13b949871b91d Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Wed, 26 Jun 2019 11:54:21 +0200 Subject: [PATCH 47/51] quantile fix; once per RF; quantile per tree flag; default false; tempmem optimization --- cpp/src/decisiontree/decisiontree.cu | 66 +++++++++++-------- cpp/src/decisiontree/decisiontree.h | 19 ++++-- .../decisiontree/kernels/col_condenser.cuh | 7 +- .../kernels/evaluate_classifier.cuh | 2 +- cpp/src/decisiontree/kernels/quantile.cuh | 1 + cpp/src/decisiontree/kernels/quantile.h | 23 +++++++ cpp/src/randomforest/randomforest.cu | 37 +++++++++-- cpp/src/randomforest/randomforest.h | 3 +- cpp/test/sg/rf_test.cu | 4 +- 9 files changed, 117 insertions(+), 45 deletions(-) create mode 100644 cpp/src/decisiontree/kernels/quantile.h diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu index 81344acff7..f6857c6e2d 100644 --- a/cpp/src/decisiontree/decisiontree.cu +++ b/cpp/src/decisiontree/decisiontree.cu @@ -70,12 +70,10 @@ DecisionTreeParams::DecisionTreeParams() {} /** * @brief Decision tree hyper-parameter object constructor to set all DecisionTreeParams members. */ -DecisionTreeParams::DecisionTreeParams(int cfg_max_depth, int cfg_max_leaves, - float cfg_max_features, int cfg_n_bins, - int cfg_split_algo, - int cfg_min_rows_per_node, - bool cfg_bootstrap_features, - CRITERION cfg_split_criterion) +DecisionTreeParams::DecisionTreeParams( + int cfg_max_depth, int cfg_max_leaves, float cfg_max_features, int cfg_n_bins, + int cfg_split_algo, int cfg_min_rows_per_node, bool cfg_bootstrap_features, + CRITERION cfg_split_criterion, bool cfg_quantile_per_tree) : max_depth(cfg_max_depth), max_leaves(cfg_max_leaves), max_features(cfg_max_features), @@ -83,7 +81,8 @@ DecisionTreeParams::DecisionTreeParams(int cfg_max_depth, int cfg_max_leaves, split_algo(cfg_split_algo), min_rows_per_node(cfg_min_rows_per_node), bootstrap_features(cfg_bootstrap_features), - split_criterion(cfg_split_criterion) {} + split_criterion(cfg_split_criterion), + quantile_per_tree(cfg_quantile_per_tree) {} /** * @brief Check validity of all decision tree hyper-parameters. @@ -180,7 +179,8 @@ void DecisionTreeBase::plant( L *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, int maxdepth, int max_leaf_nodes, const float colper, int n_bins, int split_algo_flag, int cfg_min_rows_per_node, bool cfg_bootstrap_features, - CRITERION cfg_split_criterion) { + CRITERION cfg_split_criterion, bool quantile_per_tree, + std::shared_ptr> in_tempmem) { split_algo = split_algo_flag; dinfo.NLocalrows = nrows; dinfo.NGlobalrows = nrows; @@ -226,10 +226,16 @@ void DecisionTreeBase::plant( shmem_used); for (int i = 0; i < MAXSTREAMS; i++) { - tempmem[i] = std::make_shared>( - handle, n_sampled_rows, ncols, MAXSTREAMS, unique_labels, n_bins, - split_algo); - if (split_algo == SPLIT_ALGO::GLOBAL_QUANTILE) { + if (in_tempmem != nullptr) { + tempmem[i] = in_tempmem; + } else { + tempmem[i] = std::make_shared>( + handle, n_sampled_rows, ncols, MAXSTREAMS, unique_labels, n_bins, + split_algo); + quantile_per_tree = true; + } + if (split_algo == SPLIT_ALGO::GLOBAL_QUANTILE && + quantile_per_tree == true) { preprocess_quantile(data, rowids, n_sampled_rows, ncols, dinfo.NLocalrows, n_bins, tempmem[i]); } @@ -240,9 +246,10 @@ void DecisionTreeBase::plant( MLCommon::TimerCPU timer; root = grow_tree(data, colper, labels, 0, rowids, n_sampled_rows, split_info); construct_time = timer.getElapsedSeconds(); - - for (int i = 0; i < MAXSTREAMS; i++) { - tempmem[i].reset(); + if (in_tempmem == nullptr) { + for (int i = 0; i < MAXSTREAMS; i++) { + tempmem[i].reset(); + } } } @@ -397,7 +404,8 @@ template void DecisionTreeBase::base_fit( const ML::cumlHandle &handle, T *data, const int ncols, const int nrows, L *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, - DecisionTreeParams &tree_params, bool is_classifier) { + DecisionTreeParams &tree_params, bool is_classifier, + std::shared_ptr> in_tempmem) { const char *CRITERION_NAME[] = {"GINI", "ENTROPY", "MSE", "MAE", "END"}; CRITERION default_criterion = (is_classifier) ? CRITERION::GINI : CRITERION::MSE; @@ -426,7 +434,7 @@ void DecisionTreeBase::base_fit( unique_labels, tree_params.max_depth, tree_params.max_leaves, tree_params.max_features, tree_params.n_bins, tree_params.split_algo, tree_params.min_rows_per_node, tree_params.bootstrap_features, - tree_params.split_criterion); + tree_params.split_criterion, tree_params.quantile_per_tree, in_tempmem); } /** @@ -446,13 +454,13 @@ void DecisionTreeBase::base_fit( * @param[in] tree_params: Decision Tree training hyper parameter struct. */ template -void DecisionTreeClassifier::fit(const ML::cumlHandle &handle, T *data, - const int ncols, const int nrows, - int *labels, unsigned int *rowids, - const int n_sampled_rows, int unique_labels, - DecisionTreeParams tree_params) { +void DecisionTreeClassifier::fit( + const ML::cumlHandle &handle, T *data, const int ncols, const int nrows, + int *labels, unsigned int *rowids, const int n_sampled_rows, + int unique_labels, DecisionTreeParams tree_params, + std::shared_ptr> in_tempmem) { this->base_fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, - unique_labels, tree_params, true); + unique_labels, tree_params, true, in_tempmem); } template @@ -511,13 +519,13 @@ void DecisionTreeClassifier::find_best_fruit_all( * @param[in] tree_params: Decision Tree training hyper parameter struct. */ template -void DecisionTreeRegressor::fit(const ML::cumlHandle &handle, T *data, - const int ncols, const int nrows, T *labels, - unsigned int *rowids, - const int n_sampled_rows, - DecisionTreeParams tree_params) { +void DecisionTreeRegressor::fit( + const ML::cumlHandle &handle, T *data, const int ncols, const int nrows, + T *labels, unsigned int *rowids, const int n_sampled_rows, + DecisionTreeParams tree_params, + std::shared_ptr> in_tempmem) { this->base_fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, 1, - tree_params, false); + tree_params, false, in_tempmem); } template diff --git a/cpp/src/decisiontree/decisiontree.h b/cpp/src/decisiontree/decisiontree.h index 64d7ad54ac..6f4a9dd9fa 100644 --- a/cpp/src/decisiontree/decisiontree.h +++ b/cpp/src/decisiontree/decisiontree.h @@ -85,6 +85,10 @@ struct DecisionTreeParams { * Whether to bootstrap columns with or without replacement. */ bool bootstrap_features = false; + /** + * Weather a quantile needs to be computed for individual trees in RF. Default; quantile is compute just once per RF. Only affects GLOBAL_QUANTILE algorithm + **/ + bool quantile_per_tree = false; /** * Node split criterion. GINI and Entropy for classification, MSE or MAE for regression. */ @@ -94,7 +98,7 @@ struct DecisionTreeParams { DecisionTreeParams(int cfg_max_depth, int cfg_max_leaves, float cfg_max_features, int cfg_n_bins, int cfg_split_aglo, int cfg_min_rows_per_node, bool cfg_bootstrap_features, - CRITERION cfg_split_criterion); + CRITERION cfg_split_criterion, bool cfg_quantile_per_tree); void validity_check() const; void print() const; }; @@ -133,7 +137,9 @@ class DecisionTreeBase { int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node = 2, bool cfg_bootstrap_features = false, - CRITERION cfg_split_criterion = CRITERION::CRITERION_END); + CRITERION cfg_split_criterion = CRITERION::CRITERION_END, + bool cfg_quantile_per_tree = false, + std::shared_ptr> in_tempmem = nullptr); void init_depth_zero(const L *labels, std::vector &colselector, const unsigned int *rowids, const int n_sampled_rows, const std::shared_ptr> tempmem); @@ -148,7 +154,8 @@ class DecisionTreeBase { void base_fit(const ML::cumlHandle &handle, T *data, const int ncols, const int nrows, L *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, - DecisionTreeParams &tree_params, bool is_classifier); + DecisionTreeParams &tree_params, bool is_classifier, + std::shared_ptr> in_tempmem); public: // Printing utility for high level tree info. @@ -176,7 +183,8 @@ class DecisionTreeClassifier : public DecisionTreeBase { void fit(const ML::cumlHandle &handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, const int unique_labels, - DecisionTreeParams tree_params); + DecisionTreeParams tree_params, + std::shared_ptr> in_tempmem = nullptr); private: /* depth is used to distinguish between root and other tree nodes for computations */ @@ -191,7 +199,8 @@ class DecisionTreeRegressor : public DecisionTreeBase { public: void fit(const ML::cumlHandle &handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids, - const int n_sampled_rows, DecisionTreeParams tree_params); + const int n_sampled_rows, DecisionTreeParams tree_params, + std::shared_ptr> in_tempmem = nullptr); private: /* depth is used to distinguish between root and other tree nodes for computations */ diff --git a/cpp/src/decisiontree/kernels/col_condenser.cuh b/cpp/src/decisiontree/kernels/col_condenser.cuh index 79de6061af..891d28617f 100644 --- a/cpp/src/decisiontree/kernels/col_condenser.cuh +++ b/cpp/src/decisiontree/kernels/col_condenser.cuh @@ -58,7 +58,12 @@ __global__ void allcolsampler_kernel(const T* __restrict__ data, myrowstart = newcolid * rowoffset; } - int index = rowids[i % nrows] + myrowstart; + int index; + if (rowids != nullptr) { + index = rowids[i % nrows] + myrowstart; + } else { + index = i % nrows + myrowstart; + } sampledcols[i] = data[index]; } return; diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index eadde4f329..fc771306f7 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -282,7 +282,7 @@ void best_split_all_cols_classifier( L* labelptr = tempmem->sampledlabels->data(); get_sampled_labels(labels, labelptr, rowids, nrows, tempmem->stream); - int batch_ncols = 1; + int batch_ncols; size_t shmem_needed = n_hist_bytes; if (split_algo == ML::SPLIT_ALGO::HIST) { shmem_needed += col_minmax_bytes; diff --git a/cpp/src/decisiontree/kernels/quantile.cuh b/cpp/src/decisiontree/kernels/quantile.cuh index 0cfefdcc34..649090bceb 100644 --- a/cpp/src/decisiontree/kernels/quantile.cuh +++ b/cpp/src/decisiontree/kernels/quantile.cuh @@ -17,6 +17,7 @@ #pragma once #include "col_condenser.cuh" #include "cub/cub.cuh" +#include "quantile.h" __global__ void set_sorting_offset(const int nrows, const int ncols, int *offsets) { diff --git a/cpp/src/decisiontree/kernels/quantile.h b/cpp/src/decisiontree/kernels/quantile.h new file mode 100644 index 0000000000..61cac17b13 --- /dev/null +++ b/cpp/src/decisiontree/kernels/quantile.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "../memory.h" +template +void preprocess_quantile(const T *data, const unsigned int *rowids, + const int n_sampled_rows, const int ncols, + const int rowoffset, const int nbins, + std::shared_ptr> tempmem); diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index a912732927..47a5ed22ac 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "../decisiontree/kernels/quantile.h" +#include "../decisiontree/memory.h" #include "random/permute.h" #include "random/rng.h" #include "randomforest.h" @@ -260,6 +262,9 @@ void rf::prepare_fit_per_tree(const ML::cumlHandle_impl& handle, 1000); // Ensure the seed for each tree is different and meaningful. r.uniformInt(selected_rows, n_sampled_rows, (unsigned int)0, (unsigned int)n_rows, stream); + //thrust::sequence(thrust::cuda::par.on(stream), sorted_selected_rows, + // sorted_selected_rows + n_sampled_rows); + CUDA_CHECK(cub::DeviceRadixSort::SortKeys( (void*)rows_temp_storage, temp_storage_bytes, selected_rows, sorted_selected_rows, n_sampled_rows, 0, 8 * sizeof(unsigned int), @@ -380,7 +385,15 @@ void rfClassifier::fit(const cumlHandle& user_handle, T* input, int n_rows, // Allocate temporary storage rows_temp_storage = new MLCommon::device_buffer( handle.getDeviceAllocator(), stream, temp_storage_bytes); - + std::shared_ptr> tempmem = + std::make_shared>( + user_handle.getImpl(), n_sampled_rows, n_cols, 1, n_unique_labels, + this->rf_params.tree_params.n_bins, + this->rf_params.tree_params.split_algo); + if (this->rf_params.tree_params.split_algo == SPLIT_ALGO::GLOBAL_QUANTILE) { + preprocess_quantile(input, nullptr, n_sampled_rows, n_cols, n_rows, + this->rf_params.tree_params.n_bins, tempmem); + } for (int i = 0; i < this->rf_params.n_trees; i++) { this->prepare_fit_per_tree(handle, i, n_rows, n_sampled_rows, selected_rows.data(), @@ -396,13 +409,14 @@ void rfClassifier::fit(const cumlHandle& user_handle, T* input, int n_rows, trees[i].fit(user_handle, input, n_cols, n_rows, labels, sorted_selected_rows.data(), n_sampled_rows, n_unique_labels, - this->rf_params.tree_params); + this->rf_params.tree_params, tempmem); } //Cleanup rows_temp_storage->release(stream); selected_rows.release(stream); sorted_selected_rows.release(stream); + tempmem.reset(); delete rows_temp_storage; } @@ -569,7 +583,16 @@ void rfRegressor::fit(const cumlHandle& user_handle, T* input, int n_rows, // Allocate temporary storage rows_temp_storage = new MLCommon::device_buffer( handle.getDeviceAllocator(), stream, temp_storage_bytes); - + std::shared_ptr> tempmem = + std::make_shared>( + user_handle.getImpl(), n_sampled_rows, n_cols, 1, 1, + this->rf_params.tree_params.n_bins, + this->rf_params.tree_params.split_algo); + + if (this->rf_params.tree_params.split_algo == SPLIT_ALGO::GLOBAL_QUANTILE) { + preprocess_quantile(input, nullptr, n_sampled_rows, n_cols, n_rows, + this->rf_params.tree_params.n_bins, tempmem); + } for (int i = 0; i < this->rf_params.n_trees; i++) { this->prepare_fit_per_tree(handle, i, n_rows, n_sampled_rows, selected_rows.data(), @@ -585,12 +608,13 @@ void rfRegressor::fit(const cumlHandle& user_handle, T* input, int n_rows, trees[i].fit(user_handle, input, n_cols, n_rows, labels, sorted_selected_rows.data(), n_sampled_rows, - this->rf_params.tree_params); + this->rf_params.tree_params, tempmem); } //Cleanup rows_temp_storage->release(stream); selected_rows.release(stream); sorted_selected_rows.release(stream); + tempmem.reset(); delete rows_temp_storage; } @@ -810,10 +834,11 @@ RF_metrics cross_validate(const cumlHandle& user_handle, RF_params set_rf_class_obj(int max_depth, int max_leaves, float max_features, int n_bins, int split_algo, int min_rows_per_node, bool bootstrap_features, bool bootstrap, int n_trees, - int rows_sample, CRITERION split_criterion) { + int rows_sample, CRITERION split_criterion, + bool quantile_per_tree) { DecisionTree::DecisionTreeParams tree_params( max_depth, max_leaves, max_features, n_bins, split_algo, min_rows_per_node, - bootstrap_features, split_criterion); + bootstrap_features, split_criterion, quantile_per_tree); RF_params rf_params(bootstrap, bootstrap_features, n_trees, rows_sample, tree_params); return rf_params; diff --git a/cpp/src/randomforest/randomforest.h b/cpp/src/randomforest/randomforest.h index fde6983a05..872515c0f7 100644 --- a/cpp/src/randomforest/randomforest.h +++ b/cpp/src/randomforest/randomforest.h @@ -179,7 +179,8 @@ RF_metrics cross_validate(const cumlHandle& user_handle, RF_params set_rf_class_obj(int max_depth, int max_leaves, float max_features, int n_bins, int split_algo, int min_rows_per_node, bool bootstrap_features, bool bootstrap, int n_trees, - int rows_sample, CRITERION split_criterion); + int rows_sample, CRITERION split_criterion, + bool quantile_per_tree); // ----------------------------- Regression ----------------------------------- // diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 3edc404290..9d6c917e07 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -56,7 +56,7 @@ class RfClassifierTest : public ::testing::TestWithParam> { DecisionTree::DecisionTreeParams tree_params( params.max_depth, params.max_leaves, params.max_features, params.n_bins, params.split_algo, params.min_rows_per_node, params.bootstrap_features, - CRITERION::GINI); + CRITERION::GINI, false); RF_params rf_params(params.bootstrap, params.bootstrap_features, params.n_trees, params.rows_sample, tree_params); //rf_params.print(); @@ -153,7 +153,7 @@ class RfRegressorTest : public ::testing::TestWithParam> { DecisionTree::DecisionTreeParams tree_params( params.max_depth, params.max_leaves, params.max_features, params.n_bins, params.split_algo, params.min_rows_per_node, params.bootstrap_features, - params.split_criterion); + params.split_criterion, false); RF_params rf_params(params.bootstrap, params.bootstrap_features, params.n_trees, params.rows_sample, tree_params); //rf_params.print(); From 95eb5f1e306d0ecc01ce8cd9b13bbd3b83243e11 Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Wed, 26 Jun 2019 15:18:37 +0200 Subject: [PATCH 48/51] critical fix: entropy functor cannot have log(0) --- cpp/src/decisiontree/kernels/metric.cuh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/decisiontree/kernels/metric.cuh b/cpp/src/decisiontree/kernels/metric.cuh index 50d9a52cee..054b580934 100644 --- a/cpp/src/decisiontree/kernels/metric.cuh +++ b/cpp/src/decisiontree/kernels/metric.cuh @@ -55,8 +55,10 @@ float GiniFunctor::exec(std::vector &hist, int nrows) { float EntropyFunctor::exec(std::vector &hist, int nrows) { float eval = 0.0; for (int i = 0; i < hist.size(); i++) { - float prob = ((float)hist[i]) / nrows; - eval += prob * logf(prob); + if (hist[i] != 0) { + float prob = ((float)hist[i]) / nrows; + eval += prob * logf(prob); + } } return (-1 * eval); } From db787efabe7ad0080ef69689ee6d521269aec54d Mon Sep 17 00:00:00 2001 From: Myrto Papadopoulou Date: Fri, 28 Jun 2019 01:36:18 -0700 Subject: [PATCH 49/51] Python style fix. --- python/cuml/test/test_random_forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/test/test_random_forest.py b/python/cuml/test/test_random_forest.py index 53ff219ec4..53f3005270 100644 --- a/python/cuml/test/test_random_forest.py +++ b/python/cuml/test/test_random_forest.py @@ -63,7 +63,7 @@ def test_rf_predict_numpy(datatype, use_handle, split_algo, n_bins=8, split_algo=0, split_criterion=0, min_rows_per_node=2, n_estimators=40, handle=handle, max_leaves=-1, - max_depth=-1) #TODO FIXME confirm max_depth and n_estimators + max_depth=-1) cuml_model.fit(X_train, y_train) cu_predict = cuml_model.predict(X_test) cu_acc = accuracy_score(y_test, cu_predict) From ffb80e2d2037d4677df77722f4ddaf8dbfbcc4f8 Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Mon, 1 Jul 2019 16:52:23 +0200 Subject: [PATCH 50/51] using proper split criterion in rf test, fixing max value range for entropy, added missing rf quantile check --- .../kernels/evaluate_classifier.cuh | 5 +-- cpp/src/decisiontree/kernels/metric.cuh | 6 ++++ cpp/src/decisiontree/kernels/metric_def.h | 2 ++ cpp/src/randomforest/randomforest.cu | 33 +++++++++---------- cpp/test/sg/rf_test.cu | 14 ++++---- 5 files changed, 34 insertions(+), 26 deletions(-) diff --git a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh index fc771306f7..5d63c094e2 100644 --- a/cpp/src/decisiontree/kernels/evaluate_classifier.cuh +++ b/cpp/src/decisiontree/kernels/evaluate_classifier.cuh @@ -190,9 +190,10 @@ void find_best_split_classifier( float tmp_gini_left = F::exec(tmp_histleft, tmp_lnrows); float tmp_gini_right = F::exec(tmp_histright, tmp_rnrows); - ASSERT((tmp_gini_left >= 0.0f) && (tmp_gini_left <= 1.0f), + float max_value = F::max_val(n_unique_labels); + ASSERT((tmp_gini_left >= 0.0f) && (tmp_gini_left <= max_value), "gini left value %f not in [0.0, 1.0]", tmp_gini_left); - ASSERT((tmp_gini_right >= 0.0f) && (tmp_gini_right <= 1.0f), + ASSERT((tmp_gini_right >= 0.0f) && (tmp_gini_right <= max_value), "gini right value %f not in [0.0, 1.0]", tmp_gini_right); float impurity = (tmp_lnrows * 1.0f / nrows) * tmp_gini_left + diff --git a/cpp/src/decisiontree/kernels/metric.cuh b/cpp/src/decisiontree/kernels/metric.cuh index 054b580934..33c6266141 100644 --- a/cpp/src/decisiontree/kernels/metric.cuh +++ b/cpp/src/decisiontree/kernels/metric.cuh @@ -43,6 +43,12 @@ __device__ __forceinline__ T AbsFunctor::exec(T x) { return MLCommon::myAbs(x); } +float GiniFunctor::max_val(int nclass) { return 1.0; } + +float EntropyFunctor::max_val(int nclass) { + float prob = 1.0 / nclass; + return (-1.0 * nclass * prob * logf(prob)); +} float GiniFunctor::exec(std::vector &hist, int nrows) { float gval = 1.0; for (int i = 0; i < hist.size(); i++) { diff --git a/cpp/src/decisiontree/kernels/metric_def.h b/cpp/src/decisiontree/kernels/metric_def.h index ffc7102d38..68a8d379b7 100644 --- a/cpp/src/decisiontree/kernels/metric_def.h +++ b/cpp/src/decisiontree/kernels/metric_def.h @@ -67,8 +67,10 @@ struct AbsFunctor { struct GiniFunctor { static float exec(std::vector& hist, int nrows); + static float max_val(int nclass); }; struct EntropyFunctor { static float exec(std::vector& hist, int nrows); + static float max_val(int nclass); }; diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index 0334eaf5db..595369ba8f 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -390,7 +390,8 @@ void rfClassifier::fit(const cumlHandle& user_handle, T* input, int n_rows, user_handle.getImpl(), n_sampled_rows, n_cols, 1, n_unique_labels, this->rf_params.tree_params.n_bins, this->rf_params.tree_params.split_algo); - if ((this->rf_params.tree_params.split_algo == SPLIT_ALGO::GLOBAL_QUANTILE) && !(this->rf_params.tree_params.quantile_per_tree)) { + if ((this->rf_params.tree_params.split_algo == SPLIT_ALGO::GLOBAL_QUANTILE) && + !(this->rf_params.tree_params.quantile_per_tree)) { preprocess_quantile(input, nullptr, n_sampled_rows, n_cols, n_rows, this->rf_params.tree_params.n_bins, tempmem); } @@ -587,7 +588,8 @@ void rfRegressor::fit(const cumlHandle& user_handle, T* input, int n_rows, this->rf_params.tree_params.n_bins, this->rf_params.tree_params.split_algo); - if (this->rf_params.tree_params.split_algo == SPLIT_ALGO::GLOBAL_QUANTILE) { + if ((this->rf_params.tree_params.split_algo == SPLIT_ALGO::GLOBAL_QUANTILE) && + !(this->rf_params.tree_params.quantile_per_tree)) { preprocess_quantile(input, nullptr, n_sampled_rows, n_cols, n_rows, this->rf_params.tree_params.n_bins, tempmem); } @@ -684,9 +686,8 @@ void rfRegressor::predict(const cumlHandle& user_handle, const T* input, * @param[in] verbose: flag for debugging purposes. */ template -RF_metrics rfRegressor::score(const cumlHandle& user_handle, - const T* input, const T* ref_labels, - int n_rows, int n_cols, +RF_metrics rfRegressor::score(const cumlHandle& user_handle, const T* input, + const T* ref_labels, int n_rows, int n_cols, T* predictions, bool verbose) const { predict(user_handle, input, n_rows, n_cols, predictions, verbose); @@ -917,12 +918,11 @@ void predict(const cumlHandle& user_handle, * @param[in] verbose: flag for debugging purposes. */ RF_metrics score(const cumlHandle& user_handle, - const rfRegressor* rf_regressor, - const float* input, const float* ref_labels, - int n_rows, int n_cols, float* predictions, - bool verbose) { - return rf_regressor->score(user_handle, input, ref_labels, n_rows, - n_cols, predictions, verbose); + const rfRegressor* rf_regressor, const float* input, + const float* ref_labels, int n_rows, int n_cols, + float* predictions, bool verbose) { + return rf_regressor->score(user_handle, input, ref_labels, n_rows, n_cols, + predictions, verbose); } /** @@ -937,12 +937,11 @@ RF_metrics score(const cumlHandle& user_handle, * @param[in] verbose: flag for debugging purposes. */ RF_metrics score(const cumlHandle& user_handle, - const rfRegressor* rf_regressor, - const double* input, const double* ref_labels, - int n_rows, int n_cols, double* predictions, - bool verbose) { - return rf_regressor->score(user_handle, input, ref_labels, n_rows, - n_cols, predictions, verbose); + const rfRegressor* rf_regressor, const double* input, + const double* ref_labels, int n_rows, int n_cols, + double* predictions, bool verbose) { + return rf_regressor->score(user_handle, input, ref_labels, n_rows, n_cols, + predictions, verbose); } }; // namespace ML diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 2dad3e07f2..e00bda61de 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -56,7 +56,7 @@ class RfClassifierTest : public ::testing::TestWithParam> { DecisionTree::DecisionTreeParams tree_params( params.max_depth, params.max_leaves, params.max_features, params.n_bins, params.split_algo, params.min_rows_per_node, params.bootstrap_features, - CRITERION::GINI, false); + params.split_criterion, false); RF_params rf_params(params.bootstrap, params.bootstrap_features, params.n_trees, params.rows_sample, tree_params); //rf_params.print(); @@ -102,9 +102,9 @@ class RfClassifierTest : public ::testing::TestWithParam> { updateDevice(inference_data_d, inference_data_h.data(), data_len, stream); // Predict and compare against known labels - RF_metrics tmp = score(handle, rf_classifier, inference_data_d, - labels, params.n_inference_rows, - params.n_cols, predicted_labels, false); + RF_metrics tmp = + score(handle, rf_classifier, inference_data_d, labels, + params.n_inference_rows, params.n_cols, predicted_labels, false); CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaStreamDestroy(stream)); @@ -196,9 +196,9 @@ class RfRegressorTest : public ::testing::TestWithParam> { updateDevice(inference_data_d, inference_data_h.data(), data_len, stream); // Predict and compare against known labels - RF_metrics tmp = score(handle, rf_regressor, inference_data_d, - labels, params.n_inference_rows, - params.n_cols, predicted_labels, false); + RF_metrics tmp = + score(handle, rf_regressor, inference_data_d, labels, + params.n_inference_rows, params.n_cols, predicted_labels, false); CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaStreamDestroy(stream)); From c6c5a5ecc7dd00ce06f1858e1a43d50b0d225fbb Mon Sep 17 00:00:00 2001 From: Vishal Mehta Date: Mon, 1 Jul 2019 17:47:19 +0200 Subject: [PATCH 51/51] changing ChangeLog entry to branch-0.9 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 78a107b9dd..3cbd400320 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## New Features - PR #766: Expose score method based on inertia for KMeans +- PR #635: Random Forest & Decision Tree Regression (Single-GPU) ## Improvements @@ -19,7 +20,6 @@ - PR #636: Rand Index metric ml-prim - PR #515: Added Random Projection feature - PR #504: Contingency matrix ml-prim -- PR #635: Random Forest & Decision Tree Regression (Single-GPU) - PR #644: Add train_test_split utility for cuDF dataframes - PR #612: Allow Cuda Array Interface, Numba inputs and input code refactor - PR #641: C: Separate C-wrapper library build to generate libcuml.so