rapidsai · dantegd · Jul 1, 2019 · May 7, 2019 · May 7, 2019 · May 8, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## New Features
 - PR #515: Added Random Projection feature
 - PR #504: Contingency matrix ml-prim
+- PR #635: Random Forest & Decision Tree Regression (Single-GPU)
 
 ## Improvements
 

diff --git a/cpp/src/decisiontree/algo_helper.h b/cpp/src/decisiontree/algo_helper.h
@@ -21,4 +21,9 @@ namespace ML {
 enum SPLIT_ALGO {
 	HIST, GLOBAL_QUANTILE, SPLIT_ALGO_END,
 };
+
+enum CRITERION {
+	GINI, ENTROPY, MSE, MAE, CRITERION_END,
+};
+
 };
diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu
diff --git a/cpp/src/decisiontree/decisiontree.h b/cpp/src/decisiontree/decisiontree.h
@@ -16,8 +16,7 @@
 
 #pragma once
 #include "algo_helper.h"
-#include "kernels/gini_def.h"
-#include "memory.cuh"
+#include "kernels/metric_def.h"
 #include <common/Timer.h>
 #include <vector>
 #include <algorithm>
@@ -27,22 +26,25 @@
 #include <common/cumlHandle.hpp>
 
 namespace ML {
+
+bool is_dev_ptr(const void *p);
+
 namespace DecisionTree {
 
 template<class T>
 struct Question {
 	int column;
 	T value;
-	void update(const GiniQuestion<T> & ques);
+	void update(const MetricQuestion<T> & ques);
 };
 
-template<class T>
+template<class T, class L>
 struct TreeNode {
 	TreeNode *left = nullptr;
 	TreeNode *right = nullptr;
-	int class_predict;
+	L prediction;
 	Question<T> question;
-	T gini_val;
+	T split_metric_val;
 
 	void print(std::ostream& os) const;
 };
@@ -81,75 +83,104 @@ struct DecisionTreeParams {
 	 */
 	int min_rows_per_node = 2;
 	/**
-	 * Wheather to bootstarp columns with or without replacement
+	 * Whether to bootstrap columns with or without replacement.
 	 */
 	bool bootstrap_features =  false;
-
+	/**
+	 * Node split criterion. GINI and Entropy for classification, MSE or MAE for regression.
+	 */
+	CRITERION split_criterion = CRITERION_END;
+
 	DecisionTreeParams();
-	DecisionTreeParams(int cfg_max_depth, int cfg_max_leaves, float cfg_max_features, int cfg_n_bins, int cfg_split_aglo, int cfg_min_rows_per_node, bool cfg_bootstrap_features);
+	DecisionTreeParams(int cfg_max_depth, int cfg_max_leaves, float cfg_max_features, int cfg_n_bins, int cfg_split_aglo, int cfg_min_rows_per_node, bool cfg_bootstrap_features, CRITERION cfg_split_criterion);
 	void validity_check() const;
 	void print() const;
 };
 
-template<class T>
-class DecisionTreeClassifier {
+template<class T, class L>
+class DecisionTreeBase {
+	protected:
+		int split_algo;
+		TreeNode<T, L> *root = nullptr;
+		int nbins;
+		DataInfo dinfo;
+		int treedepth;
+		int depth_counter = 0;
+		int maxleaves;
+		int leaf_counter = 0;
+		std::vector<std::shared_ptr<TemporaryMemory<T, L>>> tempmem;
+		size_t total_temp_mem;
+		const int MAXSTREAMS = 1;
+		size_t max_shared_mem;
+		size_t shmem_used = 0;
+		int n_unique_labels = -1; // number of unique labels in dataset
+		double construct_time;
+		int min_rows_per_node;
+		bool bootstrap_features;
+		CRITERION split_criterion;
+		std::vector<unsigned int> feature_selector;
+
+		void print_node(const std::string& prefix, const TreeNode<T, L>* const node, bool isLeft) const;
+		void split_branch(T *data, MetricQuestion<T> & ques, const int n_sampled_rows, int& nrowsleft, int& nrowsright, unsigned int* rowids);
+
+		void plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, L *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, 
+			int maxdepth = -1, int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node=2,
+			bool cfg_bootstrap_features=false, CRITERION cfg_split_criterion=CRITERION::CRITERION_END);
+		void init_depth_zero(const L* labels, std::vector<unsigned int>& colselector, const unsigned int* rowids, const int n_sampled_rows, const std::shared_ptr<TemporaryMemory<T,L>> tempmem);
+		TreeNode<T, L> * grow_tree(T *data, const float colper, L *labels, int depth, unsigned int* rowids, const int n_sampled_rows, MetricInfo<T> prev_split_info);
+		virtual void find_best_fruit_all(T *data, L *labels, const float colper, MetricQuestion<T> & ques, float& gain, unsigned int* rowids,
+							const int n_sampled_rows, MetricInfo<T> split_info[3], int depth) = 0;
+		void base_fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, L *labels, unsigned int *rowids,
+			const int n_sampled_rows, int unique_labels, DecisionTreeParams & tree_params, bool is_classifier);
+
+	public:
+		// Printing utility for high level tree info.
+		void print_tree_summary() const;
+
+		// Printing utility for debug and looking at nodes and leaves.
+		void print() const;
+
+		// Predict labels for n_rows rows, with n_cols features each, for a given tree. rows in row-major format.
+		void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, L * predictions, bool verbose=false) const;
+		void predict_all(const T * rows, const int n_rows, const int n_cols, L * preds, bool verbose=false) const;
+		L predict_one(const T * row, const TreeNode<T, L> * const node, bool verbose=false) const;
+
+}; // End DecisionTreeBase Class
 
-private:
-	int split_algo;
-	TreeNode<T> *root = nullptr;
-	int nbins;
-	DataInfo dinfo;
-	int treedepth;
-	int depth_counter = 0;
-	int maxleaves;
-	int leaf_counter = 0;
-	std::vector<std::shared_ptr<TemporaryMemory<T>>> tempmem;
-	size_t total_temp_mem;
-	const int MAXSTREAMS = 1;
-	size_t max_shared_mem;
-	size_t shmem_used = 0;
-	int n_unique_labels = -1; // number of unique labels in dataset
-	double construct_time;
-	int min_rows_per_node;
-	bool bootstrap_features;
-	std::vector<int> feature_selector;
-
+template<class T>
+class DecisionTreeClassifier : public DecisionTreeBase<T, int> {
 public:
 	// Expects column major T dataset, integer labels
 	// data, labels are both device ptr.
 	// Assumption: labels are all mapped to contiguous numbers starting from 0 during preprocessing. Needed for gini hist impl.
 	void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids,
 			const int n_sampled_rows, const int unique_labels, DecisionTreeParams tree_params);
 
-	/* Predict labels for n_rows rows, with n_cols features each, for a given tree. rows in row-major format. */
-	void predict(const ML::cumlHandle& handle, const T * rows, const int n_rows, const int n_cols, int* predictions, bool verbose=false) const;
-
-	// Printing utility for high level tree info.
-	void print_tree_summary() const;
-
-	// Printing utility for debug and looking at nodes and leaves.
-	void print() const;
-
 private:
-	// Same as above fit, but planting is better for a tree then fitting.
-	void plant(const cumlHandle_impl& handle, T *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels,
-		   int maxdepth = -1, int max_leaf_nodes = -1, const float colper = 1.0, int n_bins = 8, int split_algo_flag = SPLIT_ALGO::HIST, int cfg_min_rows_per_node=2, bool cfg_bootstrap_features=false);
+	/* depth is used to distinguish between root and other tree nodes for computations */
+	void find_best_fruit_all(T *data, int *labels, const float colper, MetricQuestion<T> & ques, float& gain, unsigned int* rowids,
+							const int n_sampled_rows, MetricInfo<T> split_info[3], int depth);
+}; // End DecisionTreeClassifier Class
 
-	TreeNode<T> * grow_tree(T *data, const float colper, int *labels, int depth, unsigned int* rowids, const int n_sampled_rows, GiniInfo prev_split_info);
+template<class T>
+class DecisionTreeRegressor : public DecisionTreeBase<T, T> {
+public:
+	void fit(const ML::cumlHandle& handle, T *data, const int ncols, const int nrows, T *labels, unsigned int *rowids,
+			const int n_sampled_rows, DecisionTreeParams tree_params);
 
+private:
 	/* depth is used to distinguish between root and other tree nodes for computations */
-	void find_best_fruit_all(T *data, int *labels, const float colper, GiniQuestion<T> & ques, float& gain, unsigned int* rowids,
-							const int n_sampled_rows, GiniInfo split_info[3], int depth);
-	void split_branch(T *data, GiniQuestion<T> & ques, const int n_sampled_rows, int& nrowsleft, int& nrowsright, unsigned int* rowids);
-	void classify_all(const T * rows, const int n_rows, const int n_cols, int* preds, bool verbose=false) const;
-	int classify(const T * row, const TreeNode<T> * const node, bool verbose=false) const;
-	void print_node(const std::string& prefix, const TreeNode<T>* const node, bool isLeft) const;
-}; // End DecisionTree Class
+	void find_best_fruit_all(T *data, T *labels, const float colper, MetricQuestion<T> & ques, float& gain, unsigned int* rowids,
+							const int n_sampled_rows, MetricInfo<T> split_info[3], int depth);
+}; // End DecisionTreeRegressor Class
 
 } //End namespace DecisionTree
 
 
 // Stateless API functions
+
+// ----------------------------- Classification ----------------------------------- //
+
 void fit(const ML::cumlHandle& handle, DecisionTree::DecisionTreeClassifier<float> * dt_classifier, float *data, const int ncols, const int nrows, int *labels,
 		unsigned int *rowids, const int n_sampled_rows, int unique_labels, DecisionTree::DecisionTreeParams tree_params);
 
@@ -161,4 +192,17 @@ void predict(const ML::cumlHandle& handle, const DecisionTree::DecisionTreeClass
 void predict(const ML::cumlHandle& handle, const DecisionTree::DecisionTreeClassifier<double> * dt_classifier, const double * rows,
 			const int n_rows, const int n_cols, int* predictions, bool verbose=false);
 
+// ----------------------------- Regression ----------------------------------- //
+
+void fit(const ML::cumlHandle& handle, DecisionTree::DecisionTreeRegressor<float> * dt_regressor, float *data, const int ncols, const int nrows, float *labels,
+		unsigned int *rowids, const int n_sampled_rows, DecisionTree::DecisionTreeParams tree_params);
+
+void fit(const ML::cumlHandle& handle, DecisionTree::DecisionTreeRegressor<double> * dt_regressor, double *data, const int ncols, const int nrows, double *labels,
+		unsigned int *rowids, const int n_sampled_rows, DecisionTree::DecisionTreeParams tree_params);
+
+void predict(const ML::cumlHandle& handle, const DecisionTree::DecisionTreeRegressor<float> * dt_regressor, const float * rows,
+			const int n_rows, const int n_cols, float * predictions, bool verbose=false);
+void predict(const ML::cumlHandle& handle, const DecisionTree::DecisionTreeRegressor<double> * dt_regressor, const double * rows,
+			const int n_rows, const int n_cols, double * predictions, bool verbose=false);
+
 } //End namespace ML
diff --git a/cpp/src/decisiontree/kernels/batch_cal.cuh b/cpp/src/decisiontree/kernels/batch_cal.cuh
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+/* Return max. possible number of columns that can be processed within avail_shared_memory.
+   Expects that requested_shared_memory is a multiple of ncols. */
+int get_batch_cols_cnt(const size_t avail_shared_memory, const size_t requested_shared_memory, const int ncols) {
+	int ncols_in_batch = ncols;
+	int ncols_factor = requested_shared_memory / ncols;
+	if (requested_shared_memory > avail_shared_memory) {
+		ncols_in_batch = avail_shared_memory / ncols_factor; // floor div.
+	}
+	return  ncols_in_batch;
+}
+
+
+/* Update batch_ncols (max. possible number of columns that can be processed within avail_shared_memory),
+   blocks (for next kernel launch), and shmemsize (requested shared memory for next kernel launch).
+   Precondition: requested_shared_memory is a multiple of ncols. */
+void update_kernel_config(const size_t avail_shared_memory, const size_t requested_shared_memory, const int ncols,
+               const int nrows, const int threads, int & batch_ncols, int & blocks, size_t & shmemsize) {
+  batch_ncols = get_batch_cols_cnt(avail_shared_memory, requested_shared_memory, ncols);
+  shmemsize = (requested_shared_memory / ncols) * batch_ncols; // requested_shared_memory is a multiple of ncols for all kernels
+  blocks = min(MLCommon::ceildiv(batch_ncols * nrows, threads), 65536);
+}
diff --git a/cpp/src/decisiontree/kernels/col_condenser.cuh b/cpp/src/decisiontree/kernels/col_condenser.cuh
@@ -28,71 +28,30 @@ __global__ void get_sampled_column_kernel(const T* __restrict__ column, T *outco
 	return;
 }
 
-void get_sampled_labels(const int *labels, int *outlabels, unsigned int* rowids, const int n_sampled_rows, const cudaStream_t stream) {
+template<typename T>
+void get_sampled_labels(const T *labels, T *outlabels, const unsigned int* rowids, const int n_sampled_rows, const cudaStream_t stream) {
 	int threads = 128;
-	get_sampled_column_kernel<int><<<MLCommon::ceildiv(n_sampled_rows, threads), threads, 0, stream>>>(labels, outlabels, rowids, n_sampled_rows);
+	get_sampled_column_kernel<T><<<MLCommon::ceildiv(n_sampled_rows, threads), threads, 0, stream>>>(labels, outlabels, rowids, n_sampled_rows);
 	CUDA_CHECK(cudaGetLastError());
 	return;
 }
 
 template<typename T>
-__global__ void allcolsampler_kernel(const T* __restrict__ data, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nrows, const int ncols, const int rowoffset, T* sampledcols)
+__global__ void allcolsampler_kernel(const T* __restrict__ data, const unsigned int* __restrict__ rowids, const unsigned int* __restrict__ colids, const int nrows, const int ncols, const int rowoffset, T* sampledcols)
 {
 	int tid = threadIdx.x + blockIdx.x * blockDim.x;
 
 	for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) {
 		int newcolid = (int)(i / nrows);
 		int myrowstart;
-		if( colids != nullptr)
+		if (colids != nullptr) {
 			myrowstart = colids[ newcolid ] * rowoffset;
-		else
+		} else {
 			myrowstart = newcolid * rowoffset;
+		}
 
 		int index = rowids[ i % nrows] + myrowstart;
 		sampledcols[i] = data[index];
 	}
 	return;
 }
-
-template<typename T>
-__global__ void allcolsampler_minmax_kernel(const T* __restrict__ data, const unsigned int* __restrict__ rowids, const int* __restrict__ colids, const int nrows, const int ncols, const int rowoffset, T* globalmin, T* globalmax, T* sampledcols, T init_min_val)
-{
-	int tid = threadIdx.x + blockIdx.x * blockDim.x;
-	extern __shared__ char shmem[];
-	T *minshared = (T*)shmem;
-	T *maxshared = (T*)(shmem + sizeof(T) * ncols);
-
-	for (int i = threadIdx.x; i < ncols; i += blockDim.x) {
-		minshared[i] = init_min_val;
-		maxshared[i] = -init_min_val;
-	}
-
-	// Initialize min max in  global memory
-	if (tid < ncols) {
-		globalmin[tid] = init_min_val;
-		globalmax[tid] = -init_min_val;
-	}
-
-	__syncthreads();
-
-	for (unsigned int i = tid; i < nrows*ncols; i += blockDim.x*gridDim.x) {
-		int newcolid = (int)(i / nrows);
-		int myrowstart = colids[ newcolid ] * rowoffset;
-		int index = rowids[ i % nrows] + myrowstart;
-		T coldata = data[index];
-
-		MLCommon::myAtomicMin(&minshared[newcolid], coldata);
-		MLCommon::myAtomicMax(&maxshared[newcolid], coldata);
-		sampledcols[i] = coldata;
-	}
-
-	__syncthreads();
-
-	for (int j = threadIdx.x; j < ncols; j+= blockDim.x) {
-		MLCommon::myAtomicMin(&globalmin[j], minshared[j]);
-		MLCommon::myAtomicMax(&globalmax[j], maxshared[j]);
-	}
-
-	return;
-}
-