update RF docs (#4138)

* Some updates to RF documentation * to be merged after #4216 Authors: - Venkat (https://github.com/venkywonka) Approvers: - Rory Mitchell (https://github.com/RAMitchell) - Vinay Deshpande (https://github.com/vinaydes) - Dante Gama Dessavre (https://github.com/dantegd) URL: #4138
rapidsai · Oct 29, 2021 · 42303fb · 42303fb
1 parent a7d0300
commit 42303fb
Show file tree

Hide file tree

Showing 7 changed files with 220 additions and 204 deletions.
diff --git a/cpp/include/cuml/tree/decisiontree.hpp b/cpp/include/cuml/tree/decisiontree.hpp
@@ -28,11 +28,11 @@ namespace DT {
 
 struct DecisionTreeParams {
   /**
-   * Maximum tree depth. Unlimited (e.g., until leaves are pure), if -1.
+   * Maximum tree depth. Unlimited (e.g., until leaves are pure), If `-1`.
    */
   int max_depth;
   /**
-   * Maximum leaf nodes per tree. Soft constraint. Unlimited, if -1.
+   * Maximum leaf nodes per tree. Soft constraint. Unlimited, If `-1`.
    */
   int max_leaves;
   /**

diff --git a/python/cuml/benchmark/ci_benchmark.py b/python/cuml/benchmark/ci_benchmark.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -173,7 +173,6 @@ def make_bench_configs(long_config):
                 bench_dims=default_dims,
                 cuml_param_override_list=[
                     {"n_bins": [8, 32]},
-                    {"split_algo": [0, 1]},
                     {"max_features": ['sqrt', 1.0]},
                 ],
             )

diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py
@@ -49,14 +49,14 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin,
     Future versions of the API will support more flexible data
     distribution and additional input types.
 
-    The distributed algorithm uses an embarrassingly-parallel
-    approach. For a forest with N trees being built on w workers, each
-    worker simply builds N/w trees on the data it has available
+    The distributed algorithm uses an *embarrassingly-parallel*
+    approach. For a forest with `N` trees being built on `w` workers, each
+    worker simply builds `N/w` trees on the data it has available
     locally. In many cases, partitioning the data so that each worker
     builds trees on a subset of the total dataset works well, but
     it generally requires the data to be well-shuffled in advance.
     Alternatively, callers can replicate all of the data across
-    workers so that rf.fit receives w partitions, each containing the
+    workers so that ``rf.fit`` receives `w` partitions, each containing the
     same data. This would produce results approximately identical to
     single-GPU fitting.
 
@@ -65,7 +65,7 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin,
 
     Parameters
     -----------
-    n_estimators : int (default = 10)
+    n_estimators : int (default = 100)
                    total number of trees in the forest (not per-worker)
     handle : cuml.Handle
         Specifies the cuml.handle that holds internal CUDA state for
@@ -74,43 +74,54 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin,
         run different models concurrently in different streams by creating
         handles in several streams.
         If it is None, a new one is created.
-    split_criterion : int or string (default = 0 ('gini'))
-        The criterion used to split nodes.
-        0 or 'gini' for GINI, 1 or 'entropy' for ENTROPY,
-        2 or 'mse' for MSE,
-        4 or 'poisson' for POISSON,
-        5 or 'gamma' for GAMMA,
-        6 or 'inverse_gaussian' for INVERSE_GAUSSIAN,
-        2, 'mse', 4, 'poisson', 5, 'gamma', 6, 'inverse_gaussian' not valid
-        for classification
+    split_criterion : int or string (default = ``0`` (``'gini'``))
+        The criterion used to split nodes.\n
+         * ``0`` or ``'gini'`` for gini impurity
+         * ``1`` or ``'entropy'`` for information gain (entropy)
+         * ``2`` or ``'mse'`` for mean squared error
+         * ``4`` or ``'poisson'`` for poisson half deviance
+         * ``5`` or ``'gamma'`` for gamma half deviance
+         * ``6`` or ``'inverse_gaussian'`` for inverse gaussian deviance
+        ``2``, ``'mse'``, ``4``, ``'poisson'``, ``5``, ``'gamma'``, ``6``,
+        ``'inverse_gaussian'`` not valid for classification
     bootstrap : boolean (default = True)
-        Control bootstrapping.
-        If set, each tree in the forest is built
-        on a bootstrapped sample with replacement.
-        If False, the whole dataset is used to build each tree.
+        Control bootstrapping.\n
+         * If ``True``, each tree in the forest is built on a bootstrapped
+           sample with replacement.
+         * If ``False``, the whole dataset is used to build each tree.
     max_samples : float (default = 1.0)
         Ratio of dataset rows used while fitting each tree.
     max_depth : int (default = -1)
-        Maximum tree depth. Unlimited (i.e, until leaves are pure), if -1.
+        Maximum tree depth. Unlimited (i.e, until leaves are pure), If ``-1``.
     max_leaves : int (default = -1)
-        Maximum leaf nodes per tree. Soft constraint. Unlimited, if -1.
+        Maximum leaf nodes per tree. Soft constraint. Unlimited, If ``-1``.
     max_features : float (default = 'auto')
         Ratio of number of features (columns) to consider
-        per node split.
-    n_bins : int (default = 8)
+        per node split.\n
+         * If type ``int`` then ``max_features`` is the absolute count of
+           features to be used.
+         * If type ``float`` then ``max_features`` is a fraction.
+         * If ``'auto'`` then ``max_features=n_features = 1.0``.
+         * If ``'sqrt'`` then ``max_features=1/sqrt(n_features)``.
+         * If ``'log2'`` then ``max_features=log2(n_features)/n_features``.
+         * If ``None``, then ``max_features = 1.0``.
+    n_bins : int (default = 128)
         Number of bins used by the split algorithm.
     min_samples_leaf : int or float (default = 1)
-        The minimum number of samples (rows) in each leaf node.
-        If int, then min_samples_leaf represents the minimum number.
-        If float, then min_samples_leaf represents a fraction and
-        ceil(min_samples_leaf * n_rows) is the minimum number of samples
-        for each leaf node.
+        The minimum number of samples (rows) in each leaf node.\n
+         * If type ``int``, then ``min_samples_leaf`` represents the minimum
+           number.
+         * If ``float``, then ``min_samples_leaf`` represents a fraction
+           and ``ceil(min_samples_leaf * n_rows)`` is the minimum number of
+           samples for each leaf node.
     min_samples_split : int or float (default = 2)
-        The minimum number of samples required to split an internal node.
-        If int, then min_samples_split represents the minimum number.
-        If float, then min_samples_split represents a fraction and
-        ceil(min_samples_split * n_rows) is the minimum number of samples
-        for each split.
+        The minimum number of samples required to split an internal
+        node.\n
+         * If type ``int``, then ``min_samples_split`` represents the minimum
+           number.
+         * If type ``float``, then ``min_samples_split`` represents a fraction
+           and ``ceil(min_samples_split * n_rows)`` is the minimum number of
+           samples for each split.
     n_streams : int (default = 4 )
         Number of parallel streams used for forest building
     workers : optional, list of strings
@@ -139,7 +150,7 @@ def __init__(
         workers=None,
         client=None,
         verbose=False,
-        n_estimators=10,
+        n_estimators=100,
         random_state=None,
         ignore_empty_partitions=False,
         **kwargs
@@ -330,7 +341,7 @@ def predict(self, X, algo='auto', threshold=0.5,
             for inference.
 
         Returns
-        ----------
+        -------
         y : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, 1)
 
         """
@@ -404,8 +415,9 @@ def predict_model_on_cpu(self, X, convert_dtype=True):
             When set to True, the predict method will, when necessary, convert
             the input to the data type which was used to train the model. This
             will increase memory used for the method.
+
         Returns
-        ----------
+        -------
         y : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, 1)
         """
         c = default_client()
@@ -501,9 +513,7 @@ def predict_proba(self, X,
 
         Returns
         -------
-        y : NumPy
-           Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_classes)
-
+        y : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_classes)
         """
         if self._get_internal_model() is None:
             self._set_internal_model(self._concat_treelite_models())

diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py
@@ -42,14 +42,14 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin,
     distribution and additional input types. User-facing APIs are
     expected to change in upcoming versions.
 
-    The distributed algorithm uses an embarrassingly-parallel
-    approach. For a forest with N trees being built on w workers, each
-    worker simply builds N/w trees on the data it has available
+    The distributed algorithm uses an *embarrassingly-parallel*
+    approach. For a forest with `N` trees being built on `w` workers, each
+    worker simply builds `N/w` trees on the data it has available
     locally. In many cases, partitioning the data so that each worker
     builds trees on a subset of the total dataset works well, but
     it generally requires the data to be well-shuffled in advance.
     Alternatively, callers can replicate all of the data across
-    workers so that rf.fit receives w partitions, each containing the
+    workers so that ``rf.fit`` receives `w` partitions, each containing the
     same data. This would produce results approximately identical to
     single-GPU fitting.
 
@@ -58,7 +58,7 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin,
 
     Parameters
     -----------
-    n_estimators : int (default = 10)
+    n_estimators : int (default = 100)
         total number of trees in the forest (not per-worker)
     handle : cuml.Handle
         Specifies the cuml.handle that holds internal CUDA state for
@@ -67,56 +67,60 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin,
         run different models concurrently in different streams by creating
         handles in several streams.
         If it is None, a new one is created.
-    split_criterion : int or string (default = 2 ('mse'))
-        The criterion used to split nodes.
-        0 or 'gini' for GINI, 1 or 'entropy' for ENTROPY,
-        2 or 'mse' for MSE,
-        4 or 'poisson' for POISSON,
-        5 or 'gamma' for GAMMA,
-        6 or 'inverse_gaussian' for INVERSE_GAUSSIAN,
-        0, 'gini', 1, 'entropy' not valid for regression
+    split_criterion : int or string (default = ``2`` (``'mse'``))
+        The criterion used to split nodes.\n
+         * ``0`` or ``'gini'`` for gini impurity
+         * ``1`` or ``'entropy'`` for information gain (entropy)
+         * ``2`` or ``'mse'`` for mean squared error
+         * ``4`` or ``'poisson'`` for poisson half deviance
+         * ``5`` or ``'gamma'`` for gamma half deviance
+         * ``6`` or ``'inverse_gaussian'`` for inverse gaussian deviance
+        ``0``, ``'gini'``, ``1``, ``'entropy'`` not valid for regression
     bootstrap : boolean (default = True)
-        Control bootstrapping.
-        If set, each tree in the forest is built
-        on a bootstrapped sample with replacement.
-        If False, the whole dataset is used to build each tree.
+        Control bootstrapping.\n
+         * If ``True``, each tree in the forest is built on a bootstrapped
+           sample with replacement.
+         * If ``False``, the whole dataset is used to build each tree.
     max_samples : float (default = 1.0)
         Ratio of dataset rows used while fitting each tree.
     max_depth : int (default = -1)
-        Maximum tree depth. Unlimited (i.e, until leaves are pure), if -1.
+        Maximum tree depth. Unlimited (i.e, until leaves are pure), If ``-1``.
     max_leaves : int (default = -1)
-        Maximum leaf nodes per tree. Soft constraint. Unlimited, if -1.
-    max_features : int or float or string or None (default = 'auto')
+        Maximum leaf nodes per tree. Soft constraint. Unlimited, If ``-1``.
+    max_features : float (default = 'auto')
         Ratio of number of features (columns) to consider
-        per node split.
-        If int then max_features/n_features.
-        If float then max_features is a fraction.
-        If 'auto' then max_features=n_features which is 1.0.
-        If 'sqrt' then max_features=1/sqrt(n_features).
-        If 'log2' then max_features=log2(n_features)/n_features.
-        If None, then max_features=n_features which is 1.0.
-    n_bins : int (default = 8)
+        per node split.\n
+         * If type ``int`` then ``max_features`` is the absolute count of
+           features to be used.
+         * If type ``float`` then ``max_features`` is a fraction.
+         * If ``'auto'`` then ``max_features=n_features = 1.0``.
+         * If ``'sqrt'`` then ``max_features=1/sqrt(n_features)``.
+         * If ``'log2'`` then ``max_features=log2(n_features)/n_features``.
+         * If ``None``, then ``max_features = 1.0``.
+    n_bins : int (default = 128)
         Number of bins used by the split algorithm.
     min_samples_leaf : int or float (default = 1)
-        The minimum number of samples (rows) in each leaf node.
-        If int, then min_samples_leaf represents the minimum number.
-        If float, then min_samples_leaf represents a fraction and
-        ceil(min_samples_leaf * n_rows) is the minimum number of samples
-        for each leaf node.
+        The minimum number of samples (rows) in each leaf node.\n
+         * If type ``int``, then ``min_samples_leaf`` represents the minimum
+           number.
+         * If ``float``, then ``min_samples_leaf`` represents a fraction and
+           ``ceil(min_samples_leaf * n_rows)`` is the minimum number of
+           samples for each leaf node.
     min_samples_split : int or float (default = 2)
-        The minimum number of samples required to split an internal node.
-        If int, then min_samples_split represents the minimum number.
-        If float, then min_samples_split represents a fraction and
-        ceil(min_samples_split * n_rows) is the minimum number of samples
-        for each split.
+        The minimum number of samples required to split an internal node.\n
+         * If type ``int``, then ``min_samples_split`` represents the minimum
+           number.
+         * If type ``float``, then ``min_samples_split`` represents a fraction
+           and ``ceil(min_samples_split * n_rows)`` is the minimum number of
+           samples for each split.
     accuracy_metric : string (default = 'r2')
         Decides the metric used to evaluate the performance of the model.
         In the 0.16 release, the default scoring metric was changed
-        from mean squared error to r-squared.
-        for r-squared : 'r2'
-        for median of abs error : 'median_ae'
-        for mean of abs error : 'mean_ae'
-        for mean square error' : 'mse'
+        from mean squared error to r-squared.\n
+         * for r-squared : ``'r2'``
+         * for median of abs error : ``'median_ae'``
+         * for mean of abs error : ``'mean_ae'``
+         * for mean square error' : ``'mse'``
     n_streams : int (default = 4 )
         Number of parallel streams used for forest building
     workers : optional, list of strings
@@ -141,7 +145,7 @@ def __init__(
         workers=None,
         client=None,
         verbose=False,
-        n_estimators=10,
+        n_estimators=100,
         random_state=None,
         ignore_empty_partitions=False,
         **kwargs

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
@@ -46,8 +46,8 @@ class BaseRandomForestModel(Base):
                     'bootstrap',
                     'verbose', 'max_samples',
                     'max_leaves',
-                    'accuracy_metric',
-                    'max_batch_size', 'n_streams', 'dtype',
+                    'accuracy_metric', 'max_batch_size',
+                    'n_streams', 'dtype',
                     'output_type', 'min_weight_fraction_leaf', 'n_jobs',
                     'max_leaf_nodes', 'min_impurity_split', 'oob_score',
                     'random_state', 'warm_start', 'class_weight',
@@ -106,7 +106,7 @@ class BaseRandomForestModel(Base):
         if ((random_state is not None) and (n_streams != 1)):
             warnings.warn("For reproducible results in Random Forest"
                           " Classifier or for almost reproducible results"
-                          " in Random Forest Regressor, n_streams==1 is "
+                          " in Random Forest Regressor, n_streams=1 is "
                           "recommended. If n_streams is > 1, results may vary "
                           "due to stream/thread timing differences, even when "
                           "random_state is set")