From afd671d76408686fb6cda80e7b684486d99fb768 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 14 Dec 2020 15:07:06 -0600 Subject: [PATCH 1/2] [doc] [dask] Add example on early stopping with Dask --- doc/tutorials/dask.rst | 77 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst index 1d00f7754696..6706a69ff8cb 100644 --- a/doc/tutorials/dask.rst +++ b/doc/tutorials/dask.rst @@ -273,6 +273,83 @@ actual computation will return a coroutine and hence require awaiting: # Use `client.compute` instead of the `compute` method from dask collection print(await client.compute(prediction)) +***************************** +Evaluation and Early Stopping +***************************** + +.. versionadded:: 1.3.0 + +The Dask interface allows the use of validation sets that are stored in distributed collections (Dask DataFrame or Dask Array). These can be used for evaluation and early stopping. + +To enable early stopping, pass one or more validation sets containing ``DaskDMatrix`` objects. + +.. code-block:: python + + import dask.array as da + import xgboost as xgb + + num_rows = 1e6 + num_features = 100 + num_partitions = 10 + rows_per_chunk = num_rows / num_partitions + + data = da.random.random( + size=(num_rows, num_features), + chunks=(rows_per_chunk, num_features) + ) + + labels = da.random.random( + size=(num_rows, 1), + chunks=(rows_per_chunk, 1) + ) + + X_eval = da.random.random( + size=(num_rows, num_features), + chunks=(rows_per_chunk, num_features) + ) + + y_eval = da.random.random( + size=(num_rows, 1), + chunks=(rows_per_chunk, 1) + ) + + dtrain = xgb.dask.DaskDMatrix( + client=client, + data=data, + label=labels + ) + + dvalid = xgb.dask.DaskDMatrix( + client=client, + data=X_eval, + label=y_eval + ) + + result = xgb.dask.train( + client=client, + params={ + "objective": "reg:squarederror", + }, + dtrain=dtrain, + num_boost_round=10, + evals=[(dvalid, "valid1")], + early_stopping_rounds=3 + ) + +When validation sets are provided to ``xgb.dask.train()`` in this way, the model object returned by ``xgb.dask.train()`` contains a history of evaluation metrics for each validation set, across all boosting rounds. + +.. code-block:: python + + print(result["history"]) + # {'valid1': OrderedDict([('rmse', [0.28857, 0.28858, 0.288592, 0.288598])])} + +If early stopping is enabled by also passing ``early_stopping_rounds``, you can check the best iteration in the returned booster. + +.. code-block:: python + + bst = result["booster"] + print(booster.best_iteration) + ***************************************************************************** Why is the initialization of ``DaskDMatrix`` so slow and throws weird errors ***************************************************************************** From bf58056cc827ddbe383e9390235e7026217d2492 Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 15 Dec 2020 16:42:07 +0800 Subject: [PATCH 2/2] Typo, model slice. --- doc/tutorials/dask.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst index 6706a69ff8cb..80754baaca53 100644 --- a/doc/tutorials/dask.rst +++ b/doc/tutorials/dask.rst @@ -347,8 +347,9 @@ If early stopping is enabled by also passing ``early_stopping_rounds``, you can .. code-block:: python - bst = result["booster"] + booster = result["booster"] print(booster.best_iteration) + best_model = booster[: booster.best_iteration] ***************************************************************************** Why is the initialization of ``DaskDMatrix`` so slow and throws weird errors