From ef4863c8bddd10b3cc340e624a526c685d6260f8 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 17 Jun 2024 15:56:34 +0800 Subject: [PATCH 1/3] Fix categorical data with external memory. - Fix the cut IO functions. --- src/common/hist_util.h | 3 +-- src/data/gradient_index.cc | 5 ++--- src/data/histogram_cut_format.h | 13 ++++++++++++- tests/python/test_data_iterator.py | 13 +++++++++++++ 4 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/common/hist_util.h b/src/common/hist_util.h index e829752dae3d..8f940500f73c 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -1,5 +1,5 @@ /** - * Copyright 2017-2024 by XGBoost Contributors + * Copyright 2017-2024, XGBoost Contributors * \file hist_util.h * \brief Utility for fast histogram aggregation * \author Philip Cho, Tianqi Chen @@ -11,7 +11,6 @@ #include // for uint32_t #include #include -#include #include #include diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc index 493aded70098..e600892db90f 100644 --- a/src/data/gradient_index.cc +++ b/src/data/gradient_index.cc @@ -4,7 +4,6 @@ */ #include "gradient_index.h" -#include #include #include #include // for forward @@ -126,8 +125,8 @@ INSTANTIATION_PUSH(data::ColumnarAdapterBatch) void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) { auto make_index = [this, n_index](auto t, common::BinTypeSize t_size) { // Must resize instead of allocating a new one. This function is called everytime a - // new batch is pushed, and we grow the size accordingly without loosing the data the - // previous batches. + // new batch is pushed, and we grow the size accordingly without loosing the data in + // the previous batches. using T = decltype(t); std::size_t n_bytes = sizeof(T) * n_index; CHECK_GE(n_bytes, this->data.size()); diff --git a/src/data/histogram_cut_format.h b/src/data/histogram_cut_format.h index 45a96134f8d0..d4eb81ad2849 100644 --- a/src/data/histogram_cut_format.h +++ b/src/data/histogram_cut_format.h @@ -1,5 +1,5 @@ /** - * Copyright 2021-2023, XGBoost contributors + * Copyright 2021-2024, XGBoost contributors */ #ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_ #define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_ @@ -23,6 +23,15 @@ inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResour if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) { return false; } + bool has_cat{false}; + if (!fi->Read(&has_cat)) { + return false; + } + decltype(cuts->MaxCategory()) max_cat{0}; + if (!fi->Read(&max_cat)) { + return false; + } + cuts->SetCategorical(has_cat, max_cat); return true; } @@ -32,6 +41,8 @@ inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts, bytes += common::WriteVec(fo, cuts.Values()); bytes += common::WriteVec(fo, cuts.Ptrs()); bytes += common::WriteVec(fo, cuts.MinValues()); + bytes += fo->Write(cuts.HasCategorical()); + bytes += fo->Write(cuts.MaxCategory()); return bytes; } } // namespace xgboost::data diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py index 7f0153565c4b..5f7536086058 100644 --- a/tests/python/test_data_iterator.py +++ b/tests/python/test_data_iterator.py @@ -52,6 +52,19 @@ def test_single_batch(tree_method: str = "approx") -> None: assert from_np.get_dump() == from_it.get_dump() +def test_with_cat_single() -> None: + X, y = tm.make_categorical(n_samples=128, n_features=3, n_categories=6, onehot=False) + Xy = xgb.DMatrix(SingleBatch(data=X, label=y), enable_categorical=True) + from_it = xgb.train({}, Xy, num_boost_round=3) + + Xy = xgb.DMatrix(X, y, enable_categorical=True) + from_Xy = xgb.train({}, Xy, num_boost_round=3) + + jit = from_it.save_raw(raw_format="json") + jxy = from_Xy.save_raw(raw_format="json") + assert jit == jxy + + def run_data_iterator( n_samples_per_batch: int, n_features: int, From 5dbf4182d907082597b5a634b49c412942705400 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 17 Jun 2024 16:54:11 +0800 Subject: [PATCH 2/3] lint. --- tests/python/test_data_iterator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py index 5f7536086058..e665bcb10d9f 100644 --- a/tests/python/test_data_iterator.py +++ b/tests/python/test_data_iterator.py @@ -53,7 +53,9 @@ def test_single_batch(tree_method: str = "approx") -> None: def test_with_cat_single() -> None: - X, y = tm.make_categorical(n_samples=128, n_features=3, n_categories=6, onehot=False) + X, y = tm.make_categorical( + n_samples=128, n_features=3, n_categories=6, onehot=False + ) Xy = xgb.DMatrix(SingleBatch(data=X, label=y), enable_categorical=True) from_it = xgb.train({}, Xy, num_boost_round=3) From 4145a49f45a48042efe3d762595fcaa971c37143 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 17 Jun 2024 16:59:12 +0800 Subject: [PATCH 3/3] typing. --- demo/guide-python/external_memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/guide-python/external_memory.py b/demo/guide-python/external_memory.py index b19f550c9149..e1bcbe99ae62 100644 --- a/demo/guide-python/external_memory.py +++ b/demo/guide-python/external_memory.py @@ -43,7 +43,7 @@ def make_batches( class Iterator(xgboost.DataIter): """A custom iterator for loading files in batches.""" - def __init__(self, file_paths: List[Tuple[str, str]]): + def __init__(self, file_paths: List[Tuple[str, str]]) -> None: self._file_paths = file_paths self._it = 0 # XGBoost will generate some cache files under current directory with the prefix