From b2cae34a8ea110f7fb8a96cf4cc7b6e7ae95ddb0 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 23 Jul 2024 02:13:15 +0800 Subject: [PATCH] Fix integer overflow. (#10615) --- src/data/ellpack_page.cuh | 4 ++-- src/tree/gpu_hist/feature_groups.cu | 5 +++-- src/tree/gpu_hist/histogram.cu | 33 +++++++++++++++++++---------- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh index 18b9384afbd7..88873d0c2d4a 100644 --- a/src/data/ellpack_page.cuh +++ b/src/data/ellpack_page.cuh @@ -24,7 +24,7 @@ struct EllpackDeviceAccessor { /*! \brief Whether or not if the matrix is dense. */ bool is_dense; /*! \brief Row length for ELLPACK, equal to number of features. */ - size_t row_stride; + bst_idx_t row_stride; bst_idx_t base_rowid{0}; bst_idx_t n_rows{0}; common::CompressedIterator gidx_iter; @@ -118,7 +118,7 @@ struct EllpackDeviceAccessor { * not found). */ [[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; } - [[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); } + [[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return this->NumBins(); } [[nodiscard]] XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); } diff --git a/src/tree/gpu_hist/feature_groups.cu b/src/tree/gpu_hist/feature_groups.cu index 52e58da7efbb..c6c6619852ca 100644 --- a/src/tree/gpu_hist/feature_groups.cu +++ b/src/tree/gpu_hist/feature_groups.cu @@ -31,11 +31,12 @@ FeatureGroups::FeatureGroups(const common::HistogramCuts& cuts, bool is_dense, for (size_t i = 2; i < cut_ptrs.size(); ++i) { int last_start = bin_segments_h.back(); + // Push a new group whenever the size of required bin storage is greater than the + // shared memory size. if (cut_ptrs[i] - last_start > max_shmem_bins) { feature_segments_h.push_back(i - 1); bin_segments_h.push_back(cut_ptrs[i - 1]); - max_group_bins = std::max(max_group_bins, - bin_segments_h.back() - last_start); + max_group_bins = std::max(max_group_bins, bin_segments_h.back() - last_start); } } feature_segments_h.push_back(cut_ptrs.size() - 1); diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index 372a5c09ba0c..e90b6831fcfd 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -23,6 +23,18 @@ struct Pair { __host__ XGBOOST_DEV_INLINE Pair operator+(Pair const& lhs, Pair const& rhs) { return {lhs.first + rhs.first, lhs.second + rhs.second}; } + +XGBOOST_DEV_INLINE bst_idx_t IterIdx(EllpackDeviceAccessor const& matrix, + RowPartitioner::RowIndexT ridx, FeatureGroup const& group, + bst_idx_t idx, std::int32_t feature_stride) { + // ridx_local = ridx - base_rowid <== Row index local to each batch + // entry_idx = ridx_local * row_stride <== Starting entry index for this row in the matrix + // entry_idx += start_feature <== Inside a row, first column inside this feature group + // idx % feature_stride <== The feaature index local to the current feature group + // entry_idx += idx % feature_stride <== Final index. + return (ridx - matrix.base_rowid) * matrix.row_stride + group.start_feature + + idx % feature_stride; +} } // anonymous namespace struct Clip : public thrust::unary_function { @@ -159,15 +171,16 @@ class HistogramAgent { idx < std::min(offset + kBlockThreads * kItemsPerTile, n_elements_); idx += kBlockThreads) { Idx ridx = d_ridx_[idx / feature_stride_]; - Idx midx = (ridx - matrix_.base_rowid) * matrix_.row_stride + group_.start_feature + - idx % feature_stride_; - bst_bin_t gidx = matrix_.gidx_iter[midx] - group_.start_bin; - if (matrix_.is_dense || gidx != matrix_.NumBins()) { + bst_bin_t gidx = matrix_.gidx_iter[IterIdx(matrix_, ridx, group_, idx, feature_stride_)]; + if (matrix_.is_dense || gidx != matrix_.NullValue()) { auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]); - AtomicAddGpairShared(smem_arr_ + gidx, adjusted); + // Subtract start_bin to write to group-local histogram. If this is not a dense + // matrix, then start_bin is 0 since featuregrouping doesn't support sparse data. + AtomicAddGpairShared(smem_arr_ + gidx - group_.start_bin, adjusted); } } } + // Instruction level parallelism by loop unrolling // Allows the kernel to pipeline many operations while waiting for global memory // Increases the throughput of this kernel significantly @@ -187,12 +200,11 @@ class HistogramAgent { #pragma unroll for (int i = 0; i < kItemsPerThread; i++) { gpair[i] = d_gpair_[ridx[i]]; - gidx[i] = matrix_.gidx_iter[(ridx[i] - matrix_.base_rowid) * matrix_.row_stride + - group_.start_feature + idx[i] % feature_stride_]; + gidx[i] = matrix_.gidx_iter[IterIdx(matrix_, ridx[i], group_, idx[i], feature_stride_)]; } #pragma unroll for (int i = 0; i < kItemsPerThread; i++) { - if ((matrix_.is_dense || gidx[i] != matrix_.NumBins())) { + if ((matrix_.is_dense || gidx[i] != matrix_.NullValue())) { auto adjusted = rounding_.ToFixedPoint(gpair[i]); AtomicAddGpairShared(smem_arr_ + gidx[i] - group_.start_bin, adjusted); } @@ -219,9 +231,8 @@ class HistogramAgent { __device__ void BuildHistogramWithGlobal() { for (auto idx : dh::GridStrideRange(static_cast(0), n_elements_)) { Idx ridx = d_ridx_[idx / feature_stride_]; - bst_bin_t gidx = matrix_.gidx_iter[(ridx - matrix_.base_rowid) * matrix_.row_stride + - group_.start_feature + idx % feature_stride_]; - if (matrix_.is_dense || gidx != matrix_.NumBins()) { + bst_bin_t gidx = matrix_.gidx_iter[IterIdx(matrix_, ridx, group_, idx, feature_stride_)]; + if (matrix_.is_dense || gidx != matrix_.NullValue()) { auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]); AtomicAddGpairGlobal(d_node_hist_ + gidx, adjusted); }