From 0ff8adae3ec1ae03c9efddb74bb19aadf0ea393c Mon Sep 17 00:00:00 2001 From: Smirnov Date: Sun, 2 Dec 2018 18:58:30 +0300 Subject: [PATCH 01/31] Initial performance optimizations for xgboost --- src/common/hist_util.cc | 119 ++++++++++++++++++++++++++-------------- src/common/hist_util.h | 6 +- 2 files changed, 81 insertions(+), 44 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index da6e4d770bff..eb9d0cff1c6e 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -8,11 +8,20 @@ #include #include #include +#include #include "./random.h" #include "./column_matrix.h" #include "./hist_util.h" #include "./quantile.h" +#include + +#if defined(_MSC_VER) || defined(__INTEL_COMPILER) + #include + #define PREFETCH_READ_T0(addr) _mm_prefetch((char *)addr, _MM_HINT_T0) +#else + #define PREFETCH_READ_T0(addr) __builtin_prefetch((char *)addr, 0, 3) +#endif namespace xgboost { namespace common { @@ -399,56 +408,82 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix& gmat, GHistRow hist) { - data_.resize(nbins_ * nthread_, GHistEntry()); - std::fill(data_.begin(), data_.end(), GHistEntry()); - - constexpr int kUnroll = 8; // loop unrolling factor const auto nthread = static_cast(this->nthread_); - const size_t nrows = row_indices.end - row_indices.begin; - const size_t rest = nrows % kUnroll; + data_.resize(nbins_ * nthread_); - #pragma omp parallel for num_threads(nthread) schedule(guided) - for (bst_omp_uint i = 0; i < nrows - rest; i += kUnroll) { - const bst_omp_uint tid = omp_get_thread_num(); - const size_t off = tid * nbins_; - size_t rid[kUnroll]; - size_t ibegin[kUnroll]; - size_t iend[kUnroll]; - GradientPair stat[kUnroll]; - for (int k = 0; k < kUnroll; ++k) { - rid[k] = row_indices.begin[i + k]; - } - for (int k = 0; k < kUnroll; ++k) { - ibegin[k] = gmat.row_ptr[rid[k]]; - iend[k] = gmat.row_ptr[rid[k] + 1]; - } - for (int k = 0; k < kUnroll; ++k) { - stat[k] = gpair[rid[k]]; + const size_t* rid = row_indices.begin; + const size_t nrows = row_indices.Size(); + const uint32_t* index = gmat.index.data(); + const size_t* row_ptr = gmat.row_ptr.data(); + const float* pgh = (float*)gpair.data(); + + float* hist_data = (float*)hist.begin; + float* data = (float*)data_.data(); + + const size_t block_size = 512; + size_t n_blocks = nrows/block_size; + n_blocks += !!(nrows - n_blocks*block_size); + + const size_t nthread_to_process = std::min((size_t)nthread, (size_t)n_blocks); + memset(thread_init_.data(), '\0', nthread_to_process*sizeof(size_t)); + + #pragma omp parallel for num_threads(nthread_to_process) schedule(guided) + for (size_t iblock = 0; iblock < n_blocks; iblock++) { + dmlc::omp_uint tid = omp_get_thread_num(); + float* data_local_hist = ((nthread_to_process == 1) ? hist_data : (float*)(data_.data() + tid * nbins_)); + + if (!thread_init_[tid]) { + memset(data_local_hist, '\0', 2*nbins_*sizeof(float)); + thread_init_[tid] = true; } - for (int k = 0; k < kUnroll; ++k) { - for (size_t j = ibegin[k]; j < iend[k]; ++j) { - const uint32_t bin = gmat.index[j]; - data_[off + bin].Add(stat[k]); + + const size_t istart = iblock*block_size; + const size_t iend = (((iblock+1)*block_size > nrows) ? nrows : istart + block_size); + for(size_t i = istart; i < iend; ++i) { + const size_t icol_start = row_ptr[rid[i]]; + const size_t icol_end = row_ptr[rid[i]+1]; + + PREFETCH_READ_T0(row_ptr + rid[i+10]); + PREFETCH_READ_T0(pgh + 2*rid[i+10]); + + for (size_t j = icol_start; j < icol_end; ++j) { + const uint32_t idx_bin = 2*index[j]; + const size_t idx_gh = 2*rid[i]; + + data_local_hist[idx_bin] += pgh[idx_gh]; + data_local_hist[idx_bin+1] += pgh[idx_gh+1]; } } } - for (size_t i = nrows - rest; i < nrows; ++i) { - const size_t rid = row_indices.begin[i]; - const size_t ibegin = gmat.row_ptr[rid]; - const size_t iend = gmat.row_ptr[rid + 1]; - const GradientPair stat = gpair[rid]; - for (size_t j = ibegin; j < iend; ++j) { - const uint32_t bin = gmat.index[j]; - data_[bin].Add(stat); + + if(nthread_to_process > 1) { + const size_t size = (2*nbins_); + const size_t block_size = 1024; + size_t n_blocks = size/block_size; + n_blocks += !!(size - n_blocks*block_size); + + size_t n_worked_bins = 0; + for(size_t i = 0; i < nthread_to_process; ++i) { + if (thread_init_[i]) { + thread_init_[n_worked_bins++] = i; + } } - } - /* reduction */ - const uint32_t nbins = nbins_; - #pragma omp parallel for num_threads(nthread) schedule(static) - for (bst_omp_uint bin_id = 0; bin_id < bst_omp_uint(nbins); ++bin_id) { - for (bst_omp_uint tid = 0; tid < nthread; ++tid) { - hist.begin[bin_id].Add(data_[tid * nbins_ + bin_id]); + const size_t nthreads_for_merge = std::min((size_t)nthread, (size_t)n_blocks); + #pragma omp parallel for num_threads(nthreads_for_merge) schedule(guided) + for (size_t iblock = 0; iblock < n_blocks; iblock++) { + const size_t istart = iblock*block_size; + const size_t iend = (((iblock+1)*block_size > size) ? size : istart + block_size); + + const size_t bin = 2*thread_init_[0]*nbins_; + memcpy(hist_data + istart, (data + bin + istart), sizeof(float)*(iend - istart)); + + for(size_t i_bin_part = 1; i_bin_part < n_worked_bins; ++i_bin_part) { + const size_t bin = 2*thread_init_[i_bin_part]*nbins_; + for(size_t i = istart; i < iend; i++) { + hist_data[i] += data[bin + i]; + } + } } } } diff --git a/src/common/hist_util.h b/src/common/hist_util.h index ad83dd6c8e18..5f8d687a3888 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -21,9 +21,9 @@ namespace common { /*! \brief sums of gradient statistics corresponding to a histogram bin */ struct GHistEntry { /*! \brief sum of first-order gradient statistics */ - double sum_grad{0}; + float sum_grad{0}; /*! \brief sum of second-order gradient statistics */ - double sum_hess{0}; + float sum_hess{0}; GHistEntry() = default; @@ -238,6 +238,7 @@ class GHistBuilder { inline void Init(size_t nthread, uint32_t nbins) { nthread_ = nthread; nbins_ = nbins; + thread_init_.resize(nthread_); } // construct a histogram via histogram aggregation @@ -259,6 +260,7 @@ class GHistBuilder { /*! \brief number of all bins over all features */ uint32_t nbins_; std::vector data_; + std::vector thread_init_; }; From 40c07c71023b4cdda754e5291392521715afe128 Mon Sep 17 00:00:00 2001 From: Smirnov Date: Sun, 2 Dec 2018 19:04:14 +0300 Subject: [PATCH 02/31] remove includes --- src/common/hist_util.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index eb9d0cff1c6e..825492cff696 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -8,13 +8,11 @@ #include #include #include -#include #include "./random.h" #include "./column_matrix.h" #include "./hist_util.h" #include "./quantile.h" -#include #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include From c80f4bc847ab9cf8e8a52261a107ec5602c617c9 Mon Sep 17 00:00:00 2001 From: Smirnov Date: Tue, 4 Dec 2018 16:45:53 +0300 Subject: [PATCH 03/31] revert float->double --- src/common/hist_util.cc | 10 +++++----- src/common/hist_util.h | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 825492cff696..31b05ee69cb4 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -415,8 +415,8 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const size_t* row_ptr = gmat.row_ptr.data(); const float* pgh = (float*)gpair.data(); - float* hist_data = (float*)hist.begin; - float* data = (float*)data_.data(); + double* hist_data = (double*)hist.begin; + double* data = (double*)data_.data(); const size_t block_size = 512; size_t n_blocks = nrows/block_size; @@ -428,10 +428,10 @@ void GHistBuilder::BuildHist(const std::vector& gpair, #pragma omp parallel for num_threads(nthread_to_process) schedule(guided) for (size_t iblock = 0; iblock < n_blocks; iblock++) { dmlc::omp_uint tid = omp_get_thread_num(); - float* data_local_hist = ((nthread_to_process == 1) ? hist_data : (float*)(data_.data() + tid * nbins_)); + double* data_local_hist = ((nthread_to_process == 1) ? hist_data : (double*)(data_.data() + tid * nbins_)); if (!thread_init_[tid]) { - memset(data_local_hist, '\0', 2*nbins_*sizeof(float)); + memset(data_local_hist, '\0', 2*nbins_*sizeof(double)); thread_init_[tid] = true; } @@ -474,7 +474,7 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const size_t iend = (((iblock+1)*block_size > size) ? size : istart + block_size); const size_t bin = 2*thread_init_[0]*nbins_; - memcpy(hist_data + istart, (data + bin + istart), sizeof(float)*(iend - istart)); + memcpy(hist_data + istart, (data + bin + istart), sizeof(double)*(iend - istart)); for(size_t i_bin_part = 1; i_bin_part < n_worked_bins; ++i_bin_part) { const size_t bin = 2*thread_init_[i_bin_part]*nbins_; diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 5f8d687a3888..30d0454c6454 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -21,9 +21,9 @@ namespace common { /*! \brief sums of gradient statistics corresponding to a histogram bin */ struct GHistEntry { /*! \brief sum of first-order gradient statistics */ - float sum_grad{0}; + double sum_grad{0}; /*! \brief sum of second-order gradient statistics */ - float sum_hess{0}; + double sum_hess{0}; GHistEntry() = default; From 32e88bb6b794027a7935b50aa7aefe7a23b86093 Mon Sep 17 00:00:00 2001 From: Smirnov Date: Tue, 4 Dec 2018 20:53:18 +0300 Subject: [PATCH 04/31] fix for CI --- src/common/hist_util.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 31b05ee69cb4..d389c521bdbf 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -426,7 +426,7 @@ void GHistBuilder::BuildHist(const std::vector& gpair, memset(thread_init_.data(), '\0', nthread_to_process*sizeof(size_t)); #pragma omp parallel for num_threads(nthread_to_process) schedule(guided) - for (size_t iblock = 0; iblock < n_blocks; iblock++) { + for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) { dmlc::omp_uint tid = omp_get_thread_num(); double* data_local_hist = ((nthread_to_process == 1) ? hist_data : (double*)(data_.data() + tid * nbins_)); @@ -441,8 +441,8 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const size_t icol_start = row_ptr[rid[i]]; const size_t icol_end = row_ptr[rid[i]+1]; - PREFETCH_READ_T0(row_ptr + rid[i+10]); - PREFETCH_READ_T0(pgh + 2*rid[i+10]); + if (i < iend-10) PREFETCH_READ_T0(row_ptr + rid[i+10]); + if (i < iend-10) PREFETCH_READ_T0(pgh + 2*rid[i+10]); for (size_t j = icol_start; j < icol_end; ++j) { const uint32_t idx_bin = 2*index[j]; @@ -469,7 +469,7 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const size_t nthreads_for_merge = std::min((size_t)nthread, (size_t)n_blocks); #pragma omp parallel for num_threads(nthreads_for_merge) schedule(guided) - for (size_t iblock = 0; iblock < n_blocks; iblock++) { + for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) { const size_t istart = iblock*block_size; const size_t iend = (((iblock+1)*block_size > size) ? size : istart + block_size); From f6c44a6e2affe0fa8c7cb67fc1e043e69570a2ab Mon Sep 17 00:00:00 2001 From: Smirnov Date: Wed, 5 Dec 2018 00:43:05 +0300 Subject: [PATCH 05/31] fix for CI --- src/common/hist_util.cc | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index d389c521bdbf..2926b712f2df 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -16,9 +16,9 @@ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include - #define PREFETCH_READ_T0(addr) _mm_prefetch((char *)addr, _MM_HINT_T0) + #define PREFETCH_READ_T0(addr) _mm_prefetch(addr, _MM_HINT_T0) #else - #define PREFETCH_READ_T0(addr) __builtin_prefetch((char *)addr, 0, 3) + #define PREFETCH_READ_T0(addr) __builtin_prefetch(addr, 0, 3) #endif namespace xgboost { @@ -413,10 +413,10 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const size_t nrows = row_indices.Size(); const uint32_t* index = gmat.index.data(); const size_t* row_ptr = gmat.row_ptr.data(); - const float* pgh = (float*)gpair.data(); + const float* pgh = reinterpret_cast(gpair.data()); - double* hist_data = (double*)hist.begin; - double* data = (double*)data_.data(); + double* hist_data = reinterpret_cast(hist.begin); + double* data = reinterpret_cast(data_.data()); const size_t block_size = 512; size_t n_blocks = nrows/block_size; @@ -428,7 +428,8 @@ void GHistBuilder::BuildHist(const std::vector& gpair, #pragma omp parallel for num_threads(nthread_to_process) schedule(guided) for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) { dmlc::omp_uint tid = omp_get_thread_num(); - double* data_local_hist = ((nthread_to_process == 1) ? hist_data : (double*)(data_.data() + tid * nbins_)); + double* data_local_hist = ((nthread_to_process == 1) ? hist_data : + reinterpret_cast(data_.data() + tid * nbins_)); if (!thread_init_[tid]) { memset(data_local_hist, '\0', 2*nbins_*sizeof(double)); @@ -437,12 +438,12 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const size_t istart = iblock*block_size; const size_t iend = (((iblock+1)*block_size > nrows) ? nrows : istart + block_size); - for(size_t i = istart; i < iend; ++i) { + for (size_t i = istart; i < iend; ++i) { const size_t icol_start = row_ptr[rid[i]]; const size_t icol_end = row_ptr[rid[i]+1]; - if (i < iend-10) PREFETCH_READ_T0(row_ptr + rid[i+10]); - if (i < iend-10) PREFETCH_READ_T0(pgh + 2*rid[i+10]); + if (i < nrows-16) PREFETCH_READ_T0(row_ptr + rid[i+10]); + if (i < nrows-16) PREFETCH_READ_T0(pgh + 2*rid[i+10]); for (size_t j = icol_start; j < icol_end; ++j) { const uint32_t idx_bin = 2*index[j]; @@ -454,14 +455,14 @@ void GHistBuilder::BuildHist(const std::vector& gpair, } } - if(nthread_to_process > 1) { + if (nthread_to_process > 1) { const size_t size = (2*nbins_); const size_t block_size = 1024; size_t n_blocks = size/block_size; n_blocks += !!(size - n_blocks*block_size); size_t n_worked_bins = 0; - for(size_t i = 0; i < nthread_to_process; ++i) { + for (size_t i = 0; i < nthread_to_process; ++i) { if (thread_init_[i]) { thread_init_[n_worked_bins++] = i; } @@ -476,9 +477,9 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const size_t bin = 2*thread_init_[0]*nbins_; memcpy(hist_data + istart, (data + bin + istart), sizeof(double)*(iend - istart)); - for(size_t i_bin_part = 1; i_bin_part < n_worked_bins; ++i_bin_part) { + for (size_t i_bin_part = 1; i_bin_part < n_worked_bins; ++i_bin_part) { const size_t bin = 2*thread_init_[i_bin_part]*nbins_; - for(size_t i = istart; i < iend; i++) { + for (size_t i = istart; i < iend; i++) { hist_data[i] += data[bin + i]; } } From c36127de063cfce95e42e64cfd83f896a802f23b Mon Sep 17 00:00:00 2001 From: Smirnov Date: Wed, 5 Dec 2018 10:07:10 +0300 Subject: [PATCH 06/31] fix for CI --- src/common/hist_util.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 2926b712f2df..cfb857f73248 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -16,9 +16,9 @@ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include - #define PREFETCH_READ_T0(addr) _mm_prefetch(addr, _MM_HINT_T0) + #define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast(addr), _MM_HINT_T0) #else - #define PREFETCH_READ_T0(addr) __builtin_prefetch(addr, 0, 3) + #define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast(addr), 0, 3) #endif namespace xgboost { @@ -422,7 +422,8 @@ void GHistBuilder::BuildHist(const std::vector& gpair, size_t n_blocks = nrows/block_size; n_blocks += !!(nrows - n_blocks*block_size); - const size_t nthread_to_process = std::min((size_t)nthread, (size_t)n_blocks); + const size_t nthread_to_process = std::min(reinterpret_cast(nthread), + reinterpret_cast(n_blocks)); memset(thread_init_.data(), '\0', nthread_to_process*sizeof(size_t)); #pragma omp parallel for num_threads(nthread_to_process) schedule(guided) @@ -468,7 +469,7 @@ void GHistBuilder::BuildHist(const std::vector& gpair, } } - const size_t nthreads_for_merge = std::min((size_t)nthread, (size_t)n_blocks); + const size_t nthreads_for_merge = std::min(reinterpret_cast(nthread), reinterpret_cast(n_blocks)); #pragma omp parallel for num_threads(nthreads_for_merge) schedule(guided) for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) { const size_t istart = iblock*block_size; From dd1294432c8388a171e424e4b9d7e8c7fc9d3000 Mon Sep 17 00:00:00 2001 From: Smirnov Date: Wed, 5 Dec 2018 10:57:58 +0300 Subject: [PATCH 07/31] fix for CI --- src/common/hist_util.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index cfb857f73248..0b0ea7920abb 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -16,9 +16,9 @@ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include - #define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast(addr), _MM_HINT_T0) + #define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast(addr), _MM_HINT_T0) #else - #define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast(addr), 0, 3) + #define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast(addr), 0, 3) #endif namespace xgboost { @@ -422,8 +422,8 @@ void GHistBuilder::BuildHist(const std::vector& gpair, size_t n_blocks = nrows/block_size; n_blocks += !!(nrows - n_blocks*block_size); - const size_t nthread_to_process = std::min(reinterpret_cast(nthread), - reinterpret_cast(n_blocks)); + const size_t nthread_to_process = std::min(static_cast(nthread), + static_cast(n_blocks)); memset(thread_init_.data(), '\0', nthread_to_process*sizeof(size_t)); #pragma omp parallel for num_threads(nthread_to_process) schedule(guided) @@ -469,7 +469,8 @@ void GHistBuilder::BuildHist(const std::vector& gpair, } } - const size_t nthreads_for_merge = std::min(reinterpret_cast(nthread), reinterpret_cast(n_blocks)); + const size_t nthreads_for_merge = std::min(static_cast(nthread), + static_cast(n_blocks)); #pragma omp parallel for num_threads(nthreads_for_merge) schedule(guided) for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) { const size_t istart = iblock*block_size; From 416bf2f94dfe1cce922c356110cd17eb454dd421 Mon Sep 17 00:00:00 2001 From: Smirnov Date: Wed, 5 Dec 2018 11:13:33 +0300 Subject: [PATCH 08/31] fix for CI --- src/common/hist_util.cc | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 0b0ea7920abb..80a64c6f0ffe 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -406,7 +406,7 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix& gmat, GHistRow hist) { - const auto nthread = static_cast(this->nthread_); + const size_t nthread = static_cast(this->nthread_); data_.resize(nbins_ * nthread_); const size_t* rid = row_indices.begin; @@ -422,8 +422,7 @@ void GHistBuilder::BuildHist(const std::vector& gpair, size_t n_blocks = nrows/block_size; n_blocks += !!(nrows - n_blocks*block_size); - const size_t nthread_to_process = std::min(static_cast(nthread), - static_cast(n_blocks)); + const size_t nthread_to_process = std::min(nthread, n_blocks); memset(thread_init_.data(), '\0', nthread_to_process*sizeof(size_t)); #pragma omp parallel for num_threads(nthread_to_process) schedule(guided) @@ -469,9 +468,7 @@ void GHistBuilder::BuildHist(const std::vector& gpair, } } - const size_t nthreads_for_merge = std::min(static_cast(nthread), - static_cast(n_blocks)); - #pragma omp parallel for num_threads(nthreads_for_merge) schedule(guided) + #pragma omp parallel for num_threads(std::min(nthread, n_blocks)) schedule(guided) for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) { const size_t istart = iblock*block_size; const size_t iend = (((iblock+1)*block_size > size) ? size : istart + block_size); From c862fa83928f929c89e25354a31055b710687453 Mon Sep 17 00:00:00 2001 From: Smirnov Date: Wed, 5 Dec 2018 11:24:39 +0300 Subject: [PATCH 09/31] fix for CI --- src/common/hist_util.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 80a64c6f0ffe..f442e3083db8 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -442,8 +442,8 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const size_t icol_start = row_ptr[rid[i]]; const size_t icol_end = row_ptr[rid[i]+1]; - if (i < nrows-16) PREFETCH_READ_T0(row_ptr + rid[i+10]); - if (i < nrows-16) PREFETCH_READ_T0(pgh + 2*rid[i+10]); + if (i < nrows - 10 - (64/sizeof(row_ptr[0]))) PREFETCH_READ_T0(row_ptr + rid[i+10]); + if (i < nrows - 10 - (64/sizeof(pgh[0]))) PREFETCH_READ_T0(pgh + 2*rid[i+10]); for (size_t j = icol_start; j < icol_end; ++j) { const uint32_t idx_bin = 2*index[j]; From 1d595667722b1b95226a813b03def65282dba290 Mon Sep 17 00:00:00 2001 From: Smirnov Date: Wed, 5 Dec 2018 12:50:18 +0300 Subject: [PATCH 10/31] fix for CI --- src/common/hist_util.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index f442e3083db8..1670587c87e2 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -442,8 +442,8 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const size_t icol_start = row_ptr[rid[i]]; const size_t icol_end = row_ptr[rid[i]+1]; - if (i < nrows - 10 - (64/sizeof(row_ptr[0]))) PREFETCH_READ_T0(row_ptr + rid[i+10]); - if (i < nrows - 10 - (64/sizeof(pgh[0]))) PREFETCH_READ_T0(pgh + 2*rid[i+10]); + if (i < nrows - 10 - (64/sizeof(*rid))) PREFETCH_READ_T0(row_ptr + rid[i+10]); + if (i < nrows - 10 - (64/sizeof(*rid))) PREFETCH_READ_T0(pgh + 2*rid[i+10]); for (size_t j = icol_start; j < icol_end; ++j) { const uint32_t idx_bin = 2*index[j]; From b7685df8d56dcbade14af0b1c9a68d200145234b Mon Sep 17 00:00:00 2001 From: Smirnov Date: Wed, 5 Dec 2018 13:59:08 +0300 Subject: [PATCH 11/31] fix for CI --- src/common/hist_util.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 1670587c87e2..2593555fc4b5 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -442,8 +442,10 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const size_t icol_start = row_ptr[rid[i]]; const size_t icol_end = row_ptr[rid[i]+1]; - if (i < nrows - 10 - (64/sizeof(*rid))) PREFETCH_READ_T0(row_ptr + rid[i+10]); - if (i < nrows - 10 - (64/sizeof(*rid))) PREFETCH_READ_T0(pgh + 2*rid[i+10]); + constexpr size_t cache_line_size = 64; + constexpr size_t offset_to_prevent_segfault = 2*(cache_line_size/sizeof(*rid)); + if (i < nrows - 10 - offset_to_prevent_segfault) PREFETCH_READ_T0(row_ptr + rid[i+10]); + if (i < nrows - 10 - offset_to_prevent_segfault) PREFETCH_READ_T0(pgh + 2*rid[i+10]); for (size_t j = icol_start; j < icol_end; ++j) { const uint32_t idx_bin = 2*index[j]; From e29229be4fcc1cea018171301715c6d1268113c7 Mon Sep 17 00:00:00 2001 From: Smirnov Date: Wed, 5 Dec 2018 15:14:33 +0300 Subject: [PATCH 12/31] fix for CI --- src/common/hist_util.cc | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 2593555fc4b5..a8c18f28fd5a 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -425,6 +425,11 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const size_t nthread_to_process = std::min(nthread, n_blocks); memset(thread_init_.data(), '\0', nthread_to_process*sizeof(size_t)); + constexpr size_t cache_line_size = 64; + constexpr size_t prefetch_offset = 10; + size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid); + no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size; + #pragma omp parallel for num_threads(nthread_to_process) schedule(guided) for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) { dmlc::omp_uint tid = omp_get_thread_num(); @@ -442,10 +447,10 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const size_t icol_start = row_ptr[rid[i]]; const size_t icol_end = row_ptr[rid[i]+1]; - constexpr size_t cache_line_size = 64; - constexpr size_t offset_to_prevent_segfault = 2*(cache_line_size/sizeof(*rid)); - if (i < nrows - 10 - offset_to_prevent_segfault) PREFETCH_READ_T0(row_ptr + rid[i+10]); - if (i < nrows - 10 - offset_to_prevent_segfault) PREFETCH_READ_T0(pgh + 2*rid[i+10]); + if (i < nrows - no_prefetch_size) { + PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]); + PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]); + } for (size_t j = icol_start; j < icol_end; ++j) { const uint32_t idx_bin = 2*index[j]; From d59c3862cdf7defa294972973e041167322f6a2d Mon Sep 17 00:00:00 2001 From: Smirnov Date: Wed, 5 Dec 2018 15:53:54 +0300 Subject: [PATCH 13/31] fix for CI --- src/common/hist_util.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index a8c18f28fd5a..37001fbcc8e0 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -425,8 +425,8 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const size_t nthread_to_process = std::min(nthread, n_blocks); memset(thread_init_.data(), '\0', nthread_to_process*sizeof(size_t)); - constexpr size_t cache_line_size = 64; - constexpr size_t prefetch_offset = 10; + const size_t cache_line_size = 64; + const size_t prefetch_offset = 10; size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid); no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size; From 4a0c9b303d49195321ab5e9c49e11e928f610596 Mon Sep 17 00:00:00 2001 From: Philip Cho Date: Thu, 3 Jan 2019 20:23:21 -0800 Subject: [PATCH 14/31] Check existence of _mm_prefetch and __builtin_prefetch --- .gitignore | 1 + CMakeLists.txt | 26 ++++++++++++++++++++++++++ cmake/build_config.h.in | 7 +++++++ include/xgboost/base.h | 4 ++++ include/xgboost/build_config.h | 20 ++++++++++++++++++++ src/common/hist_util.cc | 12 +++++++----- 6 files changed, 65 insertions(+), 5 deletions(-) create mode 100644 cmake/build_config.h.in create mode 100644 include/xgboost/build_config.h diff --git a/.gitignore b/.gitignore index bdfa3322a55a..5131e198399a 100644 --- a/.gitignore +++ b/.gitignore @@ -91,3 +91,4 @@ lib/ metastore_db plugin/updater_gpu/test/cpp/data +/include/xgboost/build_config.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b0d90797a44..061431de5684 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,6 +49,26 @@ if(WIN32 AND MINGW) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++") endif() +# Check existence of software pre-fetching +include(CheckCXXSourceCompiles) +check_cxx_source_compiles(" +#include +int main() { + char data = 0; + const char* address = &data; + _mm_prefetch(address, _MM_HINT_NTA); + return 0; +} +" XGBOOST_MM_PREFETCH_PRESENT) +check_cxx_source_compiles(" +int main() { + char data = 0; + const char* address = &data; + __builtin_prefetch(address, 0, 0); + return 0; +} +" XGBOOST_BUILTIN_PREFETCH_PRESENT) + # Sanitizer if(USE_SANITIZER) include(cmake/Sanitizer.cmake) @@ -82,6 +102,12 @@ include_directories ( ${PROJECT_SOURCE_DIR}/rabit/include ) +# Generate configurable header +set(CMAKE_LOCAL "${PROJECT_SOURCE_DIR}/cmake") +set(INCLUDE_ROOT "${PROJECT_SOURCE_DIR}/include") +message(STATUS "${CMAKE_LOCAL}/build_config.h.in -> ${INCLUDE_ROOT}/xgboost/build_config.h") +configure_file("${CMAKE_LOCAL}/build_config.h.in" "${INCLUDE_ROOT}/xgboost/build_config.h") + file(GLOB_RECURSE SOURCES src/*.cc src/*.h diff --git a/cmake/build_config.h.in b/cmake/build_config.h.in new file mode 100644 index 000000000000..b49dde12e123 --- /dev/null +++ b/cmake/build_config.h.in @@ -0,0 +1,7 @@ +#ifndef XGBOOST_BUILD_CONFIG_H_ +#define XGBOOST_BUILD_CONFIG_H_ + +#cmakedefine XGBOOST_MM_PREFETCH_PRESENT +#cmakedefine XGBOOST_BUILTIN_PREFETCH_PRESENT + +#endif // XGBOOST_BUILD_CONFIG_H_ diff --git a/include/xgboost/base.h b/include/xgboost/base.h index 97b140294f22..55c2e4ac7144 100644 --- a/include/xgboost/base.h +++ b/include/xgboost/base.h @@ -218,4 +218,8 @@ using bst_omp_uint = dmlc::omp_uint; // NOLINT #endif #endif } // namespace xgboost + +/* Always keep this #include at the bottom of xgboost/base.h */ +#include + #endif // XGBOOST_BASE_H_ diff --git a/include/xgboost/build_config.h b/include/xgboost/build_config.h new file mode 100644 index 000000000000..1e36dc80889c --- /dev/null +++ b/include/xgboost/build_config.h @@ -0,0 +1,20 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file build_config.h + * \brief Fall-back logic for platform-specific feature detection. + * \author Hyunsu Philip Cho + */ +#ifndef XGBOOST_BUILD_CONFIG_H_ +#define XGBOOST_BUILD_CONFIG_H_ + +/* default logic for software pre-fetching */ +#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))) || defined(__INTEL_COMPILER) + // Enable _mm_prefetch for Intel compiler and MSVC+x86 + #define XGBOOST_MM_PREFETCH_PRESENT + #define XGBOOST_BUILTIN_PREFETCH_PRESENT +#elif defined(__GNUC__) + // Enable __builtin_prefetch for GCC + #define XGBOOST_BUILTIN_PREFETCH_PRESENT +#endif + +#endif // XGBOOST_BUILD_CONFIG_H_ diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 37001fbcc8e0..d62cc42d2e24 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -14,11 +14,13 @@ #include "./hist_util.h" #include "./quantile.h" -#if defined(_MSC_VER) || defined(__INTEL_COMPILER) - #include - #define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast(addr), _MM_HINT_T0) -#else - #define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast(addr), 0, 3) +#if defined(XGBOOST_MM_PREFETCH_PRESENT) + #include + #define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast(addr), _MM_HINT_T0) +#elif defined(XGBOOST_BUILTIN_PREFETCH_PRESENT) + #define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast(addr), 0, 3) +#else // no SW pre-fetching available; PREFETCH_READ_T0 is no-op + #define PREFETCH_READ_T0(addr) do {} while(0) #endif namespace xgboost { From 6c37c3f787a3332601fd8b2a635f40e9ddeae8a7 Mon Sep 17 00:00:00 2001 From: Philip Cho Date: Thu, 3 Jan 2019 20:34:35 -0800 Subject: [PATCH 15/31] Fix lint --- src/common/hist_util.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index d62cc42d2e24..a988d3baf9ff 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -20,7 +20,7 @@ #elif defined(XGBOOST_BUILTIN_PREFETCH_PRESENT) #define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast(addr), 0, 3) #else // no SW pre-fetching available; PREFETCH_READ_T0 is no-op - #define PREFETCH_READ_T0(addr) do {} while(0) + #define PREFETCH_READ_T0(addr) do {} while (0) #endif namespace xgboost { From a3df3368ffc821ad864521b9fdb87fbe94f2da5c Mon Sep 17 00:00:00 2001 From: egor Date: Sun, 2 Jun 2019 17:01:39 +0300 Subject: [PATCH 16/31] optimizations for CPU --- src/common/column_matrix.h | 11 +- src/common/hist_util.cc | 246 +++--- src/common/hist_util.h | 248 ++++-- src/common/row_set.h | 47 +- src/tree/param.h | 43 +- src/tree/split_evaluator.cc | 4 +- src/tree/updater_quantile_hist.cc | 1150 ++++++++++++++++---------- src/tree/updater_quantile_hist.h | 309 ++++--- tests/cpp/tree/test_quantile_hist.cc | 25 +- 9 files changed, 1275 insertions(+), 808 deletions(-) diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h index 510206f50cd7..e55e1ef5777d 100644 --- a/src/common/column_matrix.h +++ b/src/common/column_matrix.h @@ -8,11 +8,11 @@ #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_ #define XGBOOST_COMMON_COLUMN_MATRIX_H_ +#include #include #include #include "hist_util.h" - namespace xgboost { namespace common { @@ -51,6 +51,10 @@ class Column { } const size_t* GetRowData() const { return row_ind_; } + const uint32_t* GetIndex() const { + return index_; + } + private: ColumnType type_; const uint32_t* index_; @@ -80,7 +84,7 @@ class ColumnMatrix { std::fill(feature_counts_.begin(), feature_counts_.end(), 0); uint32_t max_val = std::numeric_limits::max(); - for (bst_uint fid = 0; fid < nfeature; ++fid) { + for (int32_t fid = 0; fid < nfeature; ++fid) { CHECK_LE(gmat.cut.row_ptr[fid + 1] - gmat.cut.row_ptr[fid], max_val); } @@ -113,13 +117,12 @@ class ColumnMatrix { boundary_[fid].index_end = accum_index_; boundary_[fid].row_ind_end = accum_row_ind_; } - index_.resize(boundary_[nfeature - 1].index_end); row_ind_.resize(boundary_[nfeature - 1].row_ind_end); // store least bin id for each feature index_base_.resize(nfeature); - for (bst_uint fid = 0; fid < nfeature; ++fid) { + for (int32_t fid = 0; fid < nfeature; ++fid) { index_base_[fid] = gmat.cut.row_ptr[fid]; } diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 78a5c950b631..1bbe3747c86a 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -2,14 +2,14 @@ * Copyright 2017-2019 by Contributors * \file hist_util.h */ +#include "./hist_util.h" +#include #include #include #include #include - #include "./random.h" #include "./column_matrix.h" -#include "./hist_util.h" #include "./quantile.h" #include "./../tree/updater_quantile_hist.h" @@ -178,7 +178,7 @@ uint32_t HistCutMatrix::GetBinIdx(const Entry& e) { void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) { cut.Init(p_fmat, max_num_bins); - const size_t nthread = omp_get_max_threads(); + const int32_t nthread = omp_get_max_threads(); const uint32_t nbins = cut.row_ptr.back(); hit_count.resize(nbins, 0); hit_count_tloc_.resize(nthread * nbins, 0); @@ -260,8 +260,8 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) { } #pragma omp parallel for num_threads(nthread) schedule(static) - for (bst_omp_uint idx = 0; idx < bst_omp_uint(nbins); ++idx) { - for (size_t tid = 0; tid < nthread; ++tid) { + for (int32_t idx = 0; idx < int32_t(nbins); ++idx) { + for (int32_t tid = 0; tid < nthread; ++tid) { hit_count[idx] += hit_count_tloc_[tid * nbins + idx]; } } @@ -411,7 +411,7 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat, for (auto fid : group) { nnz += feature_nnz[fid]; } - double nnz_rate = static_cast(nnz) / nrow; + float nnz_rate = static_cast(nnz) / nrow; // take apart small sparse group, due it will not gain on speed if (nnz_rate <= param.sparse_threshold) { for (auto fid : group) { @@ -496,177 +496,133 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat, } } -void GHistBuilder::BuildHist(const std::vector& gpair, - const RowSetCollection::Elem row_indices, - const GHistIndexMatrix& gmat, - GHistRow hist) { - const size_t nthread = static_cast(this->nthread_); - data_.resize(nbins_ * nthread_); - - const size_t* rid = row_indices.begin; - const size_t nrows = row_indices.Size(); - const uint32_t* index = gmat.index.data(); - const size_t* row_ptr = gmat.row_ptr.data(); - const float* pgh = reinterpret_cast(gpair.data()); - - double* hist_data = reinterpret_cast(hist.data()); - double* data = reinterpret_cast(data_.data()); - - const size_t block_size = 512; - size_t n_blocks = nrows/block_size; - n_blocks += !!(nrows - n_blocks*block_size); - - const size_t nthread_to_process = std::min(nthread, n_blocks); - memset(thread_init_.data(), '\0', nthread_to_process*sizeof(size_t)); +void BuildHistLocalDense(size_t istart, size_t iend, size_t nrows, const size_t* rid, + const uint32_t* index, const GradientPair::ValueT* pgh, const size_t* row_ptr, + GradStatHist::GradType* data_local_hist, GradStatHist* grad_stat_global) { + GradStatHist grad_stat; // make local var to prevent false sharing + const size_t n_features = row_ptr[rid[istart]+1] - row_ptr[rid[istart]]; const size_t cache_line_size = 64; + const size_t prefetch_step = cache_line_size / sizeof(*index); const size_t prefetch_offset = 10; + size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid); no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size; -#pragma omp parallel for num_threads(nthread_to_process) schedule(guided) - for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) { - dmlc::omp_uint tid = omp_get_thread_num(); - double* data_local_hist = ((nthread_to_process == 1) ? hist_data : - reinterpret_cast(data_.data() + tid * nbins_)); - - if (!thread_init_[tid]) { - memset(data_local_hist, '\0', 2*nbins_*sizeof(double)); - thread_init_[tid] = true; - } - - const size_t istart = iblock*block_size; - const size_t iend = (((iblock+1)*block_size > nrows) ? nrows : istart + block_size); + if (iend < nrows - no_prefetch_size) { for (size_t i = istart; i < iend; ++i) { - const size_t icol_start = row_ptr[rid[i]]; - const size_t icol_end = row_ptr[rid[i]+1]; + const size_t icol_start = rid[i] * n_features; + const size_t icol_start_prefetch = rid[i+prefetch_offset] * n_features; + const size_t idx_gh = 2*rid[i]; + + PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]); - if (i < nrows - no_prefetch_size) { - PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]); - PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]); + for (size_t j = icol_start_prefetch; j < icol_start_prefetch + n_features; + j += prefetch_step) { + PREFETCH_READ_T0(index + j); } - for (size_t j = icol_start; j < icol_end; ++j) { - const uint32_t idx_bin = 2*index[j]; - const size_t idx_gh = 2*rid[i]; + grad_stat.sum_grad += pgh[idx_gh]; + grad_stat.sum_hess += pgh[idx_gh+1]; + for (size_t j = icol_start; j < icol_start + n_features; ++j) { + const uint32_t idx_bin = 2*index[j]; data_local_hist[idx_bin] += pgh[idx_gh]; data_local_hist[idx_bin+1] += pgh[idx_gh+1]; } } + } else { + for (size_t i = istart; i < iend; ++i) { + const size_t icol_start = rid[i] * n_features; + const size_t idx_gh = 2*rid[i]; + grad_stat.sum_grad += pgh[idx_gh]; + grad_stat.sum_hess += pgh[idx_gh+1]; + + for (size_t j = icol_start; j < icol_start + n_features; ++j) { + const uint32_t idx_bin = 2*index[j]; + data_local_hist[idx_bin] += pgh[idx_gh]; + data_local_hist[idx_bin+1] += pgh[idx_gh+1]; + } + } } + grad_stat_global->Add(grad_stat); +} - if (nthread_to_process > 1) { - const size_t size = (2*nbins_); - const size_t block_size = 1024; - size_t n_blocks = size/block_size; - n_blocks += !!(size - n_blocks*block_size); +void BuildHistLocalSparse(size_t istart, size_t iend, size_t nrows, const size_t* rid, + const uint32_t* index, const GradientPair::ValueT* pgh, const size_t* row_ptr, + GradStatHist::GradType* data_local_hist, GradStatHist* grad_stat_global) { + GradStatHist grad_stat; // make local var to prevent false sharing - size_t n_worked_bins = 0; - for (size_t i = 0; i < nthread_to_process; ++i) { - if (thread_init_[i]) { - thread_init_[n_worked_bins++] = i; - } - } + const size_t cache_line_size = 64; + const size_t prefetch_step = cache_line_size / sizeof(index[0]); + const size_t prefetch_offset = 10; -#pragma omp parallel for num_threads(std::min(nthread, n_blocks)) schedule(guided) - for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) { - const size_t istart = iblock * block_size; - const size_t iend = (((iblock + 1) * block_size > size) ? size : istart + block_size); + size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid); + no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size; - const size_t bin = 2 * thread_init_[0] * nbins_; - memcpy(hist_data + istart, (data + bin + istart), sizeof(double) * (iend - istart)); + if (iend < nrows - no_prefetch_size) { + for (size_t i = istart; i < iend; ++i) { + const size_t icol_start = row_ptr[rid[i]]; + const size_t icol_end = row_ptr[rid[i]+1]; + const size_t idx_gh = 2*rid[i]; - for (size_t i_bin_part = 1; i_bin_part < n_worked_bins; ++i_bin_part) { - const size_t bin = 2 * thread_init_[i_bin_part] * nbins_; - for (size_t i = istart; i < iend; i++) { - hist_data[i] += data[bin + i]; - } - } - } - } -} + const size_t icol_start10 = row_ptr[rid[i+prefetch_offset]]; + const size_t icol_end10 = row_ptr[rid[i+prefetch_offset]+1]; + + PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]); -void GHistBuilder::BuildBlockHist(const std::vector& gpair, - const RowSetCollection::Elem row_indices, - const GHistIndexBlockMatrix& gmatb, - GHistRow hist) { - constexpr int kUnroll = 8; // loop unrolling factor - const size_t nblock = gmatb.GetNumBlock(); - const size_t nrows = row_indices.end - row_indices.begin; - const size_t rest = nrows % kUnroll; - -#if defined(_OPENMP) - const auto nthread = static_cast(this->nthread_); // NOLINT -#endif // defined(_OPENMP) - tree::GradStats* p_hist = hist.data(); - -#pragma omp parallel for num_threads(nthread) schedule(guided) - for (bst_omp_uint bid = 0; bid < nblock; ++bid) { - auto gmat = gmatb[bid]; - - for (size_t i = 0; i < nrows - rest; i += kUnroll) { - size_t rid[kUnroll]; - size_t ibegin[kUnroll]; - size_t iend[kUnroll]; - GradientPair stat[kUnroll]; - - for (int k = 0; k < kUnroll; ++k) { - rid[k] = row_indices.begin[i + k]; - ibegin[k] = gmat.row_ptr[rid[k]]; - iend[k] = gmat.row_ptr[rid[k] + 1]; - stat[k] = gpair[rid[k]]; + for (size_t j = icol_start10; j < icol_end10; j+=prefetch_step) { + PREFETCH_READ_T0(index + j); } - for (int k = 0; k < kUnroll; ++k) { - for (size_t j = ibegin[k]; j < iend[k]; ++j) { - const uint32_t bin = gmat.index[j]; - p_hist[bin].Add(stat[k]); - } + + grad_stat.sum_grad += pgh[idx_gh]; + grad_stat.sum_hess += pgh[idx_gh+1]; + + for (size_t j = icol_start; j < icol_end; ++j) { + const uint32_t idx_bin = 2*index[j]; + data_local_hist[idx_bin] += pgh[idx_gh]; + data_local_hist[idx_bin+1] += pgh[idx_gh+1]; } } - for (size_t i = nrows - rest; i < nrows; ++i) { - const size_t rid = row_indices.begin[i]; - const size_t ibegin = gmat.row_ptr[rid]; - const size_t iend = gmat.row_ptr[rid + 1]; - const GradientPair stat = gpair[rid]; - for (size_t j = ibegin; j < iend; ++j) { - const uint32_t bin = gmat.index[j]; - p_hist[bin].Add(stat); + } else { + for (size_t i = istart; i < iend; ++i) { + const size_t icol_start = row_ptr[rid[i]]; + const size_t icol_end = row_ptr[rid[i]+1]; + const size_t idx_gh = 2*rid[i]; + + grad_stat.sum_grad += pgh[idx_gh]; + grad_stat.sum_hess += pgh[idx_gh+1]; + + for (size_t j = icol_start; j < icol_end; ++j) { + const uint32_t idx_bin = 2*index[j]; + data_local_hist[idx_bin] += pgh[idx_gh]; + data_local_hist[idx_bin+1] += pgh[idx_gh+1]; } } } + grad_stat_global->Add(grad_stat); } -void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) { - const uint32_t nbins = static_cast(nbins_); - constexpr int kUnroll = 8; // loop unrolling factor - const uint32_t rest = nbins % kUnroll; - -#if defined(_OPENMP) - const auto nthread = static_cast(this->nthread_); // NOLINT -#endif // defined(_OPENMP) - tree::GradStats* p_self = self.data(); - tree::GradStats* p_sibling = sibling.data(); - tree::GradStats* p_parent = parent.data(); - -#pragma omp parallel for num_threads(nthread) schedule(static) - for (bst_omp_uint bin_id = 0; - bin_id < static_cast(nbins - rest); bin_id += kUnroll) { - tree::GradStats pb[kUnroll]; - tree::GradStats sb[kUnroll]; - for (int k = 0; k < kUnroll; ++k) { - pb[k] = p_parent[bin_id + k]; - } - for (int k = 0; k < kUnroll; ++k) { - sb[k] = p_sibling[bin_id + k]; - } - for (int k = 0; k < kUnroll; ++k) { - p_self[bin_id + k].SetSubstract(pb[k], sb[k]); +void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) { + GradStatHist* p_self = self.data(); + GradStatHist* p_sibling = sibling.data(); + GradStatHist* p_parent = parent.data(); + + const size_t size = self.size(); + CHECK_EQ(sibling.size(), size); + CHECK_EQ(parent.size(), size); + + const size_t block_size = 1024; // aproximatly 1024 values per block + size_t n_blocks = size/block_size + !!(size%block_size); + + #pragma omp parallel for + for (int iblock = 0; iblock < n_blocks; ++iblock) { + const size_t ibegin = iblock*block_size; + const size_t iend = (((iblock+1)*block_size > size) ? size : ibegin + block_size); + for (bst_omp_uint bin_id = ibegin; bin_id < iend; bin_id++) { + p_self[bin_id].SetSubstract(p_parent[bin_id], p_sibling[bin_id]); } } - for (uint32_t bin_id = nbins - rest; bin_id < nbins; ++bin_id) { - p_self[bin_id].SetSubstract(p_parent[bin_id], p_sibling[bin_id]); - } } } // namespace common diff --git a/src/common/hist_util.h b/src/common/hist_util.h index dc2b80bb87a5..25efe7bbb2ce 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -11,13 +11,54 @@ #include #include #include +#include +#include #include "row_set.h" #include "../tree/param.h" #include "./quantile.h" #include "./timer.h" #include "../include/rabit/rabit.h" +#include "random.h" namespace xgboost { + +/*! + * \brief A C-style array with in-stack allocation. As long as the array is smaller than MaxStackSize, it will be allocated inside the stack. Otherwise, it will be heap-allocated. + */ +template +class MemStackAllocator { + public: + explicit MemStackAllocator(size_t required_size): required_size_(required_size) { + } + + T* Get() { + if (!ptr_) { + if (MaxStackSize >= required_size_) { + ptr_ = stack_mem_; + } else { + ptr_ = reinterpret_cast(malloc(required_size_ * sizeof(T))); + do_free_ = true; + } + } + + return ptr_; + } + + ~MemStackAllocator() { + if (do_free_) free(ptr_); + } + + private: + T* ptr_ = nullptr; + bool do_free_ = false; + size_t required_size_; + T stack_mem_[MaxStackSize]; +}; + +namespace tree { +class SplitEvaluator; +} + namespace common { /* @@ -134,9 +175,10 @@ using GHistIndexRow = Span; */ struct GHistIndexMatrix { /*! \brief row pointer to rows by element position */ - std::vector row_ptr; + // std::vector row_ptr; + SimpleArray row_ptr; /*! \brief The index data */ - std::vector index; + SimpleArray index; /*! \brief hit count of each index */ std::vector hit_count; /*! \brief The corresponding cuts */ @@ -170,6 +212,11 @@ struct GHistIndexBlock { inline GHistIndexBlock(const size_t* row_ptr, const uint32_t* index) : row_ptr(row_ptr), index(index) {} + + // get i-th row + inline GHistIndexRow operator[](size_t i) const { + return {&index[0] + row_ptr[i], detail::ptrdiff_t(row_ptr[i + 1] - row_ptr[i])}; + } }; class ColumnMatrix; @@ -202,12 +249,63 @@ class GHistIndexBlockMatrix { }; /*! - * \brief histogram of graident statistics for a single node. - * Consists of multiple GradStats, each entry showing total graident statistics - * for that particular bin - * Uses global bin id so as to represent all features simultaneously + * \brief used instead of GradStats to have float instead of double to reduce histograms + * this improves performance by 10-30% and memory consumption for histograms by 2x + * accuracy in both cases is the same */ -using GHistRow = Span; +struct GradStatHist { + typedef float GradType; + /*! \brief sum gradient statistics */ + GradType sum_grad; + /*! \brief sum hessian statistics */ + GradType sum_hess; + + GradStatHist() : sum_grad{0}, sum_hess{0} { + static_assert(sizeof(GradStatHist) == 8, + "Size of GradStatHist is not 8 bytes."); + } + + inline void Add(const GradStatHist& b) { + sum_grad += b.sum_grad; + sum_hess += b.sum_hess; + } + + inline void Add(const tree::GradStats& b) { + sum_grad += b.sum_grad; + sum_hess += b.sum_hess; + } + + inline void Add(GradientPair p) { + this->Add(p.GetGrad(), p.GetHess()); + } + + inline void Add(GradType grad, GradType hess) { + sum_grad += grad; + sum_hess += hess; + } + + inline tree::GradStats ToGradStat() const { + return tree::GradStats(sum_grad, sum_hess); + } + + inline void SetSubstract(const GradStatHist& a, const GradStatHist& b) { + sum_grad = a.sum_grad - b.sum_grad; + sum_hess = a.sum_hess - b.sum_hess; + } + + inline void SetSubstract(const tree::GradStats& a, const GradStatHist& b) { + sum_grad = a.sum_grad - b.sum_grad; + sum_hess = a.sum_hess - b.sum_hess; + } + + inline GradType GetGrad() const { return sum_grad; } + inline GradType GetHess() const { return sum_hess; } + inline static void Reduce(GradStatHist& a, const GradStatHist& b) { // NOLINT(*) + a.Add(b); + } +}; + +using GHistRow = Span; /*! * \brief histogram of gradient statistics for multiple nodes @@ -215,49 +313,57 @@ using GHistRow = Span; class HistCollection { public: // access histogram for i-th node - GHistRow operator[](bst_uint nid) const { - constexpr uint32_t kMax = std::numeric_limits::max(); - CHECK_NE(row_ptr_[nid], kMax); - tree::GradStats* ptr = - const_cast(dmlc::BeginPtr(data_) + row_ptr_[nid]); - return {ptr, nbins_}; + inline GHistRow operator[](bst_uint nid) { + if (nid >= data_arr_.size()) { + AddHistRow(nid); + } + return { const_cast(dmlc::BeginPtr(*data_arr_[nid])), nbins_}; } // have we computed a histogram for i-th node? - bool RowExists(bst_uint nid) const { - const uint32_t k_max = std::numeric_limits::max(); - return (nid < row_ptr_.size() && row_ptr_[nid] != k_max); + inline bool RowExists(bst_uint nid) const { + return nid < data_arr_.size(); } // initialize histogram collection - void Init(uint32_t nbins) { - nbins_ = nbins; - row_ptr_.clear(); - data_.clear(); + inline void Init(uint32_t nbins) { + if (nbins_ != nbins) { + for (size_t i = 0; i < data_arr_.size(); ++i) { + delete data_arr_[i]; + } + data_arr_.clear(); + nbins_ = nbins; + } + } + + ~HistCollection() { + for (size_t i = 0; i < data_arr_.size(); ++i) { + delete data_arr_[i]; + } } // create an empty histogram for i-th node - void AddHistRow(bst_uint nid) { - constexpr uint32_t kMax = std::numeric_limits::max(); - if (nid >= row_ptr_.size()) { - row_ptr_.resize(nid + 1, kMax); + inline void AddHistRow(bst_uint nid) { + if (data_arr_.size() <= nid) { + data_arr_.resize(nid + 1, nullptr); } - CHECK_EQ(row_ptr_[nid], kMax); - row_ptr_[nid] = data_.size(); - data_.resize(data_.size() + nbins_); + if (data_arr_[nid] == nullptr) { + data_arr_[nid] = new std::vector; + } + + if (data_arr_[nid]->size() == 0) { + data_arr_[nid]->resize(nbins_); + } } private: /*! \brief number of all bins over all features */ - uint32_t nbins_; - - std::vector data_; - - /*! \brief row_ptr_[nid] locates bin for historgram of node nid */ - std::vector row_ptr_; + uint32_t nbins_ = 0; + std::vector*> data_arr_; }; + /*! * \brief builder for histograms of gradient statistics */ @@ -267,21 +373,55 @@ class GHistBuilder { inline void Init(size_t nthread, uint32_t nbins) { nthread_ = nthread; nbins_ = nbins; - thread_init_.resize(nthread_); - } - - // construct a histogram via histogram aggregation - void BuildHist(const std::vector& gpair, - const RowSetCollection::Elem row_indices, - const GHistIndexMatrix& gmat, - GHistRow hist); - // same, with feature grouping - void BuildBlockHist(const std::vector& gpair, - const RowSetCollection::Elem row_indices, - const GHistIndexBlockMatrix& gmatb, - GHistRow hist); - // construct a histogram via subtraction trick - void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent); + } + +void BuildBlockHist(const std::vector& gpair, + const RowSetCollection::Elem row_indices, + const GHistIndexBlockMatrix& gmatb, + GHistRow hist) { + constexpr int kUnroll = 8; // loop unrolling factor + const int32_t nblock = gmatb.GetNumBlock(); + const size_t nrows = row_indices.end - row_indices.begin; + const size_t rest = nrows % kUnroll; + + #pragma omp parallel for + for (int32_t bid = 0; bid < nblock; ++bid) { + auto gmat = gmatb[bid]; + + for (size_t i = 0; i < nrows - rest; i += kUnroll) { + size_t rid[kUnroll]; + size_t ibegin[kUnroll]; + size_t iend[kUnroll]; + GradientPair stat[kUnroll]; + for (int k = 0; k < kUnroll; ++k) { + rid[k] = row_indices.begin[i + k]; + } + for (int k = 0; k < kUnroll; ++k) { + ibegin[k] = gmat.row_ptr[rid[k]]; + iend[k] = gmat.row_ptr[rid[k] + 1]; + } + for (int k = 0; k < kUnroll; ++k) { + stat[k] = gpair[rid[k]]; + } + for (int k = 0; k < kUnroll; ++k) { + for (size_t j = ibegin[k]; j < iend[k]; ++j) { + const uint32_t bin = gmat.index[j]; + hist[bin].Add(stat[k]); + } + } + } + for (size_t i = nrows - rest; i < nrows; ++i) { + const size_t rid = row_indices.begin[i]; + const size_t ibegin = gmat.row_ptr[rid]; + const size_t iend = gmat.row_ptr[rid + 1]; + const GradientPair stat = gpair[rid]; + for (size_t j = ibegin; j < iend; ++j) { + const uint32_t bin = gmat.index[j]; + hist[bin].Add(stat); + } + } + } +} uint32_t GetNumBins() { return nbins_; @@ -292,11 +432,19 @@ class GHistBuilder { size_t nthread_; /*! \brief number of all bins over all features */ uint32_t nbins_; - std::vector thread_init_; - std::vector data_; }; +void BuildHistLocalDense(size_t istart, size_t iend, size_t nrows, const size_t* rid, + const uint32_t* index, const GradientPair::ValueT* pgh, const size_t* row_ptr, + GradStatHist::GradType* data_local_hist, GradStatHist* grad_stat); + +void BuildHistLocalSparse(size_t istart, size_t iend, size_t nrows, const size_t* rid, + const uint32_t* index, const GradientPair::ValueT* pgh, const size_t* row_ptr, + GradStatHist::GradType* data_local_hist, GradStatHist* grad_stat); + +void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent); + } // namespace common } // namespace xgboost #endif // XGBOOST_COMMON_HIST_UTIL_H_ diff --git a/src/common/row_set.h b/src/common/row_set.h index 285988b159c3..39ae404f8779 100644 --- a/src/common/row_set.h +++ b/src/common/row_set.h @@ -27,10 +27,10 @@ class RowSetCollection { // id of node associated with this instance set; -1 means uninitialized Elem() = default; - Elem(const size_t* begin, - const size_t* end, - int node_id) - : begin(begin), end(end), node_id(node_id) {} + Elem(const size_t* begin_, + const size_t* end_, + int node_id_) + : begin(begin_), end(end_), node_id(node_id_) {} inline size_t Size() const { return end - begin; @@ -42,6 +42,10 @@ class RowSetCollection { std::vector right; }; + size_t Size(unsigned node_id) { + return elem_of_each_node_[node_id].Size(); + } + inline std::vector::const_iterator begin() const { // NOLINT return elem_of_each_node_.begin(); } @@ -51,12 +55,12 @@ class RowSetCollection { } /*! \brief return corresponding element set given the node_id */ - inline const Elem& operator[](unsigned node_id) const { - const Elem& e = elem_of_each_node_[node_id]; - CHECK(e.begin != nullptr) - << "access element that is not in the set"; + inline Elem operator[](unsigned node_id) const { + const Elem e = elem_of_each_node_[node_id]; return e; } + + // clear up things inline void Clear() { elem_of_each_node_.clear(); @@ -81,38 +85,29 @@ class RowSetCollection { const size_t* end = dmlc::BeginPtr(row_indices_) + row_indices_.size(); elem_of_each_node_.emplace_back(Elem(begin, end, 0)); } + // split rowset into two inline void AddSplit(unsigned node_id, - const std::vector& row_split_tloc, + size_t iLeft, unsigned left_node_id, unsigned right_node_id) { - const Elem e = elem_of_each_node_[node_id]; - const auto nthread = static_cast(row_split_tloc.size()); + Elem e = elem_of_each_node_[node_id]; + CHECK(e.begin != nullptr); - size_t* all_begin = dmlc::BeginPtr(row_indices_); - size_t* begin = all_begin + (e.begin - all_begin); - size_t* it = begin; - for (bst_omp_uint tid = 0; tid < nthread; ++tid) { - std::copy(row_split_tloc[tid].left.begin(), row_split_tloc[tid].left.end(), it); - it += row_split_tloc[tid].left.size(); - } - size_t* split_pt = it; - for (bst_omp_uint tid = 0; tid < nthread; ++tid) { - std::copy(row_split_tloc[tid].right.begin(), row_split_tloc[tid].right.end(), it); - it += row_split_tloc[tid].right.size(); - } + size_t* begin = const_cast(e.begin); + size_t* split_pt = begin + iLeft; if (left_node_id >= elem_of_each_node_.size()) { - elem_of_each_node_.resize(left_node_id + 1, Elem(nullptr, nullptr, -1)); + elem_of_each_node_.resize((left_node_id + 1)*2, Elem(nullptr, nullptr, -1)); } if (right_node_id >= elem_of_each_node_.size()) { - elem_of_each_node_.resize(right_node_id + 1, Elem(nullptr, nullptr, -1)); + elem_of_each_node_.resize((right_node_id + 1)*2, Elem(nullptr, nullptr, -1)); } elem_of_each_node_[left_node_id] = Elem(begin, split_pt, left_node_id); elem_of_each_node_[right_node_id] = Elem(split_pt, e.end, right_node_id); - elem_of_each_node_[node_id] = Elem(nullptr, nullptr, -1); + elem_of_each_node_[node_id] = Elem(begin, e.end, -1); } // stores the row indices in the set diff --git a/src/tree/param.h b/src/tree/param.h index d0d49a403450..cca823d0036e 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -279,7 +279,7 @@ XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess } } else { T w = CalcWeight(p, sum_grad, sum_hess); - T ret = CalcGainGivenWeight(p, sum_grad, sum_hess, w); + T ret = CalcGainGivenWeight(p, sum_grad, sum_hess, w); if (p.reg_alpha == 0.0f) { return ret; } else { @@ -299,7 +299,7 @@ template XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess, T test_grad, T test_hess) { T w = CalcWeight(sum_grad, sum_hess); - T ret = CalcGainGivenWeight(p, test_grad, test_hess); + T ret = CalcGainGivenWeight(p, test_grad, test_hess); if (p.reg_alpha == 0.0f) { return ret; } else { @@ -338,15 +338,16 @@ XGBOOST_DEVICE inline float CalcWeight(const TrainingParams &p, GpairT sum_grad) } /*! \brief core statistics used for tree construction */ -struct XGBOOST_ALIGNAS(16) GradStats { +struct GradStats { + typedef double GradType; /*! \brief sum gradient statistics */ - double sum_grad; + GradType sum_grad; /*! \brief sum hessian statistics */ - double sum_hess; + GradType sum_hess; public: - XGBOOST_DEVICE double GetGrad() const { return sum_grad; } - XGBOOST_DEVICE double GetHess() const { return sum_hess; } + XGBOOST_DEVICE GradType GetGrad() const { return sum_grad; } + XGBOOST_DEVICE GradType GetHess() const { return sum_hess; } XGBOOST_DEVICE GradStats() : sum_grad{0}, sum_hess{0} { static_assert(sizeof(GradStats) == 16, @@ -356,7 +357,7 @@ struct XGBOOST_ALIGNAS(16) GradStats { template XGBOOST_DEVICE explicit GradStats(const GpairT &sum) : sum_grad(sum.GetGrad()), sum_hess(sum.GetHess()) {} - explicit GradStats(const double grad, const double hess) + explicit GradStats(const GradType grad, const GradType hess) : sum_grad(grad), sum_hess(hess) {} /*! * \brief accumulate statistics @@ -381,7 +382,7 @@ struct XGBOOST_ALIGNAS(16) GradStats { /*! \return whether the statistics is not used yet */ inline bool Empty() const { return sum_hess == 0.0; } /*! \brief add statistics to the data */ - inline void Add(double grad, double hess) { + inline void Add(GradType grad, GradType hess) { sum_grad += grad; sum_hess += hess; } @@ -411,7 +412,7 @@ struct ValueConstraint { template XGBOOST_DEVICE inline double CalcGain(const ParamT ¶m, GradStats stats) const { - return CalcGainGivenWeight(param, stats.sum_grad, stats.sum_hess, + return CalcGainGivenWeight(param, stats.sum_grad, stats.sum_hess, CalcWeight(param, stats)); } @@ -422,8 +423,8 @@ struct ValueConstraint { double wleft = CalcWeight(param, left); double wright = CalcWeight(param, right); double gain = - CalcGainGivenWeight(param, left.sum_grad, left.sum_hess, wleft) + - CalcGainGivenWeight(param, right.sum_grad, right.sum_hess, wright); + CalcGainGivenWeight(param, left.sum_grad, left.sum_hess, wleft) + + CalcGainGivenWeight(param, right.sum_grad, right.sum_hess, wright); if (constraint == 0) { return gain; } else if (constraint > 0) { @@ -468,6 +469,7 @@ struct SplitEntry { bst_float split_value{0.0f}; GradStats left_sum; GradStats right_sum; + bool default_left{true}; /*! \brief constructor */ SplitEntry() = default; @@ -482,7 +484,11 @@ struct SplitEntry { * \param split_index the feature index where the split is on */ inline bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const { - if (this->SplitIndex() <= split_index) { + if (!std::isfinite(new_loss_chg)) { // in some cases new_loss_chg can be NaN or Inf, + // for example when lambda = 0 & min_child_weight = 0 + // skip value in this case + return false; + } else if (this->SplitIndex() <= split_index) { return new_loss_chg > this->loss_chg; } else { return !(this->loss_chg > new_loss_chg); @@ -500,6 +506,7 @@ struct SplitEntry { this->split_value = e.split_value; this->left_sum = e.left_sum; this->right_sum = e.right_sum; + this->default_left = e.default_left; return true; } else { return false; @@ -514,13 +521,11 @@ struct SplitEntry { * \return whether the proposed split is better and can replace current split */ inline bool Update(bst_float new_loss_chg, unsigned split_index, - bst_float new_split_value, bool default_left, + bst_float new_split_value, bool new_default_left, const GradStats &left_sum, const GradStats &right_sum) { if (this->NeedReplace(new_loss_chg, split_index)) { this->loss_chg = new_loss_chg; - if (default_left) { - split_index |= (1U << 31); - } + this->default_left = new_default_left; this->sindex = split_index; this->split_value = new_split_value; this->left_sum = left_sum; @@ -536,9 +541,9 @@ struct SplitEntry { dst.Update(src); } /*!\return feature index to split on */ - inline unsigned SplitIndex() const { return sindex & ((1U << 31) - 1U); } + inline unsigned SplitIndex() const { return sindex; } /*!\return whether missing value goes to left branch */ - inline bool DefaultLeft() const { return (sindex >> 31) != 0; } + inline bool DefaultLeft() const { return default_left; } }; } // namespace tree diff --git a/src/tree/split_evaluator.cc b/src/tree/split_evaluator.cc index 55d2b99ffae0..716e6d81d834 100644 --- a/src/tree/split_evaluator.cc +++ b/src/tree/split_evaluator.cc @@ -282,7 +282,9 @@ class MonotonicConstraint final : public SplitEvaluator { bst_float leftweight, bst_float rightweight) override { inner_->AddSplit(nodeid, leftid, rightid, featureid, leftweight, rightweight); - bst_uint newsize = std::max(leftid, rightid) + 1; + + bst_uint newsize = std::max(bst_uint(lower_.size()), bst_uint(std::max(leftid, rightid) + 1u)); + lower_.resize(newsize); upper_.resize(newsize); bst_int constraint = GetConstraint(featureid); diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 52633c099384..7f6ac26b2b1e 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -1,8 +1,8 @@ /*! - * Copyright 2017-2018 by Contributors + * Copyright 2017-2019 by Contributors * \file updater_quantile_hist.cc * \brief use quantized feature values to construct a tree - * \author Philip Cho, Tianqi Checn + * \author Philip Cho, Tianqi Checn, Egor Smirnov */ #include #include @@ -52,6 +52,7 @@ void QuantileHistMaker::Init(const std::vector *gpair, DMatrix *dmat, const std::vector &trees) { + // omp_set_nested(1); if (is_gmat_initialized_ == false) { double tstart = dmlc::GetTime(); gmat_.Init(dmat, static_cast(param_.max_bin)); @@ -88,94 +89,180 @@ bool QuantileHistMaker::UpdatePredictionCache( } } -void QuantileHistMaker::Builder::SyncHistograms( - int starting_index, - int sync_count, - RegTree *p_tree) { - builder_monitor_.Start("SyncHistograms"); - this->histred_.Allreduce(hist_[starting_index].data(), hist_builder_.GetNumBins() * sync_count); - // use Subtraction Trick - for (auto const& node_pair : nodes_for_subtraction_trick_) { - hist_.AddHistRow(node_pair.first); - SubtractionTrick(hist_[node_pair.first], hist_[node_pair.second], - hist_[(*p_tree)[node_pair.first].Parent()]); - } - builder_monitor_.Stop("SyncHistograms"); +void QuantileHistMaker::Builder::BuildNodeStat( + const GHistIndexMatrix &gmat, + DMatrix *p_fmat, + RegTree *p_tree, + const std::vector &gpair_h, + int32_t nid) { + + // add constraints + if (!(*p_tree)[nid].IsLeftChild() && !(*p_tree)[nid].IsRoot()) { + auto parent_id = (*p_tree)[nid].Parent(); + // it's a right child + auto left_sibling_id = (*p_tree)[parent_id].LeftChild(); + auto parent_split_feature_id = snode_[parent_id].best.SplitIndex(); + + { + spliteval_->AddSplit(parent_id, left_sibling_id, nid, parent_split_feature_id, + snode_[left_sibling_id].weight, snode_[nid].weight); + } + } } -void QuantileHistMaker::Builder::BuildLocalHistograms( - int *starting_index, - int *sync_count, +void QuantileHistMaker::Builder::BuildNodeStatBatch( const GHistIndexMatrix &gmat, - const GHistIndexBlockMatrix &gmatb, + DMatrix *p_fmat, RegTree *p_tree, - const std::vector &gpair_h) { - builder_monitor_.Start("BuildLocalHistograms"); - for (auto const& entry : qexpand_depth_wise_) { - int nid = entry.nid; - RegTree::Node &node = (*p_tree)[nid]; - if (rabit::IsDistributed()) { - if (node.IsRoot() || node.IsLeftChild()) { - hist_.AddHistRow(nid); - // in distributed setting, we always calculate from left child or root node - BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid], false); - if (!node.IsRoot()) { - nodes_for_subtraction_trick_[(*p_tree)[node.Parent()].RightChild()] = nid; + const std::vector &gpair_h, + const std::vector& nodes) { + perf_monitor.TickStart(); + for (const auto& node : nodes) { + const int32_t nid = node.nid; + const int32_t sibling_nid = node.sibling_nid; + this->InitNewNode(nid, gmat, gpair_h, *p_fmat, p_tree, &(snode_[nid]), (*p_tree)[nid].Parent()); + if (sibling_nid > -1) { + this->InitNewNode(nid, gmat, gpair_h, *p_fmat, p_tree, + &(snode_[sibling_nid]), (*p_tree)[sibling_nid].Parent()); + } + } + for (const auto& node : nodes) { + const int32_t nid = node.nid; + const int32_t sibling_nid = node.sibling_nid; + BuildNodeStat(gmat, p_fmat, p_tree, gpair_h, nid); + if (sibling_nid > -1) { + BuildNodeStat(gmat, p_fmat, p_tree, gpair_h, sibling_nid); + } + } + perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::INIT_NEW_NODE); +} + +template +std::pair PartitionDenseLeftDefaultKernel(const RowIdxType* rid, + const IdxType* idx, const IdxType offset, const int32_t split_cond, + const size_t istart, const size_t iend, RowIdxType* p_left, RowIdxType* p_right) { + size_t ileft = 0; + size_t iright = 0; + + for (size_t i = istart; i < iend; i++) { + if ( idx[rid[i]] == std::numeric_limits::max() || + static_cast(idx[rid[i]] + offset) <= split_cond) { + p_left[ileft++] = rid[i]; + } else { + p_right[iright++] = rid[i]; + } + } + return { ileft, iright }; +} + +template +std::pair PartitionDenseRightDefaultKernel(const RowIdxType* rid, + const IdxType* idx, const IdxType offset, const int32_t split_cond, + const size_t istart, const size_t iend, RowIdxType* p_left, RowIdxType* p_right) { + size_t ileft = 0; + size_t iright = 0; + + for (size_t i = istart; i < iend; i++) { + if (idx[rid[i]] == std::numeric_limits::max() || + static_cast(idx[rid[i]] + offset) > split_cond) { + p_right[iright++] = rid[i]; + } else { + p_left[ileft++] = rid[i]; + } + } + return { ileft, iright }; +} + +template +std::pair PartitionSparseKernel(const RowIdxType* rowid, + const IdxType* idx, const int32_t split_cond, const size_t ibegin, + const size_t iend, RowIdxType* p_left, RowIdxType* p_right, + Column column, bool default_left) { + size_t ileft = 0; + size_t iright = 0; + + if (ibegin < iend) { // ensure that [ibegin, iend) is nonempty range + // search first nonzero row with index >= rowid[ibegin] + const size_t* p = std::lower_bound(column.GetRowData(), + column.GetRowData() + column.Size(), + rowid[ibegin]); + if (p != column.GetRowData() + column.Size() && *p <= rowid[iend - 1]) { + size_t cursor = p - column.GetRowData(); + + for (size_t i = ibegin; i < iend; ++i) { + const size_t rid = rowid[i]; + while (cursor < column.Size() + && column.GetRowIdx(cursor) < rid + && column.GetRowIdx(cursor) <= rowid[iend - 1]) { + ++cursor; + } + if (cursor < column.Size() && column.GetRowIdx(cursor) == rid) { + const uint32_t rbin = column.GetFeatureBinIdx(cursor); + if (static_cast(rbin + column.GetBaseIdx()) <= split_cond) { + p_left[ileft++] = rid; + } else { + p_right[iright++] = rid; + } + ++cursor; + } else { + // missing value + if (default_left) { + p_left[ileft++] = rid; + } else { + p_right[iright++] = rid; + } } - (*sync_count)++; - (*starting_index) = std::min((*starting_index), nid); } - } else { - if (!node.IsRoot() && node.IsLeftChild() && - (row_set_collection_[nid].Size() < - row_set_collection_[(*p_tree)[node.Parent()].RightChild()].Size())) { - hist_.AddHistRow(nid); - BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid], false); - nodes_for_subtraction_trick_[(*p_tree)[node.Parent()].RightChild()] = nid; - (*sync_count)++; - (*starting_index) = std::min((*starting_index), nid); - } else if (!node.IsRoot() && !node.IsLeftChild() && - (row_set_collection_[nid].Size() <= - row_set_collection_[(*p_tree)[node.Parent()].LeftChild()].Size())) { - hist_.AddHistRow(nid); - BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid], false); - nodes_for_subtraction_trick_[(*p_tree)[node.Parent()].LeftChild()] = nid; - (*sync_count)++; - (*starting_index) = std::min((*starting_index), nid); - } else if (node.IsRoot()) { - hist_.AddHistRow(nid); - BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid], false); - (*sync_count)++; - (*starting_index) = std::min((*starting_index), nid); + } else { // all rows in [ibegin, iend) have missing values + if (default_left) { + for (size_t i = ibegin; i < iend; ++i) { + const size_t rid = rowid[i]; + p_left[ileft++] = rid; + } + } else { + for (size_t i = ibegin; i < iend; ++i) { + const size_t rid = rowid[i]; + p_right[iright++] = rid; + } } } } - builder_monitor_.Stop("BuildLocalHistograms"); + return {ileft, iright}; } -void QuantileHistMaker::Builder::BuildNodeStats( - const GHistIndexMatrix &gmat, - DMatrix *p_fmat, - RegTree *p_tree, - const std::vector &gpair_h) { - builder_monitor_.Start("BuildNodeStats"); - for (auto const& entry : qexpand_depth_wise_) { - int nid = entry.nid; - this->InitNewNode(nid, gmat, gpair_h, *p_fmat, *p_tree); - // add constraints - if (!(*p_tree)[nid].IsLeftChild() && !(*p_tree)[nid].IsRoot()) { - // it's a right child - auto parent_id = (*p_tree)[nid].Parent(); - auto left_sibling_id = (*p_tree)[parent_id].LeftChild(); - auto parent_split_feature_id = snode_[parent_id].best.SplitIndex(); - spliteval_->AddSplit(parent_id, left_sibling_id, nid, parent_split_feature_id, - snode_[left_sibling_id].weight, snode_[nid].weight); + +int32_t QuantileHistMaker::Builder::FindSplitCond(int32_t nid, + RegTree *p_tree, + const GHistIndexMatrix &gmat) { + bst_float left_leaf_weight = spliteval_->ComputeWeight(nid, + snode_[nid].best.left_sum) * param_.learning_rate; + bst_float right_leaf_weight = spliteval_->ComputeWeight(nid, + snode_[nid].best.right_sum) * param_.learning_rate; + p_tree->ExpandNode(nid, snode_[nid].best.SplitIndex(), snode_[nid].best.split_value, + snode_[nid].best.DefaultLeft(), snode_[nid].weight, left_leaf_weight, + right_leaf_weight, snode_[nid].best.loss_chg, snode_[nid].stats.sum_hess); + + RegTree::Node node = (*p_tree)[nid]; + // Categorize member rows + const bst_uint fid = node.SplitIndex(); + const bst_float split_pt = node.SplitCond(); + const uint32_t lower_bound = gmat.cut.row_ptr[fid]; + const uint32_t upper_bound = gmat.cut.row_ptr[fid + 1]; + int32_t split_cond = -1; + // convert floating-point split_pt into corresponding bin_id + // split_cond = -1 indicates that split_pt is less than all known cut points + CHECK_LT(upper_bound, + static_cast(std::numeric_limits::max())); + for (uint32_t i = lower_bound; i < upper_bound; ++i) { + if (split_pt == gmat.cut.cut[i]) { + split_cond = static_cast(i); } } - builder_monitor_.Stop("BuildNodeStats"); + return split_cond; } -void QuantileHistMaker::Builder::EvaluateSplits( +void QuantileHistMaker::Builder::CreateNewNodesBatch( + const std::vector& nodes, const GHistIndexMatrix &gmat, const ColumnMatrix &column_matrix, DMatrix *p_fmat, @@ -184,49 +271,395 @@ void QuantileHistMaker::Builder::EvaluateSplits( int depth, unsigned *timestamp, std::vector *temp_qexpand_depth) { - for (auto const& entry : qexpand_depth_wise_) { - int nid = entry.nid; - this->EvaluateSplit(nid, gmat, hist_, *p_fmat, *p_tree); - if (snode_[nid].best.loss_chg < kRtEps || + perf_monitor.TickStart(); + const size_t block_size = 2048; + + // inputs for tasks + std::vector nids_to_split; + std::vector cond_to_split; + std::vector n_blocks_vec; + std::vector> tasks; + size_t* buffer = buffer_for_partition_.data(); + size_t cur_buff_offset = 0; + + // buffers to store partial results + std::vector>> buffers_by_nids; + std::vector>> sizes_by_nids; + + auto create_nodes = [&](int32_t this_nid) { + if (snode_[this_nid].best.loss_chg < kRtEps || (param_.max_depth > 0 && depth == param_.max_depth) || (param_.max_leaves > 0 && (*num_leaves) == param_.max_leaves)) { - (*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate); + (*p_tree)[this_nid].SetLeaf(snode_[this_nid].weight * param_.learning_rate); + } else { + nids_to_split.push_back(this_nid); + cond_to_split.push_back(FindSplitCond(this_nid, p_tree, gmat)); + + const size_t nrows = row_set_collection_[this_nid].Size(); + const size_t n_blocks = nrows / block_size + !!(nrows % block_size); + n_blocks_vec.push_back(n_blocks); + + buffers_by_nids.resize(buffers_by_nids.size() + 1); + + for (size_t i = 0; i < n_blocks; ++i) { + const size_t istart = i*block_size; + const size_t iend = (i == n_blocks-1) ? nrows : istart + block_size; + buffers_by_nids.back().push_back({ buffer + cur_buff_offset, + buffer + cur_buff_offset + (iend-istart) }); + cur_buff_offset += 2*(iend-istart); + tasks.emplace_back(cond_to_split.size() - 1, i); + } + sizes_by_nids.emplace_back(n_blocks); + } + }; + for (const auto& node : nodes) { + const int32_t nid = node.nid; + const int32_t sibling_nid = node.sibling_nid; + create_nodes(nid); + + if (sibling_nid > -1) { + create_nodes(sibling_nid); + } + } + + // buffer to store # of rows in left part for each row-block + std::vector left_sizes; + + const int32_t size = tasks.size(); + #pragma omp parallel + { + // compute partial partitions + #pragma omp for schedule(guided) + for (int32_t i = 0; i < size; ++i) { + const size_t node_idx = tasks[i].first; + const size_t iblock = tasks[i].second; + const int32_t split_cond = cond_to_split[node_idx]; + const int32_t nid = nids_to_split[node_idx]; + const bst_uint fid = (*p_tree)[nid].SplitIndex(); + + const size_t nrows = row_set_collection_[nid].Size(); + const size_t nblocks = n_blocks_vec[node_idx]; + const size_t istart = iblock*block_size; + const size_t iend = (iblock == nblocks-1) ? nrows : istart + block_size; + + const bool default_left = (*p_tree)[nid].DefaultLeft(); + const Column column = column_matrix.GetColumn(fid); + + const uint32_t* idx = column.GetIndex(); + const size_t* rid = row_set_collection_[nid].begin; + + if (column.GetType() == xgboost::common::kDenseColumn) { + if (default_left) { + sizes_by_nids[node_idx][iblock] = PartitionDenseLeftDefaultKernel( + rid, idx, column.GetBaseIdx(), split_cond, istart, iend, + buffers_by_nids[node_idx][iblock].first, buffers_by_nids[node_idx][iblock].second); + } else { + sizes_by_nids[node_idx][iblock] = PartitionDenseRightDefaultKernel( + rid, idx, column.GetBaseIdx(), split_cond, istart, iend, + buffers_by_nids[node_idx][iblock].first, buffers_by_nids[node_idx][iblock].second); + } + } else { + sizes_by_nids[node_idx][iblock] = PartitionSparseKernel( + rid, idx, split_cond, istart, iend, buffers_by_nids[node_idx][iblock].first, + buffers_by_nids[node_idx][iblock].second, column, default_left); + } + } + + // calculate sizes of left parts in each block + #pragma omp single + { + for (size_t inode = 0; inode < nids_to_split.size(); ++inode) { + size_t nLeft = 0; + for (auto& size : sizes_by_nids[inode]) { + nLeft += size.first; + } + left_sizes.push_back(nLeft); + } + } + + // merge partial results to one + #pragma omp for schedule(guided) + for (int32_t i = 0; i < size; ++i) { + const size_t node_idx = tasks[i].first; + const size_t iblock = tasks[i].second; + + const int32_t nid = nids_to_split[node_idx]; + auto* rid = const_cast(row_set_collection_[nid].begin); + + size_t iLeft = 0; + size_t iRight = 0; + + const size_t nLeft = left_sizes[node_idx]; + + for (size_t j = 0; j < iblock; ++j) { + iLeft += sizes_by_nids[node_idx][j].first; + iRight += sizes_by_nids[node_idx][j].second; + } + + memcpy(rid + iLeft, buffers_by_nids[node_idx][iblock].first, + sizes_by_nids[node_idx][iblock].first * sizeof(rid[0])); + memcpy(rid + nLeft + iRight, buffers_by_nids[node_idx][iblock].second, + sizes_by_nids[node_idx][iblock].second * sizeof(rid[0])); + } + } + // register new nodes + for (size_t i = 0; i < nids_to_split.size(); ++i) { + const int32_t nid = nids_to_split[i]; + const size_t nLeft = left_sizes[i]; + RegTree::Node node = (*p_tree)[nid]; + + const int32_t left_id = node.LeftChild(); + const int32_t right_id = node.RightChild(); + row_set_collection_.AddSplit(nid, nLeft, left_id, right_id); + + if (rabit::IsDistributed() || + row_set_collection_[left_id].Size() < row_set_collection_[right_id].Size()) { + temp_qexpand_depth->push_back(ExpandEntry(left_id, right_id, nid, + depth + 1, 0.0, (*timestamp)++)); } else { - this->ApplySplit(nid, gmat, column_matrix, hist_, *p_fmat, p_tree); - int left_id = (*p_tree)[nid].LeftChild(); - int right_id = (*p_tree)[nid].RightChild(); - temp_qexpand_depth->push_back(ExpandEntry(left_id, - p_tree->GetDepth(left_id), 0.0, (*timestamp)++)); - temp_qexpand_depth->push_back(ExpandEntry(right_id, - p_tree->GetDepth(right_id), 0.0, (*timestamp)++)); - // - 1 parent + 2 new children - (*num_leaves)++; + temp_qexpand_depth->push_back(ExpandEntry(right_id, left_id, nid, + depth + 1, 0.0, (*timestamp)++)); } } + + perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::APPLY_SPLIT); } +std::tuple + QuantileHistMaker::Builder::GetHistBuffer( + std::vector* hist_is_init, std::vector* grad_stats, + size_t block_id, size_t nthread, size_t tid, + std::vector* data_hist, size_t hist_size) { + + const size_t n_hist_for_current_node = hist_is_init->size(); + const size_t hist_id = ((n_hist_for_current_node == nthread) ? tid : block_id); + + common::GradStatHist::GradType* local_data_hist = (*data_hist)[hist_id]; + if (!(*hist_is_init)[hist_id]) { + std::fill(local_data_hist, local_data_hist + hist_size, 0.0f); + (*hist_is_init)[hist_id] = true; + } + + return std::make_tuple(local_data_hist, &(*grad_stats)[hist_id]); +} + +void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& nodes, + RegTree* p_tree, const GHistIndexMatrix &gmat, const std::vector& gpair, + std::vector>* hist_buffers, + std::vector>* hist_is_init) { + perf_monitor.TickStart(); + const size_t block_size_rows = 256; + const size_t nthread = static_cast(this->nthread_); + const size_t nbins = gmat.cut.row_ptr.back(); + const size_t hist_size = 2 * nbins; + + hist_buffers->resize(nodes.size()); + hist_is_init->resize(nodes.size()); + + // input data for tasks + int32_t n_tasks = 0; + std::vector task_nid; + std::vector task_node_idx; + std::vector task_block_idx; + + // result vector + std::vector> grad_stats(nodes.size()); + + size_t i_hist = 0; + + // prepare tasks for parallel exection + for (size_t i = 0; i < nodes.size(); ++i) { + const int32_t nid = nodes[i].nid; + const int32_t sibling_nid = nodes[i].sibling_nid; + hist_.AddHistRow(nid); + if (sibling_nid > -1) { + hist_.AddHistRow(sibling_nid); + } + const size_t nrows = row_set_collection_[nid].Size(); + const size_t n_local_blocks = nrows / block_size_rows + !!(nrows % block_size_rows); + const size_t n_local_histograms = std::min(nthread, n_local_blocks); + + for (size_t j = 0; j < n_local_blocks; ++j) { + task_nid.push_back(nid); + task_node_idx.push_back(i); + task_block_idx.push_back(j); + } + n_tasks += n_local_blocks; + + (*hist_buffers)[i].clear(); + for (size_t j = 0; j < n_local_histograms; j++) { + (*hist_buffers)[i].push_back( + reinterpret_cast(hist_buff_[i_hist++].data())); + } + (*hist_is_init)[i].clear(); + (*hist_is_init)[i].resize(n_local_histograms, false); + grad_stats[i].resize(n_local_histograms); + } + const GradientPair::ValueT* const pgh = + reinterpret_cast(gpair.data()); + + // execute tasks in parallel + #pragma omp parallel for schedule(guided) + for (int32_t itask = 0; itask < n_tasks; ++itask) { + const size_t tid = omp_get_thread_num(); + const int32_t nid = task_nid[itask]; + const int32_t block_id = task_block_idx[itask]; + const int32_t node_idx = task_node_idx[itask]; + + common::GradStatHist::GradType* data_local_hist; + common::GradStatHist* grad_stat; + std::tie(data_local_hist, grad_stat) = GetHistBuffer(&(*hist_is_init)[node_idx], + &grad_stats[node_idx], block_id, nthread, tid, + &(*hist_buffers)[node_idx], hist_size); + + const size_t* row_ptr = gmat.row_ptr.data(); + const size_t* rid = row_set_collection_[nid].begin; + + const size_t nrows = row_set_collection_[nid].Size(); + const size_t istart = block_id * block_size_rows; + const size_t iend = (((block_id+1)*block_size_rows > nrows) ? nrows : istart + block_size_rows); + + if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) { + common::BuildHistLocalDense(istart, iend, nrows, rid, gmat.index.data(), pgh, + row_ptr, data_local_hist, grad_stat); + } else { + common::BuildHistLocalSparse(istart, iend, nrows, rid, gmat.index.data(), pgh, + row_ptr, data_local_hist, grad_stat); + } + } + + SyncHistograms(p_tree, nodes, hist_buffers, hist_is_init, grad_stats); + + perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::BUILD_HIST); +} + +void QuantileHistMaker::Builder::SyncHistograms( + RegTree* p_tree, + const std::vector& nodes, + std::vector>* hist_buffers, + std::vector>* hist_is_init, + const std::vector>& grad_stats) { + if (rabit::IsDistributed()) { + #pragma omp parallel for // TODO(egorsmir): replace to n_features * nodes.size() + for (int i = 0; i < nodes.size(); ++i) { + const int32_t nid = nodes[i].nid; + common::GradStatHist::GradType* hist_data = + reinterpret_cast(hist_[nid].data()); + + ReduceHistograms(hist_data, nullptr, nullptr, 0, hist_builder_.GetNumBins() * 2, i, + *hist_is_init, *hist_buffers); + } + + for (auto elem : nodes) { + this->histred_.Allreduce(hist_[elem.nid].data(), hist_builder_.GetNumBins()); + } + + // TODO(egorsmir): add parallel for + for (auto elem : nodes) { + if (elem.sibling_nid > -1) { + SubtractionTrick(hist_[elem.sibling_nid], hist_[elem.nid], + hist_[(*p_tree)[elem.sibling_nid].Parent()]); + } + } + } + + // merge grad stats + { + for (size_t inode = 0; inode < nodes.size(); ++inode) { + const int32_t nid = nodes[inode].nid; + + if (snode_.size() <= size_t(nid)) { + snode_.resize(nid + 1, NodeEntry(param_)); + } + + common::GradStatHist grad_stat; + for (size_t ihist = 0; ihist < (*hist_is_init)[inode].size(); ++ihist) { + if ((*hist_is_init)[inode][ihist]) { + grad_stat.Add(grad_stats[inode][ihist]); + } + } + this->histred_.Allreduce(&grad_stat, 1); + snode_[nid].stats = grad_stat.ToGradStat(); + + const int32_t sibling_nid = nodes[inode].sibling_nid; + if (sibling_nid > -1) { + if (snode_.size() <= size_t(sibling_nid)) { + snode_.resize(sibling_nid + 1, NodeEntry(param_)); + } + const int parent_id = (*p_tree)[nid].Parent(); + snode_[sibling_nid].stats.SetSubstract(snode_[parent_id].stats, snode_[nid].stats); + } + } + } +} + +void QuantileHistMaker::Builder::ReduceHistograms( + common::GradStatHist::GradType* hist_data, + common::GradStatHist::GradType* sibling_hist_data, + common::GradStatHist::GradType* parent_hist_data, + const size_t ibegin, + const size_t iend, + const size_t inode, + const std::vector>& hist_is_init, + const std::vector>& hist_buffers) { + bool is_init = false; + for (size_t ihist = 0; ihist < hist_is_init[inode].size(); ++ihist) { + common::GradStatHist::GradType* partial_data = hist_buffers[inode][ihist]; + if (hist_is_init[inode][ihist] && is_init) { + for (size_t i = ibegin; i < iend; ++i) { + hist_data[i] += partial_data[i]; + } + } else if (hist_is_init[inode][ihist]) { + for (size_t i = ibegin; i < iend; ++i) { + hist_data[i] = partial_data[i]; + } + is_init = true; + } + } + + if (sibling_hist_data) { + for (size_t i = ibegin; i < iend; ++i) { + sibling_hist_data[i] = parent_hist_data[i] - hist_data[i]; + } + } +} + +// void QuantileHistMaker::Builder::SyncHistograms() { + +// } + void QuantileHistMaker::Builder::ExpandWithDepthWidth( const GHistIndexMatrix &gmat, const GHistIndexBlockMatrix &gmatb, const ColumnMatrix &column_matrix, - DMatrix *p_fmat, - RegTree *p_tree, + DMatrix* p_fmat, + RegTree* p_tree, const std::vector &gpair_h) { unsigned timestamp = 0; int num_leaves = 0; // in depth_wise growing, we feed loss_chg with 0.0 since it is not used anyway - qexpand_depth_wise_.emplace_back(ExpandEntry(0, p_tree->GetDepth(0), 0.0, timestamp++)); + qexpand_depth_wise_.emplace_back(0, -1, ROOT_PARENT_ID, p_tree->GetDepth(0), 0.0, timestamp++); ++num_leaves; + for (int depth = 0; depth < param_.max_depth + 1; depth++) { - int starting_index = std::numeric_limits::max(); - int sync_count = 0; std::vector temp_qexpand_depth; - BuildLocalHistograms(&starting_index, &sync_count, gmat, gmatb, p_tree, gpair_h); - SyncHistograms(starting_index, sync_count, p_tree); - BuildNodeStats(gmat, p_fmat, p_tree, gpair_h); - EvaluateSplits(gmat, column_matrix, p_fmat, p_tree, &num_leaves, depth, ×tamp, - &temp_qexpand_depth); + + // buffer to store partial histograms + std::vector> hist_buffers; + // uint8_t is used instead of bool due to read/write + // to std::vector - thread unsafe + std::vector> hist_is_init; + + BuildHistsBatch(qexpand_depth_wise_, p_tree, gmat, gpair_h, + &hist_buffers, &hist_is_init); + BuildNodeStatBatch(gmat, p_fmat, p_tree, gpair_h, qexpand_depth_wise_); + EvaluateSplitsBatch(qexpand_depth_wise_, gmat, *p_fmat, hist_is_init, hist_buffers, p_tree); + CreateNewNodesBatch(qexpand_depth_wise_, gmat, column_matrix, p_fmat, p_tree, + &num_leaves, depth, ×tamp, &temp_qexpand_depth); + + num_leaves += temp_qexpand_depth.size(); + // clean up qexpand_depth_wise_.clear(); nodes_for_subtraction_trick_.clear(); @@ -246,18 +679,21 @@ void QuantileHistMaker::Builder::ExpandWithLossGuide( DMatrix* p_fmat, RegTree* p_tree, const std::vector& gpair_h) { - unsigned timestamp = 0; int num_leaves = 0; + std::vector> hist_buffers; + std::vector> hist_is_init; + for (int nid = 0; nid < p_tree->param.num_roots; ++nid) { - hist_.AddHistRow(nid); - BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid], true); + std::vector nodes_to_build{ExpandEntry( + 0, -1, ROOT_PARENT_ID, p_tree->GetDepth(0), 0.0, timestamp++)}; - this->InitNewNode(nid, gmat, gpair_h, *p_fmat, *p_tree); + BuildHistsBatch(nodes_to_build, p_tree, gmat, gpair_h, &hist_buffers, &hist_is_init); + BuildNodeStatBatch(gmat, p_fmat, p_tree, gpair_h, nodes_to_build); + EvaluateSplitsBatch(nodes_to_build, gmat, *p_fmat, hist_is_init, hist_buffers, p_tree); - this->EvaluateSplit(nid, gmat, hist_, *p_fmat, *p_tree); - qexpand_loss_guided_->push(ExpandEntry(nid, p_tree->GetDepth(nid), + qexpand_loss_guided_->push(ExpandEntry(nid, -1, -1, p_tree->GetDepth(nid), snode_[nid].best.loss_chg, timestamp++)); ++num_leaves; @@ -265,50 +701,29 @@ void QuantileHistMaker::Builder::ExpandWithLossGuide( while (!qexpand_loss_guided_->empty()) { const ExpandEntry candidate = qexpand_loss_guided_->top(); - const int nid = candidate.nid; + const int32_t nid = candidate.nid; qexpand_loss_guided_->pop(); - if (candidate.loss_chg <= kRtEps - || (param_.max_depth > 0 && candidate.depth == param_.max_depth) - || (param_.max_leaves > 0 && num_leaves == param_.max_leaves) ) { - (*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate); - } else { - this->ApplySplit(nid, gmat, column_matrix, hist_, *p_fmat, p_tree); - const int cleft = (*p_tree)[nid].LeftChild(); - const int cright = (*p_tree)[nid].RightChild(); - hist_.AddHistRow(cleft); - hist_.AddHistRow(cright); + std::vector nodes_to_build{candidate}; + std::vector successors; - if (rabit::IsDistributed()) { - // in distributed mode, we need to keep consistent across workers - BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, hist_[cleft], true); - SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]); - } else { - if (row_set_collection_[cleft].Size() < row_set_collection_[cright].Size()) { - BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, hist_[cleft], true); - SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]); - } else { - BuildHist(gpair_h, row_set_collection_[cright], gmat, gmatb, hist_[cright], true); - SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]); - } - } + CreateNewNodesBatch(nodes_to_build, gmat, column_matrix, p_fmat, p_tree, + &num_leaves, candidate.depth, ×tamp, &successors); - this->InitNewNode(cleft, gmat, gpair_h, *p_fmat, *p_tree); - this->InitNewNode(cright, gmat, gpair_h, *p_fmat, *p_tree); - bst_uint featureid = snode_[nid].best.SplitIndex(); - spliteval_->AddSplit(nid, cleft, cright, featureid, - snode_[cleft].weight, snode_[cright].weight); + if (!successors.empty()) { + BuildHistsBatch(successors, p_tree, gmat, gpair_h, &hist_buffers, &hist_is_init); + BuildNodeStatBatch(gmat, p_fmat, p_tree, gpair_h, successors); + EvaluateSplitsBatch(successors, gmat, *p_fmat, hist_is_init, hist_buffers, p_tree); - this->EvaluateSplit(cleft, gmat, hist_, *p_fmat, *p_tree); - this->EvaluateSplit(cright, gmat, hist_, *p_fmat, *p_tree); + const int32_t cleft = (*p_tree)[nid].LeftChild(); + const int32_t cright = (*p_tree)[nid].RightChild(); - qexpand_loss_guided_->push(ExpandEntry(cleft, p_tree->GetDepth(cleft), + qexpand_loss_guided_->push(ExpandEntry(cleft, -1, nid, p_tree->GetDepth(cleft), snode_[cleft].best.loss_chg, timestamp++)); - qexpand_loss_guided_->push(ExpandEntry(cright, p_tree->GetDepth(cright), + qexpand_loss_guided_->push(ExpandEntry(cright, -1, nid, p_tree->GetDepth(cright), snode_[cright].best.loss_chg, timestamp++)); - ++num_leaves; // give two and take one, as parent is no longer a leaf } } @@ -320,13 +735,14 @@ void QuantileHistMaker::Builder::Update(const GHistIndexMatrix& gmat, HostDeviceVector* gpair, DMatrix* p_fmat, RegTree* p_tree) { - builder_monitor_.Start("Update"); + perf_monitor.StartPerfMonitor(); const std::vector& gpair_h = gpair->ConstHostVector(); - spliteval_->Reset(); + perf_monitor.TickStart(); this->InitData(gmat, gpair_h, *p_fmat, *p_tree); + perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::INIT_DATA); if (param_.grow_policy == TrainParam::kLossGuide) { ExpandWithLossGuide(gmat, gmatb, column_matrix, p_fmat, p_tree, gpair_h); @@ -337,17 +753,18 @@ void QuantileHistMaker::Builder::Update(const GHistIndexMatrix& gmat, for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) { p_tree->Stat(nid).loss_chg = snode_[nid].best.loss_chg; p_tree->Stat(nid).base_weight = snode_[nid].weight; - p_tree->Stat(nid).sum_hess = static_cast(snode_[nid].stats.sum_hess); + p_tree->Stat(nid).sum_hess = + static_cast(snode_[nid].stats.sum_hess); } pruner_->Update(gpair, p_fmat, std::vector{p_tree}); - builder_monitor_.Stop("Update"); + perf_monitor.EndPerfMonitor(); } bool QuantileHistMaker::Builder::UpdatePredictionCache( - const DMatrix* data, - HostDeviceVector* p_out_preds) { + const DMatrix* data, + HostDeviceVector* p_out_preds) { std::vector& out_preds = p_out_preds->HostVector(); // p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in @@ -363,8 +780,31 @@ bool QuantileHistMaker::Builder::UpdatePredictionCache( CHECK_GT(out_preds.size(), 0U); - for (const RowSetCollection::Elem rowset : row_set_collection_) { - if (rowset.begin != nullptr && rowset.end != nullptr) { + const size_t block_size = 2048; + const size_t n_nodes = row_set_collection_.end() - row_set_collection_.begin(); + std::vector tasks_elem; + std::vector tasks_iblock; + std::vector tasks_nblock; + + for (size_t k = 0; k < n_nodes; ++k) { + const size_t nrows = row_set_collection_[k].Size(); + const size_t nblocks = nrows / block_size + !!(nrows % block_size); + + for (size_t i = 0; i < nblocks; ++i) { + tasks_elem.push_back(row_set_collection_[k]); + tasks_iblock.push_back(i); + tasks_nblock.push_back(nblocks); + } + } + + #pragma omp parallel for schedule(guided) + for (int32_t k = 0; k < tasks_elem.size(); ++k) { + const RowSetCollection::Elem rowset = tasks_elem[k]; + if (rowset.begin != nullptr && rowset.end != nullptr && rowset.node_id != -1) { + const size_t nrows = rowset.Size(); + const size_t iblock = tasks_iblock[k]; + const size_t nblocks = tasks_nblock[k]; + int nid = rowset.node_id; bst_float leaf_value; // if a node is marked as deleted by the pruner, traverse upward to locate @@ -377,8 +817,11 @@ bool QuantileHistMaker::Builder::UpdatePredictionCache( } leaf_value = (*p_last_tree_)[nid].LeafValue(); - for (const size_t* it = rowset.begin; it < rowset.end; ++it) { - out_preds[*it] += leaf_value; + const size_t istart = iblock*block_size; + const size_t iend = (iblock == nblocks-1) ? nrows : istart + block_size; + + for (size_t it = istart; it < iend; ++it) { + out_preds[rowset.begin[it]] += leaf_value; } } } @@ -399,7 +842,6 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat, CHECK(param_.max_depth > 0) << "max_depth cannot be 0 (unlimited) " << "when grow_policy is depthwise."; } - builder_monitor_.Start("InitData"); const auto& info = fmat.Info(); { @@ -410,12 +852,16 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat, // initialize histogram collection uint32_t nbins = gmat.cut.row_ptr.back(); hist_.Init(nbins); + hist_buff_.Init(nbins); // initialize histogram builder -#pragma omp parallel + #pragma omp parallel { this->nthread_ = omp_get_num_threads(); } + + const auto nthread = static_cast(this->nthread_); + row_split_tloc_.resize(nthread); hist_builder_.Init(this->nthread_, nbins); CHECK_EQ(info.root_index_.size(), 0U); @@ -457,7 +903,7 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat, } bool has_neg_hess = false; - for (size_t tid = 0; tid < this->nthread_; ++tid) { + for (int32_t tid = 0; tid < this->nthread_; ++tid) { if (p_buff[tid]) { has_neg_hess = true; } @@ -485,8 +931,8 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat, } } } - row_set_collection_.Init(); + buffer_for_partition_.reserve(2 * info.num_row_); { /* determine layout of data */ @@ -549,290 +995,114 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat, qexpand_depth_wise_.clear(); } } - builder_monitor_.Stop("InitData"); } -void QuantileHistMaker::Builder::EvaluateSplit(const int nid, - const GHistIndexMatrix& gmat, - const HistCollection& hist, - const DMatrix& fmat, - const RegTree& tree) { - builder_monitor_.Start("EvaluateSplit"); - // start enumeration +void QuantileHistMaker::Builder::EvaluateSplitsBatch( + const std::vector& nodes, + const GHistIndexMatrix& gmat, + const DMatrix& fmat, + const std::vector>& hist_is_init, + const std::vector>& hist_buffers, + RegTree* p_tree) { + perf_monitor.TickStart(); const MetaInfo& info = fmat.Info(); - auto p_feature_set = column_sampler_.GetFeatureSet(tree.GetDepth(nid)); - const auto& feature_set = p_feature_set->HostVector(); - const auto nfeature = static_cast(feature_set.size()); - const auto nthread = static_cast(this->nthread_); - best_split_tloc_.resize(nthread); -#pragma omp parallel for schedule(static) num_threads(nthread) - for (bst_omp_uint tid = 0; tid < nthread; ++tid) { - best_split_tloc_[tid] = snode_[nid].best; - } - GHistRow node_hist = hist[nid]; - -#pragma omp parallel for schedule(dynamic) num_threads(nthread) - for (bst_omp_uint i = 0; i < nfeature; ++i) { // NOLINT(*) - const auto feature_id = static_cast(feature_set[i]); - const auto tid = static_cast(omp_get_thread_num()); - const auto node_id = static_cast(nid); - // Narrow search space by dropping features that are not feasible under the - // given set of constraints (e.g. feature interaction constraints) - if (spliteval_->CheckFeatureConstraint(node_id, feature_id)) { - this->EnumerateSplit(-1, gmat, node_hist, snode_[nid], info, - &best_split_tloc_[tid], feature_id, node_id); - this->EnumerateSplit(+1, gmat, node_hist, snode_[nid], info, - &best_split_tloc_[tid], feature_id, node_id); - } - } - for (unsigned tid = 0; tid < nthread; ++tid) { - snode_[nid].best.Update(best_split_tloc_[tid]); - } - builder_monitor_.Stop("EvaluateSplit"); -} - -void QuantileHistMaker::Builder::ApplySplit(int nid, - const GHistIndexMatrix& gmat, - const ColumnMatrix& column_matrix, - const HistCollection& hist, - const DMatrix& fmat, - RegTree* p_tree) { - builder_monitor_.Start("ApplySplit"); - // TODO(hcho3): support feature sampling by levels - - /* 1. Create child nodes */ - NodeEntry& e = snode_[nid]; - bst_float left_leaf_weight = - spliteval_->ComputeWeight(nid, e.best.left_sum) * param_.learning_rate; - bst_float right_leaf_weight = - spliteval_->ComputeWeight(nid, e.best.right_sum) * param_.learning_rate; - p_tree->ExpandNode(nid, e.best.SplitIndex(), e.best.split_value, - e.best.DefaultLeft(), e.weight, left_leaf_weight, - right_leaf_weight, e.best.loss_chg, e.stats.sum_hess); - - /* 2. Categorize member rows */ - const auto nthread = static_cast(this->nthread_); - row_split_tloc_.resize(nthread); - for (bst_omp_uint i = 0; i < nthread; ++i) { - row_split_tloc_[i].left.clear(); - row_split_tloc_[i].right.clear(); - } - const bool default_left = (*p_tree)[nid].DefaultLeft(); - const bst_uint fid = (*p_tree)[nid].SplitIndex(); - const bst_float split_pt = (*p_tree)[nid].SplitCond(); - const uint32_t lower_bound = gmat.cut.row_ptr[fid]; - const uint32_t upper_bound = gmat.cut.row_ptr[fid + 1]; - int32_t split_cond = -1; - // convert floating-point split_pt into corresponding bin_id - // split_cond = -1 indicates that split_pt is less than all known cut points - CHECK_LT(upper_bound, - static_cast(std::numeric_limits::max())); - for (uint32_t i = lower_bound; i < upper_bound; ++i) { - if (split_pt == gmat.cut.cut[i]) { - split_cond = static_cast(i); + // prepare tasks + std::vector> tasks; + for (size_t i = 0; i < nodes.size(); ++i) { + auto p_feature_set = column_sampler_.GetFeatureSet(nodes[i].depth); + + const auto& feature_set = p_feature_set->HostVector(); + const auto nfeature = static_cast(feature_set.size()); + for (size_t j = 0; j < nfeature; ++j) { + tasks.emplace_back(i, feature_set[j]); } } - const auto& rowset = row_set_collection_[nid]; - - Column column = column_matrix.GetColumn(fid); - if (column.GetType() == xgboost::common::kDenseColumn) { - ApplySplitDenseData(rowset, gmat, &row_split_tloc_, column, split_cond, - default_left); - } else { - ApplySplitSparseData(rowset, gmat, &row_split_tloc_, column, lower_bound, - upper_bound, split_cond, default_left); - } + // partial results + std::vector> splits(tasks.size()); + // parallel enumeration + #pragma omp parallel for schedule(guided) + for (int32_t i = 0; i < tasks.size(); ++i) { + const int32_t node_idx = tasks[i].first; + const size_t fid = tasks[i].second; + const int32_t nid = nodes[node_idx].nid; + const int32_t sibling_nid = nodes[node_idx].sibling_nid; + const int32_t parent_nid = nodes[node_idx].parent_nid; + + common::GradStatHist::GradType* hist_data = + reinterpret_cast(hist_[nid].data()); + common::GradStatHist::GradType* sibling_hist_data = sibling_nid > -1 ? + reinterpret_cast( + hist_[sibling_nid].data()) : nullptr; + common::GradStatHist::GradType* parent_hist_data = sibling_nid > -1 ? + reinterpret_cast(hist_[parent_nid].data()) : nullptr; + + // reduce needed part of a hist here to have it in cache before enumeratation + if (!rabit::IsDistributed()) { + const std::vector& cut_ptr = gmat.cut.row_ptr; + const size_t ibegin = 2 * cut_ptr[fid]; + const size_t iend = 2 * cut_ptr[fid + 1]; + ReduceHistograms(hist_data, sibling_hist_data, parent_hist_data, ibegin, iend, node_idx, + hist_is_init, hist_buffers); + } - row_set_collection_.AddSplit( - nid, row_split_tloc_, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild()); - builder_monitor_.Stop("ApplySplit"); -} + if (spliteval_->CheckFeatureConstraint(nid, fid)) { + auto& snode = snode_[nid]; + const bool compute_backward = this->EnumerateSplit(+1, gmat, hist_[nid], snode, + info, &splits[i].first, fid, nid); -void QuantileHistMaker::Builder::ApplySplitDenseData( - const RowSetCollection::Elem rowset, - const GHistIndexMatrix& gmat, - std::vector* p_row_split_tloc, - const Column& column, - bst_int split_cond, - bool default_left) { - std::vector& row_split_tloc = *p_row_split_tloc; - constexpr int kUnroll = 8; // loop unrolling factor - const size_t nrows = rowset.end - rowset.begin; - const size_t rest = nrows % kUnroll; - -#pragma omp parallel for num_threads(nthread_) schedule(static) - for (bst_omp_uint i = 0; i < nrows - rest; i += kUnroll) { - const bst_uint tid = omp_get_thread_num(); - auto& left = row_split_tloc[tid].left; - auto& right = row_split_tloc[tid].right; - size_t rid[kUnroll]; - uint32_t rbin[kUnroll]; - for (int k = 0; k < kUnroll; ++k) { - rid[k] = rowset.begin[i + k]; - } - for (int k = 0; k < kUnroll; ++k) { - rbin[k] = column.GetFeatureBinIdx(rid[k]); - } - for (int k = 0; k < kUnroll; ++k) { // NOLINT - if (rbin[k] == std::numeric_limits::max()) { // missing value - if (default_left) { - left.push_back(rid[k]); - } else { - right.push_back(rid[k]); - } - } else { - if (static_cast(rbin[k] + column.GetBaseIdx()) <= split_cond) { - left.push_back(rid[k]); - } else { - right.push_back(rid[k]); - } + if (compute_backward) { + this->EnumerateSplit(-1, gmat, hist_[nid], snode, info, + &splits[i].first, fid, nid); } } - } - for (size_t i = nrows - rest; i < nrows; ++i) { - auto& left = row_split_tloc[nthread_-1].left; - auto& right = row_split_tloc[nthread_-1].right; - const size_t rid = rowset.begin[i]; - const uint32_t rbin = column.GetFeatureBinIdx(rid); - if (rbin == std::numeric_limits::max()) { // missing value - if (default_left) { - left.push_back(rid); - } else { - right.push_back(rid); - } - } else { - if (static_cast(rbin + column.GetBaseIdx()) <= split_cond) { - left.push_back(rid); - } else { - right.push_back(rid); + + if (sibling_nid > -1 && spliteval_->CheckFeatureConstraint(sibling_nid, fid)) { + auto& snode = snode_[sibling_nid]; + + const bool compute_backward = this->EnumerateSplit(+1, gmat, hist_[sibling_nid], snode, + info, &splits[i].second, fid, sibling_nid); + + if (compute_backward) { + this->EnumerateSplit(-1, gmat, hist_[sibling_nid], snode, info, + &splits[i].second, fid, sibling_nid); } } } -} - -void QuantileHistMaker::Builder::ApplySplitSparseData( - const RowSetCollection::Elem rowset, - const GHistIndexMatrix& gmat, - std::vector* p_row_split_tloc, - const Column& column, - bst_uint lower_bound, - bst_uint upper_bound, - bst_int split_cond, - bool default_left) { - std::vector& row_split_tloc = *p_row_split_tloc; - const size_t nrows = rowset.end - rowset.begin; - -#pragma omp parallel num_threads(nthread_) - { - const auto tid = static_cast(omp_get_thread_num()); - const size_t ibegin = tid * nrows / nthread_; - const size_t iend = (tid + 1) * nrows / nthread_; - if (ibegin < iend) { // ensure that [ibegin, iend) is nonempty range - // search first nonzero row with index >= rowset[ibegin] - const size_t* p = std::lower_bound(column.GetRowData(), - column.GetRowData() + column.Size(), - rowset.begin[ibegin]); - - auto& left = row_split_tloc[tid].left; - auto& right = row_split_tloc[tid].right; - if (p != column.GetRowData() + column.Size() && *p <= rowset.begin[iend - 1]) { - size_t cursor = p - column.GetRowData(); - for (size_t i = ibegin; i < iend; ++i) { - const size_t rid = rowset.begin[i]; - while (cursor < column.Size() - && column.GetRowIdx(cursor) < rid - && column.GetRowIdx(cursor) <= rowset.begin[iend - 1]) { - ++cursor; - } - if (cursor < column.Size() && column.GetRowIdx(cursor) == rid) { - const uint32_t rbin = column.GetFeatureBinIdx(cursor); - if (static_cast(rbin + column.GetBaseIdx()) <= split_cond) { - left.push_back(rid); - } else { - right.push_back(rid); - } - ++cursor; - } else { - // missing value - if (default_left) { - left.push_back(rid); - } else { - right.push_back(rid); - } - } - } - } else { // all rows in [ibegin, iend) have missing values - if (default_left) { - for (size_t i = ibegin; i < iend; ++i) { - const size_t rid = rowset.begin[i]; - left.push_back(rid); - } - } else { - for (size_t i = ibegin; i < iend; ++i) { - const size_t rid = rowset.begin[i]; - right.push_back(rid); - } - } - } + // choice of the best splits + for (size_t i = 0; i < splits.size(); ++i) { + const int32_t node_idx = tasks[i].first; + const int32_t nid = nodes[node_idx].nid; + const int32_t sibling_nid = nodes[node_idx].sibling_nid; + snode_[nid].best.Update(splits[i].first); + if (sibling_nid > -1) { + snode_[sibling_nid].best.Update(splits[i].second); } } + + perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::EVALUATE_SPLIT); } void QuantileHistMaker::Builder::InitNewNode(int nid, const GHistIndexMatrix& gmat, const std::vector& gpair, const DMatrix& fmat, - const RegTree& tree) { - builder_monitor_.Start("InitNewNode"); - { - snode_.resize(tree.param.num_nodes, NodeEntry(param_)); - } - - { - auto& stats = snode_[nid].stats; - GHistRow hist = hist_[nid]; - if (tree[nid].IsRoot()) { - if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) { - const std::vector& row_ptr = gmat.cut.row_ptr; - const uint32_t ibegin = row_ptr[fid_least_bins_]; - const uint32_t iend = row_ptr[fid_least_bins_ + 1]; - auto begin = hist.data(); - for (uint32_t i = ibegin; i < iend; ++i) { - const GradStats et = begin[i]; - stats.Add(et.sum_grad, et.sum_hess); - } - } else { - const RowSetCollection::Elem e = row_set_collection_[nid]; - for (const size_t* it = e.begin; it < e.end; ++it) { - stats.Add(gpair[*it]); - } - } - histred_.Allreduce(&snode_[nid].stats, 1); - } else { - int parent_id = tree[nid].Parent(); - if (tree[nid].IsLeftChild()) { - snode_[nid].stats = snode_[parent_id].best.left_sum; - } else { - snode_[nid].stats = snode_[parent_id].best.right_sum; - } - } - } - + RegTree* tree, + QuantileHistMaker::NodeEntry* snode, + int32_t parentid) { // calculating the weights { - bst_uint parentid = tree[nid].Parent(); - snode_[nid].weight = static_cast( - spliteval_->ComputeWeight(parentid, snode_[nid].stats)); - snode_[nid].root_gain = static_cast( - spliteval_->ComputeScore(parentid, snode_[nid].stats, snode_[nid].weight)); + snode->weight = static_cast( + spliteval_->ComputeWeight(parentid, snode->stats)); + snode->root_gain = static_cast( + spliteval_->ComputeScore(parentid, snode->stats, + snode->weight)); } - builder_monitor_.Stop("InitNewNode"); } // enumerate the split values of specific feature -void QuantileHistMaker::Builder::EnumerateSplit(int d_step, +bool QuantileHistMaker::Builder::EnumerateSplit(int d_step, const GHistIndexMatrix& gmat, const GHistRow& hist, const NodeEntry& snode, @@ -871,27 +1141,45 @@ void QuantileHistMaker::Builder::EnumerateSplit(int d_step, iend = static_cast(cut_ptr[fid]) - 1; } - for (int32_t i = ibegin; i != iend; i += d_step) { - // start working - // try to find a split - e.Add(hist[i].GetGrad(), hist[i].GetHess()); - if (e.sum_hess >= param_.min_child_weight) { - c.SetSubstract(snode.stats, e); - if (c.sum_hess >= param_.min_child_weight) { - bst_float loss_chg; - bst_float split_pt; - if (d_step > 0) { - // forward enumeration: split at right bound of each bin - loss_chg = static_cast( - spliteval_->ComputeSplitScore(nodeID, fid, e, c) - - snode.root_gain); + if (d_step == 1) { + for (int32_t i = ibegin; i < iend; i++) { + e.Add(hist[i].GetGrad(), hist[i].GetHess()); + if (e.sum_hess >= param_.min_child_weight) { + c.SetSubstract(snode.stats, e); + if (c.sum_hess >= param_.min_child_weight) { + bst_float loss_chg; + bst_float split_pt; + { + loss_chg = static_cast(spliteval_->ComputeSplitScore(nodeID, + fid, e, c) - snode.root_gain); + } + split_pt = cut_val[i]; best.Update(loss_chg, fid, split_pt, d_step == -1, e, c); - } else { + } + } + } + p_best->Update(best); + + if (e.GetGrad() == snode.stats.GetGrad() && e.GetHess() == snode.stats.GetHess()) { + return false; + } + } else { + for (int32_t i = ibegin; i != iend; i--) { + e.Add(hist[i].GetGrad(), hist[i].GetHess()); + if (e.sum_hess >= param_.min_child_weight) { + c.SetSubstract(snode.stats, e); + if (c.sum_hess >= param_.min_child_weight) { + bst_float loss_chg; + bst_float split_pt; + // backward enumeration: split at left bound of each bin - loss_chg = static_cast( - spliteval_->ComputeSplitScore(nodeID, fid, c, e) - - snode.root_gain); + { + loss_chg = static_cast( + spliteval_->ComputeSplitScore(nodeID, fid, c, e) - + snode.root_gain); + } + if (i == imin) { // for leftmost bin, left bound is the smallest feature value split_pt = gmat.cut.min_val[fid]; @@ -902,8 +1190,14 @@ void QuantileHistMaker::Builder::EnumerateSplit(int d_step, } } } + p_best->Update(best); + + if (e.GetGrad() == snode.stats.GetGrad() && e.GetHess() == snode.stats.GetHess()) { + return false; + } } - p_best->Update(best); + + return true; } XGBOOST_REGISTER_TREE_UPDATER(FastHistMaker, "grow_fast_histmaker") diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h index 17688f86afcb..27b87b974e7e 100644 --- a/src/tree/updater_quantile_hist.h +++ b/src/tree/updater_quantile_hist.h @@ -1,8 +1,8 @@ /*! - * Copyright 2017-2018 by Contributors + * Copyright 2017-2019 by Contributors * \file updater_quantile_hist.h * \brief use quantized feature values to construct a tree - * \author Philip Cho, Tianqi Chen + * \author Philip Cho, Tianqi Chen, Egor Smirnov */ #ifndef XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_ #define XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_ @@ -15,54 +15,23 @@ #include #include #include +#include #include #include #include +#include #include "./param.h" #include "./split_evaluator.h" #include "../common/random.h" -#include "../common/timer.h" #include "../common/hist_util.h" #include "../common/row_set.h" #include "../common/column_matrix.h" namespace xgboost { - -/*! - * \brief A C-style array with in-stack allocation. As long as the array is smaller than MaxStackSize, it will be allocated inside the stack. Otherwise, it will be heap-allocated. - */ -template -class MemStackAllocator { - public: - explicit MemStackAllocator(size_t required_size): required_size_(required_size) { - } - - T* Get() { - if (!ptr_) { - if (MaxStackSize >= required_size_) { - ptr_ = stack_mem_; - } else { - ptr_ = reinterpret_cast(malloc(required_size_ * sizeof(T))); - do_free_ = true; - } - } - - return ptr_; - } - - ~MemStackAllocator() { - if (do_free_) free(ptr_); - } - - - private: - T* ptr_ = nullptr; - bool do_free_ = false; - size_t required_size_; - T stack_mem_[MaxStackSize]; -}; - +namespace common { + struct GradStatHist; +} namespace tree { using xgboost::common::HistCutMatrix; @@ -88,6 +57,7 @@ class QuantileHistMaker: public TreeUpdater { bool UpdatePredictionCache(const DMatrix* data, HostDeviceVector* out_preds) override; + protected: // training parameter TrainParam param_; @@ -100,6 +70,7 @@ class QuantileHistMaker: public TreeUpdater { bool is_gmat_initialized_; // data structure + public: struct NodeEntry { /*! \brief statics for node entry */ GradStats stats; @@ -111,7 +82,8 @@ class QuantileHistMaker: public TreeUpdater { SplitEntry best; // constructor explicit NodeEntry(const TrainParam& param) - : root_gain(0.0f), weight(0.0f) {} + : root_gain(0.0f), weight(0.0f) { + } }; // actual builder that runs the algorithm @@ -121,11 +93,8 @@ class QuantileHistMaker: public TreeUpdater { explicit Builder(const TrainParam& param, std::unique_ptr pruner, std::unique_ptr spliteval) - : param_(param), pruner_(std::move(pruner)), - spliteval_(std::move(spliteval)), p_last_tree_(nullptr), - p_last_fmat_(nullptr) { - builder_monitor_.Init("Quantile::Builder"); - } + : param_(param), pruner_(std::move(pruner)), spliteval_(std::move(spliteval)), + p_last_tree_(nullptr), p_last_fmat_(nullptr) { } // update one tree, growing virtual void Update(const GHistIndexMatrix& gmat, const GHistIndexBlockMatrix& gmatb, @@ -134,42 +103,104 @@ class QuantileHistMaker: public TreeUpdater { DMatrix* p_fmat, RegTree* p_tree); - inline void BuildHist(const std::vector& gpair, - const RowSetCollection::Elem row_indices, - const GHistIndexMatrix& gmat, - const GHistIndexBlockMatrix& gmatb, - GHistRow hist, - bool sync_hist) { - builder_monitor_.Start("BuildHist"); - if (param_.enable_feature_grouping > 0) { - hist_builder_.BuildBlockHist(gpair, row_indices, gmatb, hist); - } else { - hist_builder_.BuildHist(gpair, row_indices, gmat, hist); - } - if (sync_hist) { - this->histred_.Allreduce(hist.data(), hist_builder_.GetNumBins()); - } - builder_monitor_.Stop("BuildHist"); - } - - inline void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) { - builder_monitor_.Start("SubtractionTrick"); - hist_builder_.SubtractionTrick(self, sibling, parent); - builder_monitor_.Stop("SubtractionTrick"); - } - bool UpdatePredictionCache(const DMatrix* data, HostDeviceVector* p_out_preds); + std::tuple + GetHistBuffer(std::vector* hist_is_init, + std::vector* grad_stats, size_t block_id, size_t nthread, + size_t tid, std::vector* data_hist, size_t hist_size); + protected: /* tree growing policies */ struct ExpandEntry { int nid; + int sibling_nid; + int parent_nid; int depth; bst_float loss_chg; unsigned timestamp; - ExpandEntry(int nid, int depth, bst_float loss_chg, unsigned tstmp) - : nid(nid), depth(depth), loss_chg(loss_chg), timestamp(tstmp) {} + ExpandEntry(int nid, int sibling_nid, int parent_nid, int depth, bst_float loss_chg, + unsigned tstmp) : nid(nid), sibling_nid(sibling_nid), parent_nid(parent_nid), + depth(depth), loss_chg(loss_chg), timestamp(tstmp) {} + }; + + struct TreeGrowingPerfMonitor { + enum timer_name {INIT_DATA, INIT_NEW_NODE, BUILD_HIST, EVALUATE_SPLIT, APPLY_SPLIT}; + + double global_start; + + // performance counters + double tstart; + double time_init_data = 0; + double time_init_new_node = 0; + double time_build_hist = 0; + double time_evaluate_split = 0; + double time_apply_split = 0; + + inline void StartPerfMonitor() { + global_start = dmlc::GetTime(); + } + + inline void EndPerfMonitor() { + CHECK_GT(global_start, 0); + double total_time = dmlc::GetTime() - global_start; + LOG(INFO) << "\nInitData: " + << std::fixed << std::setw(6) << std::setprecision(4) << time_init_data + << " (" << std::fixed << std::setw(5) << std::setprecision(2) + << time_init_data / total_time * 100 << "%)\n" + << "InitNewNode: " + << std::fixed << std::setw(6) << std::setprecision(4) << time_init_new_node + << " (" << std::fixed << std::setw(5) << std::setprecision(2) + << time_init_new_node / total_time * 100 << "%)\n" + << "BuildHist: " + << std::fixed << std::setw(6) << std::setprecision(4) << time_build_hist + << " (" << std::fixed << std::setw(5) << std::setprecision(2) + << time_build_hist / total_time * 100 << "%)\n" + << "EvaluateSplit: " + << std::fixed << std::setw(6) << std::setprecision(4) << time_evaluate_split + << " (" << std::fixed << std::setw(5) << std::setprecision(2) + << time_evaluate_split / total_time * 100 << "%)\n" + << "ApplySplit: " + << std::fixed << std::setw(6) << std::setprecision(4) << time_apply_split + << " (" << std::fixed << std::setw(5) << std::setprecision(2) + << time_apply_split / total_time * 100 << "%)\n" + << "========================================\n" + << "Total: " + << std::fixed << std::setw(6) << std::setprecision(4) << total_time << std::endl; + // clear performance counters + time_init_data = 0; + time_init_new_node = 0; + time_build_hist = 0; + time_evaluate_split = 0; + time_apply_split = 0; + } + + inline void TickStart() { + tstart = dmlc::GetTime(); + } + + inline void UpdatePerfTimer(const timer_name &timer_name) { + // CHECK_GT(tstart, 0); // TODO Fix + switch (timer_name) { + case INIT_DATA: + time_init_data += dmlc::GetTime() - tstart; + break; + case INIT_NEW_NODE: + time_init_new_node += dmlc::GetTime() - tstart; + break; + case BUILD_HIST: + time_build_hist += dmlc::GetTime() - tstart; + break; + case EVALUATE_SPLIT: + time_evaluate_split += dmlc::GetTime() - tstart; + break; + case APPLY_SPLIT: + time_apply_split += dmlc::GetTime() - tstart; + break; + } + tstart = -1; + } }; // initialize temp data structure @@ -178,43 +209,16 @@ class QuantileHistMaker: public TreeUpdater { const DMatrix& fmat, const RegTree& tree); - void EvaluateSplit(const int nid, - const GHistIndexMatrix& gmat, - const HistCollection& hist, - const DMatrix& fmat, - const RegTree& tree); - - void ApplySplit(int nid, - const GHistIndexMatrix& gmat, - const ColumnMatrix& column_matrix, - const HistCollection& hist, - const DMatrix& fmat, - RegTree* p_tree); - - void ApplySplitDenseData(const RowSetCollection::Elem rowset, - const GHistIndexMatrix& gmat, - std::vector* p_row_split_tloc, - const Column& column, - bst_int split_cond, - bool default_left); - - void ApplySplitSparseData(const RowSetCollection::Elem rowset, - const GHistIndexMatrix& gmat, - std::vector* p_row_split_tloc, - const Column& column, - bst_uint lower_bound, - bst_uint upper_bound, - bst_int split_cond, - bool default_left); - void InitNewNode(int nid, const GHistIndexMatrix& gmat, const std::vector& gpair, const DMatrix& fmat, - const RegTree& tree); + RegTree* tree, + QuantileHistMaker::NodeEntry* snode, + int32_t parentid); // enumerate the split values of specific feature - void EnumerateSplit(int d_step, + bool EnumerateSplit(int d_step, const GHistIndexMatrix& gmat, const GHistRow& hist, const NodeEntry& snode, @@ -223,6 +227,30 @@ class QuantileHistMaker: public TreeUpdater { bst_uint fid, bst_uint nodeID); + void EvaluateSplitsBatch(const std::vector& nodes, + const GHistIndexMatrix& gmat, + const DMatrix& fmat, + const std::vector>& hist_is_init, + const std::vector>& hist_buffers, + RegTree* p_tree); + + void ReduceHistograms( + common::GradStatHist::GradType* hist_data, + common::GradStatHist::GradType* sibling_hist_data, + common::GradStatHist::GradType* parent_hist_data, + const size_t ibegin, + const size_t iend, + const size_t inode, + const std::vector>& hist_is_init, + const std::vector>& hist_buffers); + + void SyncHistograms( + RegTree* p_tree, + const std::vector& nodes, + std::vector>* hist_buffers, + std::vector>* hist_is_init, + const std::vector>& grad_stats); + void ExpandWithDepthWidth(const GHistIndexMatrix &gmat, const GHistIndexBlockMatrix &gmatb, const ColumnMatrix &column_matrix, @@ -230,30 +258,6 @@ class QuantileHistMaker: public TreeUpdater { RegTree *p_tree, const std::vector &gpair_h); - void BuildLocalHistograms(int *starting_index, - int *sync_count, - const GHistIndexMatrix &gmat, - const GHistIndexBlockMatrix &gmatb, - RegTree *p_tree, - const std::vector &gpair_h); - - void SyncHistograms(int starting_index, - int sync_count, - RegTree *p_tree); - - void BuildNodeStats(const GHistIndexMatrix &gmat, - DMatrix *p_fmat, - RegTree *p_tree, - const std::vector &gpair_h); - - void EvaluateSplits(const GHistIndexMatrix &gmat, - const ColumnMatrix &column_matrix, - DMatrix *p_fmat, - RegTree *p_tree, - int *num_leaves, - int depth, - unsigned *timestamp, - std::vector *temp_qexpand_depth); void ExpandWithLossGuide(const GHistIndexMatrix& gmat, const GHistIndexBlockMatrix& gmatb, @@ -262,6 +266,54 @@ class QuantileHistMaker: public TreeUpdater { RegTree* p_tree, const std::vector& gpair_h); + + void BuildHistsBatch(const std::vector& nodes, RegTree* tree, + const GHistIndexMatrix &gmat, const std::vector& gpair, + std::vector>* hist_buffers, + std::vector>* hist_is_init); + + void BuildNodeStat(const GHistIndexMatrix &gmat, + DMatrix *p_fmat, + RegTree *p_tree, + const std::vector &gpair_h, + int32_t nid); + + void BuildNodeStatBatch( + const GHistIndexMatrix &gmat, + DMatrix *p_fmat, + RegTree *p_tree, + const std::vector &gpair_h, + const std::vector& nodes); + + void CreateNewNodes(const GHistIndexMatrix &gmat, + const ColumnMatrix &column_matrix, + DMatrix *p_fmat, + RegTree *p_tree, + int *num_leaves, + int depth, + unsigned *timestamp, + std::vector *temp_qexpand_depth, + int32_t nid, + std::mutex* mutex_add_nodes, + const QuantileHistMaker::NodeEntry& snode, + RegTree::Node node); + + int32_t FindSplitCond(int32_t nid, + RegTree *p_tree, + const GHistIndexMatrix &gmat); + + void CreateNewNodesBatch( + const std::vector& nodes, + const GHistIndexMatrix &gmat, + const ColumnMatrix &column_matrix, + DMatrix *p_fmat, + RegTree *p_tree, + int *num_leaves, + int depth, + unsigned *timestamp, + std::vector *temp_qexpand_depth); + + inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) { if (lhs.loss_chg == rhs.loss_chg) { return lhs.timestamp > rhs.timestamp; // favor small timestamp @@ -270,6 +322,8 @@ class QuantileHistMaker: public TreeUpdater { } } + HistCollection hist_buff_; + // --data fields-- const TrainParam& param_; // number of omp thread used during training @@ -280,6 +334,7 @@ class QuantileHistMaker: public TreeUpdater { // the temp space for split std::vector row_split_tloc_; std::vector best_split_tloc_; + std::vector buffer_for_partition_; /*! \brief TreeNode Data: statistics for each constructed node */ std::vector snode_; /*! \brief culmulative histogram of gradients. */ @@ -311,8 +366,8 @@ class QuantileHistMaker: public TreeUpdater { enum DataLayout { kDenseDataZeroBased, kDenseDataOneBased, kSparseData }; DataLayout data_layout_; - common::Monitor builder_monitor_; - rabit::Reducer histred_; + TreeGrowingPerfMonitor perf_monitor; + rabit::Reducer histred_; }; std::unique_ptr builder_; diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc index f1f56719863d..40a28100d4a4 100644 --- a/tests/cpp/tree/test_quantile_hist.cc +++ b/tests/cpp/tree/test_quantile_hist.cc @@ -101,8 +101,13 @@ class QuantileHistMock : public QuantileHistMaker { RealImpl::InitData(gmat, gpair, fmat, tree); GHistIndexBlockMatrix dummy; hist_.AddHistRow(nid); - BuildHist(gpair, row_set_collection_[nid], - gmat, dummy, hist_[nid], false); + + std::vector> hist_buffers; + std::vector> hist_is_init; + std::vector nodes = {ExpandEntry(nid, -1, -1, tree.GetDepth(0), 0.0, 0)}; + BuildHistsBatch(nodes, const_cast(&tree), gmat, gpair, &hist_buffers, &hist_is_init); + RealImpl::InitNewNode(nid, gmat, gpair, fmat, const_cast(&tree), &snode_[0], tree[0].Parent()); + EvaluateSplitsBatch(nodes, gmat, fmat, hist_is_init, hist_buffers, const_cast(&tree)); // Check if number of histogram bins is correct ASSERT_EQ(hist_[nid].size(), gmat.cut.row_ptr.back()); @@ -143,10 +148,12 @@ class QuantileHistMock : public QuantileHistMaker { RealImpl::InitData(gmat, row_gpairs, *(*dmat), tree); hist_.AddHistRow(0); - BuildHist(row_gpairs, row_set_collection_[0], - gmat, quantile_index_block, hist_[0], false); - - RealImpl::InitNewNode(0, gmat, row_gpairs, *(*dmat), tree); + std::vector nodes = {ExpandEntry(0, -1, -1, tree.GetDepth(0), 0.0, 0)}; + std::vector> hist_buffers; + std::vector> hist_is_init; + BuildHistsBatch(nodes, const_cast(&tree), gmat, row_gpairs, &hist_buffers, &hist_is_init); + RealImpl::InitNewNode(0, gmat, row_gpairs, *(*dmat), const_cast(&tree), &snode_[0], tree[0].Parent()); + EvaluateSplitsBatch(nodes, gmat, **dmat, hist_is_init, hist_buffers, const_cast(&tree)); /* Compute correct split (best_split) using the computed histogram */ const size_t num_row = dmat->get()->Info().num_row_; @@ -197,6 +204,7 @@ class QuantileHistMock : public QuantileHistMaker { const auto split_gain = evaluator->ComputeSplitScore(0, fid, GradStats(left_sum), GradStats(right_sum)); + if (split_gain > best_split_gain) { best_split_gain = split_gain; best_split_feature = fid; @@ -206,7 +214,8 @@ class QuantileHistMock : public QuantileHistMaker { } /* Now compare against result given by EvaluateSplit() */ - RealImpl::EvaluateSplit(0, gmat, hist_, *(*dmat), tree); + EvaluateSplitsBatch(nodes, gmat, **dmat, hist_is_init, hist_buffers, const_cast(&tree)); + ASSERT_EQ(snode_[0].best.SplitIndex(), best_split_feature); ASSERT_EQ(snode_[0].best.split_value, gmat.cut.cut[best_split_threshold]); @@ -289,7 +298,7 @@ TEST(Updater, QuantileHist_EvalSplits) { std::vector> cfg {{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())}, {"split_evaluator", "elastic_net"}, - {"reg_lambda", "0"}, {"reg_alpha", "0"}, {"max_delta_step", "0"}, + {"reg_lambda", "1.0f"}, {"reg_alpha", "0"}, {"max_delta_step", "0"}, {"min_child_weight", "0"}}; QuantileHistMock maker(cfg); maker.TestEvaluateSplit(); From f4dd54a7fad719bb55121e65fca2e8cc2e566841 Mon Sep 17 00:00:00 2001 From: egor Date: Sun, 2 Jun 2019 21:22:33 +0300 Subject: [PATCH 17/31] appling comments in review --- src/common/hist_util.cc | 4 + src/common/hist_util.h | 118 ++++++++++++--------------- src/tree/updater_quantile_hist.cc | 18 ++-- src/tree/updater_quantile_hist.h | 4 +- tests/cpp/tree/test_quantile_hist.cc | 6 +- 5 files changed, 67 insertions(+), 83 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 1bbe3747c86a..edcff201e0ad 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -496,6 +496,8 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat, } } +// used when data layout is kDenseDataZeroBased or kDenseDataOneBased +// it means that "row_ptr" is not needed for hist computations void BuildHistLocalDense(size_t istart, size_t iend, size_t nrows, const size_t* rid, const uint32_t* index, const GradientPair::ValueT* pgh, const size_t* row_ptr, GradStatHist::GradType* data_local_hist, GradStatHist* grad_stat_global) { @@ -548,6 +550,8 @@ void BuildHistLocalDense(size_t istart, size_t iend, size_t nrows, const size_t* grad_stat_global->Add(grad_stat); } +// used when data layout is kSparseData +// it means that "row_ptr" is needed for hist computations void BuildHistLocalSparse(size_t istart, size_t iend, size_t nrows, const size_t* rid, const uint32_t* index, const GradientPair::ValueT* pgh, const size_t* row_ptr, GradStatHist::GradType* data_local_hist, GradStatHist* grad_stat_global) { diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 25efe7bbb2ce..72cbebc717b7 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -55,10 +55,6 @@ class MemStackAllocator { T stack_mem_[MaxStackSize]; }; -namespace tree { -class SplitEvaluator; -} - namespace common { /* @@ -155,7 +151,7 @@ struct HistCutMatrix { }; /*! \brief Builds the cut matrix on the GPU. - * + * * \return The row stride across the entire dataset. */ size_t DeviceSketch @@ -314,10 +310,8 @@ class HistCollection { public: // access histogram for i-th node inline GHistRow operator[](bst_uint nid) { - if (nid >= data_arr_.size()) { - AddHistRow(nid); - } - return { const_cast(dmlc::BeginPtr(*data_arr_[nid])), nbins_}; + AddHistRow(nid); + return { const_cast(dmlc::BeginPtr(data_arr_[nid])), nbins_}; } // have we computed a histogram for i-th node? @@ -328,39 +322,27 @@ class HistCollection { // initialize histogram collection inline void Init(uint32_t nbins) { if (nbins_ != nbins) { - for (size_t i = 0; i < data_arr_.size(); ++i) { - delete data_arr_[i]; - } data_arr_.clear(); nbins_ = nbins; } } - ~HistCollection() { - for (size_t i = 0; i < data_arr_.size(); ++i) { - delete data_arr_[i]; - } - } - // create an empty histogram for i-th node inline void AddHistRow(bst_uint nid) { if (data_arr_.size() <= nid) { - data_arr_.resize(nid + 1, nullptr); - } - - if (data_arr_[nid] == nullptr) { - data_arr_[nid] = new std::vector; - } + size_t prev = data_arr_.size(); + data_arr_.resize(nid + 1); - if (data_arr_[nid]->size() == 0) { - data_arr_[nid]->resize(nbins_); + for(size_t i = prev; i < data_arr_.size(); ++i) { + data_arr_[i].resize(nbins_); + } } } private: /*! \brief number of all bins over all features */ uint32_t nbins_ = 0; - std::vector*> data_arr_; + std::vector> data_arr_; }; @@ -375,53 +357,53 @@ class GHistBuilder { nbins_ = nbins; } -void BuildBlockHist(const std::vector& gpair, - const RowSetCollection::Elem row_indices, - const GHistIndexBlockMatrix& gmatb, - GHistRow hist) { - constexpr int kUnroll = 8; // loop unrolling factor - const int32_t nblock = gmatb.GetNumBlock(); - const size_t nrows = row_indices.end - row_indices.begin; - const size_t rest = nrows % kUnroll; - - #pragma omp parallel for - for (int32_t bid = 0; bid < nblock; ++bid) { - auto gmat = gmatb[bid]; - - for (size_t i = 0; i < nrows - rest; i += kUnroll) { - size_t rid[kUnroll]; - size_t ibegin[kUnroll]; - size_t iend[kUnroll]; - GradientPair stat[kUnroll]; - for (int k = 0; k < kUnroll; ++k) { - rid[k] = row_indices.begin[i + k]; - } - for (int k = 0; k < kUnroll; ++k) { - ibegin[k] = gmat.row_ptr[rid[k]]; - iend[k] = gmat.row_ptr[rid[k] + 1]; - } - for (int k = 0; k < kUnroll; ++k) { - stat[k] = gpair[rid[k]]; + void BuildBlockHist(const std::vector& gpair, + const RowSetCollection::Elem row_indices, + const GHistIndexBlockMatrix& gmatb, + GHistRow hist) { + constexpr int kUnroll = 8; // loop unrolling factor + const int32_t nblock = gmatb.GetNumBlock(); + const size_t nrows = row_indices.end - row_indices.begin; + const size_t rest = nrows % kUnroll; + + #pragma omp parallel for + for (int32_t bid = 0; bid < nblock; ++bid) { + auto gmat = gmatb[bid]; + + for (size_t i = 0; i < nrows - rest; i += kUnroll) { + size_t rid[kUnroll]; + size_t ibegin[kUnroll]; + size_t iend[kUnroll]; + GradientPair stat[kUnroll]; + for (int k = 0; k < kUnroll; ++k) { + rid[k] = row_indices.begin[i + k]; + } + for (int k = 0; k < kUnroll; ++k) { + ibegin[k] = gmat.row_ptr[rid[k]]; + iend[k] = gmat.row_ptr[rid[k] + 1]; + } + for (int k = 0; k < kUnroll; ++k) { + stat[k] = gpair[rid[k]]; + } + for (int k = 0; k < kUnroll; ++k) { + for (size_t j = ibegin[k]; j < iend[k]; ++j) { + const uint32_t bin = gmat.index[j]; + hist[bin].Add(stat[k]); + } + } } - for (int k = 0; k < kUnroll; ++k) { - for (size_t j = ibegin[k]; j < iend[k]; ++j) { + for (size_t i = nrows - rest; i < nrows; ++i) { + const size_t rid = row_indices.begin[i]; + const size_t ibegin = gmat.row_ptr[rid]; + const size_t iend = gmat.row_ptr[rid + 1]; + const GradientPair stat = gpair[rid]; + for (size_t j = ibegin; j < iend; ++j) { const uint32_t bin = gmat.index[j]; - hist[bin].Add(stat[k]); + hist[bin].Add(stat); } } } - for (size_t i = nrows - rest; i < nrows; ++i) { - const size_t rid = row_indices.begin[i]; - const size_t ibegin = gmat.row_ptr[rid]; - const size_t iend = gmat.row_ptr[rid + 1]; - const GradientPair stat = gpair[rid]; - for (size_t j = ibegin; j < iend; ++j) { - const uint32_t bin = gmat.index[j]; - hist[bin].Add(stat); - } - } } -} uint32_t GetNumBins() { return nbins_; diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 7f6ac26b2b1e..e5b01919651f 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -457,7 +457,7 @@ void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& hist_is_init->resize(nodes.size()); // input data for tasks - int32_t n_tasks = 0; + int32_t n_hist_buidling_tasks = 0; std::vector task_nid; std::vector task_node_idx; std::vector task_block_idx; @@ -479,12 +479,13 @@ void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& const size_t n_local_blocks = nrows / block_size_rows + !!(nrows % block_size_rows); const size_t n_local_histograms = std::min(nthread, n_local_blocks); + + task_nid.resize(task_nid.size() + n_local_blocks, nid); for (size_t j = 0; j < n_local_blocks; ++j) { - task_nid.push_back(nid); task_node_idx.push_back(i); task_block_idx.push_back(j); } - n_tasks += n_local_blocks; + n_hist_buidling_tasks += n_local_blocks; (*hist_buffers)[i].clear(); for (size_t j = 0; j < n_local_histograms; j++) { @@ -500,7 +501,7 @@ void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& // execute tasks in parallel #pragma omp parallel for schedule(guided) - for (int32_t itask = 0; itask < n_tasks; ++itask) { + for (int32_t itask = 0; itask < n_hist_buidling_tasks; ++itask) { const size_t tid = omp_get_thread_num(); const int32_t nid = task_nid[itask]; const int32_t block_id = task_block_idx[itask]; @@ -654,7 +655,7 @@ void QuantileHistMaker::Builder::ExpandWithDepthWidth( BuildHistsBatch(qexpand_depth_wise_, p_tree, gmat, gpair_h, &hist_buffers, &hist_is_init); BuildNodeStatBatch(gmat, p_fmat, p_tree, gpair_h, qexpand_depth_wise_); - EvaluateSplitsBatch(qexpand_depth_wise_, gmat, *p_fmat, hist_is_init, hist_buffers, p_tree); + EvaluateSplitsBatch(qexpand_depth_wise_, gmat, *p_fmat, hist_is_init, hist_buffers); CreateNewNodesBatch(qexpand_depth_wise_, gmat, column_matrix, p_fmat, p_tree, &num_leaves, depth, ×tamp, &temp_qexpand_depth); @@ -691,7 +692,7 @@ void QuantileHistMaker::Builder::ExpandWithLossGuide( BuildHistsBatch(nodes_to_build, p_tree, gmat, gpair_h, &hist_buffers, &hist_is_init); BuildNodeStatBatch(gmat, p_fmat, p_tree, gpair_h, nodes_to_build); - EvaluateSplitsBatch(nodes_to_build, gmat, *p_fmat, hist_is_init, hist_buffers, p_tree); + EvaluateSplitsBatch(nodes_to_build, gmat, *p_fmat, hist_is_init, hist_buffers); qexpand_loss_guided_->push(ExpandEntry(nid, -1, -1, p_tree->GetDepth(nid), snode_[nid].best.loss_chg, @@ -713,7 +714,7 @@ void QuantileHistMaker::Builder::ExpandWithLossGuide( if (!successors.empty()) { BuildHistsBatch(successors, p_tree, gmat, gpair_h, &hist_buffers, &hist_is_init); BuildNodeStatBatch(gmat, p_fmat, p_tree, gpair_h, successors); - EvaluateSplitsBatch(successors, gmat, *p_fmat, hist_is_init, hist_buffers, p_tree); + EvaluateSplitsBatch(successors, gmat, *p_fmat, hist_is_init, hist_buffers); const int32_t cleft = (*p_tree)[nid].LeftChild(); const int32_t cright = (*p_tree)[nid].RightChild(); @@ -1002,8 +1003,7 @@ void QuantileHistMaker::Builder::EvaluateSplitsBatch( const GHistIndexMatrix& gmat, const DMatrix& fmat, const std::vector>& hist_is_init, - const std::vector>& hist_buffers, - RegTree* p_tree) { + const std::vector>& hist_buffers) { perf_monitor.TickStart(); const MetaInfo& info = fmat.Info(); // prepare tasks diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h index 27b87b974e7e..3d7f5b2427b9 100644 --- a/src/tree/updater_quantile_hist.h +++ b/src/tree/updater_quantile_hist.h @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -231,8 +230,7 @@ class QuantileHistMaker: public TreeUpdater { const GHistIndexMatrix& gmat, const DMatrix& fmat, const std::vector>& hist_is_init, - const std::vector>& hist_buffers, - RegTree* p_tree); + const std::vector>& hist_buffers); void ReduceHistograms( common::GradStatHist::GradType* hist_data, diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc index 40a28100d4a4..3d0c09e6a7ba 100644 --- a/tests/cpp/tree/test_quantile_hist.cc +++ b/tests/cpp/tree/test_quantile_hist.cc @@ -107,7 +107,7 @@ class QuantileHistMock : public QuantileHistMaker { std::vector nodes = {ExpandEntry(nid, -1, -1, tree.GetDepth(0), 0.0, 0)}; BuildHistsBatch(nodes, const_cast(&tree), gmat, gpair, &hist_buffers, &hist_is_init); RealImpl::InitNewNode(nid, gmat, gpair, fmat, const_cast(&tree), &snode_[0], tree[0].Parent()); - EvaluateSplitsBatch(nodes, gmat, fmat, hist_is_init, hist_buffers, const_cast(&tree)); + EvaluateSplitsBatch(nodes, gmat, fmat, hist_is_init, hist_buffers); // Check if number of histogram bins is correct ASSERT_EQ(hist_[nid].size(), gmat.cut.row_ptr.back()); @@ -153,7 +153,7 @@ class QuantileHistMock : public QuantileHistMaker { std::vector> hist_is_init; BuildHistsBatch(nodes, const_cast(&tree), gmat, row_gpairs, &hist_buffers, &hist_is_init); RealImpl::InitNewNode(0, gmat, row_gpairs, *(*dmat), const_cast(&tree), &snode_[0], tree[0].Parent()); - EvaluateSplitsBatch(nodes, gmat, **dmat, hist_is_init, hist_buffers, const_cast(&tree)); + EvaluateSplitsBatch(nodes, gmat, **dmat, hist_is_init, hist_buffers); /* Compute correct split (best_split) using the computed histogram */ const size_t num_row = dmat->get()->Info().num_row_; @@ -214,7 +214,7 @@ class QuantileHistMock : public QuantileHistMaker { } /* Now compare against result given by EvaluateSplit() */ - EvaluateSplitsBatch(nodes, gmat, **dmat, hist_is_init, hist_buffers, const_cast(&tree)); + EvaluateSplitsBatch(nodes, gmat, **dmat, hist_is_init, hist_buffers); ASSERT_EQ(snode_[0].best.SplitIndex(), best_split_feature); ASSERT_EQ(snode_[0].best.split_value, gmat.cut.cut[best_split_threshold]); From d2fc8781d9c579e5f17be16843d6764dfe82bc8c Mon Sep 17 00:00:00 2001 From: egor Date: Wed, 12 Jun 2019 23:57:06 +0300 Subject: [PATCH 18/31] add some comments, code refactoring --- src/common/hist_util.cc | 12 +- src/common/hist_util.h | 2 +- src/tree/updater_quantile_hist.cc | 254 +++++++++++++++++------------- src/tree/updater_quantile_hist.h | 21 +++ 4 files changed, 179 insertions(+), 110 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index edcff201e0ad..349759ee7f13 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -511,7 +511,11 @@ void BuildHistLocalDense(size_t istart, size_t iend, size_t nrows, const size_t* size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid); no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size; - if (iend < nrows - no_prefetch_size) { + // if read each row in some block of bin-matrix - it's dense block + // and we dont need SW prefetch in this case + const bool denseBlock = (rid[iend-1] - rid[istart]) == (iend - istart - 1); + + if (iend < nrows - no_prefetch_size && !denseBlock) { for (size_t i = istart; i < iend; ++i) { const size_t icol_start = rid[i] * n_features; const size_t icol_start_prefetch = rid[i+prefetch_offset] * n_features; @@ -564,7 +568,11 @@ void BuildHistLocalSparse(size_t istart, size_t iend, size_t nrows, const size_t size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid); no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size; - if (iend < nrows - no_prefetch_size) { + // if read each row in some block of bin-matrix - it's dense block + // and we dont need SW prefetch in this case + const bool denseBlock = (rid[iend-1] - rid[istart]) == (iend - istart); + + if (iend < nrows - no_prefetch_size && !denseBlock) { for (size_t i = istart; i < iend; ++i) { const size_t icol_start = row_ptr[rid[i]]; const size_t icol_end = row_ptr[rid[i]+1]; diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 72cbebc717b7..72a6958528b4 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -333,7 +333,7 @@ class HistCollection { size_t prev = data_arr_.size(); data_arr_.resize(nid + 1); - for(size_t i = prev; i < data_arr_.size(); ++i) { + for (size_t i = prev; i < data_arr_.size(); ++i) { data_arr_[i].resize(nbins_); } } diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index e5b01919651f..a08c0d549c4f 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -41,7 +41,7 @@ void QuantileHistMaker::Init(const std::vector& nodes, - const GHistIndexMatrix &gmat, - const ColumnMatrix &column_matrix, - DMatrix *p_fmat, - RegTree *p_tree, - int *num_leaves, - int depth, - unsigned *timestamp, - std::vector *temp_qexpand_depth) { - perf_monitor.TickStart(); - const size_t block_size = 2048; - // inputs for tasks - std::vector nids_to_split; - std::vector cond_to_split; - std::vector n_blocks_vec; - std::vector> tasks; +// split rows in each node to blocks of rows +// for future parallel execution +template +void QuantileHistMaker::Builder::CreateTasksForApplySplit( + const std::vector& nodes, + const GHistIndexMatrix &gmat, + RegTree *p_tree, + int *num_leaves, + const int depth, + const size_t block_size, + std::vector* tasks, + std::vector>* nodes_bounds) { size_t* buffer = buffer_for_partition_.data(); size_t cur_buff_offset = 0; - // buffers to store partial results - std::vector>> buffers_by_nids; - std::vector>> sizes_by_nids; - auto create_nodes = [&](int32_t this_nid) { if (snode_[this_nid].best.loss_chg < kRtEps || (param_.max_depth > 0 && depth == param_.max_depth) || (param_.max_leaves > 0 && (*num_leaves) == param_.max_leaves)) { (*p_tree)[this_nid].SetLeaf(snode_[this_nid].weight * param_.learning_rate); } else { - nids_to_split.push_back(this_nid); - cond_to_split.push_back(FindSplitCond(this_nid, p_tree, gmat)); - const size_t nrows = row_set_collection_[this_nid].Size(); const size_t n_blocks = nrows / block_size + !!(nrows % block_size); - n_blocks_vec.push_back(n_blocks); - buffers_by_nids.resize(buffers_by_nids.size() + 1); + nodes_bounds->emplace_back(tasks->size(), tasks->size() + n_blocks); + + int32_t split_cond = FindSplitCond(this_nid, p_tree, gmat); for (size_t i = 0; i < n_blocks; ++i) { const size_t istart = i*block_size; const size_t iend = (i == n_blocks-1) ? nrows : istart + block_size; - buffers_by_nids.back().push_back({ buffer + cur_buff_offset, - buffer + cur_buff_offset + (iend-istart) }); + + TaskType task {this_nid, split_cond, n_blocks, i, istart, iend, nodes_bounds.size()-1, + buffer + cur_buff_offset, buffer + cur_buff_offset + (iend-istart), 0, 0}; + tasks->push_back(task); cur_buff_offset += 2*(iend-istart); - tasks.emplace_back(cond_to_split.size() - 1, i); } - sizes_by_nids.emplace_back(n_blocks); } }; for (const auto& node : nodes) { @@ -321,27 +310,59 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( create_nodes(sibling_nid); } } +} + +void QuantileHistMaker::Builder::CreateNewNodesBatch( + const std::vector& nodes, + const GHistIndexMatrix &gmat, + const ColumnMatrix &column_matrix, + DMatrix *p_fmat, + RegTree *p_tree, + int *num_leaves, + int depth, + unsigned *timestamp, + std::vector *temp_qexpand_depth) { + perf_monitor.TickStart(); + const size_t block_size = 2048; + + struct ApplySplitTaskInfo { + // input + int32_t nid; + int32_t split_cond; + size_t n_blocks_this_node; + size_t i_block_this_node; + size_t istart; + size_t iend; + size_t inode; + // result + size_t* left; + size_t* right; + size_t n_left; + size_t n_right; + }; + + // create tasks for partition of row_set_collection_ + std::vector tasks; + std::vector> nodes_bounds; + CreateTasksForApplySplit(nodes, gmat, p_tree, num_leaves, depth, + block_size, &tasks, &nodes_bounds); // buffer to store # of rows in left part for each row-block std::vector left_sizes; + left_sizes.reserve(nodes_bounds.size()); - const int32_t size = tasks.size(); + // execute tasks in parallel #pragma omp parallel { // compute partial partitions #pragma omp for schedule(guided) - for (int32_t i = 0; i < size; ++i) { - const size_t node_idx = tasks[i].first; - const size_t iblock = tasks[i].second; - const int32_t split_cond = cond_to_split[node_idx]; - const int32_t nid = nids_to_split[node_idx]; - const bst_uint fid = (*p_tree)[nid].SplitIndex(); - - const size_t nrows = row_set_collection_[nid].Size(); - const size_t nblocks = n_blocks_vec[node_idx]; - const size_t istart = iblock*block_size; - const size_t iend = (iblock == nblocks-1) ? nrows : istart + block_size; + for (int32_t i = 0; i < tasks.size(); ++i) { + const int32_t nid = tasks[i].nid; + const int32_t split_cond = tasks[i].split_cond; + const size_t istart = tasks[i].istart; + const size_t iend = tasks[i].iend; + const bst_uint fid = (*p_tree)[nid].SplitIndex(); const bool default_left = (*p_tree)[nid].DefaultLeft(); const Column column = column_matrix.GetColumn(fid); @@ -350,67 +371,78 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( if (column.GetType() == xgboost::common::kDenseColumn) { if (default_left) { - sizes_by_nids[node_idx][iblock] = PartitionDenseLeftDefaultKernel( + auto res = PartitionDenseLeftDefaultKernel( rid, idx, column.GetBaseIdx(), split_cond, istart, iend, - buffers_by_nids[node_idx][iblock].first, buffers_by_nids[node_idx][iblock].second); + tasks[i].left, tasks[i].right); + tasks[i].n_left = res.first; + tasks[i].n_right = res.second; } else { - sizes_by_nids[node_idx][iblock] = PartitionDenseRightDefaultKernel( + auto res = PartitionDenseRightDefaultKernel( rid, idx, column.GetBaseIdx(), split_cond, istart, iend, - buffers_by_nids[node_idx][iblock].first, buffers_by_nids[node_idx][iblock].second); + tasks[i].left, tasks[i].right); + tasks[i].n_left = res.first; + tasks[i].n_right = res.second; } } else { - sizes_by_nids[node_idx][iblock] = PartitionSparseKernel( - rid, idx, split_cond, istart, iend, buffers_by_nids[node_idx][iblock].first, - buffers_by_nids[node_idx][iblock].second, column, default_left); + auto res = PartitionSparseKernel( + rid, idx, split_cond, istart, iend, tasks[i].left, tasks[i].right, column, default_left); + tasks[i].n_left = res.first; + tasks[i].n_right = res.second; } } // calculate sizes of left parts in each block #pragma omp single { - for (size_t inode = 0; inode < nids_to_split.size(); ++inode) { - size_t nLeft = 0; - for (auto& size : sizes_by_nids[inode]) { - nLeft += size.first; + for (size_t inode = 0; inode < nodes_bounds.size(); ++inode) { + size_t n_left = 0; + + size_t begin = nodes_bounds[inode].first; + size_t end = nodes_bounds[inode].second; + + for (size_t i = begin; i < end; ++i) { + n_left += tasks[i].n_left; } - left_sizes.push_back(nLeft); + left_sizes.push_back(n_left); } } // merge partial results to one #pragma omp for schedule(guided) - for (int32_t i = 0; i < size; ++i) { - const size_t node_idx = tasks[i].first; - const size_t iblock = tasks[i].second; + for (int32_t i = 0; i < tasks.size(); ++i) { + const size_t node_idx = tasks[i].inode; + const int32_t nid = tasks[i].nid; + const int32_t iblock = tasks[i].i_block_this_node; - const int32_t nid = nids_to_split[node_idx]; auto* rid = const_cast(row_set_collection_[nid].begin); - size_t iLeft = 0; - size_t iRight = 0; + size_t ileft = 0; + size_t iright = 0; - const size_t nLeft = left_sizes[node_idx]; + const size_t n_left = left_sizes[node_idx]; - for (size_t j = 0; j < iblock; ++j) { - iLeft += sizes_by_nids[node_idx][j].first; - iRight += sizes_by_nids[node_idx][j].second; + for (size_t j = nodes_bounds[node_idx].first; + j < nodes_bounds[node_idx].first + iblock; ++j) { + ileft += tasks[j].n_left; + iright += tasks[j].n_right; } - memcpy(rid + iLeft, buffers_by_nids[node_idx][iblock].first, - sizes_by_nids[node_idx][iblock].first * sizeof(rid[0])); - memcpy(rid + nLeft + iRight, buffers_by_nids[node_idx][iblock].second, - sizes_by_nids[node_idx][iblock].second * sizeof(rid[0])); + std::memcpy(rid + ileft, tasks[i].left, + tasks[i].n_left * sizeof(rid[0])); + std::memcpy(rid + n_left + iright, tasks[i].right, + tasks[i].n_right * sizeof(rid[0])); } } + // register new nodes - for (size_t i = 0; i < nids_to_split.size(); ++i) { - const int32_t nid = nids_to_split[i]; - const size_t nLeft = left_sizes[i]; + for (size_t i = 0; i < nodes_bounds.size(); ++i) { + const int32_t nid = tasks[nodes_bounds[i].first].nid; + const size_t n_left = left_sizes[i]; RegTree::Node node = (*p_tree)[nid]; const int32_t left_id = node.LeftChild(); const int32_t right_id = node.RightChild(); - row_set_collection_.AddSplit(nid, nLeft, left_id, right_id); + row_set_collection_.AddSplit(nid, n_left, left_id, right_id); if (rabit::IsDistributed() || row_set_collection_[left_id].Size() < row_set_collection_[right_id].Size()) { @@ -443,31 +475,19 @@ std::tuple return std::make_tuple(local_data_hist, &(*grad_stats)[hist_id]); } -void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& nodes, - RegTree* p_tree, const GHistIndexMatrix &gmat, const std::vector& gpair, - std::vector>* hist_buffers, - std::vector>* hist_is_init) { - perf_monitor.TickStart(); - const size_t block_size_rows = 256; - const size_t nthread = static_cast(this->nthread_); - const size_t nbins = gmat.cut.row_ptr.back(); - const size_t hist_size = 2 * nbins; - - hist_buffers->resize(nodes.size()); - hist_is_init->resize(nodes.size()); - - // input data for tasks - int32_t n_hist_buidling_tasks = 0; - std::vector task_nid; - std::vector task_node_idx; - std::vector task_block_idx; - - // result vector - std::vector> grad_stats(nodes.size()); - +void QuantileHistMaker::Builder::CreateTasksForBuildHist( + size_t block_size_rows, + size_t nthread, + const std::vector& nodes, + std::vector>* hist_buffers, + std::vector>* hist_is_init, + std::vector>* grad_stats, + std::vector* task_nid, + std::vector* task_node_idx, + std::vector* task_block_idx) { size_t i_hist = 0; - // prepare tasks for parallel exection + // prepare tasks for parallel execution for (size_t i = 0; i < nodes.size(); ++i) { const int32_t nid = nodes[i].nid; const int32_t sibling_nid = nodes[i].sibling_nid; @@ -479,13 +499,11 @@ void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& const size_t n_local_blocks = nrows / block_size_rows + !!(nrows % block_size_rows); const size_t n_local_histograms = std::min(nthread, n_local_blocks); - - task_nid.resize(task_nid.size() + n_local_blocks, nid); + task_nid->resize(task_nid->size() + n_local_blocks, nid); for (size_t j = 0; j < n_local_blocks; ++j) { - task_node_idx.push_back(i); - task_block_idx.push_back(j); + task_node_idx->push_back(i); + task_block_idx->push_back(j); } - n_hist_buidling_tasks += n_local_blocks; (*hist_buffers)[i].clear(); for (size_t j = 0; j < n_local_histograms; j++) { @@ -494,8 +512,34 @@ void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& } (*hist_is_init)[i].clear(); (*hist_is_init)[i].resize(n_local_histograms, false); - grad_stats[i].resize(n_local_histograms); + (*grad_stats)[i].resize(n_local_histograms); } +} + +void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& nodes, + RegTree* p_tree, const GHistIndexMatrix &gmat, const std::vector& gpair, + std::vector>* hist_buffers, + std::vector>* hist_is_init) { + perf_monitor.TickStart(); + const size_t block_size_rows = 256; + const size_t nthread = static_cast(this->nthread_); + const size_t nbins = gmat.cut.row_ptr.back(); + const size_t hist_size = 2 * nbins; + + hist_buffers->resize(nodes.size()); + hist_is_init->resize(nodes.size()); + + // input data for tasks + std::vector task_nid; + std::vector task_node_idx; + std::vector task_block_idx; + + // result vector + std::vector> grad_stats(nodes.size()); + CreateTasksForBuildHist(block_size_rows, nthread, nodes, hist_buffers, hist_is_init, &grad_stats, + &task_nid, &task_node_idx, &task_block_idx); + int32_t n_hist_buidling_tasks = task_node_idx.size(); + const GradientPair::ValueT* const pgh = reinterpret_cast(gpair.data()); @@ -625,10 +669,6 @@ void QuantileHistMaker::Builder::ReduceHistograms( } } -// void QuantileHistMaker::Builder::SyncHistograms() { - -// } - void QuantileHistMaker::Builder::ExpandWithDepthWidth( const GHistIndexMatrix &gmat, const GHistIndexBlockMatrix &gmatb, @@ -1037,7 +1077,7 @@ void QuantileHistMaker::Builder::EvaluateSplitsBatch( common::GradStatHist::GradType* parent_hist_data = sibling_nid > -1 ? reinterpret_cast(hist_[parent_nid].data()) : nullptr; - // reduce needed part of a hist here to have it in cache before enumeratation + // reduce needed part of a hist here to have it in cache before enumeration if (!rabit::IsDistributed()) { const std::vector& cut_ptr = gmat.cut.row_ptr; const size_t ibegin = 2 * cut_ptr[fid]; diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h index 3d7f5b2427b9..92fd9fcccee2 100644 --- a/src/tree/updater_quantile_hist.h +++ b/src/tree/updater_quantile_hist.h @@ -311,6 +311,27 @@ class QuantileHistMaker: public TreeUpdater { unsigned *timestamp, std::vector *temp_qexpand_depth); + template + void CreateTasksForApplySplit( + const std::vector& nodes, + const GHistIndexMatrix &gmat, + RegTree *p_tree, + int *num_leaves, + const int depth, + const size_t block_size, + std::vector* tasks, + std::vector>* nodes_bounds); + + void CreateTasksForBuildHist( + size_t block_size_rows, + size_t nthread, + const std::vector& nodes, + std::vector>* hist_buffers, + std::vector>* hist_is_init, + std::vector>* grad_stats, + std::vector* task_nid, + std::vector* task_node_idx, + std::vector* task_block_idx); inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) { if (lhs.loss_chg == rhs.loss_chg) { From 9301edeb312397dc80f2aa1d834820f5068abf6f Mon Sep 17 00:00:00 2001 From: egor Date: Thu, 13 Jun 2019 00:41:05 +0300 Subject: [PATCH 19/31] fixing issues in CI --- src/tree/updater_quantile_hist.cc | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index a08c0d549c4f..dfbb69316634 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -294,7 +294,7 @@ void QuantileHistMaker::Builder::CreateTasksForApplySplit( const size_t istart = i*block_size; const size_t iend = (i == n_blocks-1) ? nrows : istart + block_size; - TaskType task {this_nid, split_cond, n_blocks, i, istart, iend, nodes_bounds.size()-1, + TaskType task {this_nid, split_cond, n_blocks, i, istart, iend, nodes_bounds->size()-1, buffer + cur_buff_offset, buffer + cur_buff_offset + (iend-istart), 0, 0}; tasks->push_back(task); cur_buff_offset += 2*(iend-istart); @@ -350,13 +350,14 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( // buffer to store # of rows in left part for each row-block std::vector left_sizes; left_sizes.reserve(nodes_bounds.size()); + const size_t size = tasks.size(); // execute tasks in parallel #pragma omp parallel { // compute partial partitions - #pragma omp for schedule(guided) - for (int32_t i = 0; i < tasks.size(); ++i) { + #pragma omp for + for (int32_t i = 0; i < size; ++i) { const int32_t nid = tasks[i].nid; const int32_t split_cond = tasks[i].split_cond; const size_t istart = tasks[i].istart; @@ -394,11 +395,11 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( // calculate sizes of left parts in each block #pragma omp single { - for (size_t inode = 0; inode < nodes_bounds.size(); ++inode) { + for (auto& node : nodes_bounds) { size_t n_left = 0; - size_t begin = nodes_bounds[inode].first; - size_t end = nodes_bounds[inode].second; + size_t begin = node.first; + size_t end = node.second; for (size_t i = begin; i < end; ++i) { n_left += tasks[i].n_left; @@ -408,8 +409,8 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( } // merge partial results to one - #pragma omp for schedule(guided) - for (int32_t i = 0; i < tasks.size(); ++i) { + #pragma omp for + for (int32_t i = 0; i < size; ++i) { const size_t node_idx = tasks[i].inode; const int32_t nid = tasks[i].nid; const int32_t iblock = tasks[i].i_block_this_node; From b065ee2ee13e8c6446b5f07f5eb6836c35fb7c44 Mon Sep 17 00:00:00 2001 From: egor Date: Sat, 15 Jun 2019 14:56:36 +0300 Subject: [PATCH 20/31] adding runtime checks --- src/tree/updater_quantile_hist.cc | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index dfbb69316634..d27911ad3749 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -285,10 +285,11 @@ void QuantileHistMaker::Builder::CreateTasksForApplySplit( } else { const size_t nrows = row_set_collection_[this_nid].Size(); const size_t n_blocks = nrows / block_size + !!(nrows % block_size); + CHECK_GT(n_blocks, 0U); nodes_bounds->emplace_back(tasks->size(), tasks->size() + n_blocks); - int32_t split_cond = FindSplitCond(this_nid, p_tree, gmat); + const int32_t split_cond = FindSplitCond(this_nid, p_tree, gmat); for (size_t i = 0; i < n_blocks; ++i) { const size_t istart = i*block_size; @@ -298,6 +299,7 @@ void QuantileHistMaker::Builder::CreateTasksForApplySplit( buffer + cur_buff_offset, buffer + cur_buff_offset + (iend-istart), 0, 0}; tasks->push_back(task); cur_buff_offset += 2*(iend-istart); + CHECK_LE(cur_buff_offset, buffer_for_partition_.capacity()); } } }; @@ -344,6 +346,7 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( // create tasks for partition of row_set_collection_ std::vector tasks; std::vector> nodes_bounds; + CreateTasksForApplySplit(nodes, gmat, p_tree, num_leaves, depth, block_size, &tasks, &nodes_bounds); @@ -399,11 +402,12 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( size_t n_left = 0; size_t begin = node.first; - size_t end = node.second; + size_t end = node.second; for (size_t i = begin; i < end; ++i) { n_left += tasks[i].n_left; } + CHECK_GT(n_left, 0U); left_sizes.push_back(n_left); } } @@ -421,13 +425,16 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( size_t iright = 0; const size_t n_left = left_sizes[node_idx]; + const size_t block_offset = nodes_bounds[node_idx].first; - for (size_t j = nodes_bounds[node_idx].first; - j < nodes_bounds[node_idx].first + iblock; ++j) { + for (size_t j = block_offset; j < block_offset + iblock; ++j) { ileft += tasks[j].n_left; iright += tasks[j].n_right; } + CHECK_LE(ileft + tasks[i].n_left, row_set_collection_[nid].Size()); + CHECK_LE(n_left + iright + tasks[i].n_right, row_set_collection_[nid].Size()); + std::memcpy(rid + ileft, tasks[i].left, tasks[i].n_left * sizeof(rid[0])); std::memcpy(rid + n_left + iright, tasks[i].right, @@ -540,6 +547,9 @@ void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& CreateTasksForBuildHist(block_size_rows, nthread, nodes, hist_buffers, hist_is_init, &grad_stats, &task_nid, &task_node_idx, &task_block_idx); int32_t n_hist_buidling_tasks = task_node_idx.size(); + CHECK_GT(n_hist_buidling_tasks, 0U); + CHECK_GT(task_nid.size(), 0U); + CHECK_GT(task_block_idx.size(), 0U); const GradientPair::ValueT* const pgh = reinterpret_cast(gpair.data()); From 8e399d50ab9b34b5eae674ecffcd2ddb16be081a Mon Sep 17 00:00:00 2001 From: egor Date: Sat, 15 Jun 2019 15:19:11 +0300 Subject: [PATCH 21/31] remove 1 extra check --- src/tree/updater_quantile_hist.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index d27911ad3749..7f97b4e920fe 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -407,7 +407,6 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( for (size_t i = begin; i < end; ++i) { n_left += tasks[i].n_left; } - CHECK_GT(n_left, 0U); left_sizes.push_back(n_left); } } From 1f8f0ee1e98b20e82027873f800409fcb9d70016 Mon Sep 17 00:00:00 2001 From: egor Date: Sat, 15 Jun 2019 16:23:43 +0300 Subject: [PATCH 22/31] remove extra checks in BuildHist --- src/tree/updater_quantile_hist.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 7f97b4e920fe..58232d087bfa 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -546,9 +546,6 @@ void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& CreateTasksForBuildHist(block_size_rows, nthread, nodes, hist_buffers, hist_is_init, &grad_stats, &task_nid, &task_node_idx, &task_block_idx); int32_t n_hist_buidling_tasks = task_node_idx.size(); - CHECK_GT(n_hist_buidling_tasks, 0U); - CHECK_GT(task_nid.size(), 0U); - CHECK_GT(task_block_idx.size(), 0U); const GradientPair::ValueT* const pgh = reinterpret_cast(gpair.data()); From 7b282897dbe81c60ac8f9e99416a406cfaeb175f Mon Sep 17 00:00:00 2001 From: egor Date: Sat, 15 Jun 2019 17:34:02 +0300 Subject: [PATCH 23/31] remove checks --- src/tree/updater_quantile_hist.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 58232d087bfa..597996e52acc 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -285,7 +285,6 @@ void QuantileHistMaker::Builder::CreateTasksForApplySplit( } else { const size_t nrows = row_set_collection_[this_nid].Size(); const size_t n_blocks = nrows / block_size + !!(nrows % block_size); - CHECK_GT(n_blocks, 0U); nodes_bounds->emplace_back(tasks->size(), tasks->size() + n_blocks); @@ -299,7 +298,6 @@ void QuantileHistMaker::Builder::CreateTasksForApplySplit( buffer + cur_buff_offset, buffer + cur_buff_offset + (iend-istart), 0, 0}; tasks->push_back(task); cur_buff_offset += 2*(iend-istart); - CHECK_LE(cur_buff_offset, buffer_for_partition_.capacity()); } } }; @@ -431,9 +429,6 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( iright += tasks[j].n_right; } - CHECK_LE(ileft + tasks[i].n_left, row_set_collection_[nid].Size()); - CHECK_LE(n_left + iright + tasks[i].n_right, row_set_collection_[nid].Size()); - std::memcpy(rid + ileft, tasks[i].left, tasks[i].n_left * sizeof(rid[0])); std::memcpy(rid + n_left + iright, tasks[i].right, From ff7d946e8da1cdd9467d559de8a4c4bf1cf5b7b3 Mon Sep 17 00:00:00 2001 From: egor Date: Sat, 15 Jun 2019 18:19:52 +0300 Subject: [PATCH 24/31] add debug info --- CMakeLists.txt | 1 + Makefile | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 10d191d9ff3f..b7e7ccfef09b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,7 @@ endif (MSVC) set_default_configuration_release() #-- Options +add_compile_options(-g) option(BUILD_C_DOC "Build documentation for C APIs using Doxygen." OFF) option(USE_OPENMP "Build with OpenMP support." ON) ## Bindings diff --git a/Makefile b/Makefile index 42d3bfe1a0ca..68b15a51df65 100644 --- a/Makefile +++ b/Makefile @@ -76,7 +76,7 @@ export JAVAINCFLAGS = -I${JAVA_HOME}/include -I./java ifeq ($(TEST_COVER), 1) CFLAGS += -g -O0 -fprofile-arcs -ftest-coverage else - CFLAGS += -O3 -funroll-loops + CFLAGS += -g -O3 -funroll-loops ifeq ($(USE_SSE), 1) CFLAGS += -msse2 endif From f333d2bb43b3ebe8aaaf8af9276b66ad5c3d6c88 Mon Sep 17 00:00:00 2001 From: egor Date: Sat, 15 Jun 2019 18:19:52 +0300 Subject: [PATCH 25/31] added debug info --- CMakeLists.txt | 5 +++++ Makefile | 2 +- src/tree/updater_quantile_hist.cc | 8 +++++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 10d191d9ff3f..9cca78d9a73d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,11 @@ endif (MSVC) set_default_configuration_release() #-- Options +add_compile_options(-g) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g") + option(BUILD_C_DOC "Build documentation for C APIs using Doxygen." OFF) option(USE_OPENMP "Build with OpenMP support." ON) ## Bindings diff --git a/Makefile b/Makefile index 42d3bfe1a0ca..68b15a51df65 100644 --- a/Makefile +++ b/Makefile @@ -76,7 +76,7 @@ export JAVAINCFLAGS = -I${JAVA_HOME}/include -I./java ifeq ($(TEST_COVER), 1) CFLAGS += -g -O0 -fprofile-arcs -ftest-coverage else - CFLAGS += -O3 -funroll-loops + CFLAGS += -g -O3 -funroll-loops ifeq ($(USE_SSE), 1) CFLAGS += -msse2 endif diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 597996e52acc..fb1bf5fa850e 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -345,8 +345,11 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( std::vector tasks; std::vector> nodes_bounds; + LOG(WARNING) << "before CreateTasksForApplySplit"; + CreateTasksForApplySplit(nodes, gmat, p_tree, num_leaves, depth, block_size, &tasks, &nodes_bounds); + LOG(WARNING) << "after CreateTasksForApplySplit"; // buffer to store # of rows in left part for each row-block std::vector left_sizes; @@ -396,6 +399,7 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( // calculate sizes of left parts in each block #pragma omp single { + LOG(WARNING) << "start of merge"; for (auto& node : nodes_bounds) { size_t n_left = 0; @@ -407,6 +411,7 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( } left_sizes.push_back(n_left); } + LOG(WARNING) << "end of merge"; } // merge partial results to one @@ -435,6 +440,7 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( tasks[i].n_right * sizeof(rid[0])); } } + LOG(WARNING) << "end of partition"; // register new nodes for (size_t i = 0; i < nodes_bounds.size(); ++i) { @@ -455,7 +461,7 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( depth + 1, 0.0, (*timestamp)++)); } } - + LOG(WARNING) << "end of APPLY_SPLIT"; perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::APPLY_SPLIT); } From c868ab150f9b5ebf71a3a31afc610f4d616d8be3 Mon Sep 17 00:00:00 2001 From: egor Date: Sat, 15 Jun 2019 22:40:19 +0300 Subject: [PATCH 26/31] revert changes --- CMakeLists.txt | 3 - Makefile | 2 +- src/tree/updater_quantile_hist.cc | 93 +++++++++++++++++-------------- src/tree/updater_quantile_hist.h | 4 +- 4 files changed, 53 insertions(+), 49 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 470faeb2e394..10d191d9ff3f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,9 +12,6 @@ endif (MSVC) set_default_configuration_release() #-- Options -add_compile_options(-g) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g") option(BUILD_C_DOC "Build documentation for C APIs using Doxygen." OFF) option(USE_OPENMP "Build with OpenMP support." ON) ## Bindings diff --git a/Makefile b/Makefile index 68b15a51df65..42d3bfe1a0ca 100644 --- a/Makefile +++ b/Makefile @@ -76,7 +76,7 @@ export JAVAINCFLAGS = -I${JAVA_HOME}/include -I./java ifeq ($(TEST_COVER), 1) CFLAGS += -g -O0 -fprofile-arcs -ftest-coverage else - CFLAGS += -g -O3 -funroll-loops + CFLAGS += -O3 -funroll-loops ifeq ($(USE_SSE), 1) CFLAGS += -msse2 endif diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index fb1bf5fa850e..419e2023ef4c 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -144,14 +144,17 @@ std::pair PartitionDenseLeftDefaultKernel(const RowIdxType* rid, size_t ileft = 0; size_t iright = 0; + const int32_t cmp = split_cond - offset; + const IdxType max_val = std::numeric_limits::max(); + for (size_t i = istart; i < iend; i++) { - if ( idx[rid[i]] == std::numeric_limits::max() || - static_cast(idx[rid[i]] + offset) <= split_cond) { + if (idx[rid[i]] == max_val || static_cast(idx[rid[i]] + offset) <= split_cond) { p_left[ileft++] = rid[i]; } else { p_right[iright++] = rid[i]; } } + return { ileft, iright }; } @@ -162,9 +165,10 @@ std::pair PartitionDenseRightDefaultKernel(const RowIdxType* rid size_t ileft = 0; size_t iright = 0; + const IdxType max_val = std::numeric_limits::max(); + for (size_t i = istart; i < iend; i++) { - if (idx[rid[i]] == std::numeric_limits::max() || - static_cast(idx[rid[i]] + offset) > split_cond) { + if (idx[rid[i]] == max_val || static_cast(idx[rid[i]] + offset) > split_cond) { p_right[iright++] = rid[i]; } else { p_left[ileft++] = rid[i]; @@ -264,7 +268,7 @@ int32_t QuantileHistMaker::Builder::FindSplitCond(int32_t nid, // split rows in each node to blocks of rows // for future parallel execution -template +template void QuantileHistMaker::Builder::CreateTasksForApplySplit( const std::vector& nodes, const GHistIndexMatrix &gmat, @@ -273,7 +277,7 @@ void QuantileHistMaker::Builder::CreateTasksForApplySplit( const int depth, const size_t block_size, std::vector* tasks, - std::vector>* nodes_bounds) { + std::vector* nodes_bounds) { size_t* buffer = buffer_for_partition_.data(); size_t cur_buff_offset = 0; @@ -286,7 +290,7 @@ void QuantileHistMaker::Builder::CreateTasksForApplySplit( const size_t nrows = row_set_collection_[this_nid].Size(); const size_t n_blocks = nrows / block_size + !!(nrows % block_size); - nodes_bounds->emplace_back(tasks->size(), tasks->size() + n_blocks); + nodes_bounds->emplace_back(this_nid, tasks->size(), tasks->size() + n_blocks); const int32_t split_cond = FindSplitCond(this_nid, p_tree, gmat); @@ -295,7 +299,7 @@ void QuantileHistMaker::Builder::CreateTasksForApplySplit( const size_t iend = (i == n_blocks-1) ? nrows : istart + block_size; TaskType task {this_nid, split_cond, n_blocks, i, istart, iend, nodes_bounds->size()-1, - buffer + cur_buff_offset, buffer + cur_buff_offset + (iend-istart), 0, 0}; + buffer + cur_buff_offset, buffer + cur_buff_offset + (iend-istart), 0, 0, 0, 0}; tasks->push_back(task); cur_buff_offset += 2*(iend-istart); } @@ -339,27 +343,40 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( size_t* right; size_t n_left; size_t n_right; + size_t ileft; + size_t iright; + }; + + struct NodeBoundsInfo { + NodeBoundsInfo(int32_t nid, size_t begin, size_t end): + nid(nid), begin(begin), end(end) { + } + + int32_t nid; + size_t begin; + size_t end; }; // create tasks for partition of row_set_collection_ std::vector tasks; - std::vector> nodes_bounds; + std::vector nodes_bounds; - LOG(WARNING) << "before CreateTasksForApplySplit"; - - CreateTasksForApplySplit(nodes, gmat, p_tree, num_leaves, depth, - block_size, &tasks, &nodes_bounds); - LOG(WARNING) << "after CreateTasksForApplySplit"; + // 1. Split row-indexes in each nodes to blocks of rows + CreateTasksForApplySplit(nodes, gmat, p_tree, num_leaves, + depth, block_size, &tasks, &nodes_bounds); // buffer to store # of rows in left part for each row-block std::vector left_sizes; left_sizes.reserve(nodes_bounds.size()); - const size_t size = tasks.size(); + const int size = tasks.size(); // execute tasks in parallel #pragma omp parallel { - // compute partial partitions + // 2. For each block of rows: + // a) Write row-indexes which should come to the left child - to 1th buffer + // b) Write row-indexes which should come to the right child - to 2th buffer + // values in each buffer - sorted in original order #pragma omp for for (int32_t i = 0; i < size; ++i) { const int32_t nid = tasks[i].nid; @@ -396,55 +413,46 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( } } - // calculate sizes of left parts in each block + // 3. For each node - find number of elements in left the part #pragma omp single { - LOG(WARNING) << "start of merge"; for (auto& node : nodes_bounds) { size_t n_left = 0; + size_t n_right = 0; - size_t begin = node.first; - size_t end = node.second; + for (size_t i = node.begin; i < node.end; ++i) { + tasks[i].ileft = n_left; + tasks[i].iright = n_right; - for (size_t i = begin; i < end; ++i) { - n_left += tasks[i].n_left; + n_left += tasks[i].n_left; + n_right += tasks[i].n_right; } left_sizes.push_back(n_left); } - LOG(WARNING) << "end of merge"; } - // merge partial results to one + // 4. Copy data from buffers to original row_set_collection_ #pragma omp for for (int32_t i = 0; i < size; ++i) { const size_t node_idx = tasks[i].inode; - const int32_t nid = tasks[i].nid; - const int32_t iblock = tasks[i].i_block_this_node; - - auto* rid = const_cast(row_set_collection_[nid].begin); + const int32_t nid = tasks[i].nid; + const int32_t iblock = tasks[i].i_block_this_node; + const size_t n_left = left_sizes[node_idx]; - size_t ileft = 0; - size_t iright = 0; + CHECK_LE(tasks[i].ileft + tasks[i].n_left, row_set_collection_[nid].Size()); + CHECK_LE(n_left + tasks[i].iright + tasks[i].n_right, row_set_collection_[nid].Size()); - const size_t n_left = left_sizes[node_idx]; - const size_t block_offset = nodes_bounds[node_idx].first; - - for (size_t j = block_offset; j < block_offset + iblock; ++j) { - ileft += tasks[j].n_left; - iright += tasks[j].n_right; - } - - std::memcpy(rid + ileft, tasks[i].left, + auto* rid = const_cast(row_set_collection_[nid].begin); + std::memcpy(rid + tasks[i].ileft, tasks[i].left, tasks[i].n_left * sizeof(rid[0])); - std::memcpy(rid + n_left + iright, tasks[i].right, + std::memcpy(rid + n_left + tasks[i].iright, tasks[i].right, tasks[i].n_right * sizeof(rid[0])); } } - LOG(WARNING) << "end of partition"; // register new nodes for (size_t i = 0; i < nodes_bounds.size(); ++i) { - const int32_t nid = tasks[nodes_bounds[i].first].nid; + const int32_t nid = nodes_bounds[i].nid; const size_t n_left = left_sizes[i]; RegTree::Node node = (*p_tree)[nid]; @@ -461,7 +469,6 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( depth + 1, 0.0, (*timestamp)++)); } } - LOG(WARNING) << "end of APPLY_SPLIT"; perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::APPLY_SPLIT); } diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h index 92fd9fcccee2..7e2458862f43 100644 --- a/src/tree/updater_quantile_hist.h +++ b/src/tree/updater_quantile_hist.h @@ -311,7 +311,7 @@ class QuantileHistMaker: public TreeUpdater { unsigned *timestamp, std::vector *temp_qexpand_depth); - template + template void CreateTasksForApplySplit( const std::vector& nodes, const GHistIndexMatrix &gmat, @@ -320,7 +320,7 @@ class QuantileHistMaker: public TreeUpdater { const int depth, const size_t block_size, std::vector* tasks, - std::vector>* nodes_bounds); + std::vector* nodes_bounds); void CreateTasksForBuildHist( size_t block_size_rows, From 886753cb077e13a66e0f8969e5d7065fa6079fd3 Mon Sep 17 00:00:00 2001 From: egor Date: Sun, 16 Jun 2019 01:25:41 +0300 Subject: [PATCH 27/31] added comments --- src/tree/updater_quantile_hist.cc | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 419e2023ef4c..0afd999fa98e 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -144,7 +144,6 @@ std::pair PartitionDenseLeftDefaultKernel(const RowIdxType* rid, size_t ileft = 0; size_t iright = 0; - const int32_t cmp = split_cond - offset; const IdxType max_val = std::numeric_limits::max(); for (size_t i = istart; i < iend; i++) { @@ -265,7 +264,6 @@ int32_t QuantileHistMaker::Builder::FindSplitCond(int32_t nid, return split_cond; } - // split rows in each node to blocks of rows // for future parallel execution template @@ -436,7 +434,6 @@ void QuantileHistMaker::Builder::CreateNewNodesBatch( for (int32_t i = 0; i < size; ++i) { const size_t node_idx = tasks[i].inode; const int32_t nid = tasks[i].nid; - const int32_t iblock = tasks[i].i_block_this_node; const size_t n_left = left_sizes[node_idx]; CHECK_LE(tasks[i].ileft + tasks[i].n_left, row_set_collection_[nid].Size()); @@ -551,6 +548,8 @@ void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& // result vector std::vector> grad_stats(nodes.size()); + + // 1. Create tasks for hist construction by block of rows for each node CreateTasksForBuildHist(block_size_rows, nthread, nodes, hist_buffers, hist_is_init, &grad_stats, &task_nid, &task_node_idx, &task_block_idx); int32_t n_hist_buidling_tasks = task_node_idx.size(); @@ -558,7 +557,7 @@ void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& const GradientPair::ValueT* const pgh = reinterpret_cast(gpair.data()); - // execute tasks in parallel + // 2. Build partial histograms for each node #pragma omp parallel for schedule(guided) for (int32_t itask = 0; itask < n_hist_buidling_tasks; ++itask) { const size_t tid = omp_get_thread_num(); @@ -579,6 +578,7 @@ void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& const size_t istart = block_id * block_size_rows; const size_t iend = (((block_id+1)*block_size_rows > nrows) ? nrows : istart + block_size_rows); + // call hist building kernel depending on bin-matrix layout if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) { common::BuildHistLocalDense(istart, iend, nrows, rid, gmat.index.data(), pgh, row_ptr, data_local_hist, grad_stat); @@ -588,6 +588,8 @@ void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& } } + // 3. Merge grad stats for each node + // Sync histograms in case of distributed computation SyncHistograms(p_tree, nodes, hist_buffers, hist_is_init, grad_stats); perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::BUILD_HIST); @@ -600,8 +602,9 @@ void QuantileHistMaker::Builder::SyncHistograms( std::vector>* hist_is_init, const std::vector>& grad_stats) { if (rabit::IsDistributed()) { + const int size = nodes.size(); #pragma omp parallel for // TODO(egorsmir): replace to n_features * nodes.size() - for (int i = 0; i < nodes.size(); ++i) { + for (int i = 0; i < size; ++i) { const int32_t nid = nodes[i].nid; common::GradStatHist::GradType* hist_data = reinterpret_cast(hist_[nid].data()); @@ -653,6 +656,7 @@ void QuantileHistMaker::Builder::SyncHistograms( } } +// merge some block of partial histograms void QuantileHistMaker::Builder::ReduceHistograms( common::GradStatHist::GradType* hist_data, common::GradStatHist::GradType* sibling_hist_data, From 6f54a91cf7bef1831dca7644ec0cc735fceebf46 Mon Sep 17 00:00:00 2001 From: Egor Smirnov Date: Wed, 26 Jun 2019 14:35:21 +0300 Subject: [PATCH 28/31] Apply suggestions from code review Co-Authored-By: Philip Hyunsu Cho --- src/common/hist_util.cc | 2 +- src/common/hist_util.h | 4 ++-- src/tree/updater_quantile_hist.cc | 14 +++++++++++--- src/tree/updater_quantile_hist.h | 2 +- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 349759ee7f13..af420db4d296 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -1,6 +1,6 @@ /*! * Copyright 2017-2019 by Contributors - * \file hist_util.h + * \file hist_util.cc */ #include "./hist_util.h" #include diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 72a6958528b4..0cef1878e491 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -271,11 +271,11 @@ struct GradStatHist { sum_hess += b.sum_hess; } - inline void Add(GradientPair p) { + inline void Add(const GradientPair& p) { this->Add(p.GetGrad(), p.GetHess()); } - inline void Add(GradType grad, GradType hess) { + inline void Add(const GradType& grad, const GradType& hess) { sum_grad += grad; sum_hess += hess; } diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 0afd999fa98e..9d1f69d8b6e4 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -566,7 +566,7 @@ void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& const int32_t node_idx = task_node_idx[itask]; common::GradStatHist::GradType* data_local_hist; - common::GradStatHist* grad_stat; + common::GradStatHist* grad_stat; // total gradient/hessian value for node `nid` std::tie(data_local_hist, grad_stat) = GetHistBuffer(&(*hist_is_init)[node_idx], &grad_stats[node_idx], block_id, nthread, tid, &(*hist_buffers)[node_idx], hist_size); @@ -1082,9 +1082,10 @@ void QuantileHistMaker::Builder::EvaluateSplitsBatch( // parallel enumeration #pragma omp parallel for schedule(guided) for (int32_t i = 0; i < tasks.size(); ++i) { + // node_idx : offset within `nodes` list const int32_t node_idx = tasks[i].first; const size_t fid = tasks[i].second; - const int32_t nid = nodes[node_idx].nid; + const int32_t nid = nodes[node_idx].nid; // usually node_idx != nid const int32_t sibling_nid = nodes[node_idx].sibling_nid; const int32_t parent_nid = nodes[node_idx].parent_nid; @@ -1110,6 +1111,9 @@ void QuantileHistMaker::Builder::EvaluateSplitsBatch( const bool compute_backward = this->EnumerateSplit(+1, gmat, hist_[nid], snode, info, &splits[i].first, fid, nid); + // Sometimes, we don't need to enumerate backward because forward and backward + // enumeration will give same loss values. This is the case if the particular feature + // column contains no missing values. So enumerate backward only if it's necessary. if (compute_backward) { this->EnumerateSplit(-1, gmat, hist_[nid], snode, info, &splits[i].first, fid, nid); @@ -1161,6 +1165,10 @@ void QuantileHistMaker::Builder::InitNewNode(int nid, } // enumerate the split values of specific feature +// d_step: +1 or -1, indicating direction at which we scan candidate thresholds in order +// fid: feature for which we seek to pick best threshold +// Returns false if we don't need to enumerate in opposite direction. +// This is the case if the particular feature (fid) column contains no missing values. bool QuantileHistMaker::Builder::EnumerateSplit(int d_step, const GHistIndexMatrix& gmat, const GHistRow& hist, @@ -1214,7 +1222,7 @@ bool QuantileHistMaker::Builder::EnumerateSplit(int d_step, } split_pt = cut_val[i]; - best.Update(loss_chg, fid, split_pt, d_step == -1, e, c); + best.Update(loss_chg, fid, split_pt, false, e, c); } } } diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h index 7e2458862f43..5a4b6e5047bd 100644 --- a/src/tree/updater_quantile_hist.h +++ b/src/tree/updater_quantile_hist.h @@ -249,7 +249,7 @@ class QuantileHistMaker: public TreeUpdater { std::vector>* hist_is_init, const std::vector>& grad_stats); - void ExpandWithDepthWidth(const GHistIndexMatrix &gmat, + void ExpandWithDepthWise(const GHistIndexMatrix &gmat, const GHistIndexBlockMatrix &gmatb, const ColumnMatrix &column_matrix, DMatrix *p_fmat, From 556a11f6e6a80ecfa46d8006b2252b1b6cb0db6c Mon Sep 17 00:00:00 2001 From: egor Date: Wed, 26 Jun 2019 15:19:47 +0300 Subject: [PATCH 29/31] apply review comments --- src/tree/updater_quantile_hist.cc | 39 +++++++++++-------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 9d1f69d8b6e4..2af5fe57dc21 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -103,10 +103,8 @@ void QuantileHistMaker::Builder::BuildNodeStat( auto left_sibling_id = (*p_tree)[parent_id].LeftChild(); auto parent_split_feature_id = snode_[parent_id].best.SplitIndex(); - { - spliteval_->AddSplit(parent_id, left_sibling_id, nid, parent_split_feature_id, - snode_[left_sibling_id].weight, snode_[nid].weight); - } + spliteval_->AddSplit(parent_id, left_sibling_id, nid, parent_split_feature_id, + snode_[left_sibling_id].weight, snode_[nid].weight); } } @@ -138,7 +136,7 @@ void QuantileHistMaker::Builder::BuildNodeStatBatch( } template -std::pair PartitionDenseLeftDefaultKernel(const RowIdxType* rid, +inline std::pair PartitionDenseLeftDefaultKernel(const RowIdxType* rid, const IdxType* idx, const IdxType offset, const int32_t split_cond, const size_t istart, const size_t iend, RowIdxType* p_left, RowIdxType* p_right) { size_t ileft = 0; @@ -158,7 +156,7 @@ std::pair PartitionDenseLeftDefaultKernel(const RowIdxType* rid, } template -std::pair PartitionDenseRightDefaultKernel(const RowIdxType* rid, +inline std::pair PartitionDenseRightDefaultKernel(const RowIdxType* rid, const IdxType* idx, const IdxType offset, const int32_t split_cond, const size_t istart, const size_t iend, RowIdxType* p_left, RowIdxType* p_right) { size_t ileft = 0; @@ -177,7 +175,7 @@ std::pair PartitionDenseRightDefaultKernel(const RowIdxType* rid } template -std::pair PartitionSparseKernel(const RowIdxType* rowid, +inline std::pair PartitionSparseKernel(const RowIdxType* rowid, const IdxType* idx, const int32_t split_cond, const size_t ibegin, const size_t iend, RowIdxType* p_left, RowIdxType* p_right, Column column, bool default_left) { @@ -688,7 +686,7 @@ void QuantileHistMaker::Builder::ReduceHistograms( } } -void QuantileHistMaker::Builder::ExpandWithDepthWidth( +void QuantileHistMaker::Builder::ExpandWithDepthWise( const GHistIndexMatrix &gmat, const GHistIndexBlockMatrix &gmatb, const ColumnMatrix &column_matrix, @@ -807,7 +805,7 @@ void QuantileHistMaker::Builder::Update(const GHistIndexMatrix& gmat, if (param_.grow_policy == TrainParam::kLossGuide) { ExpandWithLossGuide(gmat, gmatb, column_matrix, p_fmat, p_tree, gpair_h); } else { - ExpandWithDepthWidth(gmat, gmatb, column_matrix, p_fmat, p_tree, gpair_h); + ExpandWithDepthWise(gmat, gmatb, column_matrix, p_fmat, p_tree, gpair_h); } for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) { @@ -1112,7 +1110,7 @@ void QuantileHistMaker::Builder::EvaluateSplitsBatch( info, &splits[i].first, fid, nid); // Sometimes, we don't need to enumerate backward because forward and backward - // enumeration will give same loss values. This is the case if the particular feature + // enumeration will give same loss values. This is the case if the particular feature // column contains no missing values. So enumerate backward only if it's necessary. if (compute_backward) { this->EnumerateSplit(-1, gmat, hist_[nid], snode, info, @@ -1214,14 +1212,9 @@ bool QuantileHistMaker::Builder::EnumerateSplit(int d_step, if (e.sum_hess >= param_.min_child_weight) { c.SetSubstract(snode.stats, e); if (c.sum_hess >= param_.min_child_weight) { - bst_float loss_chg; - bst_float split_pt; - { - loss_chg = static_cast(spliteval_->ComputeSplitScore(nodeID, + bst_float loss_chg = static_cast(spliteval_->ComputeSplitScore(nodeID, fid, e, c) - snode.root_gain); - } - - split_pt = cut_val[i]; + bst_float split_pt = cut_val[i]; best.Update(loss_chg, fid, split_pt, false, e, c); } } @@ -1237,15 +1230,11 @@ bool QuantileHistMaker::Builder::EnumerateSplit(int d_step, if (e.sum_hess >= param_.min_child_weight) { c.SetSubstract(snode.stats, e); if (c.sum_hess >= param_.min_child_weight) { - bst_float loss_chg; bst_float split_pt; - // backward enumeration: split at left bound of each bin - { - loss_chg = static_cast( - spliteval_->ComputeSplitScore(nodeID, fid, c, e) - - snode.root_gain); - } + bst_float loss_chg = static_cast( + spliteval_->ComputeSplitScore(nodeID, fid, c, e) - + snode.root_gain); if (i == imin) { // for leftmost bin, left bound is the smallest feature value @@ -1253,7 +1242,7 @@ bool QuantileHistMaker::Builder::EnumerateSplit(int d_step, } else { split_pt = cut_val[i - 1]; } - best.Update(loss_chg, fid, split_pt, d_step == -1, c, e); + best.Update(loss_chg, fid, split_pt, true, c, e); } } } From e8a7940101d719fae4ec17695cf06c4d5b3af5f4 Mon Sep 17 00:00:00 2001 From: Philip Cho Date: Wed, 26 Jun 2019 14:55:35 -0700 Subject: [PATCH 30/31] Remove unused function CreateNewNodes() --- src/tree/updater_quantile_hist.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h index 5a4b6e5047bd..592257321553 100644 --- a/src/tree/updater_quantile_hist.h +++ b/src/tree/updater_quantile_hist.h @@ -283,19 +283,6 @@ class QuantileHistMaker: public TreeUpdater { const std::vector &gpair_h, const std::vector& nodes); - void CreateNewNodes(const GHistIndexMatrix &gmat, - const ColumnMatrix &column_matrix, - DMatrix *p_fmat, - RegTree *p_tree, - int *num_leaves, - int depth, - unsigned *timestamp, - std::vector *temp_qexpand_depth, - int32_t nid, - std::mutex* mutex_add_nodes, - const QuantileHistMaker::NodeEntry& snode, - RegTree::Node node); - int32_t FindSplitCond(int32_t nid, RegTree *p_tree, const GHistIndexMatrix &gmat); From adf00125574f6e924bb83c35a8bc0936ab458611 Mon Sep 17 00:00:00 2001 From: Philip Cho Date: Wed, 26 Jun 2019 14:57:14 -0700 Subject: [PATCH 31/31] Add descriptive comment on node_idx variable in QuantileHistMaker::Builder::BuildHistsBatch() --- src/tree/updater_quantile_hist.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 2af5fe57dc21..9899ea61de4e 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -561,6 +561,7 @@ void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector& const size_t tid = omp_get_thread_num(); const int32_t nid = task_nid[itask]; const int32_t block_id = task_block_idx[itask]; + // node_idx : location of node `nid` within the `nodes` list. In general, node_idx != nid const int32_t node_idx = task_node_idx[itask]; common::GradStatHist::GradType* data_local_hist;