Skip to content

Commit

Permalink
Optimized ApplySplit, BuildHist and UpdatePredictCache functions on C…
Browse files Browse the repository at this point in the history
…PU (#5244)

* Split up sparse and dense build hist kernels.
* Add `PartitionBuilder`.
  • Loading branch information
SmirnovEgorRu authored Feb 29, 2020
1 parent b81f8cb commit 1b97eaf
Show file tree
Hide file tree
Showing 9 changed files with 683 additions and 376 deletions.
5 changes: 3 additions & 2 deletions src/common/column_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class Column {
size_t Size() const { return len_; }
uint32_t GetGlobalBinIdx(size_t idx) const { return index_base_ + index_[idx]; }
uint32_t GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
common::Span<const uint32_t> GetFeatureBinIdxPtr() const { return { index_, len_ }; }
// column.GetFeatureBinIdx(idx) + column.GetBaseIdx(idx) ==
// column.GetGlobalBinIdx(idx)
uint32_t GetBaseIdx() const { return index_base_; }
Expand Down Expand Up @@ -186,8 +187,8 @@ class ColumnMatrix {

std::vector<size_t> feature_counts_;
std::vector<ColumnType> type_;
SimpleArray<uint32_t> index_; // index_: may store smaller integers; needs padding
SimpleArray<size_t> row_ind_;
std::vector<uint32_t> index_; // index_: may store smaller integers; needs padding
std::vector<size_t> row_ind_;
std::vector<ColumnBoundary> boundary_;

// index_base_[fid]: least bin id for feature fid
Expand Down
147 changes: 124 additions & 23 deletions src/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,7 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
}

/*!
* \brief fill a histogram by zeroes
* \brief fill a histogram by zeros in range [begin, end)
*/
void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end) {
memset(hist.data() + begin, '\0', (end-begin)*sizeof(tree::GradStats));
Expand Down Expand Up @@ -719,43 +719,144 @@ void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2,
}
}


void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist) {
const size_t* rid = row_indices.begin;
const size_t nrows = row_indices.Size();
const uint32_t* index = gmat.index.data();
const size_t* row_ptr = gmat.row_ptr.data();
struct Prefetch {
public:
static constexpr size_t kCacheLineSize = 64;
static constexpr size_t kPrefetchOffset = 10;
static constexpr size_t kPrefetchStep =
kCacheLineSize / sizeof(decltype(GHistIndexMatrix::index)::value_type);

private:
static constexpr size_t kNoPrefetchSize =
kPrefetchOffset + kCacheLineSize /
sizeof(decltype(GHistIndexMatrix::row_ptr)::value_type);

public:
static size_t NoPrefetchSize(size_t rows) {
return std::min(rows, kNoPrefetchSize);
}
};

constexpr size_t Prefetch::kNoPrefetchSize;

template<typename FPType, bool do_prefetch>
void BuildHistDenseKernel(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
const size_t n_features,
GHistRow hist) {
const size_t size = row_indices.Size();
const size_t* rid = row_indices.begin;
const float* pgh = reinterpret_cast<const float*>(gpair.data());
const uint32_t* gradient_index = gmat.index.data();
FPType* hist_data = reinterpret_cast<FPType*>(hist.data());

const uint32_t two {2}; // Each element from 'gpair' and 'hist' contains
// 2 FP values: gradient and hessian.
// So we need to multiply each row-index/bin-index by 2
// to work with gradient pairs as a singe row FP array

for (size_t i = 0; i < size; ++i) {
const size_t icol_start = rid[i] * n_features;
const size_t idx_gh = two * rid[i];

if (do_prefetch) {
const size_t icol_start_prefetch = rid[i + Prefetch::kPrefetchOffset] * n_features;

double* hist_data = reinterpret_cast<double*>(hist.data());
PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
for (size_t j = icol_start_prefetch; j < icol_start_prefetch + n_features;
j += Prefetch::kPrefetchStep) {
PREFETCH_READ_T0(gradient_index + j);
}
}

for (size_t j = icol_start; j < icol_start + n_features; ++j) {
const uint32_t idx_bin = two * gradient_index[j];

hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
}
}

template<typename FPType, bool do_prefetch>
void BuildHistSparseKernel(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist) {
const size_t size = row_indices.Size();
const size_t* rid = row_indices.begin;
const float* pgh = reinterpret_cast<const float*>(gpair.data());
const uint32_t* gradient_index = gmat.index.data();
const size_t* row_ptr = gmat.row_ptr.data();
FPType* hist_data = reinterpret_cast<FPType*>(hist.data());

const size_t cache_line_size = 64;
const size_t prefetch_offset = 10;
size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid);
no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size;
const uint32_t two {2}; // Each element from 'gpair' and 'hist' contains
// 2 FP values: gradient and hessian.
// So we need to multiply each row-index/bin-index by 2
// to work with gradient pairs as a singe row FP array

for (size_t i = 0; i < nrows; ++i) {
for (size_t i = 0; i < size; ++i) {
const size_t icol_start = row_ptr[rid[i]];
const size_t icol_end = row_ptr[rid[i]+1];
const size_t idx_gh = two * rid[i];

if (do_prefetch) {
const size_t icol_start_prftch = row_ptr[rid[i+Prefetch::kPrefetchOffset]];
const size_t icol_end_prefect = row_ptr[rid[i+Prefetch::kPrefetchOffset]+1];

if (i < nrows - no_prefetch_size) {
PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]);
PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]);
PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
for (size_t j = icol_start_prftch; j < icol_end_prefect; j+=Prefetch::kPrefetchStep) {
PREFETCH_READ_T0(gradient_index + j);
}
}

for (size_t j = icol_start; j < icol_end; ++j) {
const uint32_t idx_bin = 2*index[j];
const size_t idx_gh = 2*rid[i];

hist_data[idx_bin] += pgh[idx_gh];
const uint32_t idx_bin = two * gradient_index[j];
hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
}
}

template<typename FPType, bool do_prefetch>
void BuildHistKernel(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat, const bool isDense, GHistRow hist) {
if (row_indices.Size() && isDense) {
const size_t* row_ptr = gmat.row_ptr.data();
const size_t n_features = row_ptr[row_indices.begin[0]+1] - row_ptr[row_indices.begin[0]];
BuildHistDenseKernel<FPType, do_prefetch>(gpair, row_indices, gmat, n_features, hist);
} else {
BuildHistSparseKernel<FPType, do_prefetch>(gpair, row_indices, gmat, hist);
}
}

void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist,
bool isDense) {
using FPType = decltype(tree::GradStats::sum_grad);
const size_t nrows = row_indices.Size();
const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows);

// if need to work with all rows from bin-matrix (e.g. root node)
const bool contiguousBlock = (row_indices.begin[nrows - 1] - row_indices.begin[0]) == (nrows - 1);

if (contiguousBlock) {
// contiguous memory access, built-in HW prefetching is enough
BuildHistKernel<FPType, false>(gpair, row_indices, gmat, isDense, hist);
} else {
const RowSetCollection::Elem span1(row_indices.begin, row_indices.end - no_prefetch_size);
const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size, row_indices.end);

BuildHistKernel<FPType, true>(gpair, span1, gmat, isDense, hist);
// no prefetching to avoid loading extra memory
BuildHistKernel<FPType, false>(gpair, span2, gmat, isDense, hist);
}
}

void GHistBuilder::BuildBlockHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexBlockMatrix& gmatb,
Expand Down
90 changes: 10 additions & 80 deletions src/common/hist_util.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright 2017 by Contributors
* Copyright 2017-2020 by Contributors
* \file hist_util.h
* \brief Utility for fast histogram aggregation
* \author Philip Cho, Tianqi Chen
Expand All @@ -25,75 +25,6 @@

namespace xgboost {
namespace common {

/*
* \brief A thin wrapper around dynamically allocated C-style array.
* Make sure to call resize() before use.
*/
template<typename T>
struct SimpleArray {
~SimpleArray() {
std::free(ptr_);
ptr_ = nullptr;
}

void resize(size_t n) {
T* ptr = static_cast<T*>(std::malloc(n * sizeof(T)));
CHECK(ptr) << "Failed to allocate memory";
if (ptr_) {
std::memcpy(ptr, ptr_, n_ * sizeof(T));
std::free(ptr_);
}
ptr_ = ptr;
n_ = n;
}

T& operator[](size_t idx) {
return ptr_[idx];
}

T& operator[](size_t idx) const {
return ptr_[idx];
}

size_t size() const {
return n_;
}

T back() const {
return ptr_[n_-1];
}

T* data() {
return ptr_;
}

const T* data() const {
return ptr_;
}


T* begin() {
return ptr_;
}

const T* begin() const {
return ptr_;
}

T* end() {
return ptr_ + n_;
}

const T* end() const {
return ptr_ + n_;
}

private:
T* ptr_ = nullptr;
size_t n_ = 0;
};

/*!
* \brief A single row in global histogram index.
* Directly represent the global index in the histogram entry.
Expand Down Expand Up @@ -161,7 +92,7 @@ class HistogramCuts {
return idx;
}

BinIdx SearchBin(Entry const& e) {
BinIdx SearchBin(Entry const& e) const {
return SearchBin(e.fvalue, e.index);
}
};
Expand Down Expand Up @@ -261,8 +192,9 @@ size_t DeviceSketch(int device,

/*!
* \brief preprocessed global index matrix, in CSR format
* Transform floating values to integer index in histogram
* This is a global histogram index.
*
* Transform floating values to integer index in histogram This is a global histogram
* index for CPU histogram. On GPU ellpack page is used.
*/
struct GHistIndexMatrix {
/*! \brief row pointer to rows by element position */
Expand Down Expand Up @@ -606,17 +538,15 @@ class ParallelGHistBuilder {
*/
class GHistBuilder {
public:
// initialize builder
inline void Init(size_t nthread, uint32_t nbins) {
nthread_ = nthread;
nbins_ = nbins;
}
GHistBuilder() : nthread_{0}, nbins_{0} {}
GHistBuilder(size_t nthread, uint32_t nbins) : nthread_{nthread}, nbins_{nbins} {}

// construct a histogram via histogram aggregation
void BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist);
GHistRow hist,
bool isDense);
// same, with feature grouping
void BuildBlockHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
Expand All @@ -625,7 +555,7 @@ class GHistBuilder {
// construct a histogram via subtraction trick
void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent);

uint32_t GetNumBins() {
uint32_t GetNumBins() const {
return nbins_;
}

Expand Down
Loading

0 comments on commit 1b97eaf

Please sign in to comment.