From c6df959ffbe027b3929080b66a318646cc9efc85 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Mon, 10 Feb 2020 11:56:36 +0000 Subject: [PATCH 01/13] Extend derivatives aggregation and solving for best split to multi-parameter loss functions --- include/maths/CBoostedTree.h | 86 +++-- .../maths/CBoostedTreeLeafNodeStatistics.h | 325 ++++++++++++++---- include/maths/CBoostedTreeUtils.h | 58 +++- include/maths/CLinearAlgebraEigen.h | 22 ++ include/maths/CLinearAlgebraShims.h | 40 ++- lib/maths/CBoostedTree.cc | 101 +++--- lib/maths/CBoostedTreeImpl.cc | 53 +-- lib/maths/CBoostedTreeLeafNodeStatistics.cc | 189 ++++++---- lib/maths/CBoostedTreeUtils.cc | 91 +++-- .../CBoostedTreeLeafNodeStatisticsTest.cc | 297 ++++++++++++++++ lib/maths/unittest/CBoostedTreeTest.cc | 99 ++++-- lib/maths/unittest/Makefile | 157 +++------ 12 files changed, 1109 insertions(+), 409 deletions(-) create mode 100644 lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc diff --git a/include/maths/CBoostedTree.h b/include/maths/CBoostedTree.h index f6556779e7..fb770b1d53 100644 --- a/include/maths/CBoostedTree.h +++ b/include/maths/CBoostedTree.h @@ -35,7 +35,8 @@ class CEncodedDataFrameRowRef; namespace boosted_tree_detail { class MATHS_EXPORT CArgMinLossImpl { public: - using TDouble1Vec = core::CSmallVector; + using TDoubleVector = CDenseVector; + using TMemoryMappedFloatVector = CMemoryMappedDenseVector; public: CArgMinLossImpl(double lambda); @@ -43,9 +44,11 @@ class MATHS_EXPORT CArgMinLossImpl { virtual std::unique_ptr clone() const = 0; virtual bool nextPass() = 0; - virtual void add(TDouble1Vec prediction, double actual, double weight = 1.0) = 0; + virtual void add(const TMemoryMappedFloatVector& prediction, + double actual, + double weight = 1.0) = 0; virtual void merge(const CArgMinLossImpl& other) = 0; - virtual TDouble1Vec value() const = 0; + virtual TDoubleVector value() const = 0; protected: double lambda() const; @@ -61,9 +64,9 @@ class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl { CArgMinMseImpl(double lambda); std::unique_ptr clone() const override; bool nextPass() override; - void add(TDouble1Vec prediction, double actual, double weight = 1.0) override; + void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override; void merge(const CArgMinLossImpl& other) override; - TDouble1Vec value() const override; + TDoubleVector value() const override; private: using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; @@ -79,14 +82,14 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl { CArgMinLogisticImpl(double lambda); std::unique_ptr clone() const override; bool nextPass() override; - void add(TDouble1Vec prediction, double actual, double weight = 1.0) override; + void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override; void merge(const CArgMinLossImpl& other) override; - TDouble1Vec value() const override; + TDoubleVector value() const override; private: using TMinMaxAccumulator = CBasicStatistics::CMinMax; - using TVector = CVectorNx1; - using TVectorVec = std::vector; + using TDoubleVector2x1 = CVectorNx1; + using TDoubleVector2x1Vec = std::vector; private: std::size_t bucket(double prediction) const { @@ -108,8 +111,8 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl { private: std::size_t m_CurrentPass = 0; TMinMaxAccumulator m_PredictionMinMax; - TVector m_CategoryCounts; - TVectorVec m_BucketCategoryCounts; + TDoubleVector2x1 m_CategoryCounts; + TDoubleVector2x1Vec m_BucketCategoryCounts; }; } @@ -118,7 +121,8 @@ namespace boosted_tree { //! \brief Computes the leaf value which minimizes the loss function. class MATHS_EXPORT CArgMinLoss { public: - using TDouble1Vec = core::CSmallVector; + using TDoubleVector = CDenseVector; + using TMemoryMappedFloatVector = CMemoryMappedDenseVector; public: CArgMinLoss(const CArgMinLoss& other); @@ -133,7 +137,7 @@ class MATHS_EXPORT CArgMinLoss { bool nextPass() const; //! Update with a point prediction and actual value. - void add(TDouble1Vec prediction, double actual, double weight = 1.0); + void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0); //! Get the minimiser over the predictions and actual values added to both //! this and \p other. @@ -144,7 +148,7 @@ class MATHS_EXPORT CArgMinLoss { //! //! Formally, returns \f$x^* = arg\min_x\{\sum_i{L(p_i + x, a_i)}\}\f$ //! for predictions and actuals \f$p_i\f$ and \f$a_i\f$, respectively. - TDouble1Vec value() const; + TDoubleVector value() const; private: using TArgMinLossImplUPtr = std::unique_ptr; @@ -161,7 +165,9 @@ class MATHS_EXPORT CArgMinLoss { //! \brief Defines the loss function for the regression problem. class MATHS_EXPORT CLoss { public: - using TDouble1Vec = core::CSmallVector; + using TDoubleVector = CDenseVector; + using TMemoryMappedFloatVector = CMemoryMappedDenseVector; + using TWriter = std::function; public: virtual ~CLoss() = default; @@ -170,17 +176,23 @@ class MATHS_EXPORT CLoss { //! The number of parameters to the loss function. virtual std::size_t numberParameters() const = 0; //! The value of the loss function. - virtual double value(TDouble1Vec prediction, double actual, double weight = 1.0) const = 0; + virtual double value(const TMemoryMappedFloatVector& prediction, + double actual, + double weight = 1.0) const = 0; //! The gradient of the loss function. - virtual TDouble1Vec - gradient(TDouble1Vec prediction, double actual, double weight = 1.0) const = 0; + virtual void gradient(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight = 1.0) const = 0; //! The Hessian of the loss function (flattened). - virtual TDouble1Vec - curvature(TDouble1Vec prediction, double actual, double weight = 1.0) const = 0; + virtual void curvature(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight = 1.0) const = 0; //! Returns true if the loss curvature is constant. virtual bool isCurvatureConstant() const = 0; //! Transforms a prediction from the forest to the target space. - virtual TDouble1Vec transform(TDouble1Vec prediction) const = 0; + virtual TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const = 0; //! Get an object which computes the leaf value that minimises loss. virtual CArgMinLoss minimizer(double lambda) const = 0; //! Get the name of the loss function @@ -198,11 +210,19 @@ class MATHS_EXPORT CMse final : public CLoss { public: std::unique_ptr clone() const override; std::size_t numberParameters() const override; - double value(TDouble1Vec prediction, double actual, double weight = 1.0) const override; - TDouble1Vec gradient(TDouble1Vec prediction, double actual, double weight = 1.0) const override; - TDouble1Vec curvature(TDouble1Vec prediction, double actual, double weight = 1.0) const override; + double value(const TMemoryMappedFloatVector& prediction, + double actual, + double weight = 1.0) const override; + void gradient(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight = 1.0) const override; + void curvature(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight = 1.0) const override; bool isCurvatureConstant() const override; - TDouble1Vec transform(TDouble1Vec prediction) const override; + TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override; CArgMinLoss minimizer(double lambda) const override; const std::string& name() const override; }; @@ -223,11 +243,19 @@ class MATHS_EXPORT CBinomialLogistic final : public CLoss { public: std::unique_ptr clone() const override; std::size_t numberParameters() const override; - double value(TDouble1Vec prediction, double actual, double weight = 1.0) const override; - TDouble1Vec gradient(TDouble1Vec prediction, double actual, double weight = 1.0) const override; - TDouble1Vec curvature(TDouble1Vec prediction, double actual, double weight = 1.0) const override; + double value(const TMemoryMappedFloatVector& prediction, + double actual, + double weight = 1.0) const override; + void gradient(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight = 1.0) const override; + void curvature(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight = 1.0) const override; bool isCurvatureConstant() const override; - TDouble1Vec transform(TDouble1Vec prediction) const override; + TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override; CArgMinLoss minimizer(double lambda) const override; const std::string& name() const override; }; diff --git a/include/maths/CBoostedTreeLeafNodeStatistics.h b/include/maths/CBoostedTreeLeafNodeStatistics.h index 74f6588510..90cc57376e 100644 --- a/include/maths/CBoostedTreeLeafNodeStatistics.h +++ b/include/maths/CBoostedTreeLeafNodeStatistics.h @@ -13,12 +13,19 @@ #include #include +#include +#include +#include #include +#include +#include #include #include +#include #include +#include #include namespace ml { @@ -37,14 +44,260 @@ class CEncodedDataFrameRowRef; //! The regression tree is grown top down by greedily selecting the split with //! the maximum gain (in the loss). This finds and scores the maximum gain split //! of a single leaf of the tree. -class CBoostedTreeLeafNodeStatistics final { +class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { public: + using TDoubleVec = std::vector; using TSizeVec = std::vector; using TSizeDoublePr = std::pair; using TRegularization = CBoostedTreeRegularization; - using TImmutableRadixSetVec = std::vector>; + using TImmutableRadixSet = core::CImmutableRadixSet; + using TImmutableRadixSetVec = std::vector; using TPtr = std::shared_ptr; using TPtrPtrPr = std::pair; + using TMemoryMappedFloatVector = CMemoryMappedDenseVector; + using TMemoryMappedDoubleVector = CMemoryMappedDenseVector; + using TDoubleVector = CDenseVector; + using TDoubleMatrix = CDenseMatrix; + + //! \brief Aggregate derivatives (gradient and Hessian). + struct MATHS_EXPORT SDerivatives { + SDerivatives(std::size_t count, TDoubleVector gradient, TDoubleMatrix curvature) + : s_Count{count}, s_Gradient{std::move(gradient)}, s_Curvature{std::move(curvature)} { + } + //! \warning This assumes that the curvature is stored flat row major. + SDerivatives(std::size_t count, + const TMemoryMappedDoubleVector& gradient, + const TMemoryMappedDoubleVector& curvature) + : s_Count{count}, s_Gradient{gradient.size()}, s_Curvature{gradient.size(), + gradient.size()} { + for (int i = 0; i < gradient.size(); ++i) { + s_Gradient[i] = gradient[i]; + } + // We only copy the upper triangle and always use selfadjointView + // when working with the actual Hessian. + for (int i = 0, k = 0; i < gradient.size(); ++i) { + for (int j = i; j < gradient.size(); ++j, ++k) { + s_Curvature(i, j) = curvature(k); + } + } + } + + static std::size_t estimateMemoryUsage(std::size_t numberLossParameters) { + return sizeof(SDerivatives) + + las::estimateMemoryUsage(numberLossParameters) + + las::estimateMemoryUsage(numberLossParameters, + numberLossParameters); + } + + std::size_t s_Count; + TDoubleVector s_Gradient; + //! Note only the upper triangle is initialized since this is symmetric. + TDoubleMatrix s_Curvature; + }; + + using TDerivativesVec = std::vector; + using TDerivativesVecVec = std::vector; + + //! \brief Accumulates aggregate derivatives. + class MATHS_EXPORT CDerivativesAccumulator { + public: + CDerivativesAccumulator(const TMemoryMappedDoubleVector& gradient, + const TMemoryMappedDoubleVector& curvature) + : CDerivativesAccumulator{0, gradient, curvature} {} + CDerivativesAccumulator(std::size_t count, + const TMemoryMappedDoubleVector& gradient, + const TMemoryMappedDoubleVector& curvature) + : m_Count{count}, m_Gradient{gradient}, m_Curvature{curvature} {} + + //! Get the accumulated count. + std::size_t count() const { return m_Count; } + + //! Get the accumulated gradient. + const TMemoryMappedDoubleVector& gradient() const { return m_Gradient; } + + //! Get the accumulated curvature. + const TMemoryMappedDoubleVector& curvature() const { + return m_Curvature; + } + + //! Add \p count, \p gradient and \p curvature to the accumulator. + void add(std::size_t count, + const TMemoryMappedFloatVector& gradient, + const TMemoryMappedFloatVector& curvature) { + m_Count += count; + m_Gradient += gradient; + m_Curvature += curvature; + } + + //! Compute the accumulation of both collections of derivatives. + void merge(const CDerivativesAccumulator& other) { + m_Count += other.m_Count; + m_Gradient += other.m_Gradient; + m_Curvature += other.m_Curvature; + } + + //! Get a checksum of this object. + std::uint64_t checksum(std::uint64_t seed = 0) const { + seed = CChecksum::calculate(seed, m_Count); + seed = CChecksum::calculate(seed, m_Gradient); + return CChecksum::calculate(seed, m_Curvature); + } + + private: + std::size_t m_Count = 0; + TMemoryMappedDoubleVector m_Gradient; + TMemoryMappedDoubleVector m_Curvature; + }; + + using TDerivativesAccumulatorVec = std::vector; + using TDerivativesAccumulatorVecVec = std::vector; + + //! \brief A collection of aggregate derivatives for candidate feature splits. + class MATHS_EXPORT CSplitDerivativesAccumulator { + public: + CSplitDerivativesAccumulator(const TImmutableRadixSetVec& candidateSplits, + std::size_t numberLossParameters) + : m_NumberLossParameters{numberLossParameters} { + + std::size_t totalNumberCandidateSplits{std::accumulate( + candidateSplits.begin(), candidateSplits.end(), std::size_t{0}, + [](std::size_t size, const TImmutableRadixSet& splits) { + return size + splits.size() + 1; + })}; + int numberGradients{static_cast(numberLossParameters)}; + int numberCurvatures{static_cast( + boosted_tree_detail::lossHessianStoredSize(numberLossParameters))}; + int numberDerivatives{numberGradients + numberCurvatures}; + m_Derivatives.resize(candidateSplits.size()); + m_DerivativesStorage.resize(totalNumberCandidateSplits * numberDerivatives, 0.0); + m_MissingDerivatives.reserve(candidateSplits.size()); + m_MissingDerivativesStorage.resize(candidateSplits.size() * numberDerivatives, 0.0); + + double* storage{nullptr}; + for (std::size_t i = 0, m = 0, n = 0; i < candidateSplits.size(); + ++i, m += numberDerivatives) { + + m_Derivatives[i].reserve(candidateSplits[i].size() + 1); + for (std::size_t j = 0; j <= candidateSplits[i].size(); + ++j, n += numberDerivatives) { + storage = &m_DerivativesStorage[n]; + m_Derivatives[i].emplace_back( + TMemoryMappedDoubleVector{storage, numberGradients}, + TMemoryMappedDoubleVector{storage + numberGradients, numberCurvatures}); + } + + storage = &m_MissingDerivativesStorage[m]; + m_MissingDerivatives.emplace_back( + TMemoryMappedDoubleVector{storage, numberGradients}, + TMemoryMappedDoubleVector{storage + numberGradients, numberCurvatures}); + } + } + + CSplitDerivativesAccumulator(const CSplitDerivativesAccumulator& other) + : m_NumberLossParameters{other.m_NumberLossParameters}, + m_DerivativesStorage{other.m_DerivativesStorage}, + m_MissingDerivativesStorage{other.m_MissingDerivativesStorage} { + + int numberGradients{static_cast(m_NumberLossParameters)}; + int numberCurvatures{static_cast( + boosted_tree_detail::lossHessianStoredSize(m_NumberLossParameters))}; + int numberDerivatives{numberGradients + numberCurvatures}; + + m_Derivatives.resize(other.m_Derivatives.size()); + m_MissingDerivatives.reserve(other.m_MissingDerivatives.size()); + + double* storage{nullptr}; + for (std::size_t i = 0, m = 0, n = 0; + i < other.m_Derivatives.size(); ++i, m += numberDerivatives) { + + m_Derivatives[i].reserve(other.m_Derivatives[i].size()); + for (std::size_t j = 0; j < other.m_Derivatives[i].size(); + ++j, n += numberDerivatives) { + storage = &m_DerivativesStorage[n]; + m_Derivatives[i].emplace_back( + other.m_Derivatives[i][j].count(), + TMemoryMappedDoubleVector{storage, numberGradients}, + TMemoryMappedDoubleVector{storage + numberGradients, numberCurvatures}); + } + + storage = &m_MissingDerivativesStorage[m]; + m_MissingDerivatives.emplace_back( + other.m_MissingDerivatives[i].count(), + TMemoryMappedDoubleVector{storage, numberGradients}, + TMemoryMappedDoubleVector{storage + numberGradients, numberCurvatures}); + } + } + + CSplitDerivativesAccumulator(CSplitDerivativesAccumulator&&) = default; + + CSplitDerivativesAccumulator& + operator=(const CSplitDerivativesAccumulator& other) = delete; + CSplitDerivativesAccumulator& operator=(CSplitDerivativesAccumulator&&) = default; + + //! Compute the accumulation of both collections of per split derivatives. + void merge(const CSplitDerivativesAccumulator& other) { + for (std::size_t i = 0; i < m_Derivatives.size(); ++i) { + for (std::size_t j = 0; j < m_Derivatives[i].size(); ++j) { + m_Derivatives[i][j].merge(other.m_Derivatives[i][j]); + } + m_MissingDerivatives[i].merge(other.m_MissingDerivatives[i]); + } + } + + //! Add \p gradient and \p curvature to the accumulated derivatives for + //! the split \p split of feature \p feature. + void addDerivatives(std::size_t feature, + std::size_t split, + const TMemoryMappedFloatVector& gradient, + const TMemoryMappedFloatVector& curvature) { + m_Derivatives[feature][split].add(1, gradient, curvature); + } + + //! Add \p gradient and \p curvature to the accumulated derivatives for + //! missing values of \p feature. + void addMissingDerivatives(std::size_t feature, + const TMemoryMappedFloatVector& gradient, + const TMemoryMappedFloatVector& curvature) { + m_MissingDerivatives[feature].add(1, gradient, curvature); + } + + //! Read out the accumulated derivatives. + auto read() const { + TDerivativesVecVec derivatives; + derivatives.resize(m_Derivatives.size()); + for (std::size_t i = 0; i < m_Derivatives.size(); ++i) { + derivatives[i].reserve(m_Derivatives[i].size()); + for (const auto& derivative : m_Derivatives[i]) { + derivatives[i].emplace_back(derivative.count(), derivative.gradient(), + derivative.curvature()); + } + } + + TDerivativesVec missingDerivatives; + missingDerivatives.reserve(m_MissingDerivatives.size()); + for (const auto& derivative : m_MissingDerivatives) { + missingDerivatives.emplace_back(derivative.count(), derivative.gradient(), + derivative.curvature()); + } + + return std::make_pair(std::move(derivatives), std::move(missingDerivatives)); + } + + //! Get a checksum of this object. + std::uint64_t checksum(std::uint64_t seed = 0) const { + seed = CChecksum::calculate(seed, m_NumberLossParameters); + seed = CChecksum::calculate(seed, m_Derivatives); + seed = CChecksum::calculate(seed, m_MissingDerivatives); + return seed; + } + + private: + std::size_t m_NumberLossParameters; + TDerivativesAccumulatorVecVec m_Derivatives; + TDoubleVec m_DerivativesStorage; + TDerivativesAccumulatorVec m_MissingDerivatives; + TDoubleVec m_MissingDerivativesStorage; + }; public: CBoostedTreeLeafNodeStatistics(std::size_t id, @@ -134,13 +387,13 @@ class CBoostedTreeLeafNodeStatistics final { //! and \p numberSplitsPerFeature. static std::size_t estimateMemoryUsage(std::size_t numberRows, std::size_t numberCols, - std::size_t numberSplitsPerFeature); + std::size_t numberSplitsPerFeature, + std::size_t numberLossParameters); private: - using TDouble1Vec = core::CSmallVector; - //! \brief Statistics relating to a split of the node. - struct SSplitStatistics : private boost::less_than_comparable { + struct MATHS_EXPORT SSplitStatistics + : private boost::less_than_comparable { SSplitStatistics() = default; SSplitStatistics(double gain, double curvature, @@ -172,60 +425,6 @@ class CBoostedTreeLeafNodeStatistics final { bool s_AssignMissingToLeft = true; }; - //! \brief Aggregate derivatives. - struct SAggregateDerivatives { - void add(std::size_t count, const TDouble1Vec& gradient, const TDouble1Vec& curvature) { - s_Count += count; - s_Gradient += gradient[0]; - s_Curvature += curvature[0]; - } - - void merge(const SAggregateDerivatives& other) { - this->add(other.s_Count, {other.s_Gradient}, {other.s_Curvature}); - } - - std::string print() const { - std::ostringstream result; - result << "count = " << s_Count << ", gradient = " << s_Gradient - << ", curvature = " << s_Curvature; - return result.str(); - } - - std::size_t s_Count = 0; - double s_Gradient = 0.0; - double s_Curvature = 0.0; - }; - - using TAggregateDerivativesVec = std::vector; - using TAggregateDerivativesVecVec = std::vector; - - //! \brief A collection of aggregate derivatives for candidate feature splits. - struct SSplitAggregateDerivatives { - SSplitAggregateDerivatives(const TImmutableRadixSetVec& candidateSplits) - : s_Derivatives(candidateSplits.size()), - s_MissingDerivatives(candidateSplits.size()) { - for (std::size_t i = 0; i < candidateSplits.size(); ++i) { - s_Derivatives[i].resize(candidateSplits[i].size() + 1); - } - } - - void merge(const SSplitAggregateDerivatives& other) { - for (std::size_t i = 0; i < s_Derivatives.size(); ++i) { - for (std::size_t j = 0; j < s_Derivatives[i].size(); ++j) { - s_Derivatives[i][j].merge(other.s_Derivatives[i][j]); - } - s_MissingDerivatives[i].merge(other.s_MissingDerivatives[i]); - } - } - - auto move() { - return std::make_pair(std::move(s_Derivatives), std::move(s_MissingDerivatives)); - } - - TAggregateDerivativesVecVec s_Derivatives; - TAggregateDerivativesVec s_MissingDerivatives; - }; - private: void computeAggregateLossDerivatives(std::size_t numberThreads, const core::CDataFrame& frame, @@ -237,7 +436,7 @@ class CBoostedTreeLeafNodeStatistics final { const CBoostedTreeNode& split, const core::CPackedBitVector& parentRowMask); void addRowDerivatives(const CEncodedDataFrameRowRef& row, - SSplitAggregateDerivatives& splitAggregateDerivatives) const; + CSplitDerivativesAccumulator& splitDerivativesAccumulator) const; SSplitStatistics computeBestSplitStatistics(const TRegularization& regularization, const TSizeVec& featureBag) const; @@ -248,8 +447,8 @@ class CBoostedTreeLeafNodeStatistics final { std::size_t m_NumberLossParameters; const TImmutableRadixSetVec& m_CandidateSplits; core::CPackedBitVector m_RowMask; - TAggregateDerivativesVecVec m_Derivatives; - TAggregateDerivativesVec m_MissingDerivatives; + TDerivativesVecVec m_Derivatives; + TDerivativesVec m_MissingDerivatives; SSplitStatistics m_BestSplit; }; } diff --git a/include/maths/CBoostedTreeUtils.h b/include/maths/CBoostedTreeUtils.h index 57cbac7284..700ee07019 100644 --- a/include/maths/CBoostedTreeUtils.h +++ b/include/maths/CBoostedTreeUtils.h @@ -8,26 +8,27 @@ #define INCLUDED_ml_maths_CBoostedTreeUtils_h #include -#include + +#include +#include +#include #include #include namespace ml { namespace maths { +namespace boosted_tree { +class CLoss; +} namespace boosted_tree_detail { -using TDouble1Vec = core::CSmallVector; using TRowRef = core::CDataFrame::TRowRef; +using TMemoryMappedFloatVector = CMemoryMappedDenseVector; inline std::size_t lossHessianStoredSize(std::size_t numberLossParameters) { return numberLossParameters * (numberLossParameters + 1) / 2; } -inline std::size_t numberLossParametersForHessianStoredSize(std::size_t lossHessianStoredSize) { - return static_cast( - (std::sqrt(8.0 * static_cast(lossHessianStoredSize) + 1.0) - 1.0) / 2.0 + 0.5); -} - inline std::size_t predictionColumn(std::size_t numberInputColumns) { return numberInputColumns; } @@ -48,31 +49,50 @@ inline std::size_t exampleWeightColumn(std::size_t numberInputColumns, lossHessianStoredSize(numberLossParameters); } +//! Read the prediction from \p row. +MATHS_EXPORT +TMemoryMappedFloatVector readPrediction(const TRowRef& row, + std::size_t numberInputColumns, + std::size_t numberLossParamaters); + +//! Zero the prediction of \p row. +MATHS_EXPORT +void zeroPrediction(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParamaters); + +//! Read the loss gradient from \p row. MATHS_EXPORT -TDouble1Vec readPrediction(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParamaters); +TMemoryMappedFloatVector readLossGradient(const TRowRef& row, + std::size_t numberInputColumns, + std::size_t numberLossParameters); +//! Zero the loss gradient of \p row. MATHS_EXPORT -void writePrediction(const TRowRef& row, std::size_t numberInputColumns, const TDouble1Vec& prediction); +void zeroLossGradient(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters); +//! Write the loss gradient to \p row. MATHS_EXPORT -TDouble1Vec readLossGradient(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParameters); +void writeLossGradient(const TRowRef& row, + std::size_t numberInputColumns, + const boosted_tree::CLoss& loss, + const TMemoryMappedFloatVector& prediction, + double actual, + double weight = 1.0); MATHS_EXPORT -void writeLossGradient(const TRowRef& row, std::size_t numberInputColumns, const TDouble1Vec& gradient); +TMemoryMappedFloatVector readLossCurvature(const TRowRef& row, + std::size_t numberInputColumns, + std::size_t numberLossParameters); MATHS_EXPORT -TDouble1Vec readLossCurvature(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParameters); +void zeroLossCurvature(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters); MATHS_EXPORT void writeLossCurvature(const TRowRef& row, std::size_t numberInputColumns, - const TDouble1Vec& curvature); + const boosted_tree::CLoss& curvature, + const TMemoryMappedFloatVector& prediction, + double actual, + double weight = 1.0); MATHS_EXPORT double readExampleWeight(const TRowRef& row, diff --git a/include/maths/CLinearAlgebraEigen.h b/include/maths/CLinearAlgebraEigen.h index 0e296f52da..72b51319d8 100644 --- a/include/maths/CLinearAlgebraEigen.h +++ b/include/maths/CLinearAlgebraEigen.h @@ -370,6 +370,16 @@ class CMemoryMappedDenseMatrix } //@} + //! Get a checksum of this object. + std::uint64_t checksum(std::uint64_t seed = 0) const { + for (std::ptrdiff_t i = 0; i < this->rows(); ++i) { + for (std::ptrdiff_t j = 0; j < this->rows(); ++j) { + seed = CChecksum::calculate(seed, (*this)(i, j)); + } + } + return seed; + } + private: void reseat(const CMemoryMappedDenseMatrix& other) { TBase* base{static_cast(this)}; @@ -668,4 +678,16 @@ CVector::CVector(const CDenseVectorInitializer& v) { } } +namespace Eigen { +template +struct ScalarBinaryOpTraits { + using ReturnType = double; +}; + +template +struct ScalarBinaryOpTraits { + using ReturnType = double; +}; +} + #endif // INCLUDED_ml_maths_CLinearAlgebraEigen_h diff --git a/include/maths/CLinearAlgebraShims.h b/include/maths/CLinearAlgebraShims.h index 70c961e902..914d128abf 100644 --- a/include/maths/CLinearAlgebraShims.h +++ b/include/maths/CLinearAlgebraShims.h @@ -335,37 +335,63 @@ outer(const CAnnotatedVector& x) { namespace las_detail { template -struct SEstimateMemoryUsage { +struct SEstimateVectorMemoryUsage { static std::size_t value(std::size_t) { return 0; } }; template -struct SEstimateMemoryUsage> { +struct SEstimateVectorMemoryUsage> { static std::size_t value(std::size_t dimension) { return dimension * sizeof(T); } }; template -struct SEstimateMemoryUsage> { +struct SEstimateVectorMemoryUsage> { static std::size_t value(std::size_t dimension) { // Ignore pad for alignment. return dimension * sizeof(SCALAR); } }; template -struct SEstimateMemoryUsage> { +struct SEstimateVectorMemoryUsage> { static std::size_t value(std::size_t dimension) { // Ignore any dynamic memory used by the annotation: we don't know how to // compute this here. It will be up to the calling code to estimate this // correctly. - return SEstimateMemoryUsage::value(dimension); + return SEstimateVectorMemoryUsage::value(dimension); + } +}; + +template +struct SEstimateMatrixMemoryUsage { + static std::size_t value(std::size_t, std::size_t) { return 0; } +}; +template +struct SEstimateMatrixMemoryUsage> { + static std::size_t value(std::size_t rows, std::size_t) { + return sizeof(T) * rows * (rows + 1) / 2; + } +}; +template +struct SEstimateMatrixMemoryUsage> { + static std::size_t value(std::size_t rows, std::size_t columns) { + // Ignore pad for alignment. + return sizeof(SCALAR) * rows * columns; } }; } -//! Estimate the amount of memory a point of type VECTOR and \p dimension will use. +//! Estimate the amount of memory a vector of type VECTOR and size \p dimension +//! will use. template std::size_t estimateMemoryUsage(std::size_t dimension) { - return las_detail::SEstimateMemoryUsage::value(dimension); + return las_detail::SEstimateVectorMemoryUsage::value(dimension); +} + +//! Estimate the amount of memory a matrix of type MATRIX and size \p rows by +//! \p columns will use. +template +std::size_t estimateMemoryUsage(std::size_t rows, std::size_t columns) { + return las_detail::SEstimateMatrixMemoryUsage::value(rows, columns); } } } diff --git a/lib/maths/CBoostedTree.cc b/lib/maths/CBoostedTree.cc index e6ccdabd69..a967808e3c 100644 --- a/lib/maths/CBoostedTree.cc +++ b/lib/maths/CBoostedTree.cc @@ -67,8 +67,8 @@ bool CArgMinMseImpl::nextPass() { return false; } -void CArgMinMseImpl::add(TDouble1Vec prediction, double actual, double weight) { - m_MeanError.add(actual - prediction[0], weight); +void CArgMinMseImpl::add(const TMemoryMappedFloatVector& prediction, double actual, double weight) { + m_MeanError.add(actual - prediction(0), weight); } void CArgMinMseImpl::merge(const CArgMinLossImpl& other) { @@ -78,7 +78,7 @@ void CArgMinMseImpl::merge(const CArgMinLossImpl& other) { } } -CArgMinMseImpl::TDouble1Vec CArgMinMseImpl::value() const { +CArgMinMseImpl::TDoubleVector CArgMinMseImpl::value() const { // We searching for the value x which minimises // @@ -89,12 +89,16 @@ CArgMinMseImpl::TDouble1Vec CArgMinMseImpl::value() const { // error m = 1/n sum_i{ a_i - p_i } we have x^* = n / (n + lambda) m. double count{CBasicStatistics::count(m_MeanError)}; - return {count == 0.0 ? 0.0 : count / (count + this->lambda()) * CBasicStatistics::mean(m_MeanError)}; + double meanError{CBasicStatistics::mean(m_MeanError)}; + + TDoubleVector result(1); + result(0) = count == 0.0 ? 0.0 : count / (count + this->lambda()) * meanError; + return result; } CArgMinLogisticImpl::CArgMinLogisticImpl(double lambda) : CArgMinLossImpl{lambda}, m_CategoryCounts{0}, - m_BucketCategoryCounts(128, TVector{0.0}) { + m_BucketCategoryCounts(128, TDoubleVector2x1{0.0}) { } std::unique_ptr CArgMinLogisticImpl::clone() const { @@ -106,15 +110,17 @@ bool CArgMinLogisticImpl::nextPass() { return m_CurrentPass < 2; } -void CArgMinLogisticImpl::add(TDouble1Vec prediction, double actual, double weight) { +void CArgMinLogisticImpl::add(const TMemoryMappedFloatVector& prediction, + double actual, + double weight) { switch (m_CurrentPass) { case 0: { - m_PredictionMinMax.add(prediction[0]); + m_PredictionMinMax.add(prediction(0)); m_CategoryCounts(static_cast(actual)) += weight; break; } case 1: { - auto& count = m_BucketCategoryCounts[this->bucket(prediction[0])]; + auto& count = m_BucketCategoryCounts[this->bucket(prediction(0))]; count(static_cast(actual)) += weight; break; } @@ -142,7 +148,7 @@ void CArgMinLogisticImpl::merge(const CArgMinLossImpl& other) { } } -CArgMinLogisticImpl::TDouble1Vec CArgMinLogisticImpl::value() const { +CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const { std::function objective; double minWeight; @@ -190,8 +196,11 @@ CArgMinLogisticImpl::TDouble1Vec CArgMinLogisticImpl::value() const { maxWeight = -m_PredictionMinMax.min() + 5.0; } + TDoubleVector result(1); + if (minWeight == maxWeight) { - return {minWeight}; + result(0) = minWeight; + return result; } double minimum; @@ -201,7 +210,8 @@ CArgMinLogisticImpl::TDouble1Vec CArgMinLogisticImpl::value() const { objective, 1e-3, maxIterations, minimum, objectiveAtMinimum); LOG_TRACE(<< "minimum = " << minimum << " objective(minimum) = " << objectiveAtMinimum); - return {minimum}; + result(0) = minimum; + return result; } } @@ -223,7 +233,7 @@ bool CArgMinLoss::nextPass() const { return m_Impl->nextPass(); } -void CArgMinLoss::add(TDouble1Vec prediction, double actual, double weight) { +void CArgMinLoss::add(const TMemoryMappedFloatVector& prediction, double actual, double weight) { return m_Impl->add(prediction, actual, weight); } @@ -231,7 +241,7 @@ void CArgMinLoss::merge(CArgMinLoss& other) { return m_Impl->merge(*other.m_Impl); } -CArgMinLoss::TDouble1Vec CArgMinLoss::value() const { +CArgMinLoss::TDoubleVector CArgMinLoss::value() const { return m_Impl->value(); } @@ -250,25 +260,30 @@ std::size_t CMse::numberParameters() const { return 1; } -double CMse::value(TDouble1Vec prediction, double actual, double weight) const { - return weight * CTools::pow2(prediction[0] - actual); +double CMse::value(const TMemoryMappedFloatVector& prediction, double actual, double weight) const { + return weight * CTools::pow2(prediction(0) - actual); } -CMse::TDouble1Vec CMse::gradient(TDouble1Vec prediction, double actual, double weight) const { - return {2.0 * weight * (prediction[0] - actual)}; +void CMse::gradient(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight) const { + writer(0, 2.0 * weight * (prediction(0) - actual)); } -CMse::TDouble1Vec -CMse::curvature(TDouble1Vec /*prediction*/, double /*actual*/, double weight) const { - return {2.0 * weight}; +void CMse::curvature(const TMemoryMappedFloatVector& /*prediction*/, + double /*actual*/, + TWriter writer, + double weight) const { + writer(0, 2.0 * weight); } bool CMse::isCurvatureConstant() const { return true; } -CMse::TDouble1Vec CMse::transform(TDouble1Vec prediction) const { - return prediction; +CMse::TDoubleVector CMse::transform(const TMemoryMappedFloatVector& prediction) const { + return TDoubleVector{prediction}; } CArgMinLoss CMse::minimizer(double lambda) const { @@ -289,35 +304,43 @@ std::size_t CBinomialLogistic::numberParameters() const { return 1; } -double CBinomialLogistic::value(TDouble1Vec prediction, double actual, double weight) const { - // Cross entropy - return -weight * ((1.0 - actual) * logOneMinusLogistic(prediction[0]) + - actual * logLogistic(prediction[0])); +double CBinomialLogistic::value(const TMemoryMappedFloatVector& prediction, + double actual, + double weight) const { + return -weight * ((1.0 - actual) * logOneMinusLogistic(prediction(0)) + + actual * logLogistic(prediction(0))); } -CBinomialLogistic::TDouble1Vec -CBinomialLogistic::gradient(TDouble1Vec prediction, double actual, double weight) const { - if (prediction[0] > -LOG_EPSILON && actual == 1.0) { - return {-weight * std::exp(-prediction[0])}; +void CBinomialLogistic::gradient(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight) const { + if (prediction(0) > -LOG_EPSILON && actual == 1.0) { + writer(0, -weight * std::exp(-prediction(0))); } - return {weight * CTools::logisticFunction(prediction[0]) - actual}; + writer(0, weight * (CTools::logisticFunction(prediction(0)) - actual)); } -CBinomialLogistic::TDouble1Vec -CBinomialLogistic::curvature(TDouble1Vec prediction, double /*actual*/, double weight) const { - if (prediction[0] > -LOG_EPSILON) { - return {weight * std::exp(-prediction[0])}; +void CBinomialLogistic::curvature(const TMemoryMappedFloatVector& prediction, + double /*actual*/, + TWriter writer, + double weight) const { + if (prediction(0) > -LOG_EPSILON) { + writer(0, weight * std::exp(-prediction(0))); } - double probability{CTools::logisticFunction(prediction[0])}; - return {weight * probability * (1.0 - probability)}; + double probability{CTools::logisticFunction(prediction(0))}; + writer(0, weight * probability * (1.0 - probability)); } bool CBinomialLogistic::isCurvatureConstant() const { return false; } -CBinomialLogistic::TDouble1Vec CBinomialLogistic::transform(TDouble1Vec prediction) const { - return {CTools::logisticFunction(prediction[0])}; +CBinomialLogistic::TDoubleVector +CBinomialLogistic::transform(const TMemoryMappedFloatVector& prediction) const { + TDoubleVector result{prediction}; + result(0) = CTools::logisticFunction(result(0)); + return result; } CArgMinLoss CBinomialLogistic::minimizer(double lambda) const { diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 323e78d2eb..437ef27019 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -39,11 +38,6 @@ namespace { // by only refreshing once every MINIMUM_SPLIT_REFRESH_INTERVAL trees we add. const double MINIMUM_SPLIT_REFRESH_INTERVAL{3.0}; -double lossAtNSigma(double n, const TMeanVarAccumulator& lossMoments) { - return CBasicStatistics::mean(lossMoments) + - n * std::sqrt(CBasicStatistics::variance(lossMoments)); -} - //! \brief Record the memory used by a supplied object using the RAII idiom. class CScopeRecordMemoryUsage { public: @@ -124,6 +118,21 @@ class CTrainForestStoppingCondition { std::size_t m_MaximumNumberTreesWithoutImprovement; TDoubleSizePrMinAccumulator m_BestTestLoss; }; + +double lossAtNSigma(double n, const TMeanVarAccumulator& lossMoments) { + return CBasicStatistics::mean(lossMoments) + + n * std::sqrt(CBasicStatistics::variance(lossMoments)); +} + +double trace(std::size_t columns, const TMemoryMappedFloatVector& upperTriangle) { + // This assumes the upper triangle of the matrix is stored row major. + double result{0.0}; + for (int i = 0, j = static_cast(columns); + i < upperTriangle.size() && j > 0; i += j, --j) { + result += upperTriangle(i); + } + return result; +} } CBoostedTreeImpl::CBoostedTreeImpl(std::size_t numberThreads, @@ -273,9 +282,10 @@ void CBoostedTreeImpl::predict(core::CDataFrame& frame) const { bool successful; std::tie(std::ignore, successful) = frame.writeColumns( m_NumberThreads, 0, frame.numberRows(), [&](TRowItr beginRows, TRowItr endRows) { + std::size_t numberLossParameters{m_Loss->numberParameters()}; for (auto row = beginRows; row != endRows; ++row) { - writePrediction(*row, m_NumberInputColumns, - {predictRow(m_Encoder->encode(*row), m_BestForest)}); + auto prediction = readPrediction(*row, m_NumberInputColumns, numberLossParameters); + prediction(0) = predictRow(m_Encoder->encode(*row), m_BestForest); } }); if (successful == false) { @@ -298,7 +308,8 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows, std::size_t hyperparametersMemoryUsage{numberColumns * sizeof(double)}; std::size_t leafNodeStatisticsMemoryUsage{ maximumNumberLeaves * CBoostedTreeLeafNodeStatistics::estimateMemoryUsage( - numberRows, numberColumns, m_NumberSplitsPerFeature)}; + numberRows, numberColumns, m_NumberSplitsPerFeature, + m_Loss->numberParameters())}; std::size_t dataTypeMemoryUsage{numberColumns * sizeof(CDataFrameUtils::SDataType)}; std::size_t featureSampleProbabilities{numberColumns * sizeof(double)}; std::size_t missingFeatureMaskMemoryUsage{ @@ -440,12 +451,10 @@ CBoostedTreeImpl::TNodeVec CBoostedTreeImpl::initializePredictionsAndLossDerivat m_NumberThreads, 0, frame.numberRows(), [this](TRowItr beginRows, TRowItr endRows) { std::size_t numberLossParameters{m_Loss->numberParameters()}; - TDouble1Vec zero(numberLossParameters, 0.0); - TDouble1Vec zeroHessian(lossHessianStoredSize(numberLossParameters), 0.0); for (auto row = beginRows; row != endRows; ++row) { - writePrediction(*row, m_NumberInputColumns, zero); - writeLossGradient(*row, m_NumberInputColumns, zero); - writeLossCurvature(*row, m_NumberInputColumns, zeroHessian); + zeroPrediction(*row, m_NumberInputColumns, numberLossParameters); + zeroLossGradient(*row, m_NumberInputColumns, numberLossParameters); + zeroLossCurvature(*row, m_NumberInputColumns, numberLossParameters); } }, &updateRowMask); @@ -595,8 +604,9 @@ CBoostedTreeImpl::candidateSplits(const core::CDataFrame& frame, m_Encoder.get(), [this](const TRowRef& row) { // TODO Think about what scalar measure of the Hessian to use here? - return readLossCurvature(row, m_NumberInputColumns, - m_Loss->numberParameters())[0]; + std::size_t numberLossParameters{m_Loss->numberParameters()}; + return trace(numberLossParameters, + readLossCurvature(row, m_NumberInputColumns, numberLossParameters)); }) .first; @@ -949,14 +959,13 @@ void CBoostedTreeImpl::refreshPredictionsAndLossDerivatives(core::CDataFrame& fr std::size_t numberLossParameters{m_Loss->numberParameters()}; for (auto row = beginRows; row != endRows; ++row) { auto prediction = readPrediction(*row, m_NumberInputColumns, numberLossParameters); - prediction[0] += root(tree).value(m_Encoder->encode(*row), tree); double actual{readActual(*row, m_DependentVariable)}; double weight{readExampleWeight(*row, m_NumberInputColumns, numberLossParameters)}; - writePrediction(*row, m_NumberInputColumns, prediction); - writeLossGradient(*row, m_NumberInputColumns, - m_Loss->gradient(prediction, actual, weight)); - writeLossCurvature(*row, m_NumberInputColumns, - m_Loss->curvature(prediction, actual, weight)); + prediction(0) += root(tree).value(m_Encoder->encode(*row), tree); + writeLossGradient(*row, m_NumberInputColumns, *m_Loss, + prediction, actual, weight); + writeLossCurvature(*row, m_NumberInputColumns, *m_Loss, + prediction, actual, weight); } }, &updateRowMask); diff --git a/lib/maths/CBoostedTreeLeafNodeStatistics.cc b/lib/maths/CBoostedTreeLeafNodeStatistics.cc index 9bab4402ca..ddbbd942df 100644 --- a/lib/maths/CBoostedTreeLeafNodeStatistics.cc +++ b/lib/maths/CBoostedTreeLeafNodeStatistics.cc @@ -15,6 +15,8 @@ #include #include +#include + namespace ml { namespace maths { using namespace boosted_tree_detail; @@ -79,11 +81,11 @@ CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( m_CandidateSplits{sibling.m_CandidateSplits}, m_RowMask{std::move(rowMask)} { m_Derivatives.resize(m_CandidateSplits.size()); - m_MissingDerivatives.resize(m_CandidateSplits.size()); + m_MissingDerivatives.reserve(m_CandidateSplits.size()); for (std::size_t i = 0; i < m_CandidateSplits.size(); ++i) { std::size_t numberSplits{m_CandidateSplits[i].size() + 1}; - m_Derivatives[i].resize(numberSplits); + m_Derivatives[i].reserve(numberSplits); for (std::size_t j = 0; j < numberSplits; ++j) { // Numeric errors mean that it's possible the sum curvature for a candidate // split is identically zero while the gradient is epsilon. This can cause @@ -97,29 +99,44 @@ CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( std::size_t count{parent.m_Derivatives[i][j].s_Count - sibling.m_Derivatives[i][j].s_Count}; if (count > 0) { - double gradient{parent.m_Derivatives[i][j].s_Gradient - - sibling.m_Derivatives[i][j].s_Gradient}; - double curvature{parent.m_Derivatives[i][j].s_Curvature - - sibling.m_Derivatives[i][j].s_Curvature}; - curvature = std::max(curvature, SMALLEST_RELATIVE_CURVATURE * - std::fabs(gradient)); - m_Derivatives[i][j] = SAggregateDerivatives{count, gradient, curvature}; + TDoubleVector gradient{parent.m_Derivatives[i][j].s_Gradient - + sibling.m_Derivatives[i][j].s_Gradient}; + TDoubleMatrix curvature{parent.m_Derivatives[i][j].s_Curvature - + sibling.m_Derivatives[i][j].s_Curvature}; + for (int k = 0; k < gradient.size(); ++k) { + curvature(k, k) = + std::max(curvature(k, k), SMALLEST_RELATIVE_CURVATURE * + std::fabs(gradient(k))); + } + m_Derivatives[i].emplace_back(count, std::move(gradient), + std::move(curvature)); + } else { + m_Derivatives[i].emplace_back( + 0, TDoubleVector{TDoubleVector::Zero(m_NumberLossParameters)}, + TDoubleMatrix{TDoubleMatrix::Zero(m_NumberLossParameters, + m_NumberLossParameters)}); } } std::size_t count{parent.m_MissingDerivatives[i].s_Count - sibling.m_MissingDerivatives[i].s_Count}; if (count > 0) { - double gradient{parent.m_MissingDerivatives[i].s_Gradient - - sibling.m_MissingDerivatives[i].s_Gradient}; - double curvature{parent.m_MissingDerivatives[i].s_Curvature - - sibling.m_MissingDerivatives[i].s_Curvature}; - curvature = std::max(curvature, SMALLEST_RELATIVE_CURVATURE * std::fabs(gradient)); - m_MissingDerivatives[i] = SAggregateDerivatives{count, gradient, curvature}; + TDoubleVector gradient{parent.m_MissingDerivatives[i].s_Gradient - + sibling.m_MissingDerivatives[i].s_Gradient}; + TDoubleVector curvature{parent.m_MissingDerivatives[i].s_Curvature - + sibling.m_MissingDerivatives[i].s_Curvature}; + for (int k = 0; k < gradient.size(); ++k) { + curvature(k, k) = + std::max(curvature(k, k), + SMALLEST_RELATIVE_CURVATURE * std::fabs(gradient(k))); + } + m_MissingDerivatives.emplace_back(count, std::move(gradient), + std::move(curvature)); + } else { + m_MissingDerivatives.emplace_back( + 0, TDoubleVector{TDoubleVector::Zero(m_NumberLossParameters)}, + TDoubleMatrix{TDoubleMatrix::Zero(m_NumberLossParameters, m_NumberLossParameters)}); } } - LOG_TRACE(<< "derivatives = " << core::CContainerPrinter::print(m_Derivatives)); - LOG_TRACE(<< "missing derivatives = " - << core::CContainerPrinter::print(m_MissingDerivatives)); m_BestSplit = this->computeBestSplitStatistics(regularization, featureBag); } @@ -201,17 +218,20 @@ std::size_t CBoostedTreeLeafNodeStatistics::memoryUsage() const { return mem; } -std::size_t CBoostedTreeLeafNodeStatistics::estimateMemoryUsage(std::size_t numberRows, - std::size_t numberCols, - std::size_t numberSplitsPerFeature) { +std::size_t +CBoostedTreeLeafNodeStatistics::estimateMemoryUsage(std::size_t numberRows, + std::size_t numberCols, + std::size_t numberSplitsPerFeature, + std::size_t numberLossParameters) { // We will typically get the close to the best compression for most of the // leaves when the set of splits becomes large, corresponding to the worst // case for memory usage. This is because the rows will be spread over many // rows so the masks will mainly contain 0 bits in this case. std::size_t rowMaskSize{numberRows / PACKED_BIT_VECTOR_MAXIMUM_ROWS_PER_BYTE}; std::size_t derivativesSize{(numberCols - 1) * numberSplitsPerFeature * - sizeof(SAggregateDerivatives)}; - std::size_t missingDerivativesSize{(numberCols - 1) * sizeof(SAggregateDerivatives)}; + SDerivatives::estimateMemoryUsage(numberLossParameters)}; + std::size_t missingDerivativesSize{ + (numberCols - 1) * SDerivatives::estimateMemoryUsage(numberLossParameters)}; return sizeof(CBoostedTreeLeafNodeStatistics) + rowMaskSize + derivativesSize + missingDerivativesSize; } @@ -224,26 +244,22 @@ void CBoostedTreeLeafNodeStatistics::computeAggregateLossDerivatives( auto result = frame.readRows( numberThreads, 0, frame.numberRows(), core::bindRetrievableState( - [&](SSplitAggregateDerivatives& splitAggregateDerivatives, + [&](CSplitDerivativesAccumulator& splitDerivativesAccumulator, TRowItr beginRows, TRowItr endRows) { for (auto row = beginRows; row != endRows; ++row) { - this->addRowDerivatives(encoder.encode(*row), splitAggregateDerivatives); + this->addRowDerivatives(encoder.encode(*row), splitDerivativesAccumulator); } }, - SSplitAggregateDerivatives{m_CandidateSplits}), + CSplitDerivativesAccumulator{m_CandidateSplits, m_NumberLossParameters}), &m_RowMask); auto& state = result.first; - SSplitAggregateDerivatives derivatives{std::move(state[0].s_FunctionState)}; + CSplitDerivativesAccumulator accumulatedDerivatives{std::move(state[0].s_FunctionState)}; for (std::size_t i = 1; i < state.size(); ++i) { - derivatives.merge(state[i].s_FunctionState); + accumulatedDerivatives.merge(state[i].s_FunctionState); } - std::tie(m_Derivatives, m_MissingDerivatives) = derivatives.move(); - - LOG_TRACE(<< "derivatives = " << core::CContainerPrinter::print(m_Derivatives)); - LOG_TRACE(<< "missing derivatives = " - << core::CContainerPrinter::print(m_MissingDerivatives)); + std::tie(m_Derivatives, m_MissingDerivatives) = accumulatedDerivatives.read(); } void CBoostedTreeLeafNodeStatistics::computeRowMaskAndAggregateLossDerivatives( @@ -257,21 +273,22 @@ void CBoostedTreeLeafNodeStatistics::computeRowMaskAndAggregateLossDerivatives( auto result = frame.readRows( numberThreads, 0, frame.numberRows(), core::bindRetrievableState( - [&](std::pair& state, + [&](std::pair& state, TRowItr beginRows, TRowItr endRows) { auto& mask = state.first; - auto& splitAggregateDerivatives = state.second; + auto& splitDerivativesAccumulator = state.second; for (auto row = beginRows; row != endRows; ++row) { auto encodedRow = encoder.encode(*row); if (split.assignToLeft(encodedRow) == isLeftChild) { std::size_t index{row->index()}; mask.extend(false, index - mask.size()); mask.extend(true); - this->addRowDerivatives(encodedRow, splitAggregateDerivatives); + this->addRowDerivatives(encodedRow, splitDerivativesAccumulator); } } }, - std::make_pair(core::CPackedBitVector{}, SSplitAggregateDerivatives{m_CandidateSplits})), + std::make_pair(core::CPackedBitVector{}, + CSplitDerivativesAccumulator{m_CandidateSplits, m_NumberLossParameters})), &parentRowMask); auto& state = result.first; @@ -281,23 +298,18 @@ void CBoostedTreeLeafNodeStatistics::computeRowMaskAndAggregateLossDerivatives( } m_RowMask = std::move(state[0].s_FunctionState.first); - SSplitAggregateDerivatives derivatives{std::move(state[0].s_FunctionState.second)}; + CSplitDerivativesAccumulator derivatives{std::move(state[0].s_FunctionState.second)}; for (std::size_t i = 1; i < state.size(); ++i) { m_RowMask |= state[i].s_FunctionState.first; derivatives.merge(state[i].s_FunctionState.second); } - std::tie(m_Derivatives, m_MissingDerivatives) = derivatives.move(); - - LOG_TRACE(<< "row mask = " << m_RowMask); - LOG_TRACE(<< "derivatives = " << core::CContainerPrinter::print(m_Derivatives)); - LOG_TRACE(<< "missing derivatives = " - << core::CContainerPrinter::print(m_MissingDerivatives)); + std::tie(m_Derivatives, m_MissingDerivatives) = derivatives.read(); } void CBoostedTreeLeafNodeStatistics::addRowDerivatives( const CEncodedDataFrameRowRef& row, - SSplitAggregateDerivatives& splitAggregateDerivatives) const { + CSplitDerivativesAccumulator& splitAggregateDerivatives) const { const TRowRef& unencodedRow{row.unencodedRow()}; auto gradient = readLossGradient(unencodedRow, m_NumberInputColumns, m_NumberLossParameters); @@ -306,10 +318,10 @@ void CBoostedTreeLeafNodeStatistics::addRowDerivatives( for (std::size_t i = 0; i < m_CandidateSplits.size(); ++i) { double featureValue{row[i]}; if (CDataFrameUtils::isMissing(featureValue)) { - splitAggregateDerivatives.s_MissingDerivatives[i].add(1, gradient, curvature); + splitAggregateDerivatives.addMissingDerivatives(i, gradient, curvature); } else { std::ptrdiff_t j{m_CandidateSplits[i].upperBound(featureValue)}; - splitAggregateDerivatives.s_Derivatives[i][j].add(1, gradient, curvature); + splitAggregateDerivatives.addDerivatives(i, j, gradient, curvature); } } } @@ -325,18 +337,58 @@ CBoostedTreeLeafNodeStatistics::computeBestSplitStatistics(const TRegularization SSplitStatistics result; + // We seek to find the value at the minimum of the quadratic expansion of the + // regularized loss function. For a given leaf this expansion is + // + // L(w) = 1/2 w^t H(\lambda) w + g^t w + // + // where H(\lambda) = \sum_i H_i + \lambda I, g = \sum_i g_i and w is the leaf's + // weight. Here, g_i and H_i denote an example's loss gradient and Hessian and i + // ranges over the examples in the leaf. Completing the square and noting that + // the minimum is given when the quadratic form is zero we have + // + // L(w^*) = -1/2 g^t H(\lambda) g + // + // where w^* = arg\min_w{ L(w) }. + + using TMinimumLoss = + std::function; + + TMinimumLoss minimumLoss; + + double lambda{regularization.leafWeightPenaltyMultiplier()}; + if (m_NumberLossParameters == 1) { + minimumLoss = [lambda](const TDoubleVector& g, const TDoubleMatrix& h) -> double { + return CTools::pow2(g(0)) / (h(0, 0) + lambda); + }; + } else { + // TODO this turns out to be extremely expensive even when m_NumberLossParameters + // is one. I'm not sure why yet. Maybe solving in-place would help. + minimumLoss = [lambda](const TDoubleVector& g, const TDoubleMatrix& h) -> double { + return g.transpose() * + (h + lambda * TDoubleMatrix::Identity(h.rows(), h.cols())) + .selfadjointView() + .ldlt() + .solve(g); + }; + } + for (auto i : featureBag) { std::size_t c{m_MissingDerivatives[i].s_Count}; - double g{m_MissingDerivatives[i].s_Gradient}; - double h{m_MissingDerivatives[i].s_Curvature}; + TDoubleVector g{m_MissingDerivatives[i].s_Gradient}; + TDoubleMatrix h{m_MissingDerivatives[i].s_Curvature}; for (const auto& derivatives : m_Derivatives[i]) { c += derivatives.s_Count; g += derivatives.s_Gradient; h += derivatives.s_Curvature; } std::size_t cl[]{m_MissingDerivatives[i].s_Count, 0}; - double gl[]{m_MissingDerivatives[i].s_Gradient, 0.0}; - double hl[]{m_MissingDerivatives[i].s_Curvature, 0.0}; + TDoubleVector gl[]{m_MissingDerivatives[i].s_Gradient, + TDoubleVector::Zero(g.rows())}; + TDoubleMatrix hl[]{m_MissingDerivatives[i].s_Curvature, + TDoubleMatrix::Zero(h.rows(), h.cols())}; + TDoubleVector gr[]{g - m_MissingDerivatives[i].s_Gradient, g}; + TDoubleMatrix hr[]{h - m_MissingDerivatives[i].s_Curvature, h}; double maximumGain{-INF}; double splitAt{-INF}; @@ -344,25 +396,26 @@ CBoostedTreeLeafNodeStatistics::computeBestSplitStatistics(const TRegularization bool assignMissingToLeft{true}; for (std::size_t j = 0; j + 1 < m_Derivatives[i].size(); ++j) { + cl[ASSIGN_MISSING_TO_LEFT] += m_Derivatives[i][j].s_Count; gl[ASSIGN_MISSING_TO_LEFT] += m_Derivatives[i][j].s_Gradient; + gr[ASSIGN_MISSING_TO_LEFT] -= m_Derivatives[i][j].s_Gradient; hl[ASSIGN_MISSING_TO_LEFT] += m_Derivatives[i][j].s_Curvature; + hr[ASSIGN_MISSING_TO_LEFT] -= m_Derivatives[i][j].s_Curvature; + cl[ASSIGN_MISSING_TO_RIGHT] += m_Derivatives[i][j].s_Count; gl[ASSIGN_MISSING_TO_RIGHT] += m_Derivatives[i][j].s_Gradient; + gr[ASSIGN_MISSING_TO_RIGHT] -= m_Derivatives[i][j].s_Gradient; hl[ASSIGN_MISSING_TO_RIGHT] += m_Derivatives[i][j].s_Curvature; + hr[ASSIGN_MISSING_TO_RIGHT] -= m_Derivatives[i][j].s_Curvature; - double gain[]{CTools::pow2(gl[ASSIGN_MISSING_TO_LEFT]) / - (hl[ASSIGN_MISSING_TO_LEFT] + - regularization.leafWeightPenaltyMultiplier()) + - CTools::pow2(g - gl[ASSIGN_MISSING_TO_LEFT]) / - (h - hl[ASSIGN_MISSING_TO_LEFT] + - regularization.leafWeightPenaltyMultiplier()), - CTools::pow2(gl[ASSIGN_MISSING_TO_RIGHT]) / - (hl[ASSIGN_MISSING_TO_RIGHT] + - regularization.leafWeightPenaltyMultiplier()) + - CTools::pow2(g - gl[ASSIGN_MISSING_TO_RIGHT]) / - (h - hl[ASSIGN_MISSING_TO_RIGHT] + - regularization.leafWeightPenaltyMultiplier())}; + double gain[2]; + gain[ASSIGN_MISSING_TO_LEFT] = + minimumLoss(gl[ASSIGN_MISSING_TO_LEFT], hl[ASSIGN_MISSING_TO_LEFT]) + + minimumLoss(gr[ASSIGN_MISSING_TO_LEFT], hr[ASSIGN_MISSING_TO_LEFT]); + gain[ASSIGN_MISSING_TO_RIGHT] = + minimumLoss(gl[ASSIGN_MISSING_TO_RIGHT], hl[ASSIGN_MISSING_TO_RIGHT]) + + minimumLoss(gr[ASSIGN_MISSING_TO_RIGHT], hr[ASSIGN_MISSING_TO_RIGHT]); if (gain[ASSIGN_MISSING_TO_LEFT] > maximumGain) { maximumGain = gain[ASSIGN_MISSING_TO_LEFT]; @@ -380,13 +433,17 @@ CBoostedTreeLeafNodeStatistics::computeBestSplitStatistics(const TRegularization double penaltyForDepth{regularization.penaltyForDepth(m_Depth)}; double penaltyForDepthPlusOne{regularization.penaltyForDepth(m_Depth + 1)}; - double gain{0.5 * (maximumGain - CTools::pow2(g) / (h + regularization.leafWeightPenaltyMultiplier())) - + double gain{0.5 * (maximumGain - minimumLoss(g, h)) - regularization.treeSizePenaltyMultiplier() - regularization.depthPenaltyMultiplier() * (2.0 * penaltyForDepthPlusOne - penaltyForDepth)}; - SSplitStatistics candidate{ - gain, h, i, splitAt, leftChildHasFewerRows, assignMissingToLeft}; + SSplitStatistics candidate{gain, + h.trace() / static_cast(m_NumberLossParameters), + i, + splitAt, + leftChildHasFewerRows, + assignMissingToLeft}; LOG_TRACE(<< "candidate split: " << candidate.print()); if (candidate > result) { diff --git a/lib/maths/CBoostedTreeUtils.cc b/lib/maths/CBoostedTreeUtils.cc index f204231e65..e138ca6b3d 100644 --- a/lib/maths/CBoostedTreeUtils.cc +++ b/lib/maths/CBoostedTreeUtils.cc @@ -6,57 +6,88 @@ #include +#include + namespace ml { namespace maths { namespace boosted_tree_detail { +using namespace boosted_tree; -TDouble1Vec readPrediction(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParamaters) { - const auto* start = row.data() + predictionColumn(numberInputColumns); - return TDouble1Vec(start, start + numberLossParamaters); +TMemoryMappedFloatVector readPrediction(const TRowRef& row, + std::size_t numberInputColumns, + std::size_t numberLossParamaters) { + return {row.data() + predictionColumn(numberInputColumns), + static_cast(numberLossParamaters)}; } -void writePrediction(const TRowRef& row, std::size_t numberInputColumns, const TDouble1Vec& prediction) { +void zeroPrediction(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParamaters) { std::size_t offset{predictionColumn(numberInputColumns)}; - for (std::size_t i = 0; i < prediction.size(); ++i) { - row.writeColumn(offset + i, prediction[i]); + for (std::size_t i = 0; i < numberLossParamaters; ++i) { + row.writeColumn(offset + i, 0.0); } } -TDouble1Vec readLossGradient(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParameters) { - const auto* start = row.data() + lossGradientColumn(numberInputColumns, numberLossParameters); - return TDouble1Vec(start, start + numberLossParameters); +TMemoryMappedFloatVector readLossGradient(const TRowRef& row, + std::size_t numberInputColumns, + std::size_t numberLossParameters) { + return {row.data() + lossGradientColumn(numberInputColumns, numberLossParameters), + static_cast(numberLossParameters)}; } -void writeLossGradient(const TRowRef& row, std::size_t numberInputColumns, const TDouble1Vec& gradient) { - std::size_t offset{lossGradientColumn(numberInputColumns, gradient.size())}; - for (std::size_t i = 0; i < gradient.size(); ++i) { - row.writeColumn(offset + i, gradient[i]); +void zeroLossGradient(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters) { + std::size_t offset{lossGradientColumn(numberInputColumns, numberLossParameters)}; + for (std::size_t i = 0; i < numberLossParameters; ++i) { + row.writeColumn(offset + i, 0.0); } } -TDouble1Vec readLossCurvature(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParameters) { - const auto* start = row.data() + lossCurvatureColumn(numberInputColumns, numberLossParameters); - return TDouble1Vec(start, start + lossHessianStoredSize(numberLossParameters)); +void writeLossGradient(const TRowRef& row, + std::size_t numberInputColumns, + const CLoss& loss, + const TMemoryMappedFloatVector& prediction, + double actual, + double weight) { + std::size_t offset{lossGradientColumn(numberInputColumns, prediction.size())}; + auto writer = [&row, offset](std::size_t i, double value) { + row.writeColumn(offset + i, value); + }; + // We wrap the writer in another lambda which we know takes advantage + // of std::function small size optimization to avoid heap allocations. + loss.gradient(prediction, actual, + [&writer](std::size_t i, double value) { writer(i, value); }, weight); } -void writeLossCurvature(const TRowRef& row, - std::size_t numberInputColumns, - const TDouble1Vec& curvature) { - // This comes from solving x(x+1)/2 = n. - std::size_t numberLossParameters{ - numberLossParametersForHessianStoredSize(curvature.size())}; +TMemoryMappedFloatVector readLossCurvature(const TRowRef& row, + std::size_t numberInputColumns, + std::size_t numberLossParameters) { + return {row.data() + lossCurvatureColumn(numberInputColumns, numberLossParameters), + static_cast(lossHessianStoredSize(numberLossParameters))}; +} + +void zeroLossCurvature(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters) { std::size_t offset{lossCurvatureColumn(numberInputColumns, numberLossParameters)}; - for (std::size_t i = 0; i < curvature.size(); ++i) { - row.writeColumn(offset + i, curvature[i]); + for (std::size_t i = 0, size = lossHessianStoredSize(numberLossParameters); + i < size; ++i) { + row.writeColumn(offset + i, 0.0); } } +void writeLossCurvature(const TRowRef& row, + std::size_t numberInputColumns, + const CLoss& loss, + const TMemoryMappedFloatVector& prediction, + double actual, + double weight) { + std::size_t offset{lossCurvatureColumn(numberInputColumns, prediction.size())}; + auto writer = [&row, offset](std::size_t i, double value) { + row.writeColumn(offset + i, value); + }; + // We wrap the writer in another lambda which we know takes advantage + // of std::function small size optimization to avoid heap allocations. + loss.curvature(prediction, actual, + [&writer](std::size_t i, double value) { writer(i, value); }, weight); +} + double readExampleWeight(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters) { diff --git a/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc b/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc new file mode 100644 index 0000000000..5c7b02fd79 --- /dev/null +++ b/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc @@ -0,0 +1,297 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include + +#include + +#include +#include + +#include + +#include + +BOOST_AUTO_TEST_SUITE(CBoostedTreeLeafNodeStatisticsTest) + +using namespace ml; +using TDoubleVec = std::vector; +using TDoubleVecVec = std::vector; +using TSizeVec = std::vector; +using TSizeVecVec = std::vector; +using TFloatVec = std::vector; +using TVector = maths::CDenseVector; +using TVectorVec = std::vector; +using TVectorVecVec = std::vector; +using TMatrix = maths::CDenseMatrix; +using TMatrixVec = std::vector; +using TMatrixVecVec = std::vector; +using TImmutableRadixSet = maths::CBoostedTreeLeafNodeStatistics::TImmutableRadixSet; +using TImmutableRadixSetVec = maths::CBoostedTreeLeafNodeStatistics::TImmutableRadixSetVec; +using TDerivativesVec = maths::CBoostedTreeLeafNodeStatistics::TDerivativesVec; +using TDerivativesVecVec = maths::CBoostedTreeLeafNodeStatistics::TDerivativesVecVec; +using TDerivativesAccumulator = maths::CBoostedTreeLeafNodeStatistics::CDerivativesAccumulator; +using TSplitDerivativesAccumulator = maths::CBoostedTreeLeafNodeStatistics::CSplitDerivativesAccumulator; + +namespace { + +template +maths::CMemoryMappedDenseVector makeGradient(T* storage, std::size_t n) { + return maths::CMemoryMappedDenseVector{storage, static_cast(n)}; +} + +template +maths::CMemoryMappedDenseVector makeCurvature(T* storage, std::size_t n) { + return maths::CMemoryMappedDenseVector(storage, static_cast(n)); +} + +template +TMatrix rowMajorHessian(std::size_t n, const maths::CMemoryMappedDenseVector& curvatures) { + TMatrix result{n, n}; + for (std::size_t i = 0, k = 0; i < n; ++i) { + for (std::size_t j = i; j < n; ++j, ++k) { + result(i, j) = result(j, i) = curvatures(k); + } + } + return result; +} + +void testDerivativesAccumulatorFor(std::size_t numberParameters) { + + LOG_DEBUG(<< "Testing " << numberParameters << " parameters"); + + test::CRandomNumbers rng; + + std::size_t numberGradients{numberParameters}; + std::size_t numberCurvatures{ + maths::boosted_tree_detail::lossHessianStoredSize(numberParameters)}; + + TDoubleVecVec gradients(numberGradients); + TDoubleVecVec curvatures(numberCurvatures); + for (std::size_t i = 0; i < numberGradients; ++i) { + rng.generateUniformSamples(-1.0, 1.5, 20, gradients[i]); + } + for (std::size_t i = 0; i < numberCurvatures; ++i) { + rng.generateUniformSamples(0.1, 0.5, 20, curvatures[i]); + } + + TDoubleVec storage1(numberGradients + numberCurvatures, 0.0); + auto totalGradient1 = makeGradient(&storage1[0], numberGradients); + auto totalCurvature1 = makeCurvature(&storage1[numberGradients], numberCurvatures); + TDerivativesAccumulator accumulator1{totalGradient1, totalCurvature1}; + + for (std::size_t j = 0; j < 10; ++j) { + TFloatVec storage; + for (std::size_t i = 0; i < numberGradients; ++i) { + storage.push_back(gradients[i][j]); + } + for (std::size_t i = 0; i < numberCurvatures; ++i) { + storage.push_back(curvatures[i][j]); + } + auto gradient = makeGradient(&storage[0], numberGradients); + auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures); + accumulator1.add(1, gradient, curvature); + } + + BOOST_REQUIRE_EQUAL(10, accumulator1.count()); + for (std::size_t i = 0; i < numberGradients; ++i) { + BOOST_REQUIRE_CLOSE( + std::accumulate(gradients[i].begin(), gradients[i].begin() + 10, 0.0), + accumulator1.gradient()(i), 1e-4); + } + for (std::size_t i = 0; i < numberCurvatures; ++i) { + BOOST_REQUIRE_CLOSE(std::accumulate(curvatures[i].begin(), + curvatures[i].begin() + 10, 0.0), + accumulator1.curvature()(i), 1e-4); + } + + TDoubleVec storage2(numberGradients + numberCurvatures, 0.0); + auto totalGradient2 = makeGradient(&storage2[0], numberGradients); + auto totalCurvature2 = makeCurvature(&storage2[numberGradients], numberCurvatures); + TDerivativesAccumulator accumulator2{totalGradient2, totalCurvature2}; + + for (std::size_t j = 10; j < 20; ++j) { + TFloatVec storage; + for (std::size_t i = 0; i < numberGradients; ++i) { + storage.push_back(gradients[i][j]); + } + for (std::size_t i = 0; i < numberCurvatures; ++i) { + storage.push_back(curvatures[i][j]); + } + auto gradient = makeGradient(&storage[0], numberGradients); + auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures); + accumulator2.add(1, gradient, curvature); + } + + BOOST_REQUIRE_EQUAL(10, accumulator2.count()); + for (std::size_t i = 0; i < numberGradients; ++i) { + BOOST_REQUIRE_CLOSE( + std::accumulate(gradients[i].begin() + 10, gradients[i].end(), 0.0), + accumulator2.gradient()(i), 1e-4); + } + for (std::size_t i = 0; i < numberCurvatures; ++i) { + BOOST_REQUIRE_CLOSE( + std::accumulate(curvatures[i].begin() + 10, curvatures[i].end(), 0.0), + accumulator2.curvature()(i), 1e-4); + } + + accumulator1.merge(accumulator2); + BOOST_REQUIRE_EQUAL(20, accumulator1.count()); + for (std::size_t i = 0; i < numberGradients; ++i) { + BOOST_REQUIRE_CLOSE(std::accumulate(gradients[i].begin(), gradients[i].end(), 0.0), + accumulator1.gradient()(i), 1e-4); + } + for (std::size_t i = 0; i < numberCurvatures; ++i) { + BOOST_REQUIRE_CLOSE(std::accumulate(curvatures[i].begin(), curvatures[i].end(), 0.0), + accumulator1.curvature()(i), 1e-4); + } +} + +void testSplitDerivativesAccumulatorFor(std::size_t numberParameters) { + + LOG_DEBUG(<< "Testing " << numberParameters << " parameters"); + + TImmutableRadixSetVec featureSplits; + featureSplits.push_back(TImmutableRadixSet{1.0, 2.0, 3.0}); + featureSplits.push_back(TImmutableRadixSet{0.1, 0.7, 1.1, 1.4}); + + test::CRandomNumbers rng; + + std::size_t numberSamples{20}; + std::size_t numberGradients{numberParameters}; + std::size_t numberCurvatures{ + maths::boosted_tree_detail::lossHessianStoredSize(numberParameters)}; + + for (std::size_t t = 0; t < 100; ++t) { + TSizeVec features; + TSizeVec splits[2]; + TDoubleVec uniform01; + TDoubleVec gradients; + TDoubleVec curvatures; + rng.generateUniformSamples(0, 2, numberSamples, features); + rng.generateUniformSamples(0, featureSplits[0].size() + 1, + numberSamples, splits[0]); + rng.generateUniformSamples(0, featureSplits[1].size() + 1, + numberSamples, splits[1]); + rng.generateUniformSamples(0.0, 1.0, numberSamples, uniform01); + rng.generateUniformSamples(-1.5, 1.0, numberSamples * numberGradients, gradients); + rng.generateUniformSamples(0.1, 0.5, numberSamples * numberCurvatures, curvatures); + + TSizeVecVec expectedCounts(2); + TVectorVecVec expectedGradients(2); + TMatrixVecVec expectedCurvatures(2); + TSizeVec expectedMissingCounts(2, 0); + TVectorVec expectedMissingGradients(2, TVector::Zero(numberParameters)); + TMatrixVec expectedMissingCurvatures(2, TMatrix::Zero(numberParameters, numberParameters)); + for (std::size_t i = 0; i < 2; ++i) { + expectedCounts[i].resize(featureSplits[i].size() + 1, 0); + expectedGradients[i].resize(featureSplits[i].size() + 1, + TVector::Zero(numberParameters)); + expectedCurvatures[i].resize(featureSplits[i].size() + 1, + TMatrix::Zero(numberParameters, numberParameters)); + } + + auto addDerivatives = [&](TSplitDerivativesAccumulator& accumulator) { + for (std::size_t i = 0, j = 0, k = 0; i < numberSamples; + ++i, j += numberGradients, k += numberCurvatures) { + + TFloatVec storage; + storage.insert(storage.end(), &gradients[j], &gradients[j + numberGradients]); + storage.insert(storage.end(), &curvatures[j], &curvatures[k + numberCurvatures]); + auto gradient = makeGradient(&storage[0], numberGradients); + auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures); + + if (uniform01[i] < 0.1) { + accumulator.addMissingDerivatives(features[i], gradient, curvature); + ++expectedMissingCounts[features[i]]; + expectedMissingGradients[features[i]] += gradient; + expectedMissingCurvatures[features[i]] += rowMajorHessian(numberParameters, curvature); + } else { + accumulator.addDerivatives(features[i], splits[features[i]][i], + gradient, curvature); + ++expectedCounts[features[i]][splits[features[i]][i]]; + expectedGradients[features[i]][splits[features[i]][i]] += gradient; + expectedCurvatures[features[i]][splits[features[i]][i]] += + rowMajorHessian(numberParameters, curvature); + } + } + }; + + auto validate = [&](const TDerivativesVecVec& derivatives, + const TDerivativesVec& missingDerivatives) { + BOOST_REQUIRE_EQUAL(expectedCounts.size(), derivatives.size()); + for (std::size_t i = 0; i < expectedCounts.size(); ++i) { + BOOST_REQUIRE_EQUAL(expectedCounts[i].size(), derivatives[i].size()); + for (std::size_t j = 0; j < expectedGradients[i].size(); ++j) { + TMatrix curvature{ + derivatives[i][j].s_Curvature.selfadjointView()}; + BOOST_REQUIRE_EQUAL(expectedCounts[i][j], derivatives[i][j].s_Count); + BOOST_REQUIRE_EQUAL(expectedGradients[i][j], derivatives[i][j].s_Gradient); + BOOST_REQUIRE_EQUAL(expectedCurvatures[i][j], curvature); + } + } + BOOST_REQUIRE_EQUAL(expectedMissingCounts.size(), missingDerivatives.size()); + for (std::size_t i = 0; i < expectedMissingCounts.size(); ++i) { + TMatrix curvature{ + missingDerivatives[i].s_Curvature.selfadjointView()}; + BOOST_REQUIRE_EQUAL(expectedMissingCounts[i], missingDerivatives[i].s_Count); + BOOST_REQUIRE_EQUAL(expectedMissingGradients[i], missingDerivatives[i].s_Gradient); + BOOST_REQUIRE_EQUAL(expectedMissingCurvatures[i], curvature); + } + }; + + LOG_TRACE(<< "Test accumulation"); + + TSplitDerivativesAccumulator accumulator1{featureSplits, numberParameters}; + addDerivatives(accumulator1); + + TDerivativesVecVec derivatives; + TDerivativesVec missingDerivatives; + std::tie(derivatives, missingDerivatives) = accumulator1.read(); + + validate(derivatives, missingDerivatives); + + LOG_TRACE(<< "Test merge"); + + rng.generateUniformSamples(0.0, 1.0, numberSamples, uniform01); + rng.generateUniformSamples(-1.5, 1.0, numberSamples * numberGradients, gradients); + rng.generateUniformSamples(0.1, 0.5, numberSamples * numberCurvatures, curvatures); + + TSplitDerivativesAccumulator accumulator2{featureSplits, numberParameters}; + addDerivatives(accumulator2); + accumulator1.merge(accumulator2); + + std::tie(derivatives, missingDerivatives) = accumulator1.read(); + + validate(derivatives, missingDerivatives); + + LOG_TRACE(<< "Test copy"); + + TSplitDerivativesAccumulator accumulator3{accumulator1}; + BOOST_REQUIRE_EQUAL(accumulator1.checksum(), accumulator3.checksum()); + } +} +} + +BOOST_AUTO_TEST_CASE(testDerivativesAccumulator) { + + // Test individual derivatives accumulation for single and multi parameter + // loss functions. + + testDerivativesAccumulatorFor(1 /*loss function parameter*/); + testDerivativesAccumulatorFor(3 /*loss function parameters*/); +} + +BOOST_AUTO_TEST_CASE(testSplitDerivativesAccumulator) { + + // Test per split derivatives accumulation for single and multi parameter + // loss functions. + + testSplitDerivativesAccumulatorFor(1 /*loss function parameter*/); + testSplitDerivativesAccumulatorFor(3 /*loss function parameters*/); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index aa1a7c9ca6..74bbe04ca0 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -42,6 +42,7 @@ using TRowRef = core::CDataFrame::TRowRef; using TRowItr = core::CDataFrame::TRowItr; using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar::TAccumulator; +using TMemoryMappedFloatVector = maths::boosted_tree::CLoss::TMemoryMappedFloatVector; namespace { @@ -879,10 +880,12 @@ BOOST_AUTO_TEST_CASE(testLogisticMinimizerEdgeCases) { // All predictions equal and zero. { CArgMinLogisticImpl argmin{0.0}; - argmin.add({0.0}, 0.0); - argmin.add({0.0}, 1.0); - argmin.add({0.0}, 1.0); - argmin.add({0.0}, 0.0); + maths::CFloatStorage storage[]{0.0}; + TMemoryMappedFloatVector prediction{storage, 1}; + argmin.add(prediction, 0.0); + argmin.add(prediction, 1.0); + argmin.add(prediction, 1.0); + argmin.add(prediction, 0.0); argmin.nextPass(); BOOST_REQUIRE_EQUAL(0.0, argmin.value()[0]); } @@ -906,7 +909,9 @@ BOOST_AUTO_TEST_CASE(testLogisticMinimizerEdgeCases) { do { ++numberPasses; for (std::size_t i = 0; i < labels.size(); ++i) { - argmin.add({weights[i]}, labels[i]); + maths::CFloatStorage storage[]{weights[i]}; + TMemoryMappedFloatVector prediction{storage, 1}; + argmin.add(prediction, labels[i]); ++counts[static_cast(labels[i])]; } } while (argmin.nextPass()); @@ -927,7 +932,9 @@ BOOST_AUTO_TEST_CASE(testLogisticMinimizerEdgeCases) { TDoubleVec actuals{1.0, 1.0, 0.0, 1.0}; do { for (std::size_t i = 0; i < predictions.size(); ++i) { - argmin.add({predictions[i]}, actuals[i]); + maths::CFloatStorage storage[]{predictions[i]}; + TMemoryMappedFloatVector prediction{storage, 1}; + argmin.add(prediction, actuals[i]); } } while (argmin.nextPass()); @@ -939,7 +946,9 @@ BOOST_AUTO_TEST_CASE(testLogisticMinimizerEdgeCases) { for (double eps : {-10.0, 0.0, 10.0}) { double lossAtEps{0.0}; for (std::size_t i = 0; i < predictions.size(); ++i) { - lossAtEps += loss.value({predictions[i] + minimizer + eps}, actuals[i]); + maths::CFloatStorage storage[]{predictions[i] + minimizer + eps}; + TMemoryMappedFloatVector probe{storage, 1}; + lossAtEps += loss.value(probe, actuals[i]); } losses.push_back(lossAtEps); } @@ -1015,19 +1024,23 @@ BOOST_AUTO_TEST_CASE(testLogisticMinimizerRandom) { do { for (std::size_t i = 0; i < labels.size() / 2; ++i) { - argmin.add({weights[i]}, labels[i]); - argminPartition[0].add({weights[i]}, labels[i]); + maths::CFloatStorage storage[]{weights[i]}; + TMemoryMappedFloatVector prediction{storage, 1}; + argmin.add(prediction, labels[i]); + argminPartition[0].add(prediction, labels[i]); } for (std::size_t i = labels.size() / 2; i < labels.size(); ++i) { - argmin.add({weights[i]}, labels[i]); - argminPartition[1].add({weights[i]}, labels[i]); + maths::CFloatStorage storage[]{weights[i]}; + TMemoryMappedFloatVector prediction{storage, 1}; + argmin.add(prediction, labels[i]); + argminPartition[1].add(prediction, labels[i]); } argminPartition[0].merge(argminPartition[1]); argminPartition[1] = argminPartition[0]; } while (nextPass()); - double actual{argmin.value()[0]}; - double actualPartition{argminPartition[0].value()[0]}; + double actual{argmin.value()(0)}; + double actualPartition{argminPartition[0].value()(0)}; LOG_DEBUG(<< "actual = " << actual << " objective at actual = " << objective(actual)); @@ -1053,13 +1066,17 @@ BOOST_AUTO_TEST_CASE(testLogisticLossForUnderflow) { // Losses should be very nearly linear function of log-odds when they're large. { - TDoubleVec lastLoss{loss.value({1.0 - std::log(eps)}, 0.0), - loss.value({1.0 + std::log(eps)}, 1.0)}; + maths::CFloatStorage predictions[]{1.0 - std::log(eps), 1.0 + std::log(eps)}; + TMemoryMappedFloatVector prediction0{&predictions[0], 1}; + TMemoryMappedFloatVector prediction1{&predictions[1], 1}; + TDoubleVec lastLoss{loss.value(prediction0, 0.0), loss.value(prediction1, 1.0)}; for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) { - TDoubleVec currentLoss{loss.value({scale - std::log(eps)}, 0.0), - loss.value({scale + std::log(eps)}, 1.0)}; - BOOST_REQUIRE_CLOSE_ABSOLUTE(0.25, lastLoss[0] - currentLoss[0], 5e-3); - BOOST_REQUIRE_CLOSE_ABSOLUTE(-0.25, lastLoss[1] - currentLoss[1], 5e-3); + predictions[0] = scale - std::log(eps); + predictions[1] = scale + std::log(eps); + TDoubleVec currentLoss{loss.value(prediction0, 0.0), + loss.value(prediction1, 1.0)}; + BOOST_REQUIRE_CLOSE_ABSOLUTE(0.25, lastLoss[0] - currentLoss[0], 0.005); + BOOST_REQUIRE_CLOSE_ABSOLUTE(-0.25, lastLoss[1] - currentLoss[1], 0.005); lastLoss = currentLoss; } } @@ -1067,24 +1084,42 @@ BOOST_AUTO_TEST_CASE(testLogisticLossForUnderflow) { // The gradient and curvature should be proportional to the exponential of the // log-odds when they're small. { - TDoubleVec lastGradient{loss.gradient({1.0 + std::log(eps)}, 0.0)[0], - loss.gradient({1.0 - std::log(eps)}, 1.0)[0]}; - TDoubleVec lastCurvature{loss.curvature({1.0 + std::log(eps)}, 0.0)[0], - loss.curvature({1.0 - std::log(eps)}, 1.0)[0]}; + auto readDerivatives = [&](double prediction, TDoubleVec& gradients, + TDoubleVec& curvatures) { + maths::CFloatStorage predictions[]{prediction + std::log(eps), + prediction - std::log(eps)}; + TMemoryMappedFloatVector prediction0{&predictions[0], 1}; + TMemoryMappedFloatVector prediction1{&predictions[1], 1}; + loss.gradient(prediction0, 0.0, [&](std::size_t, double value) { + gradients[0] = value; + }); + loss.gradient(prediction1, 1.0, [&](std::size_t, double value) { + gradients[1] = value; + }); + loss.curvature(prediction0, 0.0, [&](std::size_t, double value) { + curvatures[0] = value; + }); + loss.curvature(prediction1, 1.0, [&](std::size_t, double value) { + curvatures[1] = value; + }); + }; + + TDoubleVec lastGradient(2); + TDoubleVec lastCurvature(2); + readDerivatives(1.0, lastGradient, lastCurvature); + for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) { - TDoubleVec currentGradient{loss.gradient({scale + std::log(eps)}, 0.0)[0], - loss.gradient({scale - std::log(eps)}, 1.0)[0]}; - TDoubleVec currentCurvature{ - loss.curvature({scale + std::log(eps)}, 0.0)[0], - loss.curvature({scale - std::log(eps)}, 1.0)[0]}; + TDoubleVec currentGradient(2); + TDoubleVec currentCurvature(2); + readDerivatives(scale, currentGradient, currentCurvature); BOOST_REQUIRE_CLOSE_ABSOLUTE(std::exp(0.25), - lastGradient[0] / currentGradient[0], 5e-3); + lastGradient[0] / currentGradient[0], 0.01); BOOST_REQUIRE_CLOSE_ABSOLUTE(std::exp(-0.25), - lastGradient[1] / currentGradient[1], 5e-3); + lastGradient[1] / currentGradient[1], 0.01); BOOST_REQUIRE_CLOSE_ABSOLUTE( - std::exp(0.25), lastCurvature[0] / currentCurvature[0], 5e-3); + std::exp(0.25), lastCurvature[0] / currentCurvature[0], 0.01); BOOST_REQUIRE_CLOSE_ABSOLUTE( - std::exp(-0.25), lastCurvature[1] / currentCurvature[1], 5e-3); + std::exp(-0.25), lastCurvature[1] / currentCurvature[1], 0.01); lastGradient = currentGradient; lastCurvature = currentCurvature; } diff --git a/lib/maths/unittest/Makefile b/lib/maths/unittest/Makefile index dcab0474aa..f84bb0e1e2 100644 --- a/lib/maths/unittest/Makefile +++ b/lib/maths/unittest/Makefile @@ -1,110 +1,63 @@ # -# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -# or more contributor license agreements. Licensed under the Elastic License; -# you may not use this file except in compliance with the Elastic License. +#Copyright Elasticsearch B.V.and / or licensed to Elasticsearch B.V.under one +# or more contributor license agreements.Licensed under the Elastic License; +#you may not use this file except in compliance with the Elastic License. # -include $(CPP_SRC_HOME)/mk/defines.mk +include $(CPP_SRC_HOME) / mk / + defines.mk -TARGET=ml_test$(EXE_EXT) + TARGET = ml_test$(EXE_EXT) -USE_BOOST=1 -USE_BOOST_FILESYSTEM_LIBS=1 -USE_BOOST_TEST_LIBS=1 -USE_RAPIDJSON=1 -USE_EIGEN=1 + USE_BOOST = 1 USE_BOOST_FILESYSTEM_LIBS = 1 USE_BOOST_TEST_LIBS = 1 USE_RAPIDJSON = 1 USE_EIGEN = 1 -LIBS:=$(LIB_ML_MATHS) + LIBS : = $(LIB_ML_MATHS) -all: build + all + : build -SRCS=\ - Main.cc \ - CAgglomerativeClustererTest.cc \ - CAssignmentTest.cc \ - CBasicStatisticsTest.cc \ - CBayesianOptimisationTest.cc \ - CBjkstUniqueValuesTest.cc \ - CBoostedTreeTest.cc \ - CBootstrapClustererTest.cc \ - CBoundingBoxTest.cc \ - CCalendarComponentAdaptiveBucketingTest.cc \ - CCalendarCyclicTestTest.cc \ - CCalendarFeatureTest.cc \ - CCategoricalToolsTest.cc \ - CChecksumTest.cc \ - CClustererTest.cc \ - CCountMinSketchTest.cc \ - CDataFrameCategoryEncoderTest.cc \ - CDataFrameUtilsTest.cc \ - CDecayRateControllerTest.cc \ - CEntropySketchTest.cc \ - CEqualWithToleranceTest.cc \ - CExpandingWindowTest.cc \ - CForecastTest.cc \ - CGammaRateConjugateTest.cc \ - CInformationCriteriaTest.cc \ - CIntegerToolsTest.cc \ - CIntegrationTest.cc \ - CKdTreeTest.cc \ - CKMeansTest.cc \ - CKMeansOnlineTest.cc \ - CKMostCorrelatedTest.cc \ - CLassoLogisticRegressionTest.cc \ - CLbfgsTest.cc \ - CLeastSquaresOnlineRegressionTest.cc \ - CLinearAlgebraTest.cc \ - CLogNormalMeanPrecConjugateTest.cc \ - CLogTDistributionTest.cc \ - CMathsFuncsTest.cc \ - CMathsMemoryTest.cc \ - CMicTest.cc \ - CMixtureDistributionTest.cc \ - CModelTest.cc \ - CMultimodalPriorTest.cc \ - CMultinomialConjugateTest.cc \ - CMultivariateConstantPriorTest.cc \ - CMultivariateMultimodalPriorTest.cc \ - CMultivariateNormalConjugateTest.cc \ - CMultivariateOneOfNPriorTest.cc \ - CNaiveBayesTest.cc \ - CNaturalBreaksClassifierTest.cc \ - CNormalMeanPrecConjugateTest.cc \ - COneOfNPriorTest.cc \ - COrderingsTest.cc \ - COrdinalTest.cc \ - COrthogonaliserTest.cc \ - COutliersTest.cc \ - CPeriodicityHypothesisTestsTest.cc \ - CPoissonMeanConjugateTest.cc \ - CPriorTest.cc \ - CPRNGTest.cc \ - CProbabilityAggregatorsTest.cc \ - CProbabilityCalibratorTest.cc \ - CQDigestTest.cc \ - CQuantileSketchTest.cc \ - CRadialBasisFunctionTest.cc \ - CRandomizedPeriodicityTestTest.cc \ - CRandomProjectionClustererTest.cc \ - CSamplingTest.cc \ - CSeasonalComponentTest.cc \ - CSeasonalComponentAdaptiveBucketingTest.cc \ - CSeasonalTimeTest.cc \ - CSetToolsTest.cc \ - CSignalTest.cc \ - CSolversTest.cc \ - CSplineTest.cc \ - CStatisticalTestsTest.cc \ - CTimeSeriesChangeDetectorTest.cc \ - CTimeSeriesDecompositionTest.cc \ - CTimeSeriesModelTest.cc \ - CTimeSeriesMultibucketFeaturesTest.cc \ - CTimeSeriesSegmentationTest.cc \ - CToolsTest.cc \ - CTreeShapFeatureImportanceTest.cc \ - CTrendComponentTest.cc \ - CXMeansOnlineTest.cc \ - CXMeansOnline1dTest.cc \ - CXMeansTest.cc \ - TestUtils.cc \ + SRCS = + Main.cc CAgglomerativeClustererTest.cc + CAssignmentTest.cc CBasicStatisticsTest.cc CBayesianOptimisationTest + .cc CBjkstUniqueValuesTest.cc CBoostedTreeLeafNodeStatisticsTest + .cc CBoostedTreeTest.cc CBootstrapClustererTest + .cc CBoundingBoxTest.cc CCalendarComponentAdaptiveBucketingTest + .cc CCalendarCyclicTestTest.cc CCalendarFeatureTest + .cc CCategoricalToolsTest.cc CChecksumTest.cc + CClustererTest.cc CCountMinSketchTest.cc CDataFrameCategoryEncoderTest + .cc CDataFrameUtilsTest.cc CDecayRateControllerTest + .cc CEntropySketchTest.cc + CEqualWithToleranceTest.cc CExpandingWindowTest.cc + CForecastTest.cc CGammaRateConjugateTest.cc CInformationCriteriaTest + .cc CIntegerToolsTest.cc CIntegrationTest.cc CKdTreeTest + .cc CKMeansTest.cc CKMeansOnlineTest.cc + CKMostCorrelatedTest.cc CLassoLogisticRegressionTest.cc + CLbfgsTest.cc CLeastSquaresOnlineRegressionTest.cc CLinearAlgebraTest + .cc CLogNormalMeanPrecConjugateTest.cc CLogTDistributionTest + .cc CMathsFuncsTest.cc CMathsMemoryTest.cc CMicTest + .cc CMixtureDistributionTest.cc CModelTest.cc CMultimodalPriorTest + .cc CMultinomialConjugateTest.cc + CMultivariateConstantPriorTest.cc + CMultivariateMultimodalPriorTest.cc CMultivariateNormalConjugateTest + .cc CMultivariateOneOfNPriorTest.cc CNaiveBayesTest + .cc CNaturalBreaksClassifierTest.cc + CNormalMeanPrecConjugateTest.cc COneOfNPriorTest.cc + COrderingsTest.cc COrdinalTest.cc COrthogonaliserTest.cc COutliersTest + .cc CPeriodicityHypothesisTestsTest.cc CPoissonMeanConjugateTest + .cc CPriorTest.cc CPRNGTest.cc CProbabilityAggregatorsTest + .cc CProbabilityCalibratorTest.cc + CQDigestTest.cc CQuantileSketchTest.cc + CRadialBasisFunctionTest.cc CRandomizedPeriodicityTestTest + .cc CRandomProjectionClustererTest.cc + CSamplingTest.cc CSeasonalComponentTest + .cc CSeasonalComponentAdaptiveBucketingTest.cc CSeasonalTimeTest + .cc CSetToolsTest.cc CSignalTest.cc CSolversTest.cc + CSplineTest.cc CStatisticalTestsTest.cc CTimeSeriesChangeDetectorTest + .cc CTimeSeriesDecompositionTest.cc + CTimeSeriesModelTest.cc CTimeSeriesMultibucketFeaturesTest + .cc CTimeSeriesSegmentationTest.cc + CToolsTest.cc CTreeShapFeatureImportanceTest.cc + CTrendComponentTest.cc CXMeansOnlineTest.cc + CXMeansOnline1dTest.cc CXMeansTest.cc TestUtils.cc -include $(CPP_SRC_HOME)/mk/stdboosttest.mk + include $(CPP_SRC_HOME) / + mk / stdboosttest.mk From 570e0f3a42729957a205eedd5900fc3281ad3711 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 11 Feb 2020 10:35:12 +0000 Subject: [PATCH 02/13] Rework derivatives storage to cut down performance impact of heap allocations --- .../maths/CBoostedTreeLeafNodeStatistics.h | 361 ++++++++++-------- lib/maths/CBoostedTreeImpl.cc | 8 +- lib/maths/CBoostedTreeLeafNodeStatistics.cc | 219 ++++------- .../CBoostedTreeLeafNodeStatisticsTest.cc | 170 +++++---- lib/maths/unittest/Makefile | 158 +++++--- 5 files changed, 475 insertions(+), 441 deletions(-) diff --git a/include/maths/CBoostedTreeLeafNodeStatistics.h b/include/maths/CBoostedTreeLeafNodeStatistics.h index 90cc57376e..94c729ffec 100644 --- a/include/maths/CBoostedTreeLeafNodeStatistics.h +++ b/include/maths/CBoostedTreeLeafNodeStatistics.h @@ -8,6 +8,7 @@ #define INCLUDED_ml_maths_CBoostedTreeLeafNodeStatistics_h #include +#include #include #include @@ -56,58 +57,23 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { using TPtrPtrPr = std::pair; using TMemoryMappedFloatVector = CMemoryMappedDenseVector; using TMemoryMappedDoubleVector = CMemoryMappedDenseVector; - using TDoubleVector = CDenseVector; - using TDoubleMatrix = CDenseMatrix; + using TMemoryMappedDoubleMatrix = CMemoryMappedDenseMatrix; - //! \brief Aggregate derivatives (gradient and Hessian). - struct MATHS_EXPORT SDerivatives { - SDerivatives(std::size_t count, TDoubleVector gradient, TDoubleMatrix curvature) - : s_Count{count}, s_Gradient{std::move(gradient)}, s_Curvature{std::move(curvature)} { - } - //! \warning This assumes that the curvature is stored flat row major. - SDerivatives(std::size_t count, - const TMemoryMappedDoubleVector& gradient, - const TMemoryMappedDoubleVector& curvature) - : s_Count{count}, s_Gradient{gradient.size()}, s_Curvature{gradient.size(), - gradient.size()} { - for (int i = 0; i < gradient.size(); ++i) { - s_Gradient[i] = gradient[i]; - } - // We only copy the upper triangle and always use selfadjointView - // when working with the actual Hessian. - for (int i = 0, k = 0; i < gradient.size(); ++i) { - for (int j = i; j < gradient.size(); ++j, ++k) { - s_Curvature(i, j) = curvature(k); - } - } - } - - static std::size_t estimateMemoryUsage(std::size_t numberLossParameters) { - return sizeof(SDerivatives) + - las::estimateMemoryUsage(numberLossParameters) + - las::estimateMemoryUsage(numberLossParameters, - numberLossParameters); - } - - std::size_t s_Count; - TDoubleVector s_Gradient; - //! Note only the upper triangle is initialized since this is symmetric. - TDoubleMatrix s_Curvature; - }; + //! \brief Accumulates aggregate derivatives. + class MATHS_EXPORT CDerivatives { + public: + //! Bounds the minimum diagonal of the Hessian. + static constexpr double SMALLEST_RELATIVE_CURVATURE{1e-20}; - using TDerivativesVec = std::vector; - using TDerivativesVecVec = std::vector; + //! See core::CMemory. + static bool dynamicSizeAlwaysZero() { return true; } - //! \brief Accumulates aggregate derivatives. - class MATHS_EXPORT CDerivativesAccumulator { public: - CDerivativesAccumulator(const TMemoryMappedDoubleVector& gradient, - const TMemoryMappedDoubleVector& curvature) - : CDerivativesAccumulator{0, gradient, curvature} {} - CDerivativesAccumulator(std::size_t count, - const TMemoryMappedDoubleVector& gradient, - const TMemoryMappedDoubleVector& curvature) - : m_Count{count}, m_Gradient{gradient}, m_Curvature{curvature} {} + CDerivatives(std::size_t numberLossParameters, double* storage) + : m_Count{0}, m_Gradient{storage, static_cast(numberLossParameters)}, + m_Curvature{storage + numberLossParameters, + static_cast(numberLossParameters), + static_cast(numberLossParameters)} {} //! Get the accumulated count. std::size_t count() const { return m_Count; } @@ -116,7 +82,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { const TMemoryMappedDoubleVector& gradient() const { return m_Gradient; } //! Get the accumulated curvature. - const TMemoryMappedDoubleVector& curvature() const { + const TMemoryMappedDoubleMatrix& curvature() const { return m_Curvature; } @@ -126,16 +92,56 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { const TMemoryMappedFloatVector& curvature) { m_Count += count; m_Gradient += gradient; - m_Curvature += curvature; + this->curvatureTriangleView() += curvature; } //! Compute the accumulation of both collections of derivatives. - void merge(const CDerivativesAccumulator& other) { + void merge(const CDerivatives& other) { m_Count += other.m_Count; m_Gradient += other.m_Gradient; m_Curvature += other.m_Curvature; } + //! Set to the difference of \p lhs and \p rhs. + void assignDifference(const CDerivatives& lhs, const CDerivatives& rhs) { + // Numeric errors mean that it's possible the sum curvature for a candidate + // split is identically zero while the gradient is epsilon. This can cause + // the node gain to appear infinite (when there is no weight regularisation) + // which in turns causes problems initialising the region we search for optimal + // hyperparameter values. We can safely force the gradient and curvature to + // be zero if we detect that the count is zero. + std::size_t count{lhs.m_Count - rhs.m_Count}; + if (count > 0) { + m_Count = count; + m_Gradient.array() = lhs.m_Gradient - rhs.m_Gradient; + m_Curvature.array() = lhs.m_Curvature - rhs.m_Curvature; + // None of our loss functions have negative curvature therefore we + // shouldn't allow the cumulative curvature to be negative either. + // In this case we force it to be a v.small multiple of the magnitude + // of the gradient since this is the closest feasible estimate. + for (int i = 0; i < m_Gradient.size(); ++i) { + m_Curvature(i, i) = std::max(m_Curvature(i, i), + SMALLEST_RELATIVE_CURVATURE * + std::fabs(m_Gradient(i))); + } + } + } + + //! Remap the accumulated curvature to lower triangle row major format. + void remapCurvature() { + // We accumulate curvatures in the first n (n + 1) / elements however + // users of TMemoryMappedDoubleMatrix expect them stored column major + // in the lower triangle of n x n matrix. This copies them backwards + // to their correct positions. + for (std::ptrdiff_t j = m_Curvature.cols() - 1, + k = m_Curvature.rows() * (m_Curvature.rows() + 1) / 2 - 1; + j >= 0; --j) { + for (std::ptrdiff_t i = m_Curvature.rows() - 1; i >= j; --i, --k) { + m_Curvature(i, j) = m_Curvature.array()(k); + } + } + } + //! Get a checksum of this object. std::uint64_t checksum(std::uint64_t seed = 0) const { seed = CChecksum::calculate(seed, m_Count); @@ -143,105 +149,73 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { return CChecksum::calculate(seed, m_Curvature); } + private: + TMemoryMappedDoubleVector curvatureTriangleView() { + return {m_Curvature.data(), m_Curvature.rows() * (m_Curvature.rows() + 1) / 2}; + } + private: std::size_t m_Count = 0; TMemoryMappedDoubleVector m_Gradient; - TMemoryMappedDoubleVector m_Curvature; + TMemoryMappedDoubleMatrix m_Curvature; }; - using TDerivativesAccumulatorVec = std::vector; - using TDerivativesAccumulatorVecVec = std::vector; - //! \brief A collection of aggregate derivatives for candidate feature splits. - class MATHS_EXPORT CSplitDerivativesAccumulator { + class MATHS_EXPORT CPerSplitDerivatives { public: - CSplitDerivativesAccumulator(const TImmutableRadixSetVec& candidateSplits, - std::size_t numberLossParameters) + using TDerivativesVec = std::vector; + + public: + explicit CPerSplitDerivatives(std::size_t numberLossParameters = 0) + : m_NumberLossParameters{numberLossParameters} {} + CPerSplitDerivatives(const TImmutableRadixSetVec& candidateSplits, + std::size_t numberLossParameters) : m_NumberLossParameters{numberLossParameters} { + this->map(candidateSplits); + } + CPerSplitDerivatives(const CPerSplitDerivatives& other) + : m_NumberLossParameters{other.m_NumberLossParameters} { + this->map(other.m_Derivatives); + this->merge(other); + } + CPerSplitDerivatives(CPerSplitDerivatives&&) = default; - std::size_t totalNumberCandidateSplits{std::accumulate( - candidateSplits.begin(), candidateSplits.end(), std::size_t{0}, - [](std::size_t size, const TImmutableRadixSet& splits) { - return size + splits.size() + 1; - })}; - int numberGradients{static_cast(numberLossParameters)}; - int numberCurvatures{static_cast( - boosted_tree_detail::lossHessianStoredSize(numberLossParameters))}; - int numberDerivatives{numberGradients + numberCurvatures}; - m_Derivatives.resize(candidateSplits.size()); - m_DerivativesStorage.resize(totalNumberCandidateSplits * numberDerivatives, 0.0); - m_MissingDerivatives.reserve(candidateSplits.size()); - m_MissingDerivativesStorage.resize(candidateSplits.size() * numberDerivatives, 0.0); - - double* storage{nullptr}; - for (std::size_t i = 0, m = 0, n = 0; i < candidateSplits.size(); - ++i, m += numberDerivatives) { - - m_Derivatives[i].reserve(candidateSplits[i].size() + 1); - for (std::size_t j = 0; j <= candidateSplits[i].size(); - ++j, n += numberDerivatives) { - storage = &m_DerivativesStorage[n]; - m_Derivatives[i].emplace_back( - TMemoryMappedDoubleVector{storage, numberGradients}, - TMemoryMappedDoubleVector{storage + numberGradients, numberCurvatures}); - } + CPerSplitDerivatives& operator=(const CPerSplitDerivatives& other) = delete; + CPerSplitDerivatives& operator=(CPerSplitDerivatives&&) = default; - storage = &m_MissingDerivativesStorage[m]; - m_MissingDerivatives.emplace_back( - TMemoryMappedDoubleVector{storage, numberGradients}, - TMemoryMappedDoubleVector{storage + numberGradients, numberCurvatures}); - } + //! \return The aggregate count for \p feature and \p split. + std::size_t count(std::size_t feature, std::size_t split) const { + return m_Derivatives[feature][split].count(); } - CSplitDerivativesAccumulator(const CSplitDerivativesAccumulator& other) - : m_NumberLossParameters{other.m_NumberLossParameters}, - m_DerivativesStorage{other.m_DerivativesStorage}, - m_MissingDerivativesStorage{other.m_MissingDerivativesStorage} { - - int numberGradients{static_cast(m_NumberLossParameters)}; - int numberCurvatures{static_cast( - boosted_tree_detail::lossHessianStoredSize(m_NumberLossParameters))}; - int numberDerivatives{numberGradients + numberCurvatures}; + //! \return The aggregate gradient for \p feature and \p split. + const TMemoryMappedDoubleVector& gradient(std::size_t feature, std::size_t split) const { + return m_Derivatives[feature][split].gradient(); + } - m_Derivatives.resize(other.m_Derivatives.size()); - m_MissingDerivatives.reserve(other.m_MissingDerivatives.size()); - - double* storage{nullptr}; - for (std::size_t i = 0, m = 0, n = 0; - i < other.m_Derivatives.size(); ++i, m += numberDerivatives) { - - m_Derivatives[i].reserve(other.m_Derivatives[i].size()); - for (std::size_t j = 0; j < other.m_Derivatives[i].size(); - ++j, n += numberDerivatives) { - storage = &m_DerivativesStorage[n]; - m_Derivatives[i].emplace_back( - other.m_Derivatives[i][j].count(), - TMemoryMappedDoubleVector{storage, numberGradients}, - TMemoryMappedDoubleVector{storage + numberGradients, numberCurvatures}); - } + //! \return The aggregate curvature for \p feature and \p split. + const TMemoryMappedDoubleMatrix& curvature(std::size_t feature, std::size_t split) const { + return m_Derivatives[feature][split].curvature(); + } - storage = &m_MissingDerivativesStorage[m]; - m_MissingDerivatives.emplace_back( - other.m_MissingDerivatives[i].count(), - TMemoryMappedDoubleVector{storage, numberGradients}, - TMemoryMappedDoubleVector{storage + numberGradients, numberCurvatures}); - } + //! \return All the split aggregate derivatives for \p feature. + const TDerivativesVec& derivatives(std::size_t feature) const { + return m_Derivatives[feature]; } - CSplitDerivativesAccumulator(CSplitDerivativesAccumulator&&) = default; + //! \return The count for missing \p feature. + std::size_t missingCount(std::size_t feature) const { + return m_MissingDerivatives[feature].count(); + } - CSplitDerivativesAccumulator& - operator=(const CSplitDerivativesAccumulator& other) = delete; - CSplitDerivativesAccumulator& operator=(CSplitDerivativesAccumulator&&) = default; + //! \return The aggregate gradients for missing \p feature. + const TMemoryMappedDoubleVector& missingGradient(std::size_t feature) const { + return m_MissingDerivatives[feature].gradient(); + } - //! Compute the accumulation of both collections of per split derivatives. - void merge(const CSplitDerivativesAccumulator& other) { - for (std::size_t i = 0; i < m_Derivatives.size(); ++i) { - for (std::size_t j = 0; j < m_Derivatives[i].size(); ++j) { - m_Derivatives[i][j].merge(other.m_Derivatives[i][j]); - } - m_MissingDerivatives[i].merge(other.m_MissingDerivatives[i]); - } + //! \return The aggregate curvatures for missing \p feature. + const TMemoryMappedDoubleMatrix& missingCurvature(std::size_t feature) const { + return m_MissingDerivatives[feature].curvature(); } //! Add \p gradient and \p curvature to the accumulated derivatives for @@ -261,26 +235,61 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { m_MissingDerivatives[feature].add(1, gradient, curvature); } - //! Read out the accumulated derivatives. - auto read() const { - TDerivativesVecVec derivatives; - derivatives.resize(m_Derivatives.size()); - for (std::size_t i = 0; i < m_Derivatives.size(); ++i) { - derivatives[i].reserve(m_Derivatives[i].size()); - for (const auto& derivative : m_Derivatives[i]) { - derivatives[i].emplace_back(derivative.count(), derivative.gradient(), - derivative.curvature()); + //! Compute the accumulation of both collections of per split derivatives. + void merge(const CPerSplitDerivatives& other) { + for (std::size_t i = 0; i < other.m_Derivatives.size(); ++i) { + for (std::size_t j = 0; j < other.m_Derivatives[i].size(); ++j) { + m_Derivatives[i][j].merge(other.m_Derivatives[i][j]); } + m_MissingDerivatives[i].merge(other.m_MissingDerivatives[i]); } + } - TDerivativesVec missingDerivatives; - missingDerivatives.reserve(m_MissingDerivatives.size()); - for (const auto& derivative : m_MissingDerivatives) { - missingDerivatives.emplace_back(derivative.count(), derivative.gradient(), - derivative.curvature()); + //! Set to the difference of \p lhs and \p rhs. + static CPerSplitDerivatives difference(const CPerSplitDerivatives& lhs, + const CPerSplitDerivatives& rhs) { + CPerSplitDerivatives result{lhs.m_NumberLossParameters}; + result.map(lhs.m_Derivatives); + for (std::size_t i = 0; i < lhs.m_Derivatives.size(); ++i) { + for (std::size_t j = 0; j < lhs.m_Derivatives[i].size(); ++j) { + result.m_Derivatives[i][j].assignDifference( + lhs.m_Derivatives[i][j], rhs.m_Derivatives[i][j]); + } + result.m_MissingDerivatives[i].assignDifference( + lhs.m_MissingDerivatives[i], rhs.m_MissingDerivatives[i]); + } + return result; + } + + //! Remap the accumulated curvature to lower triangle row major format. + void remapCurvature() { + for (std::size_t i = 0; i < m_Derivatives.size(); ++i) { + for (std::size_t j = 0; j < m_Derivatives[i].size(); ++j) { + m_Derivatives[i][j].remapCurvature(); + } + m_MissingDerivatives[i].remapCurvature(); } + } + + //! Get the memory used by this object. + std::size_t memoryUsage() const { + return core::CMemory::dynamicSize(m_Derivatives) + + core::CMemory::dynamicSize(m_MissingDerivatives) + + core::CMemory::dynamicSize(m_Storage); + } - return std::make_pair(std::move(derivatives), std::move(missingDerivatives)); + //! Estimate the split derivatives' memory usage for a data frame with + //! \p numberCols columns using \p numberSplitsPerFeature for a loss + //! function with \p numberLossParameters parameters. + static std::size_t estimateMemoryUsage(std::size_t numberCols, + std::size_t numberSplitsPerFeature, + std::size_t numberLossParameters) { + std::size_t derivativesSize{(numberCols - 1) * (numberSplitsPerFeature + 1) * + sizeof(CDerivatives)}; + std::size_t storageSize{(numberCols - 1) * (numberSplitsPerFeature + 1) * + numberLossParameters * + (numberLossParameters + 1) * sizeof(double)}; + return sizeof(CPerSplitDerivatives) + derivativesSize + storageSize; } //! Get a checksum of this object. @@ -292,11 +301,47 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { } private: - std::size_t m_NumberLossParameters; - TDerivativesAccumulatorVecVec m_Derivatives; - TDoubleVec m_DerivativesStorage; - TDerivativesAccumulatorVec m_MissingDerivatives; - TDoubleVec m_MissingDerivativesStorage; + using TDerivativesVecVec = std::vector; + + private: + static std::size_t number(const TDerivativesVec& derivatives) { + return derivatives.size(); + } + static std::size_t number(const TImmutableRadixSet& splits) { + return splits.size() + 1; + } + template + void map(const SPLITS& splits) { + std::size_t totalNumberSplits{ + std::accumulate(splits.begin(), splits.end(), std::size_t{0}, + [](std::size_t size, const auto& featureSplits) { + return size + number(featureSplits); + })}; + + int numberGradients{static_cast(m_NumberLossParameters)}; + int numberCurvatures{numberGradients * numberGradients}; + int numberDerivatives{numberGradients + numberCurvatures}; + + m_Derivatives.resize(splits.size()); + m_MissingDerivatives.reserve(splits.size()); + m_Storage.resize((totalNumberSplits + splits.size()) * numberDerivatives, 0.0); + + double* storage{&m_Storage[0]}; + for (std::size_t i = 0; i < splits.size(); ++i, storage += numberDerivatives) { + std::size_t size{number(splits[i])}; + m_Derivatives[i].reserve(size); + for (std::size_t j = 0; j < size; ++j, storage += numberDerivatives) { + m_Derivatives[i].emplace_back(m_NumberLossParameters, storage); + } + m_MissingDerivatives.emplace_back(m_NumberLossParameters, storage); + } + } + + private: + std::size_t m_NumberLossParameters = 0; + TDerivativesVecVec m_Derivatives; + TDerivativesVec m_MissingDerivatives; + TDoubleVec m_Storage; }; public: @@ -351,8 +396,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { const TRegularization& regularization, const TImmutableRadixSetVec& candidateSplits, const TSizeVec& featureBag, - const CBoostedTreeNode& split, - bool leftChildHasFewerRows); + const CBoostedTreeNode& split); //! Order two leaves by decreasing gain in splitting them. bool operator<(const CBoostedTreeLeafNodeStatistics& rhs) const; @@ -383,8 +427,8 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { std::size_t memoryUsage() const; //! Estimate the maximum leaf statistics' memory usage training on a data frame - //! with \p numberRows rows and \p numberCols columns using \p featureBagFraction - //! and \p numberSplitsPerFeature. + //! with \p numberRows rows and \p numberCols columns using \p numberSplitsPerFeature + //! for a loss function with \p numberLossParameters parameters. static std::size_t estimateMemoryUsage(std::size_t numberRows, std::size_t numberCols, std::size_t numberSplitsPerFeature, @@ -436,7 +480,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { const CBoostedTreeNode& split, const core::CPackedBitVector& parentRowMask); void addRowDerivatives(const CEncodedDataFrameRowRef& row, - CSplitDerivativesAccumulator& splitDerivativesAccumulator) const; + CPerSplitDerivatives& splitDerivatives) const; SSplitStatistics computeBestSplitStatistics(const TRegularization& regularization, const TSizeVec& featureBag) const; @@ -447,8 +491,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { std::size_t m_NumberLossParameters; const TImmutableRadixSetVec& m_CandidateSplits; core::CPackedBitVector m_RowMask; - TDerivativesVecVec m_Derivatives; - TDerivativesVec m_MissingDerivatives; + CPerSplitDerivatives m_Derivatives; SSplitStatistics m_BestSplit; }; } diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 437ef27019..782f28f638 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -718,7 +718,6 @@ CBoostedTreeImpl::trainTree(core::CDataFrame& frame, double splitValue; std::tie(splitFeature, splitValue) = leaf->bestSplit(); - bool leftChildHasFewerRows{leaf->leftChildHasFewerRows()}; bool assignMissingToLeft{leaf->assignMissingToLeft()}; std::size_t leftChildId, rightChildId; @@ -728,10 +727,9 @@ CBoostedTreeImpl::trainTree(core::CDataFrame& frame, TLeafNodeStatisticsPtr leftChild; TLeafNodeStatisticsPtr rightChild; - std::tie(leftChild, rightChild) = - leaf->split(leftChildId, rightChildId, m_NumberThreads, frame, *m_Encoder, - m_Regularization, candidateSplits, this->featureBag(), - tree[leaf->id()], leftChildHasFewerRows); + std::tie(leftChild, rightChild) = leaf->split( + leftChildId, rightChildId, m_NumberThreads, frame, *m_Encoder, + m_Regularization, candidateSplits, this->featureBag(), tree[leaf->id()]); scopeMemoryUsage.add(leftChild); scopeMemoryUsage.add(rightChild); diff --git a/lib/maths/CBoostedTreeLeafNodeStatistics.cc b/lib/maths/CBoostedTreeLeafNodeStatistics.cc index ddbbd942df..203da767a5 100644 --- a/lib/maths/CBoostedTreeLeafNodeStatistics.cc +++ b/lib/maths/CBoostedTreeLeafNodeStatistics.cc @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -25,7 +24,6 @@ using TRowItr = core::CDataFrame::TRowItr; namespace { const std::size_t ASSIGN_MISSING_TO_LEFT{0}; const std::size_t ASSIGN_MISSING_TO_RIGHT{1}; -const double SMALLEST_RELATIVE_CURVATURE{1e-20}; } CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( @@ -78,65 +76,9 @@ CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( core::CPackedBitVector rowMask) : m_Id{id}, m_Depth{sibling.m_Depth}, m_NumberInputColumns{sibling.m_NumberInputColumns}, m_NumberLossParameters{sibling.m_NumberLossParameters}, - m_CandidateSplits{sibling.m_CandidateSplits}, m_RowMask{std::move(rowMask)} { - - m_Derivatives.resize(m_CandidateSplits.size()); - m_MissingDerivatives.reserve(m_CandidateSplits.size()); - - for (std::size_t i = 0; i < m_CandidateSplits.size(); ++i) { - std::size_t numberSplits{m_CandidateSplits[i].size() + 1}; - m_Derivatives[i].reserve(numberSplits); - for (std::size_t j = 0; j < numberSplits; ++j) { - // Numeric errors mean that it's possible the sum curvature for a candidate - // split is identically zero while the gradient is epsilon. This can cause - // the node gain to appear infinite (when there is no weight regularisation) - // which in turns causes problems initialising the region we search for optimal - // hyperparameter values. We can safely force the gradient and curvature to - // be zero if we detect that the count is zero. Also, none of our loss functions - // have negative curvature therefore we shouldn't allow the cumulative curvature - // to be negative either. In this case we force it to be a v.small multiple - // of the magnitude of the gradient since this is the closest feasible estimate. - std::size_t count{parent.m_Derivatives[i][j].s_Count - - sibling.m_Derivatives[i][j].s_Count}; - if (count > 0) { - TDoubleVector gradient{parent.m_Derivatives[i][j].s_Gradient - - sibling.m_Derivatives[i][j].s_Gradient}; - TDoubleMatrix curvature{parent.m_Derivatives[i][j].s_Curvature - - sibling.m_Derivatives[i][j].s_Curvature}; - for (int k = 0; k < gradient.size(); ++k) { - curvature(k, k) = - std::max(curvature(k, k), SMALLEST_RELATIVE_CURVATURE * - std::fabs(gradient(k))); - } - m_Derivatives[i].emplace_back(count, std::move(gradient), - std::move(curvature)); - } else { - m_Derivatives[i].emplace_back( - 0, TDoubleVector{TDoubleVector::Zero(m_NumberLossParameters)}, - TDoubleMatrix{TDoubleMatrix::Zero(m_NumberLossParameters, - m_NumberLossParameters)}); - } - } - std::size_t count{parent.m_MissingDerivatives[i].s_Count - - sibling.m_MissingDerivatives[i].s_Count}; - if (count > 0) { - TDoubleVector gradient{parent.m_MissingDerivatives[i].s_Gradient - - sibling.m_MissingDerivatives[i].s_Gradient}; - TDoubleVector curvature{parent.m_MissingDerivatives[i].s_Curvature - - sibling.m_MissingDerivatives[i].s_Curvature}; - for (int k = 0; k < gradient.size(); ++k) { - curvature(k, k) = - std::max(curvature(k, k), - SMALLEST_RELATIVE_CURVATURE * std::fabs(gradient(k))); - } - m_MissingDerivatives.emplace_back(count, std::move(gradient), - std::move(curvature)); - } else { - m_MissingDerivatives.emplace_back( - 0, TDoubleVector{TDoubleVector::Zero(m_NumberLossParameters)}, - TDoubleMatrix{TDoubleMatrix::Zero(m_NumberLossParameters, m_NumberLossParameters)}); - } - } + m_CandidateSplits{sibling.m_CandidateSplits}, m_RowMask{std::move(rowMask)}, + m_Derivatives{CPerSplitDerivatives::difference(parent.m_Derivatives, + sibling.m_Derivatives)} { m_BestSplit = this->computeBestSplitStatistics(regularization, featureBag); } @@ -150,9 +92,8 @@ CBoostedTreeLeafNodeStatistics::split(std::size_t leftChildId, const TRegularization& regularization, const TImmutableRadixSetVec& candidateSplits, const TSizeVec& featureBag, - const CBoostedTreeNode& split, - bool leftChildHasFewerRows) { - if (leftChildHasFewerRows) { + const CBoostedTreeNode& split) { + if (this->leftChildHasFewerRows()) { auto leftChild = std::make_shared( leftChildId, m_NumberInputColumns, m_NumberLossParameters, numberThreads, frame, encoder, regularization, candidateSplits, @@ -212,10 +153,7 @@ core::CPackedBitVector& CBoostedTreeLeafNodeStatistics::rowMask() { } std::size_t CBoostedTreeLeafNodeStatistics::memoryUsage() const { - std::size_t mem{core::CMemory::dynamicSize(m_RowMask)}; - mem += core::CMemory::dynamicSize(m_Derivatives); - mem += core::CMemory::dynamicSize(m_MissingDerivatives); - return mem; + return core::CMemory::dynamicSize(m_RowMask) + core::CMemory::dynamicSize(m_Derivatives); } std::size_t @@ -228,12 +166,9 @@ CBoostedTreeLeafNodeStatistics::estimateMemoryUsage(std::size_t numberRows, // case for memory usage. This is because the rows will be spread over many // rows so the masks will mainly contain 0 bits in this case. std::size_t rowMaskSize{numberRows / PACKED_BIT_VECTOR_MAXIMUM_ROWS_PER_BYTE}; - std::size_t derivativesSize{(numberCols - 1) * numberSplitsPerFeature * - SDerivatives::estimateMemoryUsage(numberLossParameters)}; - std::size_t missingDerivativesSize{ - (numberCols - 1) * SDerivatives::estimateMemoryUsage(numberLossParameters)}; - return sizeof(CBoostedTreeLeafNodeStatistics) + rowMaskSize + - derivativesSize + missingDerivativesSize; + std::size_t perSplitDerivativesSize{CPerSplitDerivatives::estimateMemoryUsage( + numberCols, numberSplitsPerFeature, numberLossParameters)}; + return sizeof(CBoostedTreeLeafNodeStatistics) + rowMaskSize + perSplitDerivativesSize; } void CBoostedTreeLeafNodeStatistics::computeAggregateLossDerivatives( @@ -244,22 +179,19 @@ void CBoostedTreeLeafNodeStatistics::computeAggregateLossDerivatives( auto result = frame.readRows( numberThreads, 0, frame.numberRows(), core::bindRetrievableState( - [&](CSplitDerivativesAccumulator& splitDerivativesAccumulator, - TRowItr beginRows, TRowItr endRows) { + [&](CPerSplitDerivatives& perSplitDerivatives, TRowItr beginRows, TRowItr endRows) { for (auto row = beginRows; row != endRows; ++row) { - this->addRowDerivatives(encoder.encode(*row), splitDerivativesAccumulator); + this->addRowDerivatives(encoder.encode(*row), perSplitDerivatives); } }, - CSplitDerivativesAccumulator{m_CandidateSplits, m_NumberLossParameters}), + CPerSplitDerivatives{m_CandidateSplits, m_NumberLossParameters}), &m_RowMask); - auto& state = result.first; - CSplitDerivativesAccumulator accumulatedDerivatives{std::move(state[0].s_FunctionState)}; - for (std::size_t i = 1; i < state.size(); ++i) { - accumulatedDerivatives.merge(state[i].s_FunctionState); + m_Derivatives = std::move(result.first[0].s_FunctionState); + for (std::size_t i = 1; i < result.first.size(); ++i) { + m_Derivatives.merge(result.first[i].s_FunctionState); } - - std::tie(m_Derivatives, m_MissingDerivatives) = accumulatedDerivatives.read(); + m_Derivatives.remapCurvature(); } void CBoostedTreeLeafNodeStatistics::computeRowMaskAndAggregateLossDerivatives( @@ -273,55 +205,51 @@ void CBoostedTreeLeafNodeStatistics::computeRowMaskAndAggregateLossDerivatives( auto result = frame.readRows( numberThreads, 0, frame.numberRows(), core::bindRetrievableState( - [&](std::pair& state, + [&](std::pair& state, TRowItr beginRows, TRowItr endRows) { auto& mask = state.first; - auto& splitDerivativesAccumulator = state.second; + auto& perSplitDerivatives = state.second; for (auto row = beginRows; row != endRows; ++row) { auto encodedRow = encoder.encode(*row); if (split.assignToLeft(encodedRow) == isLeftChild) { std::size_t index{row->index()}; mask.extend(false, index - mask.size()); mask.extend(true); - this->addRowDerivatives(encodedRow, splitDerivativesAccumulator); + this->addRowDerivatives(encodedRow, perSplitDerivatives); } } }, std::make_pair(core::CPackedBitVector{}, - CSplitDerivativesAccumulator{m_CandidateSplits, m_NumberLossParameters})), + CPerSplitDerivatives{m_CandidateSplits, m_NumberLossParameters})), &parentRowMask); - auto& state = result.first; - for (auto& mask_ : state) { + for (auto& mask_ : result.first) { auto& mask = mask_.s_FunctionState.first; mask.extend(false, parentRowMask.size() - mask.size()); } - m_RowMask = std::move(state[0].s_FunctionState.first); - CSplitDerivativesAccumulator derivatives{std::move(state[0].s_FunctionState.second)}; - for (std::size_t i = 1; i < state.size(); ++i) { - m_RowMask |= state[i].s_FunctionState.first; - derivatives.merge(state[i].s_FunctionState.second); + m_RowMask = std::move(result.first[0].s_FunctionState.first); + m_Derivatives = std::move(result.first[0].s_FunctionState.second); + for (std::size_t i = 1; i < result.first.size(); ++i) { + m_RowMask |= result.first[i].s_FunctionState.first; + m_Derivatives.merge(result.first[i].s_FunctionState.second); } - - std::tie(m_Derivatives, m_MissingDerivatives) = derivatives.read(); + m_Derivatives.remapCurvature(); } -void CBoostedTreeLeafNodeStatistics::addRowDerivatives( - const CEncodedDataFrameRowRef& row, - CSplitDerivativesAccumulator& splitAggregateDerivatives) const { +void CBoostedTreeLeafNodeStatistics::addRowDerivatives(const CEncodedDataFrameRowRef& row, + CPerSplitDerivatives& perSplitDerivatives) const { const TRowRef& unencodedRow{row.unencodedRow()}; auto gradient = readLossGradient(unencodedRow, m_NumberInputColumns, m_NumberLossParameters); auto curvature = readLossCurvature(unencodedRow, m_NumberInputColumns, m_NumberLossParameters); - - for (std::size_t i = 0; i < m_CandidateSplits.size(); ++i) { - double featureValue{row[i]}; + for (std::size_t feature = 0; feature < m_CandidateSplits.size(); ++feature) { + double featureValue{row[feature]}; if (CDataFrameUtils::isMissing(featureValue)) { - splitAggregateDerivatives.addMissingDerivatives(i, gradient, curvature); + perSplitDerivatives.addMissingDerivatives(feature, gradient, curvature); } else { - std::ptrdiff_t j{m_CandidateSplits[i].upperBound(featureValue)}; - splitAggregateDerivatives.addDerivatives(i, j, gradient, curvature); + std::ptrdiff_t split{m_CandidateSplits[feature].upperBound(featureValue)}; + perSplitDerivatives.addDerivatives(feature, split, gradient, curvature); } } } @@ -351,63 +279,72 @@ CBoostedTreeLeafNodeStatistics::computeBestSplitStatistics(const TRegularization // // where w^* = arg\min_w{ L(w) }. - using TMinimumLoss = - std::function; + using TDoubleVector = CDenseVector; + using TDoubleMatrix = CDenseMatrix; + using TMinimumLoss = std::function; TMinimumLoss minimumLoss; - + double lambda{regularization.leafWeightPenaltyMultiplier()}; if (m_NumberLossParameters == 1) { - minimumLoss = [lambda](const TDoubleVector& g, const TDoubleMatrix& h) -> double { + minimumLoss = [&](const TDoubleVector& g, const TDoubleMatrix& h) -> double { return CTools::pow2(g(0)) / (h(0, 0) + lambda); }; } else { - // TODO this turns out to be extremely expensive even when m_NumberLossParameters - // is one. I'm not sure why yet. Maybe solving in-place would help. - minimumLoss = [lambda](const TDoubleVector& g, const TDoubleMatrix& h) -> double { + // TODO this turns out to be extremely expensive even when g and h are scalar. + // I'm not sure why yet. Maybe solving in-place would help. + minimumLoss = [&](const TDoubleVector& g, const TDoubleMatrix& h) -> double { return g.transpose() * (h + lambda * TDoubleMatrix::Identity(h.rows(), h.cols())) - .selfadjointView() + .selfadjointView() .ldlt() .solve(g); }; } - for (auto i : featureBag) { - std::size_t c{m_MissingDerivatives[i].s_Count}; - TDoubleVector g{m_MissingDerivatives[i].s_Gradient}; - TDoubleMatrix h{m_MissingDerivatives[i].s_Curvature}; - for (const auto& derivatives : m_Derivatives[i]) { - c += derivatives.s_Count; - g += derivatives.s_Gradient; - h += derivatives.s_Curvature; + for (auto feature : featureBag) { + std::size_t c{m_Derivatives.missingCount(feature)}; + TDoubleVector g{m_Derivatives.missingGradient(feature)}; + TDoubleMatrix h{m_Derivatives.missingCurvature(feature)}; + for (const auto& derivatives : m_Derivatives.derivatives(feature)) { + c += derivatives.count(); + g += derivatives.gradient(); + h += derivatives.curvature(); } - std::size_t cl[]{m_MissingDerivatives[i].s_Count, 0}; - TDoubleVector gl[]{m_MissingDerivatives[i].s_Gradient, + std::size_t cl[]{m_Derivatives.missingCount(feature), 0}; + TDoubleVector gl[]{m_Derivatives.missingGradient(feature), TDoubleVector::Zero(g.rows())}; - TDoubleMatrix hl[]{m_MissingDerivatives[i].s_Curvature, + TDoubleMatrix hl[]{m_Derivatives.missingCurvature(feature), TDoubleMatrix::Zero(h.rows(), h.cols())}; - TDoubleVector gr[]{g - m_MissingDerivatives[i].s_Gradient, g}; - TDoubleMatrix hr[]{h - m_MissingDerivatives[i].s_Curvature, h}; + TDoubleVector gr[]{g - m_Derivatives.missingGradient(feature), g}; + TDoubleMatrix hr[]{h - m_Derivatives.missingCurvature(feature), h}; double maximumGain{-INF}; double splitAt{-INF}; bool leftChildHasFewerRows{true}; bool assignMissingToLeft{true}; + std::size_t size{m_Derivatives.derivatives(feature).size()}; + + for (std::size_t split = 0; split + 1 < size; ++split) { - for (std::size_t j = 0; j + 1 < m_Derivatives[i].size(); ++j) { + std::size_t count{m_Derivatives.count(feature, split)}; + const TMemoryMappedDoubleVector& gradient{m_Derivatives.gradient(feature, split)}; + const TMemoryMappedDoubleMatrix& curvature{m_Derivatives.curvature(feature, split)}; - cl[ASSIGN_MISSING_TO_LEFT] += m_Derivatives[i][j].s_Count; - gl[ASSIGN_MISSING_TO_LEFT] += m_Derivatives[i][j].s_Gradient; - gr[ASSIGN_MISSING_TO_LEFT] -= m_Derivatives[i][j].s_Gradient; - hl[ASSIGN_MISSING_TO_LEFT] += m_Derivatives[i][j].s_Curvature; - hr[ASSIGN_MISSING_TO_LEFT] -= m_Derivatives[i][j].s_Curvature; + if (m_Derivatives.count(feature, split) == 0) { + continue; + } - cl[ASSIGN_MISSING_TO_RIGHT] += m_Derivatives[i][j].s_Count; - gl[ASSIGN_MISSING_TO_RIGHT] += m_Derivatives[i][j].s_Gradient; - gr[ASSIGN_MISSING_TO_RIGHT] -= m_Derivatives[i][j].s_Gradient; - hl[ASSIGN_MISSING_TO_RIGHT] += m_Derivatives[i][j].s_Curvature; - hr[ASSIGN_MISSING_TO_RIGHT] -= m_Derivatives[i][j].s_Curvature; + cl[ASSIGN_MISSING_TO_LEFT] += count; + gl[ASSIGN_MISSING_TO_LEFT] += gradient; + gr[ASSIGN_MISSING_TO_LEFT] -= gradient; + hl[ASSIGN_MISSING_TO_LEFT] += curvature; + hr[ASSIGN_MISSING_TO_LEFT] -= curvature; + cl[ASSIGN_MISSING_TO_RIGHT] += count; + gl[ASSIGN_MISSING_TO_RIGHT] += gradient; + gr[ASSIGN_MISSING_TO_RIGHT] -= gradient; + hl[ASSIGN_MISSING_TO_RIGHT] += curvature; + hr[ASSIGN_MISSING_TO_RIGHT] -= curvature; double gain[2]; gain[ASSIGN_MISSING_TO_LEFT] = @@ -419,13 +356,13 @@ CBoostedTreeLeafNodeStatistics::computeBestSplitStatistics(const TRegularization if (gain[ASSIGN_MISSING_TO_LEFT] > maximumGain) { maximumGain = gain[ASSIGN_MISSING_TO_LEFT]; - splitAt = m_CandidateSplits[i][j]; + splitAt = m_CandidateSplits[feature][split]; leftChildHasFewerRows = (2 * cl[ASSIGN_MISSING_TO_LEFT] < c); assignMissingToLeft = true; } if (gain[ASSIGN_MISSING_TO_RIGHT] > maximumGain) { maximumGain = gain[ASSIGN_MISSING_TO_RIGHT]; - splitAt = m_CandidateSplits[i][j]; + splitAt = m_CandidateSplits[feature][split]; leftChildHasFewerRows = (2 * cl[ASSIGN_MISSING_TO_RIGHT] < c); assignMissingToLeft = false; } @@ -440,7 +377,7 @@ CBoostedTreeLeafNodeStatistics::computeBestSplitStatistics(const TRegularization SSplitStatistics candidate{gain, h.trace() / static_cast(m_NumberLossParameters), - i, + feature, splitAt, leftChildHasFewerRows, assignMissingToLeft}; diff --git a/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc b/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc index 5c7b02fd79..fc84057a24 100644 --- a/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc +++ b/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc @@ -31,10 +31,8 @@ using TMatrixVec = std::vector; using TMatrixVecVec = std::vector; using TImmutableRadixSet = maths::CBoostedTreeLeafNodeStatistics::TImmutableRadixSet; using TImmutableRadixSetVec = maths::CBoostedTreeLeafNodeStatistics::TImmutableRadixSetVec; -using TDerivativesVec = maths::CBoostedTreeLeafNodeStatistics::TDerivativesVec; -using TDerivativesVecVec = maths::CBoostedTreeLeafNodeStatistics::TDerivativesVecVec; -using TDerivativesAccumulator = maths::CBoostedTreeLeafNodeStatistics::CDerivativesAccumulator; -using TSplitDerivativesAccumulator = maths::CBoostedTreeLeafNodeStatistics::CSplitDerivativesAccumulator; +using TDerivatives = maths::CBoostedTreeLeafNodeStatistics::CDerivatives; +using TPerSplitDerivatives = maths::CBoostedTreeLeafNodeStatistics::CPerSplitDerivatives; namespace { @@ -59,15 +57,14 @@ TMatrix rowMajorHessian(std::size_t n, const maths::CMemoryMappedDenseVector& return result; } -void testDerivativesAccumulatorFor(std::size_t numberParameters) { +void testDerivativesFor(std::size_t numberParameters) { LOG_DEBUG(<< "Testing " << numberParameters << " parameters"); test::CRandomNumbers rng; std::size_t numberGradients{numberParameters}; - std::size_t numberCurvatures{ - maths::boosted_tree_detail::lossHessianStoredSize(numberParameters)}; + std::size_t numberCurvatures{numberParameters * (numberParameters + 1) / 2}; TDoubleVecVec gradients(numberGradients); TDoubleVecVec curvatures(numberCurvatures); @@ -78,10 +75,10 @@ void testDerivativesAccumulatorFor(std::size_t numberParameters) { rng.generateUniformSamples(0.1, 0.5, 20, curvatures[i]); } - TDoubleVec storage1(numberGradients + numberCurvatures, 0.0); - auto totalGradient1 = makeGradient(&storage1[0], numberGradients); - auto totalCurvature1 = makeCurvature(&storage1[numberGradients], numberCurvatures); - TDerivativesAccumulator accumulator1{totalGradient1, totalCurvature1}; + LOG_DEBUG(<< "Accumulate"); + + TDoubleVec storage1(numberGradients * (numberGradients + 1), 0.0); + TDerivatives derivatives1{numberParameters, &storage1[0]}; for (std::size_t j = 0; j < 10; ++j) { TFloatVec storage; @@ -93,25 +90,28 @@ void testDerivativesAccumulatorFor(std::size_t numberParameters) { } auto gradient = makeGradient(&storage[0], numberGradients); auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures); - accumulator1.add(1, gradient, curvature); + derivatives1.add(1, gradient, curvature); } + derivatives1.remapCurvature(); - BOOST_REQUIRE_EQUAL(10, accumulator1.count()); + BOOST_REQUIRE_EQUAL(10, derivatives1.count()); for (std::size_t i = 0; i < numberGradients; ++i) { BOOST_REQUIRE_CLOSE( std::accumulate(gradients[i].begin(), gradients[i].begin() + 10, 0.0), - accumulator1.gradient()(i), 1e-4); + derivatives1.gradient()(i), 1e-4); } - for (std::size_t i = 0; i < numberCurvatures; ++i) { - BOOST_REQUIRE_CLOSE(std::accumulate(curvatures[i].begin(), - curvatures[i].begin() + 10, 0.0), - accumulator1.curvature()(i), 1e-4); + for (std::size_t j = 0, k = 0; j < numberGradients; ++j) { + for (std::size_t i = j; i < numberGradients; ++i, ++k) { + BOOST_REQUIRE_CLOSE(std::accumulate(curvatures[k].begin(), + curvatures[k].begin() + 10, 0.0), + derivatives1.curvature()(i, j), 1e-4); + } } - TDoubleVec storage2(numberGradients + numberCurvatures, 0.0); - auto totalGradient2 = makeGradient(&storage2[0], numberGradients); - auto totalCurvature2 = makeCurvature(&storage2[numberGradients], numberCurvatures); - TDerivativesAccumulator accumulator2{totalGradient2, totalCurvature2}; + LOG_DEBUG(<< "Merge"); + + TDoubleVec storage2(numberGradients * (numberGradients + 1), 0.0); + TDerivatives derivatives2{numberParameters, &storage2[0]}; for (std::size_t j = 10; j < 20; ++j) { TFloatVec storage; @@ -123,34 +123,48 @@ void testDerivativesAccumulatorFor(std::size_t numberParameters) { } auto gradient = makeGradient(&storage[0], numberGradients); auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures); - accumulator2.add(1, gradient, curvature); + derivatives2.add(1, gradient, curvature); } + derivatives2.remapCurvature(); + + derivatives1.merge(derivatives2); - BOOST_REQUIRE_EQUAL(10, accumulator2.count()); + BOOST_REQUIRE_EQUAL(20, derivatives1.count()); for (std::size_t i = 0; i < numberGradients; ++i) { - BOOST_REQUIRE_CLOSE( - std::accumulate(gradients[i].begin() + 10, gradients[i].end(), 0.0), - accumulator2.gradient()(i), 1e-4); + BOOST_REQUIRE_CLOSE(std::accumulate(gradients[i].begin(), gradients[i].end(), 0.0), + derivatives1.gradient()(i), 1e-4); } - for (std::size_t i = 0; i < numberCurvatures; ++i) { - BOOST_REQUIRE_CLOSE( - std::accumulate(curvatures[i].begin() + 10, curvatures[i].end(), 0.0), - accumulator2.curvature()(i), 1e-4); + for (std::size_t j = 0, k = 0; j < numberGradients; ++j) { + for (std::size_t i = j; i < numberGradients; ++i, ++k) { + BOOST_REQUIRE_CLOSE( + std::accumulate(curvatures[k].begin(), curvatures[k].end(), 0.0), + derivatives1.curvature()(i, j), 1e-4); + } } - accumulator1.merge(accumulator2); - BOOST_REQUIRE_EQUAL(20, accumulator1.count()); + LOG_DEBUG(<< "Difference"); + + TDoubleVec storage3(numberGradients * (numberGradients + 1), 0.0); + TDerivatives derivatives3{numberParameters, &storage2[0]}; + + derivatives3.assignDifference(derivatives1, derivatives2); + + BOOST_REQUIRE_EQUAL(10, derivatives3.count()); for (std::size_t i = 0; i < numberGradients; ++i) { - BOOST_REQUIRE_CLOSE(std::accumulate(gradients[i].begin(), gradients[i].end(), 0.0), - accumulator1.gradient()(i), 1e-4); + BOOST_REQUIRE_CLOSE( + std::accumulate(gradients[i].begin(), gradients[i].begin() + 10, 0.0), + derivatives3.gradient()(i), 1e-4); } - for (std::size_t i = 0; i < numberCurvatures; ++i) { - BOOST_REQUIRE_CLOSE(std::accumulate(curvatures[i].begin(), curvatures[i].end(), 0.0), - accumulator1.curvature()(i), 1e-4); + for (std::size_t j = 0, k = 0; j < numberGradients; ++j) { + for (std::size_t i = j; i < numberGradients; ++i, ++k) { + BOOST_REQUIRE_CLOSE(std::accumulate(curvatures[k].begin(), + curvatures[k].begin() + 10, 0.0), + derivatives3.curvature()(i, j), 1e-4); + } } } -void testSplitDerivativesAccumulatorFor(std::size_t numberParameters) { +void testPerSplitDerivativesFor(std::size_t numberParameters) { LOG_DEBUG(<< "Testing " << numberParameters << " parameters"); @@ -162,8 +176,7 @@ void testSplitDerivativesAccumulatorFor(std::size_t numberParameters) { std::size_t numberSamples{20}; std::size_t numberGradients{numberParameters}; - std::size_t numberCurvatures{ - maths::boosted_tree_detail::lossHessianStoredSize(numberParameters)}; + std::size_t numberCurvatures{numberParameters * (numberParameters + 1) / 2}; for (std::size_t t = 0; t < 100; ++t) { TSizeVec features; @@ -189,28 +202,30 @@ void testSplitDerivativesAccumulatorFor(std::size_t numberParameters) { for (std::size_t i = 0; i < 2; ++i) { expectedCounts[i].resize(featureSplits[i].size() + 1, 0); expectedGradients[i].resize(featureSplits[i].size() + 1, - TVector::Zero(numberParameters)); + TVector::Zero(numberParameters)); expectedCurvatures[i].resize(featureSplits[i].size() + 1, - TMatrix::Zero(numberParameters, numberParameters)); + TMatrix::Zero(numberParameters, numberParameters)); } - auto addDerivatives = [&](TSplitDerivativesAccumulator& accumulator) { + auto addDerivatives = [&](TPerSplitDerivatives& derivatives) { for (std::size_t i = 0, j = 0, k = 0; i < numberSamples; - ++i, j += numberGradients, k += numberCurvatures) { + ++i, j += numberGradients, k += numberCurvatures) { TFloatVec storage; storage.insert(storage.end(), &gradients[j], &gradients[j + numberGradients]); - storage.insert(storage.end(), &curvatures[j], &curvatures[k + numberCurvatures]); + storage.insert(storage.end(), &curvatures[j], + &curvatures[k + numberCurvatures]); auto gradient = makeGradient(&storage[0], numberGradients); auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures); if (uniform01[i] < 0.1) { - accumulator.addMissingDerivatives(features[i], gradient, curvature); + derivatives.addMissingDerivatives(features[i], gradient, curvature); ++expectedMissingCounts[features[i]]; expectedMissingGradients[features[i]] += gradient; - expectedMissingCurvatures[features[i]] += rowMajorHessian(numberParameters, curvature); + expectedMissingCurvatures[features[i]] += + rowMajorHessian(numberParameters, curvature); } else { - accumulator.addDerivatives(features[i], splits[features[i]][i], + derivatives.addDerivatives(features[i], splits[features[i]][i], gradient, curvature); ++expectedCounts[features[i]][splits[features[i]][i]]; expectedGradients[features[i]][splits[features[i]][i]] += gradient; @@ -218,41 +233,36 @@ void testSplitDerivativesAccumulatorFor(std::size_t numberParameters) { rowMajorHessian(numberParameters, curvature); } } + derivatives.remapCurvature(); }; - auto validate = [&](const TDerivativesVecVec& derivatives, - const TDerivativesVec& missingDerivatives) { - BOOST_REQUIRE_EQUAL(expectedCounts.size(), derivatives.size()); + auto validate = [&](const TPerSplitDerivatives& derivatives) { for (std::size_t i = 0; i < expectedCounts.size(); ++i) { - BOOST_REQUIRE_EQUAL(expectedCounts[i].size(), derivatives[i].size()); for (std::size_t j = 0; j < expectedGradients[i].size(); ++j) { TMatrix curvature{ - derivatives[i][j].s_Curvature.selfadjointView()}; - BOOST_REQUIRE_EQUAL(expectedCounts[i][j], derivatives[i][j].s_Count); - BOOST_REQUIRE_EQUAL(expectedGradients[i][j], derivatives[i][j].s_Gradient); + derivatives.curvature(i, j).selfadjointView()}; + BOOST_REQUIRE_EQUAL(expectedCounts[i][j], derivatives.count(i, j)); + BOOST_REQUIRE_EQUAL(expectedGradients[i][j], + derivatives.gradient(i, j)); BOOST_REQUIRE_EQUAL(expectedCurvatures[i][j], curvature); } } - BOOST_REQUIRE_EQUAL(expectedMissingCounts.size(), missingDerivatives.size()); for (std::size_t i = 0; i < expectedMissingCounts.size(); ++i) { TMatrix curvature{ - missingDerivatives[i].s_Curvature.selfadjointView()}; - BOOST_REQUIRE_EQUAL(expectedMissingCounts[i], missingDerivatives[i].s_Count); - BOOST_REQUIRE_EQUAL(expectedMissingGradients[i], missingDerivatives[i].s_Gradient); + derivatives.missingCurvature(i).selfadjointView()}; + BOOST_REQUIRE_EQUAL(expectedMissingCounts[i], derivatives.missingCount(i)); + BOOST_REQUIRE_EQUAL(expectedMissingGradients[i], + derivatives.missingGradient(i)); BOOST_REQUIRE_EQUAL(expectedMissingCurvatures[i], curvature); } }; LOG_TRACE(<< "Test accumulation"); - TSplitDerivativesAccumulator accumulator1{featureSplits, numberParameters}; - addDerivatives(accumulator1); + TPerSplitDerivatives derivatives1{featureSplits, numberParameters}; - TDerivativesVecVec derivatives; - TDerivativesVec missingDerivatives; - std::tie(derivatives, missingDerivatives) = accumulator1.read(); - - validate(derivatives, missingDerivatives); + addDerivatives(derivatives1); + validate(derivatives1); LOG_TRACE(<< "Test merge"); @@ -260,38 +270,36 @@ void testSplitDerivativesAccumulatorFor(std::size_t numberParameters) { rng.generateUniformSamples(-1.5, 1.0, numberSamples * numberGradients, gradients); rng.generateUniformSamples(0.1, 0.5, numberSamples * numberCurvatures, curvatures); - TSplitDerivativesAccumulator accumulator2{featureSplits, numberParameters}; - addDerivatives(accumulator2); - accumulator1.merge(accumulator2); - - std::tie(derivatives, missingDerivatives) = accumulator1.read(); + TPerSplitDerivatives derivatives2{featureSplits, numberParameters}; - validate(derivatives, missingDerivatives); + addDerivatives(derivatives2); + derivatives1.merge(derivatives2); + validate(derivatives1); LOG_TRACE(<< "Test copy"); - TSplitDerivativesAccumulator accumulator3{accumulator1}; - BOOST_REQUIRE_EQUAL(accumulator1.checksum(), accumulator3.checksum()); + TPerSplitDerivatives derivatives3{derivatives1}; + BOOST_REQUIRE_EQUAL(derivatives1.checksum(), derivatives3.checksum()); } } } -BOOST_AUTO_TEST_CASE(testDerivativesAccumulator) { +BOOST_AUTO_TEST_CASE(testDerivatives) { // Test individual derivatives accumulation for single and multi parameter // loss functions. - testDerivativesAccumulatorFor(1 /*loss function parameter*/); - testDerivativesAccumulatorFor(3 /*loss function parameters*/); + testDerivativesFor(1 /*loss function parameter*/); + testDerivativesFor(3 /*loss function parameters*/); } -BOOST_AUTO_TEST_CASE(testSplitDerivativesAccumulator) { +BOOST_AUTO_TEST_CASE(testPerSplitDerivatives) { // Test per split derivatives accumulation for single and multi parameter // loss functions. - testSplitDerivativesAccumulatorFor(1 /*loss function parameter*/); - testSplitDerivativesAccumulatorFor(3 /*loss function parameters*/); + testPerSplitDerivativesFor(1 /*loss function parameter*/); + testPerSplitDerivativesFor(3 /*loss function parameters*/); } BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/maths/unittest/Makefile b/lib/maths/unittest/Makefile index f84bb0e1e2..a6dfab62a6 100644 --- a/lib/maths/unittest/Makefile +++ b/lib/maths/unittest/Makefile @@ -1,63 +1,111 @@ # -#Copyright Elasticsearch B.V.and / or licensed to Elasticsearch B.V.under one -# or more contributor license agreements.Licensed under the Elastic License; -#you may not use this file except in compliance with the Elastic License. +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License; +# you may not use this file except in compliance with the Elastic License. # -include $(CPP_SRC_HOME) / mk / - defines.mk +include $(CPP_SRC_HOME)/mk/defines.mk - TARGET = ml_test$(EXE_EXT) +TARGET=ml_test$(EXE_EXT) - USE_BOOST = 1 USE_BOOST_FILESYSTEM_LIBS = 1 USE_BOOST_TEST_LIBS = 1 USE_RAPIDJSON = 1 USE_EIGEN = 1 +USE_BOOST=1 +USE_BOOST_FILESYSTEM_LIBS=1 +USE_BOOST_TEST_LIBS=1 +USE_RAPIDJSON=1 +USE_EIGEN=1 - LIBS : = $(LIB_ML_MATHS) +LIBS:=$(LIB_ML_MATHS) - all - : build +all: build - SRCS = - Main.cc CAgglomerativeClustererTest.cc - CAssignmentTest.cc CBasicStatisticsTest.cc CBayesianOptimisationTest - .cc CBjkstUniqueValuesTest.cc CBoostedTreeLeafNodeStatisticsTest - .cc CBoostedTreeTest.cc CBootstrapClustererTest - .cc CBoundingBoxTest.cc CCalendarComponentAdaptiveBucketingTest - .cc CCalendarCyclicTestTest.cc CCalendarFeatureTest - .cc CCategoricalToolsTest.cc CChecksumTest.cc - CClustererTest.cc CCountMinSketchTest.cc CDataFrameCategoryEncoderTest - .cc CDataFrameUtilsTest.cc CDecayRateControllerTest - .cc CEntropySketchTest.cc - CEqualWithToleranceTest.cc CExpandingWindowTest.cc - CForecastTest.cc CGammaRateConjugateTest.cc CInformationCriteriaTest - .cc CIntegerToolsTest.cc CIntegrationTest.cc CKdTreeTest - .cc CKMeansTest.cc CKMeansOnlineTest.cc - CKMostCorrelatedTest.cc CLassoLogisticRegressionTest.cc - CLbfgsTest.cc CLeastSquaresOnlineRegressionTest.cc CLinearAlgebraTest - .cc CLogNormalMeanPrecConjugateTest.cc CLogTDistributionTest - .cc CMathsFuncsTest.cc CMathsMemoryTest.cc CMicTest - .cc CMixtureDistributionTest.cc CModelTest.cc CMultimodalPriorTest - .cc CMultinomialConjugateTest.cc - CMultivariateConstantPriorTest.cc - CMultivariateMultimodalPriorTest.cc CMultivariateNormalConjugateTest - .cc CMultivariateOneOfNPriorTest.cc CNaiveBayesTest - .cc CNaturalBreaksClassifierTest.cc - CNormalMeanPrecConjugateTest.cc COneOfNPriorTest.cc - COrderingsTest.cc COrdinalTest.cc COrthogonaliserTest.cc COutliersTest - .cc CPeriodicityHypothesisTestsTest.cc CPoissonMeanConjugateTest - .cc CPriorTest.cc CPRNGTest.cc CProbabilityAggregatorsTest - .cc CProbabilityCalibratorTest.cc - CQDigestTest.cc CQuantileSketchTest.cc - CRadialBasisFunctionTest.cc CRandomizedPeriodicityTestTest - .cc CRandomProjectionClustererTest.cc - CSamplingTest.cc CSeasonalComponentTest - .cc CSeasonalComponentAdaptiveBucketingTest.cc CSeasonalTimeTest - .cc CSetToolsTest.cc CSignalTest.cc CSolversTest.cc - CSplineTest.cc CStatisticalTestsTest.cc CTimeSeriesChangeDetectorTest - .cc CTimeSeriesDecompositionTest.cc - CTimeSeriesModelTest.cc CTimeSeriesMultibucketFeaturesTest - .cc CTimeSeriesSegmentationTest.cc - CToolsTest.cc CTreeShapFeatureImportanceTest.cc - CTrendComponentTest.cc CXMeansOnlineTest.cc - CXMeansOnline1dTest.cc CXMeansTest.cc TestUtils.cc +SRCS=\ + Main.cc \ + CAgglomerativeClustererTest.cc \ + CAssignmentTest.cc \ + CBasicStatisticsTest.cc \ + CBayesianOptimisationTest.cc \ + CBjkstUniqueValuesTest.cc \ + CBoostedTreeLeafNodeStatisticsTest.cc \ + CBoostedTreeTest.cc \ + CBootstrapClustererTest.cc \ + CBoundingBoxTest.cc \ + CCalendarComponentAdaptiveBucketingTest.cc \ + CCalendarCyclicTestTest.cc \ + CCalendarFeatureTest.cc \ + CCategoricalToolsTest.cc \ + CChecksumTest.cc \ + CClustererTest.cc \ + CCountMinSketchTest.cc \ + CDataFrameCategoryEncoderTest.cc \ + CDataFrameUtilsTest.cc \ + CDecayRateControllerTest.cc \ + CEntropySketchTest.cc \ + CEqualWithToleranceTest.cc \ + CExpandingWindowTest.cc \ + CForecastTest.cc \ + CGammaRateConjugateTest.cc \ + CInformationCriteriaTest.cc \ + CIntegerToolsTest.cc \ + CIntegrationTest.cc \ + CKdTreeTest.cc \ + CKMeansTest.cc \ + CKMeansOnlineTest.cc \ + CKMostCorrelatedTest.cc \ + CLassoLogisticRegressionTest.cc \ + CLbfgsTest.cc \ + CLeastSquaresOnlineRegressionTest.cc \ + CLinearAlgebraTest.cc \ + CLogNormalMeanPrecConjugateTest.cc \ + CLogTDistributionTest.cc \ + CMathsFuncsTest.cc \ + CMathsMemoryTest.cc \ + CMicTest.cc \ + CMixtureDistributionTest.cc \ + CModelTest.cc \ + CMultimodalPriorTest.cc \ + CMultinomialConjugateTest.cc \ + CMultivariateConstantPriorTest.cc \ + CMultivariateMultimodalPriorTest.cc \ + CMultivariateNormalConjugateTest.cc \ + CMultivariateOneOfNPriorTest.cc \ + CNaiveBayesTest.cc \ + CNaturalBreaksClassifierTest.cc \ + CNormalMeanPrecConjugateTest.cc \ + COneOfNPriorTest.cc \ + COrderingsTest.cc \ + COrdinalTest.cc \ + COrthogonaliserTest.cc \ + COutliersTest.cc \ + CPeriodicityHypothesisTestsTest.cc \ + CPoissonMeanConjugateTest.cc \ + CPriorTest.cc \ + CPRNGTest.cc \ + CProbabilityAggregatorsTest.cc \ + CProbabilityCalibratorTest.cc \ + CQDigestTest.cc \ + CQuantileSketchTest.cc \ + CRadialBasisFunctionTest.cc \ + CRandomizedPeriodicityTestTest.cc \ + CRandomProjectionClustererTest.cc \ + CSamplingTest.cc \ + CSeasonalComponentTest.cc \ + CSeasonalComponentAdaptiveBucketingTest.cc \ + CSeasonalTimeTest.cc \ + CSetToolsTest.cc \ + CSignalTest.cc \ + CSolversTest.cc \ + CSplineTest.cc \ + CStatisticalTestsTest.cc \ + CTimeSeriesChangeDetectorTest.cc \ + CTimeSeriesDecompositionTest.cc \ + CTimeSeriesModelTest.cc \ + CTimeSeriesMultibucketFeaturesTest.cc \ + CTimeSeriesSegmentationTest.cc \ + CToolsTest.cc \ + CTreeShapFeatureImportanceTest.cc \ + CTrendComponentTest.cc \ + CXMeansOnlineTest.cc \ + CXMeansOnline1dTest.cc \ + CXMeansTest.cc \ + TestUtils.cc \ - include $(CPP_SRC_HOME) / - mk / stdboosttest.mk +include $(CPP_SRC_HOME)/mk/stdboosttest.mk From 8240a0af798c9bf01536a66ddb64bc682316a27d Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 11 Feb 2020 10:56:34 +0000 Subject: [PATCH 03/13] Optimize computing derivatives difference --- .../maths/CBoostedTreeLeafNodeStatistics.h | 59 +++++++++---------- lib/maths/CBoostedTreeLeafNodeStatistics.cc | 17 +++--- .../CBoostedTreeLeafNodeStatisticsTest.cc | 15 ++--- 3 files changed, 43 insertions(+), 48 deletions(-) diff --git a/include/maths/CBoostedTreeLeafNodeStatistics.h b/include/maths/CBoostedTreeLeafNodeStatistics.h index 94c729ffec..ba07dd5cdc 100644 --- a/include/maths/CBoostedTreeLeafNodeStatistics.h +++ b/include/maths/CBoostedTreeLeafNodeStatistics.h @@ -96,25 +96,18 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { } //! Compute the accumulation of both collections of derivatives. - void merge(const CDerivatives& other) { + void add(const CDerivatives& other) { m_Count += other.m_Count; m_Gradient += other.m_Gradient; m_Curvature += other.m_Curvature; } //! Set to the difference of \p lhs and \p rhs. - void assignDifference(const CDerivatives& lhs, const CDerivatives& rhs) { - // Numeric errors mean that it's possible the sum curvature for a candidate - // split is identically zero while the gradient is epsilon. This can cause - // the node gain to appear infinite (when there is no weight regularisation) - // which in turns causes problems initialising the region we search for optimal - // hyperparameter values. We can safely force the gradient and curvature to - // be zero if we detect that the count is zero. - std::size_t count{lhs.m_Count - rhs.m_Count}; - if (count > 0) { - m_Count = count; - m_Gradient.array() = lhs.m_Gradient - rhs.m_Gradient; - m_Curvature.array() = lhs.m_Curvature - rhs.m_Curvature; + void subtract(const CDerivatives& rhs) { + m_Count -= rhs.m_Count; + if (m_Count > 0) { + m_Gradient -= rhs.m_Gradient; + m_Curvature -= rhs.m_Curvature; // None of our loss functions have negative curvature therefore we // shouldn't allow the cumulative curvature to be negative either. // In this case we force it to be a v.small multiple of the magnitude @@ -124,6 +117,16 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { SMALLEST_RELATIVE_CURVATURE * std::fabs(m_Gradient(i))); } + } else { + // Numeric errors mean that it's possible the sum curvature for a + // candidate split is identically zero while the gradient is epsilon. + // This can cause the node gain to appear infinite (when there is no + // weight regularisation) which in turns causes problems initialising + // the region we search for optimal hyperparameter values. We can + // safely force the gradient and curvature to be zero if we detect + // that the count is zero. + m_Gradient.setZero(); + m_Curvature.setZero(); } } @@ -166,7 +169,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { using TDerivativesVec = std::vector; public: - explicit CPerSplitDerivatives(std::size_t numberLossParameters = 0) + explicit CPerSplitDerivatives(std::size_t numberLossParameters = 0) : m_NumberLossParameters{numberLossParameters} {} CPerSplitDerivatives(const TImmutableRadixSetVec& candidateSplits, std::size_t numberLossParameters) @@ -176,7 +179,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { CPerSplitDerivatives(const CPerSplitDerivatives& other) : m_NumberLossParameters{other.m_NumberLossParameters} { this->map(other.m_Derivatives); - this->merge(other); + this->add(other); } CPerSplitDerivatives(CPerSplitDerivatives&&) = default; @@ -236,29 +239,23 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { } //! Compute the accumulation of both collections of per split derivatives. - void merge(const CPerSplitDerivatives& other) { + void add(const CPerSplitDerivatives& other) { for (std::size_t i = 0; i < other.m_Derivatives.size(); ++i) { for (std::size_t j = 0; j < other.m_Derivatives[i].size(); ++j) { - m_Derivatives[i][j].merge(other.m_Derivatives[i][j]); + m_Derivatives[i][j].add(other.m_Derivatives[i][j]); } - m_MissingDerivatives[i].merge(other.m_MissingDerivatives[i]); + m_MissingDerivatives[i].add(other.m_MissingDerivatives[i]); } } - //! Set to the difference of \p lhs and \p rhs. - static CPerSplitDerivatives difference(const CPerSplitDerivatives& lhs, - const CPerSplitDerivatives& rhs) { - CPerSplitDerivatives result{lhs.m_NumberLossParameters}; - result.map(lhs.m_Derivatives); - for (std::size_t i = 0; i < lhs.m_Derivatives.size(); ++i) { - for (std::size_t j = 0; j < lhs.m_Derivatives[i].size(); ++j) { - result.m_Derivatives[i][j].assignDifference( - lhs.m_Derivatives[i][j], rhs.m_Derivatives[i][j]); + //! Subtract \p rhs. + void subtract(const CPerSplitDerivatives& rhs) { + for (std::size_t i = 0; i < m_Derivatives.size(); ++i) { + for (std::size_t j = 0; j < m_Derivatives[i].size(); ++j) { + m_Derivatives[i][j].subtract(rhs.m_Derivatives[i][j]); } - result.m_MissingDerivatives[i].assignDifference( - lhs.m_MissingDerivatives[i], rhs.m_MissingDerivatives[i]); + m_MissingDerivatives[i].subtract(rhs.m_MissingDerivatives[i]); } - return result; } //! Remap the accumulated curvature to lower triangle row major format. @@ -374,7 +371,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { //! Only called by split but is public so it's accessible to std::make_shared. CBoostedTreeLeafNodeStatistics(std::size_t id, - const CBoostedTreeLeafNodeStatistics& parent, + CBoostedTreeLeafNodeStatistics&& parent, const CBoostedTreeLeafNodeStatistics& sibling, const TRegularization& regularization, const TSizeVec& featureBag, diff --git a/lib/maths/CBoostedTreeLeafNodeStatistics.cc b/lib/maths/CBoostedTreeLeafNodeStatistics.cc index 203da767a5..d62a6578b7 100644 --- a/lib/maths/CBoostedTreeLeafNodeStatistics.cc +++ b/lib/maths/CBoostedTreeLeafNodeStatistics.cc @@ -69,7 +69,7 @@ CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( std::size_t id, - const CBoostedTreeLeafNodeStatistics& parent, + CBoostedTreeLeafNodeStatistics&& parent, const CBoostedTreeLeafNodeStatistics& sibling, const TRegularization& regularization, const TSizeVec& featureBag, @@ -77,9 +77,9 @@ CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( : m_Id{id}, m_Depth{sibling.m_Depth}, m_NumberInputColumns{sibling.m_NumberInputColumns}, m_NumberLossParameters{sibling.m_NumberLossParameters}, m_CandidateSplits{sibling.m_CandidateSplits}, m_RowMask{std::move(rowMask)}, - m_Derivatives{CPerSplitDerivatives::difference(parent.m_Derivatives, - sibling.m_Derivatives)} { + m_Derivatives{std::move(parent.m_Derivatives)} { + m_Derivatives.subtract(sibling.m_Derivatives); m_BestSplit = this->computeBestSplitStatistics(regularization, featureBag); } @@ -93,6 +93,7 @@ CBoostedTreeLeafNodeStatistics::split(std::size_t leftChildId, const TImmutableRadixSetVec& candidateSplits, const TSizeVec& featureBag, const CBoostedTreeNode& split) { + if (this->leftChildHasFewerRows()) { auto leftChild = std::make_shared( leftChildId, m_NumberInputColumns, m_NumberLossParameters, @@ -101,8 +102,8 @@ CBoostedTreeLeafNodeStatistics::split(std::size_t leftChildId, core::CPackedBitVector rightChildRowMask{m_RowMask}; rightChildRowMask ^= leftChild->rowMask(); auto rightChild = std::make_shared( - rightChildId, *this, *leftChild, regularization, featureBag, - std::move(rightChildRowMask)); + rightChildId, std::move(*this), *leftChild, regularization, + featureBag, std::move(rightChildRowMask)); return std::make_pair(leftChild, rightChild); } @@ -114,7 +115,7 @@ CBoostedTreeLeafNodeStatistics::split(std::size_t leftChildId, core::CPackedBitVector leftChildRowMask{m_RowMask}; leftChildRowMask ^= rightChild->rowMask(); auto leftChild = std::make_shared( - leftChildId, *this, *rightChild, regularization, featureBag, + leftChildId, std::move(*this), *rightChild, regularization, featureBag, std::move(leftChildRowMask)); return std::make_pair(leftChild, rightChild); @@ -189,7 +190,7 @@ void CBoostedTreeLeafNodeStatistics::computeAggregateLossDerivatives( m_Derivatives = std::move(result.first[0].s_FunctionState); for (std::size_t i = 1; i < result.first.size(); ++i) { - m_Derivatives.merge(result.first[i].s_FunctionState); + m_Derivatives.add(result.first[i].s_FunctionState); } m_Derivatives.remapCurvature(); } @@ -232,7 +233,7 @@ void CBoostedTreeLeafNodeStatistics::computeRowMaskAndAggregateLossDerivatives( m_Derivatives = std::move(result.first[0].s_FunctionState.second); for (std::size_t i = 1; i < result.first.size(); ++i) { m_RowMask |= result.first[i].s_FunctionState.first; - m_Derivatives.merge(result.first[i].s_FunctionState.second); + m_Derivatives.add(result.first[i].s_FunctionState.second); } m_Derivatives.remapCurvature(); } diff --git a/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc b/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc index fc84057a24..a0a2c28584 100644 --- a/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc +++ b/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc @@ -127,7 +127,7 @@ void testDerivativesFor(std::size_t numberParameters) { } derivatives2.remapCurvature(); - derivatives1.merge(derivatives2); + derivatives1.add(derivatives2); BOOST_REQUIRE_EQUAL(20, derivatives1.count()); for (std::size_t i = 0; i < numberGradients; ++i) { @@ -144,22 +144,19 @@ void testDerivativesFor(std::size_t numberParameters) { LOG_DEBUG(<< "Difference"); - TDoubleVec storage3(numberGradients * (numberGradients + 1), 0.0); - TDerivatives derivatives3{numberParameters, &storage2[0]}; + derivatives1.subtract(derivatives2); - derivatives3.assignDifference(derivatives1, derivatives2); - - BOOST_REQUIRE_EQUAL(10, derivatives3.count()); + BOOST_REQUIRE_EQUAL(10, derivatives1.count()); for (std::size_t i = 0; i < numberGradients; ++i) { BOOST_REQUIRE_CLOSE( std::accumulate(gradients[i].begin(), gradients[i].begin() + 10, 0.0), - derivatives3.gradient()(i), 1e-4); + derivatives1.gradient()(i), 1e-4); } for (std::size_t j = 0, k = 0; j < numberGradients; ++j) { for (std::size_t i = j; i < numberGradients; ++i, ++k) { BOOST_REQUIRE_CLOSE(std::accumulate(curvatures[k].begin(), curvatures[k].begin() + 10, 0.0), - derivatives3.curvature()(i, j), 1e-4); + derivatives1.curvature()(i, j), 1e-4); } } } @@ -273,7 +270,7 @@ void testPerSplitDerivativesFor(std::size_t numberParameters) { TPerSplitDerivatives derivatives2{featureSplits, numberParameters}; addDerivatives(derivatives2); - derivatives1.merge(derivatives2); + derivatives1.add(derivatives2); validate(derivatives1); LOG_TRACE(<< "Test copy"); From 2db432bf6f71c9fd4b408a67ada720b4ca69af2f Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 11 Feb 2020 12:08:17 +0000 Subject: [PATCH 04/13] Couple more performance tweaks --- lib/maths/CBoostedTreeLeafNodeStatistics.cc | 42 +++++++++++++-------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/lib/maths/CBoostedTreeLeafNodeStatistics.cc b/lib/maths/CBoostedTreeLeafNodeStatistics.cc index d62a6578b7..00d2bef730 100644 --- a/lib/maths/CBoostedTreeLeafNodeStatistics.cc +++ b/lib/maths/CBoostedTreeLeafNodeStatistics.cc @@ -303,22 +303,32 @@ CBoostedTreeLeafNodeStatistics::computeBestSplitStatistics(const TRegularization }; } + int d{static_cast(m_NumberLossParameters)}; + TDoubleVector g{d}; + TDoubleMatrix h{d, d}; + TDoubleVector gl[]{TDoubleVector{d}, TDoubleVector{d}}; + TDoubleVector gr[]{TDoubleVector{d}, TDoubleVector{d}}; + TDoubleMatrix hl[]{TDoubleMatrix{d, d}, TDoubleMatrix{d, d}}; + TDoubleMatrix hr[]{TDoubleMatrix{d, d}, TDoubleMatrix{d, d}}; + for (auto feature : featureBag) { std::size_t c{m_Derivatives.missingCount(feature)}; - TDoubleVector g{m_Derivatives.missingGradient(feature)}; - TDoubleMatrix h{m_Derivatives.missingCurvature(feature)}; + g = m_Derivatives.missingGradient(feature); + h = m_Derivatives.missingCurvature(feature); for (const auto& derivatives : m_Derivatives.derivatives(feature)) { c += derivatives.count(); g += derivatives.gradient(); h += derivatives.curvature(); } std::size_t cl[]{m_Derivatives.missingCount(feature), 0}; - TDoubleVector gl[]{m_Derivatives.missingGradient(feature), - TDoubleVector::Zero(g.rows())}; - TDoubleMatrix hl[]{m_Derivatives.missingCurvature(feature), - TDoubleMatrix::Zero(h.rows(), h.cols())}; - TDoubleVector gr[]{g - m_Derivatives.missingGradient(feature), g}; - TDoubleMatrix hr[]{h - m_Derivatives.missingCurvature(feature), h}; + gl[ASSIGN_MISSING_TO_LEFT] = m_Derivatives.missingGradient(feature); + gl[ASSIGN_MISSING_TO_RIGHT] = TDoubleVector::Zero(g.rows()); + gr[ASSIGN_MISSING_TO_LEFT] = g - m_Derivatives.missingGradient(feature); + gr[ASSIGN_MISSING_TO_RIGHT] = g; + hl[ASSIGN_MISSING_TO_LEFT] = m_Derivatives.missingCurvature(feature); + hl[ASSIGN_MISSING_TO_RIGHT] = TDoubleMatrix::Zero(h.rows(), h.cols()); + hr[ASSIGN_MISSING_TO_LEFT] = h - m_Derivatives.missingCurvature(feature); + hr[ASSIGN_MISSING_TO_RIGHT] = h; double maximumGain{-INF}; double splitAt{-INF}; @@ -329,22 +339,22 @@ CBoostedTreeLeafNodeStatistics::computeBestSplitStatistics(const TRegularization for (std::size_t split = 0; split + 1 < size; ++split) { std::size_t count{m_Derivatives.count(feature, split)}; - const TMemoryMappedDoubleVector& gradient{m_Derivatives.gradient(feature, split)}; - const TMemoryMappedDoubleMatrix& curvature{m_Derivatives.curvature(feature, split)}; - - if (m_Derivatives.count(feature, split) == 0) { + if (count == 0) { continue; } + const TMemoryMappedDoubleVector& gradient{m_Derivatives.gradient(feature, split)}; + const TMemoryMappedDoubleMatrix& curvature{m_Derivatives.curvature(feature, split)}; + cl[ASSIGN_MISSING_TO_LEFT] += count; - gl[ASSIGN_MISSING_TO_LEFT] += gradient; - gr[ASSIGN_MISSING_TO_LEFT] -= gradient; - hl[ASSIGN_MISSING_TO_LEFT] += curvature; - hr[ASSIGN_MISSING_TO_LEFT] -= curvature; cl[ASSIGN_MISSING_TO_RIGHT] += count; + gl[ASSIGN_MISSING_TO_LEFT] += gradient; gl[ASSIGN_MISSING_TO_RIGHT] += gradient; + gr[ASSIGN_MISSING_TO_LEFT] -= gradient; gr[ASSIGN_MISSING_TO_RIGHT] -= gradient; + hl[ASSIGN_MISSING_TO_LEFT] += curvature; hl[ASSIGN_MISSING_TO_RIGHT] += curvature; + hr[ASSIGN_MISSING_TO_LEFT] -= curvature; hr[ASSIGN_MISSING_TO_RIGHT] -= curvature; double gain[2]; From da33060151d339f61b9b0a48a9da2b0779e9716e Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 11 Feb 2020 15:47:12 +0000 Subject: [PATCH 05/13] Further tweak --- lib/maths/CBoostedTreeLeafNodeStatistics.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/maths/CBoostedTreeLeafNodeStatistics.cc b/lib/maths/CBoostedTreeLeafNodeStatistics.cc index 00d2bef730..8efd0a5a34 100644 --- a/lib/maths/CBoostedTreeLeafNodeStatistics.cc +++ b/lib/maths/CBoostedTreeLeafNodeStatistics.cc @@ -14,8 +14,6 @@ #include #include -#include - namespace ml { namespace maths { using namespace boosted_tree_detail; @@ -284,26 +282,28 @@ CBoostedTreeLeafNodeStatistics::computeBestSplitStatistics(const TRegularization using TDoubleMatrix = CDenseMatrix; using TMinimumLoss = std::function; + int d{static_cast(m_NumberLossParameters)}; + TMinimumLoss minimumLoss; double lambda{regularization.leafWeightPenaltyMultiplier()}; + Eigen::MatrixXd placeholder{d, d}; if (m_NumberLossParameters == 1) { + // There is a large fixed overhead for using ldl^t even when g and h are + // scalar so we have special case handling. minimumLoss = [&](const TDoubleVector& g, const TDoubleMatrix& h) -> double { return CTools::pow2(g(0)) / (h(0, 0) + lambda); }; } else { - // TODO this turns out to be extremely expensive even when g and h are scalar. - // I'm not sure why yet. Maybe solving in-place would help. + // TODO use Cholesky (but need to handle positive semi-definite case). minimumLoss = [&](const TDoubleVector& g, const TDoubleMatrix& h) -> double { - return g.transpose() * - (h + lambda * TDoubleMatrix::Identity(h.rows(), h.cols())) - .selfadjointView() - .ldlt() - .solve(g); + placeholder = + (h + lambda * TDoubleMatrix::Identity(d, d)).selfadjointView(); + Eigen::LDLT> ldlt{placeholder}; + return g.transpose() * ldlt.solve(g); }; } - int d{static_cast(m_NumberLossParameters)}; TDoubleVector g{d}; TDoubleMatrix h{d, d}; TDoubleVector gl[]{TDoubleVector{d}, TDoubleVector{d}}; From deea27076b2eea0a6b643ecc6007f0d2c2303e3a Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 11 Feb 2020 18:21:13 +0000 Subject: [PATCH 06/13] Improve comment --- include/maths/CBoostedTreeLeafNodeStatistics.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/maths/CBoostedTreeLeafNodeStatistics.h b/include/maths/CBoostedTreeLeafNodeStatistics.h index ba07dd5cdc..3e975c784c 100644 --- a/include/maths/CBoostedTreeLeafNodeStatistics.h +++ b/include/maths/CBoostedTreeLeafNodeStatistics.h @@ -132,10 +132,10 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { //! Remap the accumulated curvature to lower triangle row major format. void remapCurvature() { - // We accumulate curvatures in the first n (n + 1) / elements however - // users of TMemoryMappedDoubleMatrix expect them stored column major - // in the lower triangle of n x n matrix. This copies them backwards - // to their correct positions. + // For performance, we accumulate curvatures into the first n (n + 1) / 2 + // elements of the array backing m_Curvature. However, the memory mapped + // matrix class expects them to be stored column major in the lower triangle + // of n x n matrix. This copies them backwards to their correct positions. for (std::ptrdiff_t j = m_Curvature.cols() - 1, k = m_Curvature.rows() * (m_Curvature.rows() + 1) / 2 - 1; j >= 0; --j) { From 8e928e1b1adcb6a0986cc74073c6fcb09828c583 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Wed, 12 Feb 2020 10:42:21 +0000 Subject: [PATCH 07/13] Test fall out and some memory estimation and usage improvements --- .../maths/CBoostedTreeLeafNodeStatistics.h | 8 ++++---- ...CDataFrameAnalyzerFeatureImportanceTest.cc | 10 +++++----- .../CDataFrameAnalyzerTrainingTest.cc | 6 +++--- lib/maths/CBoostedTreeImpl.cc | 14 +++++++++----- lib/maths/CBoostedTreeLeafNodeStatistics.cc | 19 +++++++++++++++---- 5 files changed, 36 insertions(+), 21 deletions(-) diff --git a/include/maths/CBoostedTreeLeafNodeStatistics.h b/include/maths/CBoostedTreeLeafNodeStatistics.h index 3e975c784c..5b2dbb8a1f 100644 --- a/include/maths/CBoostedTreeLeafNodeStatistics.h +++ b/include/maths/CBoostedTreeLeafNodeStatistics.h @@ -278,13 +278,12 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { //! Estimate the split derivatives' memory usage for a data frame with //! \p numberCols columns using \p numberSplitsPerFeature for a loss //! function with \p numberLossParameters parameters. - static std::size_t estimateMemoryUsage(std::size_t numberCols, + static std::size_t estimateMemoryUsage(std::size_t numberFeatures, std::size_t numberSplitsPerFeature, std::size_t numberLossParameters) { - std::size_t derivativesSize{(numberCols - 1) * (numberSplitsPerFeature + 1) * + std::size_t derivativesSize{numberFeatures * (numberSplitsPerFeature + 1) * sizeof(CDerivatives)}; - std::size_t storageSize{(numberCols - 1) * (numberSplitsPerFeature + 1) * - numberLossParameters * + std::size_t storageSize{numberFeatures * (numberSplitsPerFeature + 1) * numberLossParameters * (numberLossParameters + 1) * sizeof(double)}; return sizeof(CPerSplitDerivatives) + derivativesSize + storageSize; } @@ -467,6 +466,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { }; private: + void maybeRecoverMemory(); void computeAggregateLossDerivatives(std::size_t numberThreads, const core::CDataFrame& frame, const CDataFrameCategoryEncoder& encoder); diff --git a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc index c60a6bb4b3..16685315ae 100644 --- a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc @@ -136,7 +136,7 @@ struct SFixture { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::regression(), - "target", s_Rows, 5, 8000000, 0, 0, {"c1"}, s_Alpha, s_Lambda, + "target", s_Rows, 5, 24000000, 0, 0, {"c1"}, s_Alpha, s_Lambda, s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, s_Eta, s_MaximumNumberTrees, s_FeatureBagFraction, shapValues), outputWriterFactory}; @@ -169,9 +169,9 @@ struct SFixture { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::classification(), - "target", s_Rows, 5, 8000000, 0, 0, {"target"}, s_Alpha, s_Lambda, - s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, s_Eta, - s_MaximumNumberTrees, s_FeatureBagFraction, shapValues), + "target", s_Rows, 5, 24000000, 0, 0, {"target"}, s_Alpha, + s_Lambda, s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, + s_Eta, s_MaximumNumberTrees, s_FeatureBagFraction, shapValues), outputWriterFactory}; TStrVec fieldNames{"target", "c1", "c2", "c3", "c4", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; @@ -197,7 +197,7 @@ struct SFixture { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::regression(), - "target", s_Rows, 5, 8000000, 0, 0, {}, s_Alpha, s_Lambda, + "target", s_Rows, 5, 24000000, 0, 0, {}, s_Alpha, s_Lambda, s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, s_Eta, s_MaximumNumberTrees, s_FeatureBagFraction, shapValues), outputWriterFactory}; diff --git a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc index d176d1199c..cfd305b34a 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc @@ -752,7 +752,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeClassifierImbalanced) { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::classification(), - "target", numberExamples, 4, 14000000, 0, 0, {"target"}), + "target", numberExamples, 4, 16000000, 0, 0, {"target"}), outputWriterFactory}; TStrVec actuals; @@ -800,7 +800,7 @@ BOOST_AUTO_TEST_CASE(testCategoricalFields) { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::regression(), - "x5", 1000, 5, 19000000, 0, 0, {"x1", "x2"}), + "x5", 1000, 5, 24000000, 0, 0, {"x1", "x2"}), outputWriterFactory}; TStrVec x[]{{"x11", "x12", "x13", "x14", "x15"}, @@ -906,7 +906,7 @@ BOOST_AUTO_TEST_CASE(testCategoricalFieldsEmptyAsMissing) { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::classification(), - "x5", 1000, 5, 19000000, 0, 0, {"x1", "x2", "x5"}), + "x5", 1000, 5, 24000000, 0, 0, {"x1", "x2", "x5"}), outputWriterFactory}; TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."}; diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 782f28f638..8e83fcc1ab 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -300,18 +300,21 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows, // A binary tree with n + 1 leaves has 2n + 1 nodes in total. std::size_t maximumNumberLeaves{this->maximumTreeSize(numberRows) + 1}; std::size_t maximumNumberNodes{2 * maximumNumberLeaves - 1}; + std::size_t maximumNumberFeatures{std::min(numberColumns - 1, numberRows / m_RowsPerFeature)}; std::size_t forestMemoryUsage{ m_MaximumNumberTrees * (sizeof(TNodeVec) + maximumNumberNodes * sizeof(CBoostedTreeNode))}; std::size_t extraColumnsMemoryUsage{numberExtraColumnsForTrain(m_Loss->numberParameters()) * numberRows * sizeof(CFloatStorage)}; + std::size_t foldRoundLossMemoryUsage{m_NumberFolds * m_NumberRounds * + sizeof(TOptionalDouble)}; std::size_t hyperparametersMemoryUsage{numberColumns * sizeof(double)}; std::size_t leafNodeStatisticsMemoryUsage{ maximumNumberLeaves * CBoostedTreeLeafNodeStatistics::estimateMemoryUsage( - numberRows, numberColumns, m_NumberSplitsPerFeature, + numberRows, maximumNumberFeatures, m_NumberSplitsPerFeature, m_Loss->numberParameters())}; - std::size_t dataTypeMemoryUsage{numberColumns * sizeof(CDataFrameUtils::SDataType)}; - std::size_t featureSampleProbabilities{numberColumns * sizeof(double)}; + std::size_t dataTypeMemoryUsage{maximumNumberFeatures * sizeof(CDataFrameUtils::SDataType)}; + std::size_t featureSampleProbabilities{maximumNumberFeatures * sizeof(double)}; std::size_t missingFeatureMaskMemoryUsage{ numberColumns * numberRows / PACKED_BIT_VECTOR_MAXIMUM_ROWS_PER_BYTE}; std::size_t trainTestMaskMemoryUsage{2 * m_NumberFolds * numberRows / @@ -319,8 +322,9 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows, std::size_t bayesianOptimisationMemoryUsage{CBayesianOptimisation::estimateMemoryUsage( this->numberHyperparametersToTune(), m_NumberRounds)}; return sizeof(*this) + forestMemoryUsage + extraColumnsMemoryUsage + - hyperparametersMemoryUsage + leafNodeStatisticsMemoryUsage + - dataTypeMemoryUsage + featureSampleProbabilities + missingFeatureMaskMemoryUsage + + foldRoundLossMemoryUsage + hyperparametersMemoryUsage + + leafNodeStatisticsMemoryUsage + dataTypeMemoryUsage + + featureSampleProbabilities + missingFeatureMaskMemoryUsage + trainTestMaskMemoryUsage + bayesianOptimisationMemoryUsage; } diff --git a/lib/maths/CBoostedTreeLeafNodeStatistics.cc b/lib/maths/CBoostedTreeLeafNodeStatistics.cc index 8efd0a5a34..558791dd40 100644 --- a/lib/maths/CBoostedTreeLeafNodeStatistics.cc +++ b/lib/maths/CBoostedTreeLeafNodeStatistics.cc @@ -97,11 +97,13 @@ CBoostedTreeLeafNodeStatistics::split(std::size_t leftChildId, leftChildId, m_NumberInputColumns, m_NumberLossParameters, numberThreads, frame, encoder, regularization, candidateSplits, featureBag, true /*is left child*/, m_Depth + 1, split, m_RowMask); - core::CPackedBitVector rightChildRowMask{m_RowMask}; + core::CPackedBitVector rightChildRowMask{std::move(m_RowMask)}; rightChildRowMask ^= leftChild->rowMask(); auto rightChild = std::make_shared( rightChildId, std::move(*this), *leftChild, regularization, featureBag, std::move(rightChildRowMask)); + leftChild->maybeRecoverMemory(); + rightChild->maybeRecoverMemory(); return std::make_pair(leftChild, rightChild); } @@ -110,11 +112,13 @@ CBoostedTreeLeafNodeStatistics::split(std::size_t leftChildId, rightChildId, m_NumberInputColumns, m_NumberLossParameters, numberThreads, frame, encoder, regularization, candidateSplits, featureBag, false /*is left child*/, m_Depth + 1, split, m_RowMask); - core::CPackedBitVector leftChildRowMask{m_RowMask}; + core::CPackedBitVector leftChildRowMask{std::move(m_RowMask)}; leftChildRowMask ^= rightChild->rowMask(); auto leftChild = std::make_shared( leftChildId, std::move(*this), *rightChild, regularization, featureBag, std::move(leftChildRowMask)); + leftChild->maybeRecoverMemory(); + rightChild->maybeRecoverMemory(); return std::make_pair(leftChild, rightChild); } @@ -157,7 +161,7 @@ std::size_t CBoostedTreeLeafNodeStatistics::memoryUsage() const { std::size_t CBoostedTreeLeafNodeStatistics::estimateMemoryUsage(std::size_t numberRows, - std::size_t numberCols, + std::size_t numberFeatures, std::size_t numberSplitsPerFeature, std::size_t numberLossParameters) { // We will typically get the close to the best compression for most of the @@ -166,10 +170,17 @@ CBoostedTreeLeafNodeStatistics::estimateMemoryUsage(std::size_t numberRows, // rows so the masks will mainly contain 0 bits in this case. std::size_t rowMaskSize{numberRows / PACKED_BIT_VECTOR_MAXIMUM_ROWS_PER_BYTE}; std::size_t perSplitDerivativesSize{CPerSplitDerivatives::estimateMemoryUsage( - numberCols, numberSplitsPerFeature, numberLossParameters)}; + numberFeatures, numberSplitsPerFeature, numberLossParameters)}; return sizeof(CBoostedTreeLeafNodeStatistics) + rowMaskSize + perSplitDerivativesSize; } +void CBoostedTreeLeafNodeStatistics::maybeRecoverMemory() { + if (this->gain() <= 0.0) { + m_RowMask = core::CPackedBitVector{}; + m_Derivatives = CPerSplitDerivatives{}; + } +} + void CBoostedTreeLeafNodeStatistics::computeAggregateLossDerivatives( std::size_t numberThreads, const core::CDataFrame& frame, From f19e74cd1a77ebcf778ec0e95d83b56d5373c85c Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Wed, 12 Feb 2020 15:44:43 +0000 Subject: [PATCH 08/13] Increase memory limit for windows --- lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc | 6 +++--- lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc index 16685315ae..600b99643a 100644 --- a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc @@ -136,7 +136,7 @@ struct SFixture { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::regression(), - "target", s_Rows, 5, 24000000, 0, 0, {"c1"}, s_Alpha, s_Lambda, + "target", s_Rows, 5, 26000000, 0, 0, {"c1"}, s_Alpha, s_Lambda, s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, s_Eta, s_MaximumNumberTrees, s_FeatureBagFraction, shapValues), outputWriterFactory}; @@ -169,7 +169,7 @@ struct SFixture { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::classification(), - "target", s_Rows, 5, 24000000, 0, 0, {"target"}, s_Alpha, + "target", s_Rows, 5, 26000000, 0, 0, {"target"}, s_Alpha, s_Lambda, s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, s_Eta, s_MaximumNumberTrees, s_FeatureBagFraction, shapValues), outputWriterFactory}; @@ -197,7 +197,7 @@ struct SFixture { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::regression(), - "target", s_Rows, 5, 24000000, 0, 0, {}, s_Alpha, s_Lambda, + "target", s_Rows, 5, 26000000, 0, 0, {}, s_Alpha, s_Lambda, s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, s_Eta, s_MaximumNumberTrees, s_FeatureBagFraction, shapValues), outputWriterFactory}; diff --git a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc index cfd305b34a..1a3d91c00c 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc @@ -752,7 +752,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeClassifierImbalanced) { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::classification(), - "target", numberExamples, 4, 16000000, 0, 0, {"target"}), + "target", numberExamples, 4, 17000000, 0, 0, {"target"}), outputWriterFactory}; TStrVec actuals; @@ -800,7 +800,7 @@ BOOST_AUTO_TEST_CASE(testCategoricalFields) { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::regression(), - "x5", 1000, 5, 24000000, 0, 0, {"x1", "x2"}), + "x5", 1000, 5, 26000000, 0, 0, {"x1", "x2"}), outputWriterFactory}; TStrVec x[]{{"x11", "x12", "x13", "x14", "x15"}, @@ -906,7 +906,7 @@ BOOST_AUTO_TEST_CASE(testCategoricalFieldsEmptyAsMissing) { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::classification(), - "x5", 1000, 5, 24000000, 0, 0, {"x1", "x2", "x5"}), + "x5", 1000, 5, 26000000, 0, 0, {"x1", "x2", "x5"}), outputWriterFactory}; TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."}; From ac467180dcc2de553e61f44256ac58561b89f7e8 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 13 Feb 2020 09:59:57 +0000 Subject: [PATCH 09/13] Relax threshold for Windows --- lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc index 1a3d91c00c..5bd5526c6b 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc @@ -426,7 +426,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTraining) { BOOST_TEST_REQUIRE(core::CProgramCounters::counter( counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 6000000); - BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1400000); + BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1500000); BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) > 0); BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) <= duration); } @@ -724,7 +724,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeClassifierTraining) { << "ms"); BOOST_TEST_REQUIRE(core::CProgramCounters::counter( counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 6000000); - BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1400000); + BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1500000); BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) > 0); BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) <= duration); } From e1556ce641f9804d8dfbb78142d0d69124ba4035 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 14 Feb 2020 14:21:42 +0000 Subject: [PATCH 10/13] Explain the map function --- include/maths/CBoostedTreeLeafNodeStatistics.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/maths/CBoostedTreeLeafNodeStatistics.h b/include/maths/CBoostedTreeLeafNodeStatistics.h index 5b2dbb8a1f..379def2484 100644 --- a/include/maths/CBoostedTreeLeafNodeStatistics.h +++ b/include/maths/CBoostedTreeLeafNodeStatistics.h @@ -308,6 +308,15 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { } template void map(const SPLITS& splits) { + // This function maps the memory in a single presized buffer containing + // enough space to store all gradient vectors and curvatures. For each + // feature the layout in this buffer is as follows: + // + // "split grad" "split hessian" "missing grad" "missing hessian" + // | | | | + // V V V V + // | n | n^2 | ... | n | n^2 | + std::size_t totalNumberSplits{ std::accumulate(splits.begin(), splits.end(), std::size_t{0}, [](std::size_t size, const auto& featureSplits) { From 3009f4252f9b52adf2284b9cc06dca738ba57ab7 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 14 Feb 2020 14:33:43 +0000 Subject: [PATCH 11/13] Improve explanations in computeBestSplitStatistics --- lib/maths/CBoostedTreeLeafNodeStatistics.cc | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/lib/maths/CBoostedTreeLeafNodeStatistics.cc b/lib/maths/CBoostedTreeLeafNodeStatistics.cc index 558791dd40..caf3e02e29 100644 --- a/lib/maths/CBoostedTreeLeafNodeStatistics.cc +++ b/lib/maths/CBoostedTreeLeafNodeStatistics.cc @@ -282,12 +282,12 @@ CBoostedTreeLeafNodeStatistics::computeBestSplitStatistics(const TRegularization // // where H(\lambda) = \sum_i H_i + \lambda I, g = \sum_i g_i and w is the leaf's // weight. Here, g_i and H_i denote an example's loss gradient and Hessian and i - // ranges over the examples in the leaf. Completing the square and noting that - // the minimum is given when the quadratic form is zero we have + // ranges over the examples in the leaf. Writing this as the sum of a quadratic + // form and constant, i.e. x(w)^t H(\lambda) x(w) + constant, and noting that H + // is positive definite, we see that we'll minimise loss by choosing w such that + // x is zero, i.e. w^* = arg\min_w(L(w)) satisfies x(w) = 0. This gives // - // L(w^*) = -1/2 g^t H(\lambda) g - // - // where w^* = arg\min_w{ L(w) }. + // L(w^*) = -1/2 g^t H(\lambda)^{-1} g using TDoubleVector = CDenseVector; using TDoubleMatrix = CDenseMatrix; @@ -392,6 +392,9 @@ CBoostedTreeLeafNodeStatistics::computeBestSplitStatistics(const TRegularization double penaltyForDepth{regularization.penaltyForDepth(m_Depth)}; double penaltyForDepthPlusOne{regularization.penaltyForDepth(m_Depth + 1)}; + + // The gain is the difference between the quadratic minimum for loss with + // no split and the loss with the minimum loss split we found. double gain{0.5 * (maximumGain - minimumLoss(g, h)) - regularization.treeSizePenaltyMultiplier() - regularization.depthPenaltyMultiplier() * From e4cb8710e09baaaedaf4c2669b7ead4fafcee322 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 14 Feb 2020 14:37:50 +0000 Subject: [PATCH 12/13] Comment tweaks --- include/maths/CBoostedTreeLeafNodeStatistics.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/maths/CBoostedTreeLeafNodeStatistics.h b/include/maths/CBoostedTreeLeafNodeStatistics.h index 379def2484..b9dbc0a5bc 100644 --- a/include/maths/CBoostedTreeLeafNodeStatistics.h +++ b/include/maths/CBoostedTreeLeafNodeStatistics.h @@ -110,8 +110,9 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { m_Curvature -= rhs.m_Curvature; // None of our loss functions have negative curvature therefore we // shouldn't allow the cumulative curvature to be negative either. - // In this case we force it to be a v.small multiple of the magnitude - // of the gradient since this is the closest feasible estimate. + // In this case we force it to be a very small multiple of the + // magnitude of the gradient since this is the closest feasible + // estimate. for (int i = 0; i < m_Gradient.size(); ++i) { m_Curvature(i, i) = std::max(m_Curvature(i, i), SMALLEST_RELATIVE_CURVATURE * From 4d9b0acf313e09731c1b716f7f9f6d18e47080bf Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 14 Feb 2020 16:34:36 +0000 Subject: [PATCH 13/13] Review comment --- include/maths/CBoostedTreeLeafNodeStatistics.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/maths/CBoostedTreeLeafNodeStatistics.h b/include/maths/CBoostedTreeLeafNodeStatistics.h index b9dbc0a5bc..1373bd422e 100644 --- a/include/maths/CBoostedTreeLeafNodeStatistics.h +++ b/include/maths/CBoostedTreeLeafNodeStatistics.h @@ -223,7 +223,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { } //! Add \p gradient and \p curvature to the accumulated derivatives for - //! the split \p split of feature \p feature. + //! the \p split of \p feature. void addDerivatives(std::size_t feature, std::size_t split, const TMemoryMappedFloatVector& gradient,