diff --git a/include/maths/CBoostedTree.h b/include/maths/CBoostedTree.h index f6556779e7..fb770b1d53 100644 --- a/include/maths/CBoostedTree.h +++ b/include/maths/CBoostedTree.h @@ -35,7 +35,8 @@ class CEncodedDataFrameRowRef; namespace boosted_tree_detail { class MATHS_EXPORT CArgMinLossImpl { public: - using TDouble1Vec = core::CSmallVector; + using TDoubleVector = CDenseVector; + using TMemoryMappedFloatVector = CMemoryMappedDenseVector; public: CArgMinLossImpl(double lambda); @@ -43,9 +44,11 @@ class MATHS_EXPORT CArgMinLossImpl { virtual std::unique_ptr clone() const = 0; virtual bool nextPass() = 0; - virtual void add(TDouble1Vec prediction, double actual, double weight = 1.0) = 0; + virtual void add(const TMemoryMappedFloatVector& prediction, + double actual, + double weight = 1.0) = 0; virtual void merge(const CArgMinLossImpl& other) = 0; - virtual TDouble1Vec value() const = 0; + virtual TDoubleVector value() const = 0; protected: double lambda() const; @@ -61,9 +64,9 @@ class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl { CArgMinMseImpl(double lambda); std::unique_ptr clone() const override; bool nextPass() override; - void add(TDouble1Vec prediction, double actual, double weight = 1.0) override; + void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override; void merge(const CArgMinLossImpl& other) override; - TDouble1Vec value() const override; + TDoubleVector value() const override; private: using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; @@ -79,14 +82,14 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl { CArgMinLogisticImpl(double lambda); std::unique_ptr clone() const override; bool nextPass() override; - void add(TDouble1Vec prediction, double actual, double weight = 1.0) override; + void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override; void merge(const CArgMinLossImpl& other) override; - TDouble1Vec value() const override; + TDoubleVector value() const override; private: using TMinMaxAccumulator = CBasicStatistics::CMinMax; - using TVector = CVectorNx1; - using TVectorVec = std::vector; + using TDoubleVector2x1 = CVectorNx1; + using TDoubleVector2x1Vec = std::vector; private: std::size_t bucket(double prediction) const { @@ -108,8 +111,8 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl { private: std::size_t m_CurrentPass = 0; TMinMaxAccumulator m_PredictionMinMax; - TVector m_CategoryCounts; - TVectorVec m_BucketCategoryCounts; + TDoubleVector2x1 m_CategoryCounts; + TDoubleVector2x1Vec m_BucketCategoryCounts; }; } @@ -118,7 +121,8 @@ namespace boosted_tree { //! \brief Computes the leaf value which minimizes the loss function. class MATHS_EXPORT CArgMinLoss { public: - using TDouble1Vec = core::CSmallVector; + using TDoubleVector = CDenseVector; + using TMemoryMappedFloatVector = CMemoryMappedDenseVector; public: CArgMinLoss(const CArgMinLoss& other); @@ -133,7 +137,7 @@ class MATHS_EXPORT CArgMinLoss { bool nextPass() const; //! Update with a point prediction and actual value. - void add(TDouble1Vec prediction, double actual, double weight = 1.0); + void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0); //! Get the minimiser over the predictions and actual values added to both //! this and \p other. @@ -144,7 +148,7 @@ class MATHS_EXPORT CArgMinLoss { //! //! Formally, returns \f$x^* = arg\min_x\{\sum_i{L(p_i + x, a_i)}\}\f$ //! for predictions and actuals \f$p_i\f$ and \f$a_i\f$, respectively. - TDouble1Vec value() const; + TDoubleVector value() const; private: using TArgMinLossImplUPtr = std::unique_ptr; @@ -161,7 +165,9 @@ class MATHS_EXPORT CArgMinLoss { //! \brief Defines the loss function for the regression problem. class MATHS_EXPORT CLoss { public: - using TDouble1Vec = core::CSmallVector; + using TDoubleVector = CDenseVector; + using TMemoryMappedFloatVector = CMemoryMappedDenseVector; + using TWriter = std::function; public: virtual ~CLoss() = default; @@ -170,17 +176,23 @@ class MATHS_EXPORT CLoss { //! The number of parameters to the loss function. virtual std::size_t numberParameters() const = 0; //! The value of the loss function. - virtual double value(TDouble1Vec prediction, double actual, double weight = 1.0) const = 0; + virtual double value(const TMemoryMappedFloatVector& prediction, + double actual, + double weight = 1.0) const = 0; //! The gradient of the loss function. - virtual TDouble1Vec - gradient(TDouble1Vec prediction, double actual, double weight = 1.0) const = 0; + virtual void gradient(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight = 1.0) const = 0; //! The Hessian of the loss function (flattened). - virtual TDouble1Vec - curvature(TDouble1Vec prediction, double actual, double weight = 1.0) const = 0; + virtual void curvature(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight = 1.0) const = 0; //! Returns true if the loss curvature is constant. virtual bool isCurvatureConstant() const = 0; //! Transforms a prediction from the forest to the target space. - virtual TDouble1Vec transform(TDouble1Vec prediction) const = 0; + virtual TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const = 0; //! Get an object which computes the leaf value that minimises loss. virtual CArgMinLoss minimizer(double lambda) const = 0; //! Get the name of the loss function @@ -198,11 +210,19 @@ class MATHS_EXPORT CMse final : public CLoss { public: std::unique_ptr clone() const override; std::size_t numberParameters() const override; - double value(TDouble1Vec prediction, double actual, double weight = 1.0) const override; - TDouble1Vec gradient(TDouble1Vec prediction, double actual, double weight = 1.0) const override; - TDouble1Vec curvature(TDouble1Vec prediction, double actual, double weight = 1.0) const override; + double value(const TMemoryMappedFloatVector& prediction, + double actual, + double weight = 1.0) const override; + void gradient(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight = 1.0) const override; + void curvature(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight = 1.0) const override; bool isCurvatureConstant() const override; - TDouble1Vec transform(TDouble1Vec prediction) const override; + TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override; CArgMinLoss minimizer(double lambda) const override; const std::string& name() const override; }; @@ -223,11 +243,19 @@ class MATHS_EXPORT CBinomialLogistic final : public CLoss { public: std::unique_ptr clone() const override; std::size_t numberParameters() const override; - double value(TDouble1Vec prediction, double actual, double weight = 1.0) const override; - TDouble1Vec gradient(TDouble1Vec prediction, double actual, double weight = 1.0) const override; - TDouble1Vec curvature(TDouble1Vec prediction, double actual, double weight = 1.0) const override; + double value(const TMemoryMappedFloatVector& prediction, + double actual, + double weight = 1.0) const override; + void gradient(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight = 1.0) const override; + void curvature(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight = 1.0) const override; bool isCurvatureConstant() const override; - TDouble1Vec transform(TDouble1Vec prediction) const override; + TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override; CArgMinLoss minimizer(double lambda) const override; const std::string& name() const override; }; diff --git a/include/maths/CBoostedTreeLeafNodeStatistics.h b/include/maths/CBoostedTreeLeafNodeStatistics.h index 74f6588510..1373bd422e 100644 --- a/include/maths/CBoostedTreeLeafNodeStatistics.h +++ b/include/maths/CBoostedTreeLeafNodeStatistics.h @@ -8,17 +8,25 @@ #define INCLUDED_ml_maths_CBoostedTreeLeafNodeStatistics_h #include +#include #include #include #include #include +#include +#include +#include #include +#include +#include #include #include +#include #include +#include #include namespace ml { @@ -37,14 +45,310 @@ class CEncodedDataFrameRowRef; //! The regression tree is grown top down by greedily selecting the split with //! the maximum gain (in the loss). This finds and scores the maximum gain split //! of a single leaf of the tree. -class CBoostedTreeLeafNodeStatistics final { +class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { public: + using TDoubleVec = std::vector; using TSizeVec = std::vector; using TSizeDoublePr = std::pair; using TRegularization = CBoostedTreeRegularization; - using TImmutableRadixSetVec = std::vector>; + using TImmutableRadixSet = core::CImmutableRadixSet; + using TImmutableRadixSetVec = std::vector; using TPtr = std::shared_ptr; using TPtrPtrPr = std::pair; + using TMemoryMappedFloatVector = CMemoryMappedDenseVector; + using TMemoryMappedDoubleVector = CMemoryMappedDenseVector; + using TMemoryMappedDoubleMatrix = CMemoryMappedDenseMatrix; + + //! \brief Accumulates aggregate derivatives. + class MATHS_EXPORT CDerivatives { + public: + //! Bounds the minimum diagonal of the Hessian. + static constexpr double SMALLEST_RELATIVE_CURVATURE{1e-20}; + + //! See core::CMemory. + static bool dynamicSizeAlwaysZero() { return true; } + + public: + CDerivatives(std::size_t numberLossParameters, double* storage) + : m_Count{0}, m_Gradient{storage, static_cast(numberLossParameters)}, + m_Curvature{storage + numberLossParameters, + static_cast(numberLossParameters), + static_cast(numberLossParameters)} {} + + //! Get the accumulated count. + std::size_t count() const { return m_Count; } + + //! Get the accumulated gradient. + const TMemoryMappedDoubleVector& gradient() const { return m_Gradient; } + + //! Get the accumulated curvature. + const TMemoryMappedDoubleMatrix& curvature() const { + return m_Curvature; + } + + //! Add \p count, \p gradient and \p curvature to the accumulator. + void add(std::size_t count, + const TMemoryMappedFloatVector& gradient, + const TMemoryMappedFloatVector& curvature) { + m_Count += count; + m_Gradient += gradient; + this->curvatureTriangleView() += curvature; + } + + //! Compute the accumulation of both collections of derivatives. + void add(const CDerivatives& other) { + m_Count += other.m_Count; + m_Gradient += other.m_Gradient; + m_Curvature += other.m_Curvature; + } + + //! Set to the difference of \p lhs and \p rhs. + void subtract(const CDerivatives& rhs) { + m_Count -= rhs.m_Count; + if (m_Count > 0) { + m_Gradient -= rhs.m_Gradient; + m_Curvature -= rhs.m_Curvature; + // None of our loss functions have negative curvature therefore we + // shouldn't allow the cumulative curvature to be negative either. + // In this case we force it to be a very small multiple of the + // magnitude of the gradient since this is the closest feasible + // estimate. + for (int i = 0; i < m_Gradient.size(); ++i) { + m_Curvature(i, i) = std::max(m_Curvature(i, i), + SMALLEST_RELATIVE_CURVATURE * + std::fabs(m_Gradient(i))); + } + } else { + // Numeric errors mean that it's possible the sum curvature for a + // candidate split is identically zero while the gradient is epsilon. + // This can cause the node gain to appear infinite (when there is no + // weight regularisation) which in turns causes problems initialising + // the region we search for optimal hyperparameter values. We can + // safely force the gradient and curvature to be zero if we detect + // that the count is zero. + m_Gradient.setZero(); + m_Curvature.setZero(); + } + } + + //! Remap the accumulated curvature to lower triangle row major format. + void remapCurvature() { + // For performance, we accumulate curvatures into the first n (n + 1) / 2 + // elements of the array backing m_Curvature. However, the memory mapped + // matrix class expects them to be stored column major in the lower triangle + // of n x n matrix. This copies them backwards to their correct positions. + for (std::ptrdiff_t j = m_Curvature.cols() - 1, + k = m_Curvature.rows() * (m_Curvature.rows() + 1) / 2 - 1; + j >= 0; --j) { + for (std::ptrdiff_t i = m_Curvature.rows() - 1; i >= j; --i, --k) { + m_Curvature(i, j) = m_Curvature.array()(k); + } + } + } + + //! Get a checksum of this object. + std::uint64_t checksum(std::uint64_t seed = 0) const { + seed = CChecksum::calculate(seed, m_Count); + seed = CChecksum::calculate(seed, m_Gradient); + return CChecksum::calculate(seed, m_Curvature); + } + + private: + TMemoryMappedDoubleVector curvatureTriangleView() { + return {m_Curvature.data(), m_Curvature.rows() * (m_Curvature.rows() + 1) / 2}; + } + + private: + std::size_t m_Count = 0; + TMemoryMappedDoubleVector m_Gradient; + TMemoryMappedDoubleMatrix m_Curvature; + }; + + //! \brief A collection of aggregate derivatives for candidate feature splits. + class MATHS_EXPORT CPerSplitDerivatives { + public: + using TDerivativesVec = std::vector; + + public: + explicit CPerSplitDerivatives(std::size_t numberLossParameters = 0) + : m_NumberLossParameters{numberLossParameters} {} + CPerSplitDerivatives(const TImmutableRadixSetVec& candidateSplits, + std::size_t numberLossParameters) + : m_NumberLossParameters{numberLossParameters} { + this->map(candidateSplits); + } + CPerSplitDerivatives(const CPerSplitDerivatives& other) + : m_NumberLossParameters{other.m_NumberLossParameters} { + this->map(other.m_Derivatives); + this->add(other); + } + CPerSplitDerivatives(CPerSplitDerivatives&&) = default; + + CPerSplitDerivatives& operator=(const CPerSplitDerivatives& other) = delete; + CPerSplitDerivatives& operator=(CPerSplitDerivatives&&) = default; + + //! \return The aggregate count for \p feature and \p split. + std::size_t count(std::size_t feature, std::size_t split) const { + return m_Derivatives[feature][split].count(); + } + + //! \return The aggregate gradient for \p feature and \p split. + const TMemoryMappedDoubleVector& gradient(std::size_t feature, std::size_t split) const { + return m_Derivatives[feature][split].gradient(); + } + + //! \return The aggregate curvature for \p feature and \p split. + const TMemoryMappedDoubleMatrix& curvature(std::size_t feature, std::size_t split) const { + return m_Derivatives[feature][split].curvature(); + } + + //! \return All the split aggregate derivatives for \p feature. + const TDerivativesVec& derivatives(std::size_t feature) const { + return m_Derivatives[feature]; + } + + //! \return The count for missing \p feature. + std::size_t missingCount(std::size_t feature) const { + return m_MissingDerivatives[feature].count(); + } + + //! \return The aggregate gradients for missing \p feature. + const TMemoryMappedDoubleVector& missingGradient(std::size_t feature) const { + return m_MissingDerivatives[feature].gradient(); + } + + //! \return The aggregate curvatures for missing \p feature. + const TMemoryMappedDoubleMatrix& missingCurvature(std::size_t feature) const { + return m_MissingDerivatives[feature].curvature(); + } + + //! Add \p gradient and \p curvature to the accumulated derivatives for + //! the \p split of \p feature. + void addDerivatives(std::size_t feature, + std::size_t split, + const TMemoryMappedFloatVector& gradient, + const TMemoryMappedFloatVector& curvature) { + m_Derivatives[feature][split].add(1, gradient, curvature); + } + + //! Add \p gradient and \p curvature to the accumulated derivatives for + //! missing values of \p feature. + void addMissingDerivatives(std::size_t feature, + const TMemoryMappedFloatVector& gradient, + const TMemoryMappedFloatVector& curvature) { + m_MissingDerivatives[feature].add(1, gradient, curvature); + } + + //! Compute the accumulation of both collections of per split derivatives. + void add(const CPerSplitDerivatives& other) { + for (std::size_t i = 0; i < other.m_Derivatives.size(); ++i) { + for (std::size_t j = 0; j < other.m_Derivatives[i].size(); ++j) { + m_Derivatives[i][j].add(other.m_Derivatives[i][j]); + } + m_MissingDerivatives[i].add(other.m_MissingDerivatives[i]); + } + } + + //! Subtract \p rhs. + void subtract(const CPerSplitDerivatives& rhs) { + for (std::size_t i = 0; i < m_Derivatives.size(); ++i) { + for (std::size_t j = 0; j < m_Derivatives[i].size(); ++j) { + m_Derivatives[i][j].subtract(rhs.m_Derivatives[i][j]); + } + m_MissingDerivatives[i].subtract(rhs.m_MissingDerivatives[i]); + } + } + + //! Remap the accumulated curvature to lower triangle row major format. + void remapCurvature() { + for (std::size_t i = 0; i < m_Derivatives.size(); ++i) { + for (std::size_t j = 0; j < m_Derivatives[i].size(); ++j) { + m_Derivatives[i][j].remapCurvature(); + } + m_MissingDerivatives[i].remapCurvature(); + } + } + + //! Get the memory used by this object. + std::size_t memoryUsage() const { + return core::CMemory::dynamicSize(m_Derivatives) + + core::CMemory::dynamicSize(m_MissingDerivatives) + + core::CMemory::dynamicSize(m_Storage); + } + + //! Estimate the split derivatives' memory usage for a data frame with + //! \p numberCols columns using \p numberSplitsPerFeature for a loss + //! function with \p numberLossParameters parameters. + static std::size_t estimateMemoryUsage(std::size_t numberFeatures, + std::size_t numberSplitsPerFeature, + std::size_t numberLossParameters) { + std::size_t derivativesSize{numberFeatures * (numberSplitsPerFeature + 1) * + sizeof(CDerivatives)}; + std::size_t storageSize{numberFeatures * (numberSplitsPerFeature + 1) * numberLossParameters * + (numberLossParameters + 1) * sizeof(double)}; + return sizeof(CPerSplitDerivatives) + derivativesSize + storageSize; + } + + //! Get a checksum of this object. + std::uint64_t checksum(std::uint64_t seed = 0) const { + seed = CChecksum::calculate(seed, m_NumberLossParameters); + seed = CChecksum::calculate(seed, m_Derivatives); + seed = CChecksum::calculate(seed, m_MissingDerivatives); + return seed; + } + + private: + using TDerivativesVecVec = std::vector; + + private: + static std::size_t number(const TDerivativesVec& derivatives) { + return derivatives.size(); + } + static std::size_t number(const TImmutableRadixSet& splits) { + return splits.size() + 1; + } + template + void map(const SPLITS& splits) { + // This function maps the memory in a single presized buffer containing + // enough space to store all gradient vectors and curvatures. For each + // feature the layout in this buffer is as follows: + // + // "split grad" "split hessian" "missing grad" "missing hessian" + // | | | | + // V V V V + // | n | n^2 | ... | n | n^2 | + + std::size_t totalNumberSplits{ + std::accumulate(splits.begin(), splits.end(), std::size_t{0}, + [](std::size_t size, const auto& featureSplits) { + return size + number(featureSplits); + })}; + + int numberGradients{static_cast(m_NumberLossParameters)}; + int numberCurvatures{numberGradients * numberGradients}; + int numberDerivatives{numberGradients + numberCurvatures}; + + m_Derivatives.resize(splits.size()); + m_MissingDerivatives.reserve(splits.size()); + m_Storage.resize((totalNumberSplits + splits.size()) * numberDerivatives, 0.0); + + double* storage{&m_Storage[0]}; + for (std::size_t i = 0; i < splits.size(); ++i, storage += numberDerivatives) { + std::size_t size{number(splits[i])}; + m_Derivatives[i].reserve(size); + for (std::size_t j = 0; j < size; ++j, storage += numberDerivatives) { + m_Derivatives[i].emplace_back(m_NumberLossParameters, storage); + } + m_MissingDerivatives.emplace_back(m_NumberLossParameters, storage); + } + } + + private: + std::size_t m_NumberLossParameters = 0; + TDerivativesVecVec m_Derivatives; + TDerivativesVec m_MissingDerivatives; + TDoubleVec m_Storage; + }; public: CBoostedTreeLeafNodeStatistics(std::size_t id, @@ -76,7 +380,7 @@ class CBoostedTreeLeafNodeStatistics final { //! Only called by split but is public so it's accessible to std::make_shared. CBoostedTreeLeafNodeStatistics(std::size_t id, - const CBoostedTreeLeafNodeStatistics& parent, + CBoostedTreeLeafNodeStatistics&& parent, const CBoostedTreeLeafNodeStatistics& sibling, const TRegularization& regularization, const TSizeVec& featureBag, @@ -98,8 +402,7 @@ class CBoostedTreeLeafNodeStatistics final { const TRegularization& regularization, const TImmutableRadixSetVec& candidateSplits, const TSizeVec& featureBag, - const CBoostedTreeNode& split, - bool leftChildHasFewerRows); + const CBoostedTreeNode& split); //! Order two leaves by decreasing gain in splitting them. bool operator<(const CBoostedTreeLeafNodeStatistics& rhs) const; @@ -130,17 +433,17 @@ class CBoostedTreeLeafNodeStatistics final { std::size_t memoryUsage() const; //! Estimate the maximum leaf statistics' memory usage training on a data frame - //! with \p numberRows rows and \p numberCols columns using \p featureBagFraction - //! and \p numberSplitsPerFeature. + //! with \p numberRows rows and \p numberCols columns using \p numberSplitsPerFeature + //! for a loss function with \p numberLossParameters parameters. static std::size_t estimateMemoryUsage(std::size_t numberRows, std::size_t numberCols, - std::size_t numberSplitsPerFeature); + std::size_t numberSplitsPerFeature, + std::size_t numberLossParameters); private: - using TDouble1Vec = core::CSmallVector; - //! \brief Statistics relating to a split of the node. - struct SSplitStatistics : private boost::less_than_comparable { + struct MATHS_EXPORT SSplitStatistics + : private boost::less_than_comparable { SSplitStatistics() = default; SSplitStatistics(double gain, double curvature, @@ -172,61 +475,8 @@ class CBoostedTreeLeafNodeStatistics final { bool s_AssignMissingToLeft = true; }; - //! \brief Aggregate derivatives. - struct SAggregateDerivatives { - void add(std::size_t count, const TDouble1Vec& gradient, const TDouble1Vec& curvature) { - s_Count += count; - s_Gradient += gradient[0]; - s_Curvature += curvature[0]; - } - - void merge(const SAggregateDerivatives& other) { - this->add(other.s_Count, {other.s_Gradient}, {other.s_Curvature}); - } - - std::string print() const { - std::ostringstream result; - result << "count = " << s_Count << ", gradient = " << s_Gradient - << ", curvature = " << s_Curvature; - return result.str(); - } - - std::size_t s_Count = 0; - double s_Gradient = 0.0; - double s_Curvature = 0.0; - }; - - using TAggregateDerivativesVec = std::vector; - using TAggregateDerivativesVecVec = std::vector; - - //! \brief A collection of aggregate derivatives for candidate feature splits. - struct SSplitAggregateDerivatives { - SSplitAggregateDerivatives(const TImmutableRadixSetVec& candidateSplits) - : s_Derivatives(candidateSplits.size()), - s_MissingDerivatives(candidateSplits.size()) { - for (std::size_t i = 0; i < candidateSplits.size(); ++i) { - s_Derivatives[i].resize(candidateSplits[i].size() + 1); - } - } - - void merge(const SSplitAggregateDerivatives& other) { - for (std::size_t i = 0; i < s_Derivatives.size(); ++i) { - for (std::size_t j = 0; j < s_Derivatives[i].size(); ++j) { - s_Derivatives[i][j].merge(other.s_Derivatives[i][j]); - } - s_MissingDerivatives[i].merge(other.s_MissingDerivatives[i]); - } - } - - auto move() { - return std::make_pair(std::move(s_Derivatives), std::move(s_MissingDerivatives)); - } - - TAggregateDerivativesVecVec s_Derivatives; - TAggregateDerivativesVec s_MissingDerivatives; - }; - private: + void maybeRecoverMemory(); void computeAggregateLossDerivatives(std::size_t numberThreads, const core::CDataFrame& frame, const CDataFrameCategoryEncoder& encoder); @@ -237,7 +487,7 @@ class CBoostedTreeLeafNodeStatistics final { const CBoostedTreeNode& split, const core::CPackedBitVector& parentRowMask); void addRowDerivatives(const CEncodedDataFrameRowRef& row, - SSplitAggregateDerivatives& splitAggregateDerivatives) const; + CPerSplitDerivatives& splitDerivatives) const; SSplitStatistics computeBestSplitStatistics(const TRegularization& regularization, const TSizeVec& featureBag) const; @@ -248,8 +498,7 @@ class CBoostedTreeLeafNodeStatistics final { std::size_t m_NumberLossParameters; const TImmutableRadixSetVec& m_CandidateSplits; core::CPackedBitVector m_RowMask; - TAggregateDerivativesVecVec m_Derivatives; - TAggregateDerivativesVec m_MissingDerivatives; + CPerSplitDerivatives m_Derivatives; SSplitStatistics m_BestSplit; }; } diff --git a/include/maths/CBoostedTreeUtils.h b/include/maths/CBoostedTreeUtils.h index 57cbac7284..700ee07019 100644 --- a/include/maths/CBoostedTreeUtils.h +++ b/include/maths/CBoostedTreeUtils.h @@ -8,26 +8,27 @@ #define INCLUDED_ml_maths_CBoostedTreeUtils_h #include -#include + +#include +#include +#include #include #include namespace ml { namespace maths { +namespace boosted_tree { +class CLoss; +} namespace boosted_tree_detail { -using TDouble1Vec = core::CSmallVector; using TRowRef = core::CDataFrame::TRowRef; +using TMemoryMappedFloatVector = CMemoryMappedDenseVector; inline std::size_t lossHessianStoredSize(std::size_t numberLossParameters) { return numberLossParameters * (numberLossParameters + 1) / 2; } -inline std::size_t numberLossParametersForHessianStoredSize(std::size_t lossHessianStoredSize) { - return static_cast( - (std::sqrt(8.0 * static_cast(lossHessianStoredSize) + 1.0) - 1.0) / 2.0 + 0.5); -} - inline std::size_t predictionColumn(std::size_t numberInputColumns) { return numberInputColumns; } @@ -48,31 +49,50 @@ inline std::size_t exampleWeightColumn(std::size_t numberInputColumns, lossHessianStoredSize(numberLossParameters); } +//! Read the prediction from \p row. +MATHS_EXPORT +TMemoryMappedFloatVector readPrediction(const TRowRef& row, + std::size_t numberInputColumns, + std::size_t numberLossParamaters); + +//! Zero the prediction of \p row. +MATHS_EXPORT +void zeroPrediction(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParamaters); + +//! Read the loss gradient from \p row. MATHS_EXPORT -TDouble1Vec readPrediction(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParamaters); +TMemoryMappedFloatVector readLossGradient(const TRowRef& row, + std::size_t numberInputColumns, + std::size_t numberLossParameters); +//! Zero the loss gradient of \p row. MATHS_EXPORT -void writePrediction(const TRowRef& row, std::size_t numberInputColumns, const TDouble1Vec& prediction); +void zeroLossGradient(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters); +//! Write the loss gradient to \p row. MATHS_EXPORT -TDouble1Vec readLossGradient(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParameters); +void writeLossGradient(const TRowRef& row, + std::size_t numberInputColumns, + const boosted_tree::CLoss& loss, + const TMemoryMappedFloatVector& prediction, + double actual, + double weight = 1.0); MATHS_EXPORT -void writeLossGradient(const TRowRef& row, std::size_t numberInputColumns, const TDouble1Vec& gradient); +TMemoryMappedFloatVector readLossCurvature(const TRowRef& row, + std::size_t numberInputColumns, + std::size_t numberLossParameters); MATHS_EXPORT -TDouble1Vec readLossCurvature(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParameters); +void zeroLossCurvature(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters); MATHS_EXPORT void writeLossCurvature(const TRowRef& row, std::size_t numberInputColumns, - const TDouble1Vec& curvature); + const boosted_tree::CLoss& curvature, + const TMemoryMappedFloatVector& prediction, + double actual, + double weight = 1.0); MATHS_EXPORT double readExampleWeight(const TRowRef& row, diff --git a/include/maths/CLinearAlgebraEigen.h b/include/maths/CLinearAlgebraEigen.h index 0e296f52da..72b51319d8 100644 --- a/include/maths/CLinearAlgebraEigen.h +++ b/include/maths/CLinearAlgebraEigen.h @@ -370,6 +370,16 @@ class CMemoryMappedDenseMatrix } //@} + //! Get a checksum of this object. + std::uint64_t checksum(std::uint64_t seed = 0) const { + for (std::ptrdiff_t i = 0; i < this->rows(); ++i) { + for (std::ptrdiff_t j = 0; j < this->rows(); ++j) { + seed = CChecksum::calculate(seed, (*this)(i, j)); + } + } + return seed; + } + private: void reseat(const CMemoryMappedDenseMatrix& other) { TBase* base{static_cast(this)}; @@ -668,4 +678,16 @@ CVector::CVector(const CDenseVectorInitializer& v) { } } +namespace Eigen { +template +struct ScalarBinaryOpTraits { + using ReturnType = double; +}; + +template +struct ScalarBinaryOpTraits { + using ReturnType = double; +}; +} + #endif // INCLUDED_ml_maths_CLinearAlgebraEigen_h diff --git a/include/maths/CLinearAlgebraShims.h b/include/maths/CLinearAlgebraShims.h index 70c961e902..914d128abf 100644 --- a/include/maths/CLinearAlgebraShims.h +++ b/include/maths/CLinearAlgebraShims.h @@ -335,37 +335,63 @@ outer(const CAnnotatedVector& x) { namespace las_detail { template -struct SEstimateMemoryUsage { +struct SEstimateVectorMemoryUsage { static std::size_t value(std::size_t) { return 0; } }; template -struct SEstimateMemoryUsage> { +struct SEstimateVectorMemoryUsage> { static std::size_t value(std::size_t dimension) { return dimension * sizeof(T); } }; template -struct SEstimateMemoryUsage> { +struct SEstimateVectorMemoryUsage> { static std::size_t value(std::size_t dimension) { // Ignore pad for alignment. return dimension * sizeof(SCALAR); } }; template -struct SEstimateMemoryUsage> { +struct SEstimateVectorMemoryUsage> { static std::size_t value(std::size_t dimension) { // Ignore any dynamic memory used by the annotation: we don't know how to // compute this here. It will be up to the calling code to estimate this // correctly. - return SEstimateMemoryUsage::value(dimension); + return SEstimateVectorMemoryUsage::value(dimension); + } +}; + +template +struct SEstimateMatrixMemoryUsage { + static std::size_t value(std::size_t, std::size_t) { return 0; } +}; +template +struct SEstimateMatrixMemoryUsage> { + static std::size_t value(std::size_t rows, std::size_t) { + return sizeof(T) * rows * (rows + 1) / 2; + } +}; +template +struct SEstimateMatrixMemoryUsage> { + static std::size_t value(std::size_t rows, std::size_t columns) { + // Ignore pad for alignment. + return sizeof(SCALAR) * rows * columns; } }; } -//! Estimate the amount of memory a point of type VECTOR and \p dimension will use. +//! Estimate the amount of memory a vector of type VECTOR and size \p dimension +//! will use. template std::size_t estimateMemoryUsage(std::size_t dimension) { - return las_detail::SEstimateMemoryUsage::value(dimension); + return las_detail::SEstimateVectorMemoryUsage::value(dimension); +} + +//! Estimate the amount of memory a matrix of type MATRIX and size \p rows by +//! \p columns will use. +template +std::size_t estimateMemoryUsage(std::size_t rows, std::size_t columns) { + return las_detail::SEstimateMatrixMemoryUsage::value(rows, columns); } } } diff --git a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc index c60a6bb4b3..600b99643a 100644 --- a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc @@ -136,7 +136,7 @@ struct SFixture { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::regression(), - "target", s_Rows, 5, 8000000, 0, 0, {"c1"}, s_Alpha, s_Lambda, + "target", s_Rows, 5, 26000000, 0, 0, {"c1"}, s_Alpha, s_Lambda, s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, s_Eta, s_MaximumNumberTrees, s_FeatureBagFraction, shapValues), outputWriterFactory}; @@ -169,9 +169,9 @@ struct SFixture { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::classification(), - "target", s_Rows, 5, 8000000, 0, 0, {"target"}, s_Alpha, s_Lambda, - s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, s_Eta, - s_MaximumNumberTrees, s_FeatureBagFraction, shapValues), + "target", s_Rows, 5, 26000000, 0, 0, {"target"}, s_Alpha, + s_Lambda, s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, + s_Eta, s_MaximumNumberTrees, s_FeatureBagFraction, shapValues), outputWriterFactory}; TStrVec fieldNames{"target", "c1", "c2", "c3", "c4", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; @@ -197,7 +197,7 @@ struct SFixture { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::regression(), - "target", s_Rows, 5, 8000000, 0, 0, {}, s_Alpha, s_Lambda, + "target", s_Rows, 5, 26000000, 0, 0, {}, s_Alpha, s_Lambda, s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, s_Eta, s_MaximumNumberTrees, s_FeatureBagFraction, shapValues), outputWriterFactory}; diff --git a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc index bb50d962d6..10989bd093 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc @@ -426,7 +426,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTraining) { BOOST_TEST_REQUIRE(core::CProgramCounters::counter( counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 6000000); - BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1400000); + BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1500000); BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) > 0); BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) <= duration); } @@ -724,7 +724,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeClassifierTraining) { << "ms"); BOOST_TEST_REQUIRE(core::CProgramCounters::counter( counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 6000000); - BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1400000); + BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1500000); BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) > 0); BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) <= duration); } @@ -752,7 +752,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeClassifierImbalanced) { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::classification(), - "target", numberExamples, 4, 14000000, 0, 0, {"target"}), + "target", numberExamples, 4, 17000000, 0, 0, {"target"}), outputWriterFactory}; TStrVec actuals; @@ -800,7 +800,7 @@ BOOST_AUTO_TEST_CASE(testCategoricalFields) { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::regression(), - "x5", 1000, 5, 19000000, 0, 0, {"x1", "x2"}), + "x5", 1000, 5, 26000000, 0, 0, {"x1", "x2"}), outputWriterFactory}; TStrVec x[]{{"x11", "x12", "x13", "x14", "x15"}, @@ -906,7 +906,7 @@ BOOST_AUTO_TEST_CASE(testCategoricalFieldsEmptyAsMissing) { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::classification(), - "x5", 1000, 5, 19000000, 0, 0, {"x1", "x2", "x5"}), + "x5", 1000, 5, 26000000, 0, 0, {"x1", "x2", "x5"}), outputWriterFactory}; TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."}; diff --git a/lib/maths/CBoostedTree.cc b/lib/maths/CBoostedTree.cc index e6ccdabd69..a967808e3c 100644 --- a/lib/maths/CBoostedTree.cc +++ b/lib/maths/CBoostedTree.cc @@ -67,8 +67,8 @@ bool CArgMinMseImpl::nextPass() { return false; } -void CArgMinMseImpl::add(TDouble1Vec prediction, double actual, double weight) { - m_MeanError.add(actual - prediction[0], weight); +void CArgMinMseImpl::add(const TMemoryMappedFloatVector& prediction, double actual, double weight) { + m_MeanError.add(actual - prediction(0), weight); } void CArgMinMseImpl::merge(const CArgMinLossImpl& other) { @@ -78,7 +78,7 @@ void CArgMinMseImpl::merge(const CArgMinLossImpl& other) { } } -CArgMinMseImpl::TDouble1Vec CArgMinMseImpl::value() const { +CArgMinMseImpl::TDoubleVector CArgMinMseImpl::value() const { // We searching for the value x which minimises // @@ -89,12 +89,16 @@ CArgMinMseImpl::TDouble1Vec CArgMinMseImpl::value() const { // error m = 1/n sum_i{ a_i - p_i } we have x^* = n / (n + lambda) m. double count{CBasicStatistics::count(m_MeanError)}; - return {count == 0.0 ? 0.0 : count / (count + this->lambda()) * CBasicStatistics::mean(m_MeanError)}; + double meanError{CBasicStatistics::mean(m_MeanError)}; + + TDoubleVector result(1); + result(0) = count == 0.0 ? 0.0 : count / (count + this->lambda()) * meanError; + return result; } CArgMinLogisticImpl::CArgMinLogisticImpl(double lambda) : CArgMinLossImpl{lambda}, m_CategoryCounts{0}, - m_BucketCategoryCounts(128, TVector{0.0}) { + m_BucketCategoryCounts(128, TDoubleVector2x1{0.0}) { } std::unique_ptr CArgMinLogisticImpl::clone() const { @@ -106,15 +110,17 @@ bool CArgMinLogisticImpl::nextPass() { return m_CurrentPass < 2; } -void CArgMinLogisticImpl::add(TDouble1Vec prediction, double actual, double weight) { +void CArgMinLogisticImpl::add(const TMemoryMappedFloatVector& prediction, + double actual, + double weight) { switch (m_CurrentPass) { case 0: { - m_PredictionMinMax.add(prediction[0]); + m_PredictionMinMax.add(prediction(0)); m_CategoryCounts(static_cast(actual)) += weight; break; } case 1: { - auto& count = m_BucketCategoryCounts[this->bucket(prediction[0])]; + auto& count = m_BucketCategoryCounts[this->bucket(prediction(0))]; count(static_cast(actual)) += weight; break; } @@ -142,7 +148,7 @@ void CArgMinLogisticImpl::merge(const CArgMinLossImpl& other) { } } -CArgMinLogisticImpl::TDouble1Vec CArgMinLogisticImpl::value() const { +CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const { std::function objective; double minWeight; @@ -190,8 +196,11 @@ CArgMinLogisticImpl::TDouble1Vec CArgMinLogisticImpl::value() const { maxWeight = -m_PredictionMinMax.min() + 5.0; } + TDoubleVector result(1); + if (minWeight == maxWeight) { - return {minWeight}; + result(0) = minWeight; + return result; } double minimum; @@ -201,7 +210,8 @@ CArgMinLogisticImpl::TDouble1Vec CArgMinLogisticImpl::value() const { objective, 1e-3, maxIterations, minimum, objectiveAtMinimum); LOG_TRACE(<< "minimum = " << minimum << " objective(minimum) = " << objectiveAtMinimum); - return {minimum}; + result(0) = minimum; + return result; } } @@ -223,7 +233,7 @@ bool CArgMinLoss::nextPass() const { return m_Impl->nextPass(); } -void CArgMinLoss::add(TDouble1Vec prediction, double actual, double weight) { +void CArgMinLoss::add(const TMemoryMappedFloatVector& prediction, double actual, double weight) { return m_Impl->add(prediction, actual, weight); } @@ -231,7 +241,7 @@ void CArgMinLoss::merge(CArgMinLoss& other) { return m_Impl->merge(*other.m_Impl); } -CArgMinLoss::TDouble1Vec CArgMinLoss::value() const { +CArgMinLoss::TDoubleVector CArgMinLoss::value() const { return m_Impl->value(); } @@ -250,25 +260,30 @@ std::size_t CMse::numberParameters() const { return 1; } -double CMse::value(TDouble1Vec prediction, double actual, double weight) const { - return weight * CTools::pow2(prediction[0] - actual); +double CMse::value(const TMemoryMappedFloatVector& prediction, double actual, double weight) const { + return weight * CTools::pow2(prediction(0) - actual); } -CMse::TDouble1Vec CMse::gradient(TDouble1Vec prediction, double actual, double weight) const { - return {2.0 * weight * (prediction[0] - actual)}; +void CMse::gradient(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight) const { + writer(0, 2.0 * weight * (prediction(0) - actual)); } -CMse::TDouble1Vec -CMse::curvature(TDouble1Vec /*prediction*/, double /*actual*/, double weight) const { - return {2.0 * weight}; +void CMse::curvature(const TMemoryMappedFloatVector& /*prediction*/, + double /*actual*/, + TWriter writer, + double weight) const { + writer(0, 2.0 * weight); } bool CMse::isCurvatureConstant() const { return true; } -CMse::TDouble1Vec CMse::transform(TDouble1Vec prediction) const { - return prediction; +CMse::TDoubleVector CMse::transform(const TMemoryMappedFloatVector& prediction) const { + return TDoubleVector{prediction}; } CArgMinLoss CMse::minimizer(double lambda) const { @@ -289,35 +304,43 @@ std::size_t CBinomialLogistic::numberParameters() const { return 1; } -double CBinomialLogistic::value(TDouble1Vec prediction, double actual, double weight) const { - // Cross entropy - return -weight * ((1.0 - actual) * logOneMinusLogistic(prediction[0]) + - actual * logLogistic(prediction[0])); +double CBinomialLogistic::value(const TMemoryMappedFloatVector& prediction, + double actual, + double weight) const { + return -weight * ((1.0 - actual) * logOneMinusLogistic(prediction(0)) + + actual * logLogistic(prediction(0))); } -CBinomialLogistic::TDouble1Vec -CBinomialLogistic::gradient(TDouble1Vec prediction, double actual, double weight) const { - if (prediction[0] > -LOG_EPSILON && actual == 1.0) { - return {-weight * std::exp(-prediction[0])}; +void CBinomialLogistic::gradient(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight) const { + if (prediction(0) > -LOG_EPSILON && actual == 1.0) { + writer(0, -weight * std::exp(-prediction(0))); } - return {weight * CTools::logisticFunction(prediction[0]) - actual}; + writer(0, weight * (CTools::logisticFunction(prediction(0)) - actual)); } -CBinomialLogistic::TDouble1Vec -CBinomialLogistic::curvature(TDouble1Vec prediction, double /*actual*/, double weight) const { - if (prediction[0] > -LOG_EPSILON) { - return {weight * std::exp(-prediction[0])}; +void CBinomialLogistic::curvature(const TMemoryMappedFloatVector& prediction, + double /*actual*/, + TWriter writer, + double weight) const { + if (prediction(0) > -LOG_EPSILON) { + writer(0, weight * std::exp(-prediction(0))); } - double probability{CTools::logisticFunction(prediction[0])}; - return {weight * probability * (1.0 - probability)}; + double probability{CTools::logisticFunction(prediction(0))}; + writer(0, weight * probability * (1.0 - probability)); } bool CBinomialLogistic::isCurvatureConstant() const { return false; } -CBinomialLogistic::TDouble1Vec CBinomialLogistic::transform(TDouble1Vec prediction) const { - return {CTools::logisticFunction(prediction[0])}; +CBinomialLogistic::TDoubleVector +CBinomialLogistic::transform(const TMemoryMappedFloatVector& prediction) const { + TDoubleVector result{prediction}; + result(0) = CTools::logisticFunction(result(0)); + return result; } CArgMinLoss CBinomialLogistic::minimizer(double lambda) const { diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 067eb63061..d8ed7b5ae6 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -40,11 +39,6 @@ namespace { // by only refreshing once every MINIMUM_SPLIT_REFRESH_INTERVAL trees we add. const double MINIMUM_SPLIT_REFRESH_INTERVAL{3.0}; -double lossAtNSigma(double n, const TMeanVarAccumulator& lossMoments) { - return CBasicStatistics::mean(lossMoments) + - n * std::sqrt(CBasicStatistics::variance(lossMoments)); -} - //! \brief Record the memory used by a supplied object using the RAII idiom. class CScopeRecordMemoryUsage { public: @@ -122,6 +116,21 @@ class CTrainForestStoppingCondition { TDoubleSizePrMinAccumulator m_BestTestLoss; }; +double lossAtNSigma(double n, const TMeanVarAccumulator& lossMoments) { + return CBasicStatistics::mean(lossMoments) + + n * std::sqrt(CBasicStatistics::variance(lossMoments)); +} + +double trace(std::size_t columns, const TMemoryMappedFloatVector& upperTriangle) { + // This assumes the upper triangle of the matrix is stored row major. + double result{0.0}; + for (int i = 0, j = static_cast(columns); + i < upperTriangle.size() && j > 0; i += j, --j) { + result += upperTriangle(i); + } + return result; +} + CDataFrameAnalysisInstrumentationStub INSTRUMENTATION_STUB; } @@ -254,9 +263,10 @@ void CBoostedTreeImpl::predict(core::CDataFrame& frame) const { bool successful; std::tie(std::ignore, successful) = frame.writeColumns( m_NumberThreads, 0, frame.numberRows(), [&](TRowItr beginRows, TRowItr endRows) { + std::size_t numberLossParameters{m_Loss->numberParameters()}; for (auto row = beginRows; row != endRows; ++row) { - writePrediction(*row, m_NumberInputColumns, - {predictRow(m_Encoder->encode(*row), m_BestForest)}); + auto prediction = readPrediction(*row, m_NumberInputColumns, numberLossParameters); + prediction(0) = predictRow(m_Encoder->encode(*row), m_BestForest); } }); if (successful == false) { @@ -271,17 +281,21 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows, // A binary tree with n + 1 leaves has 2n + 1 nodes in total. std::size_t maximumNumberLeaves{this->maximumTreeSize(numberRows) + 1}; std::size_t maximumNumberNodes{2 * maximumNumberLeaves - 1}; + std::size_t maximumNumberFeatures{std::min(numberColumns - 1, numberRows / m_RowsPerFeature)}; std::size_t forestMemoryUsage{ m_MaximumNumberTrees * (sizeof(TNodeVec) + maximumNumberNodes * sizeof(CBoostedTreeNode))}; std::size_t extraColumnsMemoryUsage{numberExtraColumnsForTrain(m_Loss->numberParameters()) * numberRows * sizeof(CFloatStorage)}; + std::size_t foldRoundLossMemoryUsage{m_NumberFolds * m_NumberRounds * + sizeof(TOptionalDouble)}; std::size_t hyperparametersMemoryUsage{numberColumns * sizeof(double)}; std::size_t leafNodeStatisticsMemoryUsage{ maximumNumberLeaves * CBoostedTreeLeafNodeStatistics::estimateMemoryUsage( - numberRows, numberColumns, m_NumberSplitsPerFeature)}; - std::size_t dataTypeMemoryUsage{numberColumns * sizeof(CDataFrameUtils::SDataType)}; - std::size_t featureSampleProbabilities{numberColumns * sizeof(double)}; + numberRows, maximumNumberFeatures, m_NumberSplitsPerFeature, + m_Loss->numberParameters())}; + std::size_t dataTypeMemoryUsage{maximumNumberFeatures * sizeof(CDataFrameUtils::SDataType)}; + std::size_t featureSampleProbabilities{maximumNumberFeatures * sizeof(double)}; std::size_t missingFeatureMaskMemoryUsage{ numberColumns * numberRows / PACKED_BIT_VECTOR_MAXIMUM_ROWS_PER_BYTE}; std::size_t trainTestMaskMemoryUsage{2 * m_NumberFolds * numberRows / @@ -291,8 +305,8 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows, std::size_t shapMemoryUsage{ m_TopShapValues > 0 ? numberRows * numberColumns * sizeof(CFloatStorage) : 0}; return sizeof(*this) + forestMemoryUsage + extraColumnsMemoryUsage + - hyperparametersMemoryUsage + - std::max(leafNodeStatisticsMemoryUsage, shapMemoryUsage) + // not current + foldRoundLossMemoryUsage + hyperparametersMemoryUsage + + std::max(leafNodeStatisticsMemoryUsage, shapMemoryUsage) + // not concurrent dataTypeMemoryUsage + featureSampleProbabilities + missingFeatureMaskMemoryUsage + trainTestMaskMemoryUsage + bayesianOptimisationMemoryUsage; } @@ -422,12 +436,10 @@ CBoostedTreeImpl::TNodeVec CBoostedTreeImpl::initializePredictionsAndLossDerivat m_NumberThreads, 0, frame.numberRows(), [this](TRowItr beginRows, TRowItr endRows) { std::size_t numberLossParameters{m_Loss->numberParameters()}; - TDouble1Vec zero(numberLossParameters, 0.0); - TDouble1Vec zeroHessian(lossHessianStoredSize(numberLossParameters), 0.0); for (auto row = beginRows; row != endRows; ++row) { - writePrediction(*row, m_NumberInputColumns, zero); - writeLossGradient(*row, m_NumberInputColumns, zero); - writeLossCurvature(*row, m_NumberInputColumns, zeroHessian); + zeroPrediction(*row, m_NumberInputColumns, numberLossParameters); + zeroLossGradient(*row, m_NumberInputColumns, numberLossParameters); + zeroLossCurvature(*row, m_NumberInputColumns, numberLossParameters); } }, &updateRowMask); @@ -576,8 +588,9 @@ CBoostedTreeImpl::candidateSplits(const core::CDataFrame& frame, m_Encoder.get(), [this](const TRowRef& row) { // TODO Think about what scalar measure of the Hessian to use here? - return readLossCurvature(row, m_NumberInputColumns, - m_Loss->numberParameters())[0]; + std::size_t numberLossParameters{m_Loss->numberParameters()}; + return trace(numberLossParameters, + readLossCurvature(row, m_NumberInputColumns, numberLossParameters)); }) .first; @@ -690,7 +703,6 @@ CBoostedTreeImpl::trainTree(core::CDataFrame& frame, double splitValue; std::tie(splitFeature, splitValue) = leaf->bestSplit(); - bool leftChildHasFewerRows{leaf->leftChildHasFewerRows()}; bool assignMissingToLeft{leaf->assignMissingToLeft()}; std::size_t leftChildId, rightChildId; @@ -700,10 +712,9 @@ CBoostedTreeImpl::trainTree(core::CDataFrame& frame, TLeafNodeStatisticsPtr leftChild; TLeafNodeStatisticsPtr rightChild; - std::tie(leftChild, rightChild) = - leaf->split(leftChildId, rightChildId, m_NumberThreads, frame, *m_Encoder, - m_Regularization, candidateSplits, this->featureBag(), - tree[leaf->id()], leftChildHasFewerRows); + std::tie(leftChild, rightChild) = leaf->split( + leftChildId, rightChildId, m_NumberThreads, frame, *m_Encoder, + m_Regularization, candidateSplits, this->featureBag(), tree[leaf->id()]); scopeMemoryUsage.add(leftChild); scopeMemoryUsage.add(rightChild); @@ -931,14 +942,13 @@ void CBoostedTreeImpl::refreshPredictionsAndLossDerivatives(core::CDataFrame& fr std::size_t numberLossParameters{m_Loss->numberParameters()}; for (auto row = beginRows; row != endRows; ++row) { auto prediction = readPrediction(*row, m_NumberInputColumns, numberLossParameters); - prediction[0] += root(tree).value(m_Encoder->encode(*row), tree); double actual{readActual(*row, m_DependentVariable)}; double weight{readExampleWeight(*row, m_NumberInputColumns, numberLossParameters)}; - writePrediction(*row, m_NumberInputColumns, prediction); - writeLossGradient(*row, m_NumberInputColumns, - m_Loss->gradient(prediction, actual, weight)); - writeLossCurvature(*row, m_NumberInputColumns, - m_Loss->curvature(prediction, actual, weight)); + prediction(0) += root(tree).value(m_Encoder->encode(*row), tree); + writeLossGradient(*row, m_NumberInputColumns, *m_Loss, + prediction, actual, weight); + writeLossCurvature(*row, m_NumberInputColumns, *m_Loss, + prediction, actual, weight); } }, &updateRowMask); diff --git a/lib/maths/CBoostedTreeLeafNodeStatistics.cc b/lib/maths/CBoostedTreeLeafNodeStatistics.cc index 9bab4402ca..caf3e02e29 100644 --- a/lib/maths/CBoostedTreeLeafNodeStatistics.cc +++ b/lib/maths/CBoostedTreeLeafNodeStatistics.cc @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -23,7 +22,6 @@ using TRowItr = core::CDataFrame::TRowItr; namespace { const std::size_t ASSIGN_MISSING_TO_LEFT{0}; const std::size_t ASSIGN_MISSING_TO_RIGHT{1}; -const double SMALLEST_RELATIVE_CURVATURE{1e-20}; } CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( @@ -69,58 +67,17 @@ CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( std::size_t id, - const CBoostedTreeLeafNodeStatistics& parent, + CBoostedTreeLeafNodeStatistics&& parent, const CBoostedTreeLeafNodeStatistics& sibling, const TRegularization& regularization, const TSizeVec& featureBag, core::CPackedBitVector rowMask) : m_Id{id}, m_Depth{sibling.m_Depth}, m_NumberInputColumns{sibling.m_NumberInputColumns}, m_NumberLossParameters{sibling.m_NumberLossParameters}, - m_CandidateSplits{sibling.m_CandidateSplits}, m_RowMask{std::move(rowMask)} { - - m_Derivatives.resize(m_CandidateSplits.size()); - m_MissingDerivatives.resize(m_CandidateSplits.size()); - - for (std::size_t i = 0; i < m_CandidateSplits.size(); ++i) { - std::size_t numberSplits{m_CandidateSplits[i].size() + 1}; - m_Derivatives[i].resize(numberSplits); - for (std::size_t j = 0; j < numberSplits; ++j) { - // Numeric errors mean that it's possible the sum curvature for a candidate - // split is identically zero while the gradient is epsilon. This can cause - // the node gain to appear infinite (when there is no weight regularisation) - // which in turns causes problems initialising the region we search for optimal - // hyperparameter values. We can safely force the gradient and curvature to - // be zero if we detect that the count is zero. Also, none of our loss functions - // have negative curvature therefore we shouldn't allow the cumulative curvature - // to be negative either. In this case we force it to be a v.small multiple - // of the magnitude of the gradient since this is the closest feasible estimate. - std::size_t count{parent.m_Derivatives[i][j].s_Count - - sibling.m_Derivatives[i][j].s_Count}; - if (count > 0) { - double gradient{parent.m_Derivatives[i][j].s_Gradient - - sibling.m_Derivatives[i][j].s_Gradient}; - double curvature{parent.m_Derivatives[i][j].s_Curvature - - sibling.m_Derivatives[i][j].s_Curvature}; - curvature = std::max(curvature, SMALLEST_RELATIVE_CURVATURE * - std::fabs(gradient)); - m_Derivatives[i][j] = SAggregateDerivatives{count, gradient, curvature}; - } - } - std::size_t count{parent.m_MissingDerivatives[i].s_Count - - sibling.m_MissingDerivatives[i].s_Count}; - if (count > 0) { - double gradient{parent.m_MissingDerivatives[i].s_Gradient - - sibling.m_MissingDerivatives[i].s_Gradient}; - double curvature{parent.m_MissingDerivatives[i].s_Curvature - - sibling.m_MissingDerivatives[i].s_Curvature}; - curvature = std::max(curvature, SMALLEST_RELATIVE_CURVATURE * std::fabs(gradient)); - m_MissingDerivatives[i] = SAggregateDerivatives{count, gradient, curvature}; - } - } - LOG_TRACE(<< "derivatives = " << core::CContainerPrinter::print(m_Derivatives)); - LOG_TRACE(<< "missing derivatives = " - << core::CContainerPrinter::print(m_MissingDerivatives)); + m_CandidateSplits{sibling.m_CandidateSplits}, m_RowMask{std::move(rowMask)}, + m_Derivatives{std::move(parent.m_Derivatives)} { + m_Derivatives.subtract(sibling.m_Derivatives); m_BestSplit = this->computeBestSplitStatistics(regularization, featureBag); } @@ -133,18 +90,20 @@ CBoostedTreeLeafNodeStatistics::split(std::size_t leftChildId, const TRegularization& regularization, const TImmutableRadixSetVec& candidateSplits, const TSizeVec& featureBag, - const CBoostedTreeNode& split, - bool leftChildHasFewerRows) { - if (leftChildHasFewerRows) { + const CBoostedTreeNode& split) { + + if (this->leftChildHasFewerRows()) { auto leftChild = std::make_shared( leftChildId, m_NumberInputColumns, m_NumberLossParameters, numberThreads, frame, encoder, regularization, candidateSplits, featureBag, true /*is left child*/, m_Depth + 1, split, m_RowMask); - core::CPackedBitVector rightChildRowMask{m_RowMask}; + core::CPackedBitVector rightChildRowMask{std::move(m_RowMask)}; rightChildRowMask ^= leftChild->rowMask(); auto rightChild = std::make_shared( - rightChildId, *this, *leftChild, regularization, featureBag, - std::move(rightChildRowMask)); + rightChildId, std::move(*this), *leftChild, regularization, + featureBag, std::move(rightChildRowMask)); + leftChild->maybeRecoverMemory(); + rightChild->maybeRecoverMemory(); return std::make_pair(leftChild, rightChild); } @@ -153,11 +112,13 @@ CBoostedTreeLeafNodeStatistics::split(std::size_t leftChildId, rightChildId, m_NumberInputColumns, m_NumberLossParameters, numberThreads, frame, encoder, regularization, candidateSplits, featureBag, false /*is left child*/, m_Depth + 1, split, m_RowMask); - core::CPackedBitVector leftChildRowMask{m_RowMask}; + core::CPackedBitVector leftChildRowMask{std::move(m_RowMask)}; leftChildRowMask ^= rightChild->rowMask(); auto leftChild = std::make_shared( - leftChildId, *this, *rightChild, regularization, featureBag, + leftChildId, std::move(*this), *rightChild, regularization, featureBag, std::move(leftChildRowMask)); + leftChild->maybeRecoverMemory(); + rightChild->maybeRecoverMemory(); return std::make_pair(leftChild, rightChild); } @@ -195,25 +156,29 @@ core::CPackedBitVector& CBoostedTreeLeafNodeStatistics::rowMask() { } std::size_t CBoostedTreeLeafNodeStatistics::memoryUsage() const { - std::size_t mem{core::CMemory::dynamicSize(m_RowMask)}; - mem += core::CMemory::dynamicSize(m_Derivatives); - mem += core::CMemory::dynamicSize(m_MissingDerivatives); - return mem; + return core::CMemory::dynamicSize(m_RowMask) + core::CMemory::dynamicSize(m_Derivatives); } -std::size_t CBoostedTreeLeafNodeStatistics::estimateMemoryUsage(std::size_t numberRows, - std::size_t numberCols, - std::size_t numberSplitsPerFeature) { +std::size_t +CBoostedTreeLeafNodeStatistics::estimateMemoryUsage(std::size_t numberRows, + std::size_t numberFeatures, + std::size_t numberSplitsPerFeature, + std::size_t numberLossParameters) { // We will typically get the close to the best compression for most of the // leaves when the set of splits becomes large, corresponding to the worst // case for memory usage. This is because the rows will be spread over many // rows so the masks will mainly contain 0 bits in this case. std::size_t rowMaskSize{numberRows / PACKED_BIT_VECTOR_MAXIMUM_ROWS_PER_BYTE}; - std::size_t derivativesSize{(numberCols - 1) * numberSplitsPerFeature * - sizeof(SAggregateDerivatives)}; - std::size_t missingDerivativesSize{(numberCols - 1) * sizeof(SAggregateDerivatives)}; - return sizeof(CBoostedTreeLeafNodeStatistics) + rowMaskSize + - derivativesSize + missingDerivativesSize; + std::size_t perSplitDerivativesSize{CPerSplitDerivatives::estimateMemoryUsage( + numberFeatures, numberSplitsPerFeature, numberLossParameters)}; + return sizeof(CBoostedTreeLeafNodeStatistics) + rowMaskSize + perSplitDerivativesSize; +} + +void CBoostedTreeLeafNodeStatistics::maybeRecoverMemory() { + if (this->gain() <= 0.0) { + m_RowMask = core::CPackedBitVector{}; + m_Derivatives = CPerSplitDerivatives{}; + } } void CBoostedTreeLeafNodeStatistics::computeAggregateLossDerivatives( @@ -224,26 +189,19 @@ void CBoostedTreeLeafNodeStatistics::computeAggregateLossDerivatives( auto result = frame.readRows( numberThreads, 0, frame.numberRows(), core::bindRetrievableState( - [&](SSplitAggregateDerivatives& splitAggregateDerivatives, - TRowItr beginRows, TRowItr endRows) { + [&](CPerSplitDerivatives& perSplitDerivatives, TRowItr beginRows, TRowItr endRows) { for (auto row = beginRows; row != endRows; ++row) { - this->addRowDerivatives(encoder.encode(*row), splitAggregateDerivatives); + this->addRowDerivatives(encoder.encode(*row), perSplitDerivatives); } }, - SSplitAggregateDerivatives{m_CandidateSplits}), + CPerSplitDerivatives{m_CandidateSplits, m_NumberLossParameters}), &m_RowMask); - auto& state = result.first; - SSplitAggregateDerivatives derivatives{std::move(state[0].s_FunctionState)}; - for (std::size_t i = 1; i < state.size(); ++i) { - derivatives.merge(state[i].s_FunctionState); + m_Derivatives = std::move(result.first[0].s_FunctionState); + for (std::size_t i = 1; i < result.first.size(); ++i) { + m_Derivatives.add(result.first[i].s_FunctionState); } - - std::tie(m_Derivatives, m_MissingDerivatives) = derivatives.move(); - - LOG_TRACE(<< "derivatives = " << core::CContainerPrinter::print(m_Derivatives)); - LOG_TRACE(<< "missing derivatives = " - << core::CContainerPrinter::print(m_MissingDerivatives)); + m_Derivatives.remapCurvature(); } void CBoostedTreeLeafNodeStatistics::computeRowMaskAndAggregateLossDerivatives( @@ -257,59 +215,51 @@ void CBoostedTreeLeafNodeStatistics::computeRowMaskAndAggregateLossDerivatives( auto result = frame.readRows( numberThreads, 0, frame.numberRows(), core::bindRetrievableState( - [&](std::pair& state, + [&](std::pair& state, TRowItr beginRows, TRowItr endRows) { auto& mask = state.first; - auto& splitAggregateDerivatives = state.second; + auto& perSplitDerivatives = state.second; for (auto row = beginRows; row != endRows; ++row) { auto encodedRow = encoder.encode(*row); if (split.assignToLeft(encodedRow) == isLeftChild) { std::size_t index{row->index()}; mask.extend(false, index - mask.size()); mask.extend(true); - this->addRowDerivatives(encodedRow, splitAggregateDerivatives); + this->addRowDerivatives(encodedRow, perSplitDerivatives); } } }, - std::make_pair(core::CPackedBitVector{}, SSplitAggregateDerivatives{m_CandidateSplits})), + std::make_pair(core::CPackedBitVector{}, + CPerSplitDerivatives{m_CandidateSplits, m_NumberLossParameters})), &parentRowMask); - auto& state = result.first; - for (auto& mask_ : state) { + for (auto& mask_ : result.first) { auto& mask = mask_.s_FunctionState.first; mask.extend(false, parentRowMask.size() - mask.size()); } - m_RowMask = std::move(state[0].s_FunctionState.first); - SSplitAggregateDerivatives derivatives{std::move(state[0].s_FunctionState.second)}; - for (std::size_t i = 1; i < state.size(); ++i) { - m_RowMask |= state[i].s_FunctionState.first; - derivatives.merge(state[i].s_FunctionState.second); + m_RowMask = std::move(result.first[0].s_FunctionState.first); + m_Derivatives = std::move(result.first[0].s_FunctionState.second); + for (std::size_t i = 1; i < result.first.size(); ++i) { + m_RowMask |= result.first[i].s_FunctionState.first; + m_Derivatives.add(result.first[i].s_FunctionState.second); } - - std::tie(m_Derivatives, m_MissingDerivatives) = derivatives.move(); - - LOG_TRACE(<< "row mask = " << m_RowMask); - LOG_TRACE(<< "derivatives = " << core::CContainerPrinter::print(m_Derivatives)); - LOG_TRACE(<< "missing derivatives = " - << core::CContainerPrinter::print(m_MissingDerivatives)); + m_Derivatives.remapCurvature(); } -void CBoostedTreeLeafNodeStatistics::addRowDerivatives( - const CEncodedDataFrameRowRef& row, - SSplitAggregateDerivatives& splitAggregateDerivatives) const { +void CBoostedTreeLeafNodeStatistics::addRowDerivatives(const CEncodedDataFrameRowRef& row, + CPerSplitDerivatives& perSplitDerivatives) const { const TRowRef& unencodedRow{row.unencodedRow()}; auto gradient = readLossGradient(unencodedRow, m_NumberInputColumns, m_NumberLossParameters); auto curvature = readLossCurvature(unencodedRow, m_NumberInputColumns, m_NumberLossParameters); - - for (std::size_t i = 0; i < m_CandidateSplits.size(); ++i) { - double featureValue{row[i]}; + for (std::size_t feature = 0; feature < m_CandidateSplits.size(); ++feature) { + double featureValue{row[feature]}; if (CDataFrameUtils::isMissing(featureValue)) { - splitAggregateDerivatives.s_MissingDerivatives[i].add(1, gradient, curvature); + perSplitDerivatives.addMissingDerivatives(feature, gradient, curvature); } else { - std::ptrdiff_t j{m_CandidateSplits[i].upperBound(featureValue)}; - splitAggregateDerivatives.s_Derivatives[i][j].add(1, gradient, curvature); + std::ptrdiff_t split{m_CandidateSplits[feature].upperBound(featureValue)}; + perSplitDerivatives.addDerivatives(feature, split, gradient, curvature); } } } @@ -325,54 +275,116 @@ CBoostedTreeLeafNodeStatistics::computeBestSplitStatistics(const TRegularization SSplitStatistics result; - for (auto i : featureBag) { - std::size_t c{m_MissingDerivatives[i].s_Count}; - double g{m_MissingDerivatives[i].s_Gradient}; - double h{m_MissingDerivatives[i].s_Curvature}; - for (const auto& derivatives : m_Derivatives[i]) { - c += derivatives.s_Count; - g += derivatives.s_Gradient; - h += derivatives.s_Curvature; + // We seek to find the value at the minimum of the quadratic expansion of the + // regularized loss function. For a given leaf this expansion is + // + // L(w) = 1/2 w^t H(\lambda) w + g^t w + // + // where H(\lambda) = \sum_i H_i + \lambda I, g = \sum_i g_i and w is the leaf's + // weight. Here, g_i and H_i denote an example's loss gradient and Hessian and i + // ranges over the examples in the leaf. Writing this as the sum of a quadratic + // form and constant, i.e. x(w)^t H(\lambda) x(w) + constant, and noting that H + // is positive definite, we see that we'll minimise loss by choosing w such that + // x is zero, i.e. w^* = arg\min_w(L(w)) satisfies x(w) = 0. This gives + // + // L(w^*) = -1/2 g^t H(\lambda)^{-1} g + + using TDoubleVector = CDenseVector; + using TDoubleMatrix = CDenseMatrix; + using TMinimumLoss = std::function; + + int d{static_cast(m_NumberLossParameters)}; + + TMinimumLoss minimumLoss; + + double lambda{regularization.leafWeightPenaltyMultiplier()}; + Eigen::MatrixXd placeholder{d, d}; + if (m_NumberLossParameters == 1) { + // There is a large fixed overhead for using ldl^t even when g and h are + // scalar so we have special case handling. + minimumLoss = [&](const TDoubleVector& g, const TDoubleMatrix& h) -> double { + return CTools::pow2(g(0)) / (h(0, 0) + lambda); + }; + } else { + // TODO use Cholesky (but need to handle positive semi-definite case). + minimumLoss = [&](const TDoubleVector& g, const TDoubleMatrix& h) -> double { + placeholder = + (h + lambda * TDoubleMatrix::Identity(d, d)).selfadjointView(); + Eigen::LDLT> ldlt{placeholder}; + return g.transpose() * ldlt.solve(g); + }; + } + + TDoubleVector g{d}; + TDoubleMatrix h{d, d}; + TDoubleVector gl[]{TDoubleVector{d}, TDoubleVector{d}}; + TDoubleVector gr[]{TDoubleVector{d}, TDoubleVector{d}}; + TDoubleMatrix hl[]{TDoubleMatrix{d, d}, TDoubleMatrix{d, d}}; + TDoubleMatrix hr[]{TDoubleMatrix{d, d}, TDoubleMatrix{d, d}}; + + for (auto feature : featureBag) { + std::size_t c{m_Derivatives.missingCount(feature)}; + g = m_Derivatives.missingGradient(feature); + h = m_Derivatives.missingCurvature(feature); + for (const auto& derivatives : m_Derivatives.derivatives(feature)) { + c += derivatives.count(); + g += derivatives.gradient(); + h += derivatives.curvature(); } - std::size_t cl[]{m_MissingDerivatives[i].s_Count, 0}; - double gl[]{m_MissingDerivatives[i].s_Gradient, 0.0}; - double hl[]{m_MissingDerivatives[i].s_Curvature, 0.0}; + std::size_t cl[]{m_Derivatives.missingCount(feature), 0}; + gl[ASSIGN_MISSING_TO_LEFT] = m_Derivatives.missingGradient(feature); + gl[ASSIGN_MISSING_TO_RIGHT] = TDoubleVector::Zero(g.rows()); + gr[ASSIGN_MISSING_TO_LEFT] = g - m_Derivatives.missingGradient(feature); + gr[ASSIGN_MISSING_TO_RIGHT] = g; + hl[ASSIGN_MISSING_TO_LEFT] = m_Derivatives.missingCurvature(feature); + hl[ASSIGN_MISSING_TO_RIGHT] = TDoubleMatrix::Zero(h.rows(), h.cols()); + hr[ASSIGN_MISSING_TO_LEFT] = h - m_Derivatives.missingCurvature(feature); + hr[ASSIGN_MISSING_TO_RIGHT] = h; double maximumGain{-INF}; double splitAt{-INF}; bool leftChildHasFewerRows{true}; bool assignMissingToLeft{true}; + std::size_t size{m_Derivatives.derivatives(feature).size()}; + + for (std::size_t split = 0; split + 1 < size; ++split) { - for (std::size_t j = 0; j + 1 < m_Derivatives[i].size(); ++j) { - cl[ASSIGN_MISSING_TO_LEFT] += m_Derivatives[i][j].s_Count; - gl[ASSIGN_MISSING_TO_LEFT] += m_Derivatives[i][j].s_Gradient; - hl[ASSIGN_MISSING_TO_LEFT] += m_Derivatives[i][j].s_Curvature; - cl[ASSIGN_MISSING_TO_RIGHT] += m_Derivatives[i][j].s_Count; - gl[ASSIGN_MISSING_TO_RIGHT] += m_Derivatives[i][j].s_Gradient; - hl[ASSIGN_MISSING_TO_RIGHT] += m_Derivatives[i][j].s_Curvature; - - double gain[]{CTools::pow2(gl[ASSIGN_MISSING_TO_LEFT]) / - (hl[ASSIGN_MISSING_TO_LEFT] + - regularization.leafWeightPenaltyMultiplier()) + - CTools::pow2(g - gl[ASSIGN_MISSING_TO_LEFT]) / - (h - hl[ASSIGN_MISSING_TO_LEFT] + - regularization.leafWeightPenaltyMultiplier()), - CTools::pow2(gl[ASSIGN_MISSING_TO_RIGHT]) / - (hl[ASSIGN_MISSING_TO_RIGHT] + - regularization.leafWeightPenaltyMultiplier()) + - CTools::pow2(g - gl[ASSIGN_MISSING_TO_RIGHT]) / - (h - hl[ASSIGN_MISSING_TO_RIGHT] + - regularization.leafWeightPenaltyMultiplier())}; + std::size_t count{m_Derivatives.count(feature, split)}; + if (count == 0) { + continue; + } + + const TMemoryMappedDoubleVector& gradient{m_Derivatives.gradient(feature, split)}; + const TMemoryMappedDoubleMatrix& curvature{m_Derivatives.curvature(feature, split)}; + + cl[ASSIGN_MISSING_TO_LEFT] += count; + cl[ASSIGN_MISSING_TO_RIGHT] += count; + gl[ASSIGN_MISSING_TO_LEFT] += gradient; + gl[ASSIGN_MISSING_TO_RIGHT] += gradient; + gr[ASSIGN_MISSING_TO_LEFT] -= gradient; + gr[ASSIGN_MISSING_TO_RIGHT] -= gradient; + hl[ASSIGN_MISSING_TO_LEFT] += curvature; + hl[ASSIGN_MISSING_TO_RIGHT] += curvature; + hr[ASSIGN_MISSING_TO_LEFT] -= curvature; + hr[ASSIGN_MISSING_TO_RIGHT] -= curvature; + + double gain[2]; + gain[ASSIGN_MISSING_TO_LEFT] = + minimumLoss(gl[ASSIGN_MISSING_TO_LEFT], hl[ASSIGN_MISSING_TO_LEFT]) + + minimumLoss(gr[ASSIGN_MISSING_TO_LEFT], hr[ASSIGN_MISSING_TO_LEFT]); + gain[ASSIGN_MISSING_TO_RIGHT] = + minimumLoss(gl[ASSIGN_MISSING_TO_RIGHT], hl[ASSIGN_MISSING_TO_RIGHT]) + + minimumLoss(gr[ASSIGN_MISSING_TO_RIGHT], hr[ASSIGN_MISSING_TO_RIGHT]); if (gain[ASSIGN_MISSING_TO_LEFT] > maximumGain) { maximumGain = gain[ASSIGN_MISSING_TO_LEFT]; - splitAt = m_CandidateSplits[i][j]; + splitAt = m_CandidateSplits[feature][split]; leftChildHasFewerRows = (2 * cl[ASSIGN_MISSING_TO_LEFT] < c); assignMissingToLeft = true; } if (gain[ASSIGN_MISSING_TO_RIGHT] > maximumGain) { maximumGain = gain[ASSIGN_MISSING_TO_RIGHT]; - splitAt = m_CandidateSplits[i][j]; + splitAt = m_CandidateSplits[feature][split]; leftChildHasFewerRows = (2 * cl[ASSIGN_MISSING_TO_RIGHT] < c); assignMissingToLeft = false; } @@ -380,13 +392,20 @@ CBoostedTreeLeafNodeStatistics::computeBestSplitStatistics(const TRegularization double penaltyForDepth{regularization.penaltyForDepth(m_Depth)}; double penaltyForDepthPlusOne{regularization.penaltyForDepth(m_Depth + 1)}; - double gain{0.5 * (maximumGain - CTools::pow2(g) / (h + regularization.leafWeightPenaltyMultiplier())) - + + // The gain is the difference between the quadratic minimum for loss with + // no split and the loss with the minimum loss split we found. + double gain{0.5 * (maximumGain - minimumLoss(g, h)) - regularization.treeSizePenaltyMultiplier() - regularization.depthPenaltyMultiplier() * (2.0 * penaltyForDepthPlusOne - penaltyForDepth)}; - SSplitStatistics candidate{ - gain, h, i, splitAt, leftChildHasFewerRows, assignMissingToLeft}; + SSplitStatistics candidate{gain, + h.trace() / static_cast(m_NumberLossParameters), + feature, + splitAt, + leftChildHasFewerRows, + assignMissingToLeft}; LOG_TRACE(<< "candidate split: " << candidate.print()); if (candidate > result) { diff --git a/lib/maths/CBoostedTreeUtils.cc b/lib/maths/CBoostedTreeUtils.cc index f204231e65..e138ca6b3d 100644 --- a/lib/maths/CBoostedTreeUtils.cc +++ b/lib/maths/CBoostedTreeUtils.cc @@ -6,57 +6,88 @@ #include +#include + namespace ml { namespace maths { namespace boosted_tree_detail { +using namespace boosted_tree; -TDouble1Vec readPrediction(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParamaters) { - const auto* start = row.data() + predictionColumn(numberInputColumns); - return TDouble1Vec(start, start + numberLossParamaters); +TMemoryMappedFloatVector readPrediction(const TRowRef& row, + std::size_t numberInputColumns, + std::size_t numberLossParamaters) { + return {row.data() + predictionColumn(numberInputColumns), + static_cast(numberLossParamaters)}; } -void writePrediction(const TRowRef& row, std::size_t numberInputColumns, const TDouble1Vec& prediction) { +void zeroPrediction(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParamaters) { std::size_t offset{predictionColumn(numberInputColumns)}; - for (std::size_t i = 0; i < prediction.size(); ++i) { - row.writeColumn(offset + i, prediction[i]); + for (std::size_t i = 0; i < numberLossParamaters; ++i) { + row.writeColumn(offset + i, 0.0); } } -TDouble1Vec readLossGradient(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParameters) { - const auto* start = row.data() + lossGradientColumn(numberInputColumns, numberLossParameters); - return TDouble1Vec(start, start + numberLossParameters); +TMemoryMappedFloatVector readLossGradient(const TRowRef& row, + std::size_t numberInputColumns, + std::size_t numberLossParameters) { + return {row.data() + lossGradientColumn(numberInputColumns, numberLossParameters), + static_cast(numberLossParameters)}; } -void writeLossGradient(const TRowRef& row, std::size_t numberInputColumns, const TDouble1Vec& gradient) { - std::size_t offset{lossGradientColumn(numberInputColumns, gradient.size())}; - for (std::size_t i = 0; i < gradient.size(); ++i) { - row.writeColumn(offset + i, gradient[i]); +void zeroLossGradient(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters) { + std::size_t offset{lossGradientColumn(numberInputColumns, numberLossParameters)}; + for (std::size_t i = 0; i < numberLossParameters; ++i) { + row.writeColumn(offset + i, 0.0); } } -TDouble1Vec readLossCurvature(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParameters) { - const auto* start = row.data() + lossCurvatureColumn(numberInputColumns, numberLossParameters); - return TDouble1Vec(start, start + lossHessianStoredSize(numberLossParameters)); +void writeLossGradient(const TRowRef& row, + std::size_t numberInputColumns, + const CLoss& loss, + const TMemoryMappedFloatVector& prediction, + double actual, + double weight) { + std::size_t offset{lossGradientColumn(numberInputColumns, prediction.size())}; + auto writer = [&row, offset](std::size_t i, double value) { + row.writeColumn(offset + i, value); + }; + // We wrap the writer in another lambda which we know takes advantage + // of std::function small size optimization to avoid heap allocations. + loss.gradient(prediction, actual, + [&writer](std::size_t i, double value) { writer(i, value); }, weight); } -void writeLossCurvature(const TRowRef& row, - std::size_t numberInputColumns, - const TDouble1Vec& curvature) { - // This comes from solving x(x+1)/2 = n. - std::size_t numberLossParameters{ - numberLossParametersForHessianStoredSize(curvature.size())}; +TMemoryMappedFloatVector readLossCurvature(const TRowRef& row, + std::size_t numberInputColumns, + std::size_t numberLossParameters) { + return {row.data() + lossCurvatureColumn(numberInputColumns, numberLossParameters), + static_cast(lossHessianStoredSize(numberLossParameters))}; +} + +void zeroLossCurvature(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters) { std::size_t offset{lossCurvatureColumn(numberInputColumns, numberLossParameters)}; - for (std::size_t i = 0; i < curvature.size(); ++i) { - row.writeColumn(offset + i, curvature[i]); + for (std::size_t i = 0, size = lossHessianStoredSize(numberLossParameters); + i < size; ++i) { + row.writeColumn(offset + i, 0.0); } } +void writeLossCurvature(const TRowRef& row, + std::size_t numberInputColumns, + const CLoss& loss, + const TMemoryMappedFloatVector& prediction, + double actual, + double weight) { + std::size_t offset{lossCurvatureColumn(numberInputColumns, prediction.size())}; + auto writer = [&row, offset](std::size_t i, double value) { + row.writeColumn(offset + i, value); + }; + // We wrap the writer in another lambda which we know takes advantage + // of std::function small size optimization to avoid heap allocations. + loss.curvature(prediction, actual, + [&writer](std::size_t i, double value) { writer(i, value); }, weight); +} + double readExampleWeight(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters) { diff --git a/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc b/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc new file mode 100644 index 0000000000..a0a2c28584 --- /dev/null +++ b/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc @@ -0,0 +1,302 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include + +#include + +#include +#include + +#include + +#include + +BOOST_AUTO_TEST_SUITE(CBoostedTreeLeafNodeStatisticsTest) + +using namespace ml; +using TDoubleVec = std::vector; +using TDoubleVecVec = std::vector; +using TSizeVec = std::vector; +using TSizeVecVec = std::vector; +using TFloatVec = std::vector; +using TVector = maths::CDenseVector; +using TVectorVec = std::vector; +using TVectorVecVec = std::vector; +using TMatrix = maths::CDenseMatrix; +using TMatrixVec = std::vector; +using TMatrixVecVec = std::vector; +using TImmutableRadixSet = maths::CBoostedTreeLeafNodeStatistics::TImmutableRadixSet; +using TImmutableRadixSetVec = maths::CBoostedTreeLeafNodeStatistics::TImmutableRadixSetVec; +using TDerivatives = maths::CBoostedTreeLeafNodeStatistics::CDerivatives; +using TPerSplitDerivatives = maths::CBoostedTreeLeafNodeStatistics::CPerSplitDerivatives; + +namespace { + +template +maths::CMemoryMappedDenseVector makeGradient(T* storage, std::size_t n) { + return maths::CMemoryMappedDenseVector{storage, static_cast(n)}; +} + +template +maths::CMemoryMappedDenseVector makeCurvature(T* storage, std::size_t n) { + return maths::CMemoryMappedDenseVector(storage, static_cast(n)); +} + +template +TMatrix rowMajorHessian(std::size_t n, const maths::CMemoryMappedDenseVector& curvatures) { + TMatrix result{n, n}; + for (std::size_t i = 0, k = 0; i < n; ++i) { + for (std::size_t j = i; j < n; ++j, ++k) { + result(i, j) = result(j, i) = curvatures(k); + } + } + return result; +} + +void testDerivativesFor(std::size_t numberParameters) { + + LOG_DEBUG(<< "Testing " << numberParameters << " parameters"); + + test::CRandomNumbers rng; + + std::size_t numberGradients{numberParameters}; + std::size_t numberCurvatures{numberParameters * (numberParameters + 1) / 2}; + + TDoubleVecVec gradients(numberGradients); + TDoubleVecVec curvatures(numberCurvatures); + for (std::size_t i = 0; i < numberGradients; ++i) { + rng.generateUniformSamples(-1.0, 1.5, 20, gradients[i]); + } + for (std::size_t i = 0; i < numberCurvatures; ++i) { + rng.generateUniformSamples(0.1, 0.5, 20, curvatures[i]); + } + + LOG_DEBUG(<< "Accumulate"); + + TDoubleVec storage1(numberGradients * (numberGradients + 1), 0.0); + TDerivatives derivatives1{numberParameters, &storage1[0]}; + + for (std::size_t j = 0; j < 10; ++j) { + TFloatVec storage; + for (std::size_t i = 0; i < numberGradients; ++i) { + storage.push_back(gradients[i][j]); + } + for (std::size_t i = 0; i < numberCurvatures; ++i) { + storage.push_back(curvatures[i][j]); + } + auto gradient = makeGradient(&storage[0], numberGradients); + auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures); + derivatives1.add(1, gradient, curvature); + } + derivatives1.remapCurvature(); + + BOOST_REQUIRE_EQUAL(10, derivatives1.count()); + for (std::size_t i = 0; i < numberGradients; ++i) { + BOOST_REQUIRE_CLOSE( + std::accumulate(gradients[i].begin(), gradients[i].begin() + 10, 0.0), + derivatives1.gradient()(i), 1e-4); + } + for (std::size_t j = 0, k = 0; j < numberGradients; ++j) { + for (std::size_t i = j; i < numberGradients; ++i, ++k) { + BOOST_REQUIRE_CLOSE(std::accumulate(curvatures[k].begin(), + curvatures[k].begin() + 10, 0.0), + derivatives1.curvature()(i, j), 1e-4); + } + } + + LOG_DEBUG(<< "Merge"); + + TDoubleVec storage2(numberGradients * (numberGradients + 1), 0.0); + TDerivatives derivatives2{numberParameters, &storage2[0]}; + + for (std::size_t j = 10; j < 20; ++j) { + TFloatVec storage; + for (std::size_t i = 0; i < numberGradients; ++i) { + storage.push_back(gradients[i][j]); + } + for (std::size_t i = 0; i < numberCurvatures; ++i) { + storage.push_back(curvatures[i][j]); + } + auto gradient = makeGradient(&storage[0], numberGradients); + auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures); + derivatives2.add(1, gradient, curvature); + } + derivatives2.remapCurvature(); + + derivatives1.add(derivatives2); + + BOOST_REQUIRE_EQUAL(20, derivatives1.count()); + for (std::size_t i = 0; i < numberGradients; ++i) { + BOOST_REQUIRE_CLOSE(std::accumulate(gradients[i].begin(), gradients[i].end(), 0.0), + derivatives1.gradient()(i), 1e-4); + } + for (std::size_t j = 0, k = 0; j < numberGradients; ++j) { + for (std::size_t i = j; i < numberGradients; ++i, ++k) { + BOOST_REQUIRE_CLOSE( + std::accumulate(curvatures[k].begin(), curvatures[k].end(), 0.0), + derivatives1.curvature()(i, j), 1e-4); + } + } + + LOG_DEBUG(<< "Difference"); + + derivatives1.subtract(derivatives2); + + BOOST_REQUIRE_EQUAL(10, derivatives1.count()); + for (std::size_t i = 0; i < numberGradients; ++i) { + BOOST_REQUIRE_CLOSE( + std::accumulate(gradients[i].begin(), gradients[i].begin() + 10, 0.0), + derivatives1.gradient()(i), 1e-4); + } + for (std::size_t j = 0, k = 0; j < numberGradients; ++j) { + for (std::size_t i = j; i < numberGradients; ++i, ++k) { + BOOST_REQUIRE_CLOSE(std::accumulate(curvatures[k].begin(), + curvatures[k].begin() + 10, 0.0), + derivatives1.curvature()(i, j), 1e-4); + } + } +} + +void testPerSplitDerivativesFor(std::size_t numberParameters) { + + LOG_DEBUG(<< "Testing " << numberParameters << " parameters"); + + TImmutableRadixSetVec featureSplits; + featureSplits.push_back(TImmutableRadixSet{1.0, 2.0, 3.0}); + featureSplits.push_back(TImmutableRadixSet{0.1, 0.7, 1.1, 1.4}); + + test::CRandomNumbers rng; + + std::size_t numberSamples{20}; + std::size_t numberGradients{numberParameters}; + std::size_t numberCurvatures{numberParameters * (numberParameters + 1) / 2}; + + for (std::size_t t = 0; t < 100; ++t) { + TSizeVec features; + TSizeVec splits[2]; + TDoubleVec uniform01; + TDoubleVec gradients; + TDoubleVec curvatures; + rng.generateUniformSamples(0, 2, numberSamples, features); + rng.generateUniformSamples(0, featureSplits[0].size() + 1, + numberSamples, splits[0]); + rng.generateUniformSamples(0, featureSplits[1].size() + 1, + numberSamples, splits[1]); + rng.generateUniformSamples(0.0, 1.0, numberSamples, uniform01); + rng.generateUniformSamples(-1.5, 1.0, numberSamples * numberGradients, gradients); + rng.generateUniformSamples(0.1, 0.5, numberSamples * numberCurvatures, curvatures); + + TSizeVecVec expectedCounts(2); + TVectorVecVec expectedGradients(2); + TMatrixVecVec expectedCurvatures(2); + TSizeVec expectedMissingCounts(2, 0); + TVectorVec expectedMissingGradients(2, TVector::Zero(numberParameters)); + TMatrixVec expectedMissingCurvatures(2, TMatrix::Zero(numberParameters, numberParameters)); + for (std::size_t i = 0; i < 2; ++i) { + expectedCounts[i].resize(featureSplits[i].size() + 1, 0); + expectedGradients[i].resize(featureSplits[i].size() + 1, + TVector::Zero(numberParameters)); + expectedCurvatures[i].resize(featureSplits[i].size() + 1, + TMatrix::Zero(numberParameters, numberParameters)); + } + + auto addDerivatives = [&](TPerSplitDerivatives& derivatives) { + for (std::size_t i = 0, j = 0, k = 0; i < numberSamples; + ++i, j += numberGradients, k += numberCurvatures) { + + TFloatVec storage; + storage.insert(storage.end(), &gradients[j], &gradients[j + numberGradients]); + storage.insert(storage.end(), &curvatures[j], + &curvatures[k + numberCurvatures]); + auto gradient = makeGradient(&storage[0], numberGradients); + auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures); + + if (uniform01[i] < 0.1) { + derivatives.addMissingDerivatives(features[i], gradient, curvature); + ++expectedMissingCounts[features[i]]; + expectedMissingGradients[features[i]] += gradient; + expectedMissingCurvatures[features[i]] += + rowMajorHessian(numberParameters, curvature); + } else { + derivatives.addDerivatives(features[i], splits[features[i]][i], + gradient, curvature); + ++expectedCounts[features[i]][splits[features[i]][i]]; + expectedGradients[features[i]][splits[features[i]][i]] += gradient; + expectedCurvatures[features[i]][splits[features[i]][i]] += + rowMajorHessian(numberParameters, curvature); + } + } + derivatives.remapCurvature(); + }; + + auto validate = [&](const TPerSplitDerivatives& derivatives) { + for (std::size_t i = 0; i < expectedCounts.size(); ++i) { + for (std::size_t j = 0; j < expectedGradients[i].size(); ++j) { + TMatrix curvature{ + derivatives.curvature(i, j).selfadjointView()}; + BOOST_REQUIRE_EQUAL(expectedCounts[i][j], derivatives.count(i, j)); + BOOST_REQUIRE_EQUAL(expectedGradients[i][j], + derivatives.gradient(i, j)); + BOOST_REQUIRE_EQUAL(expectedCurvatures[i][j], curvature); + } + } + for (std::size_t i = 0; i < expectedMissingCounts.size(); ++i) { + TMatrix curvature{ + derivatives.missingCurvature(i).selfadjointView()}; + BOOST_REQUIRE_EQUAL(expectedMissingCounts[i], derivatives.missingCount(i)); + BOOST_REQUIRE_EQUAL(expectedMissingGradients[i], + derivatives.missingGradient(i)); + BOOST_REQUIRE_EQUAL(expectedMissingCurvatures[i], curvature); + } + }; + + LOG_TRACE(<< "Test accumulation"); + + TPerSplitDerivatives derivatives1{featureSplits, numberParameters}; + + addDerivatives(derivatives1); + validate(derivatives1); + + LOG_TRACE(<< "Test merge"); + + rng.generateUniformSamples(0.0, 1.0, numberSamples, uniform01); + rng.generateUniformSamples(-1.5, 1.0, numberSamples * numberGradients, gradients); + rng.generateUniformSamples(0.1, 0.5, numberSamples * numberCurvatures, curvatures); + + TPerSplitDerivatives derivatives2{featureSplits, numberParameters}; + + addDerivatives(derivatives2); + derivatives1.add(derivatives2); + validate(derivatives1); + + LOG_TRACE(<< "Test copy"); + + TPerSplitDerivatives derivatives3{derivatives1}; + BOOST_REQUIRE_EQUAL(derivatives1.checksum(), derivatives3.checksum()); + } +} +} + +BOOST_AUTO_TEST_CASE(testDerivatives) { + + // Test individual derivatives accumulation for single and multi parameter + // loss functions. + + testDerivativesFor(1 /*loss function parameter*/); + testDerivativesFor(3 /*loss function parameters*/); +} + +BOOST_AUTO_TEST_CASE(testPerSplitDerivatives) { + + // Test per split derivatives accumulation for single and multi parameter + // loss functions. + + testPerSplitDerivativesFor(1 /*loss function parameter*/); + testPerSplitDerivativesFor(3 /*loss function parameters*/); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index 1f7a37b572..eafe61f1ff 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -42,6 +42,7 @@ using TRowRef = core::CDataFrame::TRowRef; using TRowItr = core::CDataFrame::TRowItr; using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar::TAccumulator; +using TMemoryMappedFloatVector = maths::boosted_tree::CLoss::TMemoryMappedFloatVector; namespace { @@ -875,10 +876,12 @@ BOOST_AUTO_TEST_CASE(testLogisticMinimizerEdgeCases) { // All predictions equal and zero. { CArgMinLogisticImpl argmin{0.0}; - argmin.add({0.0}, 0.0); - argmin.add({0.0}, 1.0); - argmin.add({0.0}, 1.0); - argmin.add({0.0}, 0.0); + maths::CFloatStorage storage[]{0.0}; + TMemoryMappedFloatVector prediction{storage, 1}; + argmin.add(prediction, 0.0); + argmin.add(prediction, 1.0); + argmin.add(prediction, 1.0); + argmin.add(prediction, 0.0); argmin.nextPass(); BOOST_REQUIRE_EQUAL(0.0, argmin.value()[0]); } @@ -902,7 +905,9 @@ BOOST_AUTO_TEST_CASE(testLogisticMinimizerEdgeCases) { do { ++numberPasses; for (std::size_t i = 0; i < labels.size(); ++i) { - argmin.add({weights[i]}, labels[i]); + maths::CFloatStorage storage[]{weights[i]}; + TMemoryMappedFloatVector prediction{storage, 1}; + argmin.add(prediction, labels[i]); ++counts[static_cast(labels[i])]; } } while (argmin.nextPass()); @@ -923,7 +928,9 @@ BOOST_AUTO_TEST_CASE(testLogisticMinimizerEdgeCases) { TDoubleVec actuals{1.0, 1.0, 0.0, 1.0}; do { for (std::size_t i = 0; i < predictions.size(); ++i) { - argmin.add({predictions[i]}, actuals[i]); + maths::CFloatStorage storage[]{predictions[i]}; + TMemoryMappedFloatVector prediction{storage, 1}; + argmin.add(prediction, actuals[i]); } } while (argmin.nextPass()); @@ -935,7 +942,9 @@ BOOST_AUTO_TEST_CASE(testLogisticMinimizerEdgeCases) { for (double eps : {-10.0, 0.0, 10.0}) { double lossAtEps{0.0}; for (std::size_t i = 0; i < predictions.size(); ++i) { - lossAtEps += loss.value({predictions[i] + minimizer + eps}, actuals[i]); + maths::CFloatStorage storage[]{predictions[i] + minimizer + eps}; + TMemoryMappedFloatVector probe{storage, 1}; + lossAtEps += loss.value(probe, actuals[i]); } losses.push_back(lossAtEps); } @@ -1011,19 +1020,23 @@ BOOST_AUTO_TEST_CASE(testLogisticMinimizerRandom) { do { for (std::size_t i = 0; i < labels.size() / 2; ++i) { - argmin.add({weights[i]}, labels[i]); - argminPartition[0].add({weights[i]}, labels[i]); + maths::CFloatStorage storage[]{weights[i]}; + TMemoryMappedFloatVector prediction{storage, 1}; + argmin.add(prediction, labels[i]); + argminPartition[0].add(prediction, labels[i]); } for (std::size_t i = labels.size() / 2; i < labels.size(); ++i) { - argmin.add({weights[i]}, labels[i]); - argminPartition[1].add({weights[i]}, labels[i]); + maths::CFloatStorage storage[]{weights[i]}; + TMemoryMappedFloatVector prediction{storage, 1}; + argmin.add(prediction, labels[i]); + argminPartition[1].add(prediction, labels[i]); } argminPartition[0].merge(argminPartition[1]); argminPartition[1] = argminPartition[0]; } while (nextPass()); - double actual{argmin.value()[0]}; - double actualPartition{argminPartition[0].value()[0]}; + double actual{argmin.value()(0)}; + double actualPartition{argminPartition[0].value()(0)}; LOG_DEBUG(<< "actual = " << actual << " objective at actual = " << objective(actual)); @@ -1049,13 +1062,17 @@ BOOST_AUTO_TEST_CASE(testLogisticLossForUnderflow) { // Losses should be very nearly linear function of log-odds when they're large. { - TDoubleVec lastLoss{loss.value({1.0 - std::log(eps)}, 0.0), - loss.value({1.0 + std::log(eps)}, 1.0)}; + maths::CFloatStorage predictions[]{1.0 - std::log(eps), 1.0 + std::log(eps)}; + TMemoryMappedFloatVector prediction0{&predictions[0], 1}; + TMemoryMappedFloatVector prediction1{&predictions[1], 1}; + TDoubleVec lastLoss{loss.value(prediction0, 0.0), loss.value(prediction1, 1.0)}; for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) { - TDoubleVec currentLoss{loss.value({scale - std::log(eps)}, 0.0), - loss.value({scale + std::log(eps)}, 1.0)}; - BOOST_REQUIRE_CLOSE_ABSOLUTE(0.25, lastLoss[0] - currentLoss[0], 5e-3); - BOOST_REQUIRE_CLOSE_ABSOLUTE(-0.25, lastLoss[1] - currentLoss[1], 5e-3); + predictions[0] = scale - std::log(eps); + predictions[1] = scale + std::log(eps); + TDoubleVec currentLoss{loss.value(prediction0, 0.0), + loss.value(prediction1, 1.0)}; + BOOST_REQUIRE_CLOSE_ABSOLUTE(0.25, lastLoss[0] - currentLoss[0], 0.005); + BOOST_REQUIRE_CLOSE_ABSOLUTE(-0.25, lastLoss[1] - currentLoss[1], 0.005); lastLoss = currentLoss; } } @@ -1063,24 +1080,42 @@ BOOST_AUTO_TEST_CASE(testLogisticLossForUnderflow) { // The gradient and curvature should be proportional to the exponential of the // log-odds when they're small. { - TDoubleVec lastGradient{loss.gradient({1.0 + std::log(eps)}, 0.0)[0], - loss.gradient({1.0 - std::log(eps)}, 1.0)[0]}; - TDoubleVec lastCurvature{loss.curvature({1.0 + std::log(eps)}, 0.0)[0], - loss.curvature({1.0 - std::log(eps)}, 1.0)[0]}; + auto readDerivatives = [&](double prediction, TDoubleVec& gradients, + TDoubleVec& curvatures) { + maths::CFloatStorage predictions[]{prediction + std::log(eps), + prediction - std::log(eps)}; + TMemoryMappedFloatVector prediction0{&predictions[0], 1}; + TMemoryMappedFloatVector prediction1{&predictions[1], 1}; + loss.gradient(prediction0, 0.0, [&](std::size_t, double value) { + gradients[0] = value; + }); + loss.gradient(prediction1, 1.0, [&](std::size_t, double value) { + gradients[1] = value; + }); + loss.curvature(prediction0, 0.0, [&](std::size_t, double value) { + curvatures[0] = value; + }); + loss.curvature(prediction1, 1.0, [&](std::size_t, double value) { + curvatures[1] = value; + }); + }; + + TDoubleVec lastGradient(2); + TDoubleVec lastCurvature(2); + readDerivatives(1.0, lastGradient, lastCurvature); + for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) { - TDoubleVec currentGradient{loss.gradient({scale + std::log(eps)}, 0.0)[0], - loss.gradient({scale - std::log(eps)}, 1.0)[0]}; - TDoubleVec currentCurvature{ - loss.curvature({scale + std::log(eps)}, 0.0)[0], - loss.curvature({scale - std::log(eps)}, 1.0)[0]}; + TDoubleVec currentGradient(2); + TDoubleVec currentCurvature(2); + readDerivatives(scale, currentGradient, currentCurvature); BOOST_REQUIRE_CLOSE_ABSOLUTE(std::exp(0.25), - lastGradient[0] / currentGradient[0], 5e-3); + lastGradient[0] / currentGradient[0], 0.01); BOOST_REQUIRE_CLOSE_ABSOLUTE(std::exp(-0.25), - lastGradient[1] / currentGradient[1], 5e-3); + lastGradient[1] / currentGradient[1], 0.01); BOOST_REQUIRE_CLOSE_ABSOLUTE( - std::exp(0.25), lastCurvature[0] / currentCurvature[0], 5e-3); + std::exp(0.25), lastCurvature[0] / currentCurvature[0], 0.01); BOOST_REQUIRE_CLOSE_ABSOLUTE( - std::exp(-0.25), lastCurvature[1] / currentCurvature[1], 5e-3); + std::exp(-0.25), lastCurvature[1] / currentCurvature[1], 0.01); lastGradient = currentGradient; lastCurvature = currentCurvature; } diff --git a/lib/maths/unittest/Makefile b/lib/maths/unittest/Makefile index dcab0474aa..a6dfab62a6 100644 --- a/lib/maths/unittest/Makefile +++ b/lib/maths/unittest/Makefile @@ -24,6 +24,7 @@ SRCS=\ CBasicStatisticsTest.cc \ CBayesianOptimisationTest.cc \ CBjkstUniqueValuesTest.cc \ + CBoostedTreeLeafNodeStatisticsTest.cc \ CBoostedTreeTest.cc \ CBootstrapClustererTest.cc \ CBoundingBoxTest.cc \