diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index dcfa231ac2..d99c21de20 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -39,6 +39,13 @@ * The Linux build platform for the {ml} C++ code is now CentOS 7 running gcc 9.3. (See {ml-pull}1170[#1170].) +== {es} version 7.11.0 + +=== Enhancements + +* During regression and classification training prefer smaller models if performance is + similar (See {ml-pull}1516[#1516].) + == {es} version 7.10.0 === Enhancements diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index 953eac33ea..dff3e7f6fd 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -54,7 +54,8 @@ class MATHS_EXPORT CBoostedTreeImpl final { using TVector = CDenseVector; using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar::TAccumulator; - using TMeanVarAccumulatorSizePr = std::pair; + using TMeanVarAccumulatorSizeDoubleTuple = + std::tuple; using TMeanVarAccumulatorVec = std::vector; using TBayesinOptimizationUPtr = std::unique_ptr; using TNodeVec = CBoostedTree::TNodeVec; @@ -211,7 +212,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { void initializeTreeShap(const core::CDataFrame& frame); //! Train the forest and compute loss moments on each fold. - TMeanVarAccumulatorSizePr crossValidateForest(core::CDataFrame& frame); + TMeanVarAccumulatorSizeDoubleTuple crossValidateForest(core::CDataFrame& frame); //! Initialize the predictions and loss function derivatives for the masked //! rows in \p frame. @@ -289,7 +290,8 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! Capture the current hyperparameter values. void captureBestHyperparameters(const TMeanVarAccumulator& lossMoments, - std::size_t maximumNumberTrees); + std::size_t maximumNumberTrees, + double numberNodes); //! Set the hyperparamaters from the best recorded. void restoreBestHyperparameters(); @@ -370,6 +372,8 @@ class MATHS_EXPORT CBoostedTreeImpl final { std::size_t m_NumberTopShapValues = 0; TTreeShapFeatureImportanceUPtr m_TreeShap; TAnalysisInstrumentationPtr m_Instrumentation; + mutable TMeanAccumulator m_ForestSizeAccumulator; + mutable TMeanAccumulator m_MeanLossAccumulator; private: friend class CBoostedTreeFactory; diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index da76523073..d126689bc7 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -146,6 +146,14 @@ double trace(std::size_t columns, const TMemoryMappedFloatVector& upperTriangle) } CDataFrameTrainBoostedTreeInstrumentationStub INSTRUMENTATION_STUB; + +double numberForestNodes(const CBoostedTreeImpl::TNodeVecVec& forest) { + double numberNodes{0.0}; + for (const auto& tree : forest) { + numberNodes += static_cast(tree.size()); + } + return numberNodes; +} } CBoostedTreeImpl::CBoostedTreeImpl(std::size_t numberThreads, @@ -223,9 +231,13 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, TMeanVarAccumulator lossMoments; std::size_t maximumNumberTrees; - std::tie(lossMoments, maximumNumberTrees) = this->crossValidateForest(frame); + double numberNodes; + std::tie(lossMoments, maximumNumberTrees, numberNodes) = + this->crossValidateForest(frame); - this->captureBestHyperparameters(lossMoments, maximumNumberTrees); + m_MeanLossAccumulator.add(CBasicStatistics::mean(lossMoments)); + + this->captureBestHyperparameters(lossMoments, maximumNumberTrees, numberNodes); if (this->selectNextHyperparameters(lossMoments, *m_BayesianOptimization) == false) { LOG_WARN(<< "Hyperparameter selection failed: exiting loop early"); @@ -481,7 +493,7 @@ void CBoostedTreeImpl::initializeTreeShap(const core::CDataFrame& frame) { } } -CBoostedTreeImpl::TMeanVarAccumulatorSizePr +CBoostedTreeImpl::TMeanVarAccumulatorSizeDoubleTuple CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) { // We want to ensure we evaluate on equal proportions for each fold. @@ -510,6 +522,7 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) { TMeanVarAccumulator lossMoments; TDoubleVec numberTrees; numberTrees.reserve(m_NumberFolds); + TMeanAccumulator forestSizeAccumulator; while (folds.size() > 0 && stopCrossValidationEarly(lossMoments) == false) { std::size_t fold{folds.back()}; @@ -524,6 +537,7 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) { lossMoments.add(loss); m_FoldRoundTestLosses[fold][m_CurrentRound] = loss; numberTrees.push_back(static_cast(forest.size())); + forestSizeAccumulator.add(numberForestNodes(forest)); m_Instrumentation->lossValues(fold, std::move(lossValues)); } m_TrainingProgress.increment(m_MaximumNumberTrees * folds.size()); @@ -532,11 +546,13 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) { std::sort(numberTrees.begin(), numberTrees.end()); std::size_t medianNumberTrees{ static_cast(CBasicStatistics::median(numberTrees))}; + double meanForestSize{CBasicStatistics::mean(forestSizeAccumulator)}; lossMoments = this->correctTestLossMoments(std::move(folds), lossMoments); LOG_TRACE(<< "test mean loss = " << CBasicStatistics::mean(lossMoments) - << ", sigma = " << std::sqrt(CBasicStatistics::mean(lossMoments))); + << ", sigma = " << std::sqrt(CBasicStatistics::mean(lossMoments)) + << ", mean number nodes in forest = " << meanForestSize); - return {lossMoments, medianNumberTrees}; + return {lossMoments, medianNumberTrees, meanForestSize}; } CBoostedTreeImpl::TNodeVec CBoostedTreeImpl::initializePredictionsAndLossDerivatives( @@ -658,6 +674,9 @@ CBoostedTreeImpl::trainForest(core::CDataFrame& frame, forest.resize(stoppingCondition.bestSize()); + // record forest size as the number of nodes + m_ForestSizeAccumulator.add(numberForestNodes(forest)); + LOG_TRACE(<< "Trained one forest"); return {forest, stoppingCondition.bestLoss(), std::move(losses)}; @@ -1274,11 +1293,17 @@ bool CBoostedTreeImpl::selectNextHyperparameters(const TMeanVarAccumulator& loss } void CBoostedTreeImpl::captureBestHyperparameters(const TMeanVarAccumulator& lossMoments, - std::size_t maximumNumberTrees) { + std::size_t maximumNumberTrees, + double numberNodes) { // We capture the parameters with the lowest error at one standard // deviation above the mean. If the mean error improvement is marginal // we prefer the solution with the least variation across the folds. - double loss{lossAtNSigma(1.0, lossMoments)}; + + // Add 0.01 * "forest number nodes" * E[GP] / "average forest number nodes" to meanLoss. + double modelSizeDifferentiator{0.01 * numberNodes / + CBasicStatistics::mean(m_ForestSizeAccumulator) * + CBasicStatistics::mean(m_MeanLossAccumulator)}; + double loss{lossAtNSigma(1.0, lossMoments) + modelSizeDifferentiator}; if (loss < m_BestForestTestLoss) { m_BestForestTestLoss = loss; m_BestHyperparameters = CBoostedTreeHyperparameters{ diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index 4aa33dc3ac..9c45afbf77 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -361,7 +361,7 @@ BOOST_AUTO_TEST_CASE(testPiecewiseConstant) { 9.0 * std::sqrt(noiseVariance / static_cast(trainRows))); } // Good R^2... - BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.95); + BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.94); meanModelRSquared.add(modelRSquared[i][0]); }