Skip to content

Commit

Permalink
[7.x][ML] Prefer smaller models with similar performance (#1516) (#1533)
Browse files Browse the repository at this point in the history
For regression and classification, during hyperparameter optimization we prefer smaller models if the loss functions are otherwise comparable.

To this end, we add 0.01 * "forest number nodes" * E[GP] / "average forest number nodes" as an additional penalty.

Backport of #1516.
  • Loading branch information
valeriy42 authored Oct 14, 2020
1 parent c6200ea commit aff5f62
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 11 deletions.
7 changes: 7 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@

//=== Regressions

== {es} version 7.11.0

=== Enhancements

* During regression and classification training prefer smaller models if performance is
similar (See {ml-pull}1516[#1516].)

== {es} version 7.10.0

=== Enhancements
Expand Down
10 changes: 7 additions & 3 deletions include/maths/CBoostedTreeImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ class MATHS_EXPORT CBoostedTreeImpl final {
using TVector = CDenseVector<double>;
using TMeanAccumulator = CBasicStatistics::SSampleMean<double>::TAccumulator;
using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
using TMeanVarAccumulatorSizePr = std::pair<TMeanVarAccumulator, std::size_t>;
using TMeanVarAccumulatorSizeDoubleTuple =
std::tuple<TMeanVarAccumulator, std::size_t, double>;
using TMeanVarAccumulatorVec = std::vector<TMeanVarAccumulator>;
using TBayesinOptimizationUPtr = std::unique_ptr<maths::CBayesianOptimisation>;
using TNodeVec = CBoostedTree::TNodeVec;
Expand Down Expand Up @@ -211,7 +212,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
void initializeTreeShap(const core::CDataFrame& frame);

//! Train the forest and compute loss moments on each fold.
TMeanVarAccumulatorSizePr crossValidateForest(core::CDataFrame& frame);
TMeanVarAccumulatorSizeDoubleTuple crossValidateForest(core::CDataFrame& frame);

//! Initialize the predictions and loss function derivatives for the masked
//! rows in \p frame.
Expand Down Expand Up @@ -289,7 +290,8 @@ class MATHS_EXPORT CBoostedTreeImpl final {

//! Capture the current hyperparameter values.
void captureBestHyperparameters(const TMeanVarAccumulator& lossMoments,
std::size_t maximumNumberTrees);
std::size_t maximumNumberTrees,
double numberNodes);

//! Set the hyperparamaters from the best recorded.
void restoreBestHyperparameters();
Expand Down Expand Up @@ -370,6 +372,8 @@ class MATHS_EXPORT CBoostedTreeImpl final {
std::size_t m_NumberTopShapValues = 0;
TTreeShapFeatureImportanceUPtr m_TreeShap;
TAnalysisInstrumentationPtr m_Instrumentation;
mutable TMeanAccumulator m_ForestSizeAccumulator;
mutable TMeanAccumulator m_MeanLossAccumulator;

private:
friend class CBoostedTreeFactory;
Expand Down
39 changes: 32 additions & 7 deletions lib/maths/CBoostedTreeImpl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,14 @@ double trace(std::size_t columns, const TMemoryMappedFloatVector& upperTriangle)
}

CDataFrameTrainBoostedTreeInstrumentationStub INSTRUMENTATION_STUB;

double numberForestNodes(const CBoostedTreeImpl::TNodeVecVec& forest) {
double numberNodes{0.0};
for (const auto& tree : forest) {
numberNodes += static_cast<double>(tree.size());
}
return numberNodes;
}
}

CBoostedTreeImpl::CBoostedTreeImpl(std::size_t numberThreads,
Expand Down Expand Up @@ -223,9 +231,13 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame,

TMeanVarAccumulator lossMoments;
std::size_t maximumNumberTrees;
std::tie(lossMoments, maximumNumberTrees) = this->crossValidateForest(frame);
double numberNodes;
std::tie(lossMoments, maximumNumberTrees, numberNodes) =
this->crossValidateForest(frame);

this->captureBestHyperparameters(lossMoments, maximumNumberTrees);
m_MeanLossAccumulator.add(CBasicStatistics::mean(lossMoments));

this->captureBestHyperparameters(lossMoments, maximumNumberTrees, numberNodes);

if (this->selectNextHyperparameters(lossMoments, *m_BayesianOptimization) == false) {
LOG_WARN(<< "Hyperparameter selection failed: exiting loop early");
Expand Down Expand Up @@ -481,7 +493,7 @@ void CBoostedTreeImpl::initializeTreeShap(const core::CDataFrame& frame) {
}
}

CBoostedTreeImpl::TMeanVarAccumulatorSizePr
CBoostedTreeImpl::TMeanVarAccumulatorSizeDoubleTuple
CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) {

// We want to ensure we evaluate on equal proportions for each fold.
Expand Down Expand Up @@ -510,6 +522,7 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) {
TMeanVarAccumulator lossMoments;
TDoubleVec numberTrees;
numberTrees.reserve(m_NumberFolds);
TMeanAccumulator forestSizeAccumulator;

while (folds.size() > 0 && stopCrossValidationEarly(lossMoments) == false) {
std::size_t fold{folds.back()};
Expand All @@ -524,6 +537,7 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) {
lossMoments.add(loss);
m_FoldRoundTestLosses[fold][m_CurrentRound] = loss;
numberTrees.push_back(static_cast<double>(forest.size()));
forestSizeAccumulator.add(numberForestNodes(forest));
m_Instrumentation->lossValues(fold, std::move(lossValues));
}
m_TrainingProgress.increment(m_MaximumNumberTrees * folds.size());
Expand All @@ -532,11 +546,13 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) {
std::sort(numberTrees.begin(), numberTrees.end());
std::size_t medianNumberTrees{
static_cast<std::size_t>(CBasicStatistics::median(numberTrees))};
double meanForestSize{CBasicStatistics::mean(forestSizeAccumulator)};
lossMoments = this->correctTestLossMoments(std::move(folds), lossMoments);
LOG_TRACE(<< "test mean loss = " << CBasicStatistics::mean(lossMoments)
<< ", sigma = " << std::sqrt(CBasicStatistics::mean(lossMoments)));
<< ", sigma = " << std::sqrt(CBasicStatistics::mean(lossMoments))
<< ", mean number nodes in forest = " << meanForestSize);

return {lossMoments, medianNumberTrees};
return {lossMoments, medianNumberTrees, meanForestSize};
}

CBoostedTreeImpl::TNodeVec CBoostedTreeImpl::initializePredictionsAndLossDerivatives(
Expand Down Expand Up @@ -658,6 +674,9 @@ CBoostedTreeImpl::trainForest(core::CDataFrame& frame,

forest.resize(stoppingCondition.bestSize());

// record forest size as the number of nodes
m_ForestSizeAccumulator.add(numberForestNodes(forest));

LOG_TRACE(<< "Trained one forest");

return {forest, stoppingCondition.bestLoss(), std::move(losses)};
Expand Down Expand Up @@ -1274,11 +1293,17 @@ bool CBoostedTreeImpl::selectNextHyperparameters(const TMeanVarAccumulator& loss
}

void CBoostedTreeImpl::captureBestHyperparameters(const TMeanVarAccumulator& lossMoments,
std::size_t maximumNumberTrees) {
std::size_t maximumNumberTrees,
double numberNodes) {
// We capture the parameters with the lowest error at one standard
// deviation above the mean. If the mean error improvement is marginal
// we prefer the solution with the least variation across the folds.
double loss{lossAtNSigma(1.0, lossMoments)};

// Add 0.01 * "forest number nodes" * E[GP] / "average forest number nodes" to meanLoss.
double modelSizeDifferentiator{0.01 * numberNodes /
CBasicStatistics::mean(m_ForestSizeAccumulator) *
CBasicStatistics::mean(m_MeanLossAccumulator)};
double loss{lossAtNSigma(1.0, lossMoments) + modelSizeDifferentiator};
if (loss < m_BestForestTestLoss) {
m_BestForestTestLoss = loss;
m_BestHyperparameters = CBoostedTreeHyperparameters{
Expand Down
2 changes: 1 addition & 1 deletion lib/maths/unittest/CBoostedTreeTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ BOOST_AUTO_TEST_CASE(testPiecewiseConstant) {
8.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
}
// Good R^2...
BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.95);
BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.94);

meanModelRSquared.add(modelRSquared[i][0]);
}
Expand Down

0 comments on commit aff5f62

Please sign in to comment.