Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Prefer smaller models with similar performance #1516

Merged
merged 9 commits into from
Oct 13, 2020
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@
* The Linux build platform for the {ml} C++ code is now CentOS 7 running gcc 9.3. (See
{ml-pull}1170[#1170].)

== {es} version 7.11.0

=== Enhancements

* During regression and classification training prefer smaller models if performance is
similar (See {ml-pull}1516[#1516].)

== {es} version 7.10.0

=== Enhancements
Expand Down
10 changes: 7 additions & 3 deletions include/maths/CBoostedTreeImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ class MATHS_EXPORT CBoostedTreeImpl final {
using TVector = CDenseVector<double>;
using TMeanAccumulator = CBasicStatistics::SSampleMean<double>::TAccumulator;
using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
using TMeanVarAccumulatorSizePr = std::pair<TMeanVarAccumulator, std::size_t>;
using TMeanVarAccumulatorSizeDoubleTuple =
std::tuple<TMeanVarAccumulator, std::size_t, double>;
using TMeanVarAccumulatorVec = std::vector<TMeanVarAccumulator>;
using TBayesinOptimizationUPtr = std::unique_ptr<maths::CBayesianOptimisation>;
using TNodeVec = CBoostedTree::TNodeVec;
Expand Down Expand Up @@ -211,7 +212,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
void initializeTreeShap(const core::CDataFrame& frame);

//! Train the forest and compute loss moments on each fold.
TMeanVarAccumulatorSizePr crossValidateForest(core::CDataFrame& frame);
TMeanVarAccumulatorSizeDoubleTuple crossValidateForest(core::CDataFrame& frame);

//! Initialize the predictions and loss function derivatives for the masked
//! rows in \p frame.
Expand Down Expand Up @@ -289,7 +290,8 @@ class MATHS_EXPORT CBoostedTreeImpl final {

//! Capture the current hyperparameter values.
void captureBestHyperparameters(const TMeanVarAccumulator& lossMoments,
std::size_t maximumNumberTrees);
std::size_t maximumNumberTrees,
double numberNodes);

//! Set the hyperparamaters from the best recorded.
void restoreBestHyperparameters();
Expand Down Expand Up @@ -370,6 +372,8 @@ class MATHS_EXPORT CBoostedTreeImpl final {
std::size_t m_NumberTopShapValues = 0;
TTreeShapFeatureImportanceUPtr m_TreeShap;
TAnalysisInstrumentationPtr m_Instrumentation;
mutable TMeanAccumulator m_ForestSizeAccumulator;
mutable TMeanAccumulator m_MeanLossAccumulator;

private:
friend class CBoostedTreeFactory;
Expand Down
39 changes: 32 additions & 7 deletions lib/maths/CBoostedTreeImpl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,14 @@ double trace(std::size_t columns, const TMemoryMappedFloatVector& upperTriangle)
}

CDataFrameTrainBoostedTreeInstrumentationStub INSTRUMENTATION_STUB;

double numberForestNodes(const CBoostedTreeImpl::TNodeVecVec& forest) {
double numberNodes{0.0};
for (const auto& tree : forest) {
numberNodes += static_cast<double>(tree.size());
}
return numberNodes;
}
}

CBoostedTreeImpl::CBoostedTreeImpl(std::size_t numberThreads,
Expand Down Expand Up @@ -223,9 +231,13 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame,

TMeanVarAccumulator lossMoments;
std::size_t maximumNumberTrees;
std::tie(lossMoments, maximumNumberTrees) = this->crossValidateForest(frame);
double numberNodes;
std::tie(lossMoments, maximumNumberTrees, numberNodes) =
this->crossValidateForest(frame);

this->captureBestHyperparameters(lossMoments, maximumNumberTrees);
m_MeanLossAccumulator.add(CBasicStatistics::mean(lossMoments));

this->captureBestHyperparameters(lossMoments, maximumNumberTrees, numberNodes);

if (this->selectNextHyperparameters(lossMoments, *m_BayesianOptimization) == false) {
LOG_WARN(<< "Hyperparameter selection failed: exiting loop early");
Expand Down Expand Up @@ -481,7 +493,7 @@ void CBoostedTreeImpl::initializeTreeShap(const core::CDataFrame& frame) {
}
}

CBoostedTreeImpl::TMeanVarAccumulatorSizePr
CBoostedTreeImpl::TMeanVarAccumulatorSizeDoubleTuple
CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) {

// We want to ensure we evaluate on equal proportions for each fold.
Expand Down Expand Up @@ -510,6 +522,7 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) {
TMeanVarAccumulator lossMoments;
TDoubleVec numberTrees;
numberTrees.reserve(m_NumberFolds);
TMeanAccumulator forestSizeAccumulator;

while (folds.size() > 0 && stopCrossValidationEarly(lossMoments) == false) {
std::size_t fold{folds.back()};
Expand All @@ -524,6 +537,7 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) {
lossMoments.add(loss);
m_FoldRoundTestLosses[fold][m_CurrentRound] = loss;
numberTrees.push_back(static_cast<double>(forest.size()));
forestSizeAccumulator.add(numberForestNodes(forest));
m_Instrumentation->lossValues(fold, std::move(lossValues));
}
m_TrainingProgress.increment(m_MaximumNumberTrees * folds.size());
Expand All @@ -532,11 +546,13 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) {
std::sort(numberTrees.begin(), numberTrees.end());
std::size_t medianNumberTrees{
static_cast<std::size_t>(CBasicStatistics::median(numberTrees))};
double meanForestSize{CBasicStatistics::mean(forestSizeAccumulator)};
lossMoments = this->correctTestLossMoments(std::move(folds), lossMoments);
LOG_TRACE(<< "test mean loss = " << CBasicStatistics::mean(lossMoments)
<< ", sigma = " << std::sqrt(CBasicStatistics::mean(lossMoments)));
<< ", sigma = " << std::sqrt(CBasicStatistics::mean(lossMoments))
<< ", mean number nodes in forest = " << meanForestSize);

return {lossMoments, medianNumberTrees};
return {lossMoments, medianNumberTrees, meanForestSize};
}

CBoostedTreeImpl::TNodeVec CBoostedTreeImpl::initializePredictionsAndLossDerivatives(
Expand Down Expand Up @@ -658,6 +674,9 @@ CBoostedTreeImpl::trainForest(core::CDataFrame& frame,

forest.resize(stoppingCondition.bestSize());

// record forest size as the number of nodes
m_ForestSizeAccumulator.add(numberForestNodes(forest));

LOG_TRACE(<< "Trained one forest");

return {forest, stoppingCondition.bestLoss(), std::move(losses)};
Expand Down Expand Up @@ -1274,11 +1293,17 @@ bool CBoostedTreeImpl::selectNextHyperparameters(const TMeanVarAccumulator& loss
}

void CBoostedTreeImpl::captureBestHyperparameters(const TMeanVarAccumulator& lossMoments,
std::size_t maximumNumberTrees) {
std::size_t maximumNumberTrees,
double numberNodes) {
// We capture the parameters with the lowest error at one standard
// deviation above the mean. If the mean error improvement is marginal
// we prefer the solution with the least variation across the folds.
double loss{lossAtNSigma(1.0, lossMoments)};

// Add 0.01 * "forest number nodes" * E[GP] / "average forest number nodes" to meanLoss.
double modelSizeDifferentiator{0.01 * numberNodes /
CBasicStatistics::mean(m_ForestSizeAccumulator) *
CBasicStatistics::mean(m_MeanLossAccumulator)};
double loss{lossAtNSigma(1.0, lossMoments) + modelSizeDifferentiator};
if (loss < m_BestForestTestLoss) {
m_BestForestTestLoss = loss;
m_BestHyperparameters = CBoostedTreeHyperparameters{
Expand Down