Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[7.x][ML] Hyperparameter importance #1649

Merged
merged 1 commit into from
Jan 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@

* Fix edge case which could cause spurious anomalies early in the learning process
if the time series has non-diurnal seasonality. (See {ml-pull}1634[#1634].)
* Compute importance of hyperparameters optimized in the fine parameter tuning step.
(See {ml-pull}1627[#1627].)

== {es} version 7.11.0

Expand Down
1 change: 1 addition & 0 deletions include/api/CDataFrameTrainBoostedTreeRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
static const std::string LAMBDA;
static const std::string GAMMA;
static const std::string ETA;
static const std::string ETA_GROWTH_RATE_PER_TREE;
static const std::string SOFT_TREE_DEPTH_LIMIT;
static const std::string SOFT_TREE_DEPTH_TOLERANCE;
static const std::string MAX_TREES;
Expand Down
30 changes: 29 additions & 1 deletion include/api/CInferenceModelMetadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
#define INCLUDED_ml_api_CInferenceModelMetadata_h

#include <maths/CBasicStatistics.h>
#include <maths/CBoostedTree.h>
#include <maths/CLinearAlgebraEigen.h>

#include <api/CInferenceModelDefinition.h>
#include <api/ImportExport.h>

#include <string>
#include <tuple>

namespace ml {
namespace api {
Expand All @@ -21,16 +23,22 @@ namespace api {
//! (such as totol feature importance) into JSON format.
class API_EXPORT CInferenceModelMetadata {
public:
static const std::string JSON_ABSOLUTE_IMPORTANCE_TAG;
static const std::string JSON_BASELINE_TAG;
static const std::string JSON_FEATURE_IMPORTANCE_BASELINE_TAG;
static const std::string JSON_CLASS_NAME_TAG;
static const std::string JSON_CLASSES_TAG;
static const std::string JSON_FEATURE_IMPORTANCE_BASELINE_TAG;
static const std::string JSON_FEATURE_NAME_TAG;
static const std::string JSON_HYPERPARAMETERS_TAG;
static const std::string JSON_HYPERPARAMETER_NAME_TAG;
static const std::string JSON_HYPERPARAMETER_VALUE_TAG;
static const std::string JSON_HYPERPARAMETER_SUPPLIED_TAG;
static const std::string JSON_IMPORTANCE_TAG;
static const std::string JSON_MAX_TAG;
static const std::string JSON_MEAN_MAGNITUDE_TAG;
static const std::string JSON_MIN_TAG;
static const std::string JSON_MODEL_METADATA_TAG;
static const std::string JSON_RELATIVE_IMPORTANCE_TAG;
static const std::string JSON_TOTAL_FEATURE_IMPORTANCE_TAG;

public:
Expand All @@ -53,17 +61,36 @@ class API_EXPORT CInferenceModelMetadata {
//! Set the feature importance baseline (the individual feature importances are additive corrections
//! to the baseline value).
void featureImportanceBaseline(TVector&& baseline);
void hyperparameterImportance(const maths::CBoostedTree::THyperparameterImportanceVec& hyperparameterImportance);

private:
struct SHyperparameterImportance {
SHyperparameterImportance(std::string hyperparameterName,
double value,
double absoluteImportance,
double relativeImportance,
bool supplied)
: s_HyperparameterName(hyperparameterName), s_Value(value),
s_AbsoluteImportance(absoluteImportance),
s_RelativeImportance(relativeImportance), s_Supplied(supplied){};
std::string s_HyperparameterName;
double s_Value;
double s_AbsoluteImportance;
double s_RelativeImportance;
bool s_Supplied;
};

using TMeanAccumulator =
std::vector<maths::CBasicStatistics::SSampleMean<double>::TAccumulator>;
using TMinMaxAccumulator = std::vector<maths::CBasicStatistics::CMinMax<double>>;
using TSizeMeanAccumulatorUMap = std::unordered_map<std::size_t, TMeanAccumulator>;
using TSizeMinMaxAccumulatorUMap = std::unordered_map<std::size_t, TMinMaxAccumulator>;
using TOptionalVector = boost::optional<TVector>;
using THyperparametersVec = std::vector<SHyperparameterImportance>;

private:
void writeTotalFeatureImportance(TRapidJsonWriter& writer) const;
void writeHyperparameterImportance(TRapidJsonWriter& writer) const;
void writeFeatureImportanceBaseline(TRapidJsonWriter& writer) const;

private:
Expand All @@ -76,6 +103,7 @@ class API_EXPORT CInferenceModelMetadata {
[](const std::string& value, TRapidJsonWriter& writer) {
writer.String(value);
};
THyperparametersVec m_HyperparameterImportance;
};
}
}
Expand Down
33 changes: 33 additions & 0 deletions include/maths/CBayesianOptimisation.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,31 @@ class MATHS_EXPORT CBayesianOptimisation {
static std::size_t estimateMemoryUsage(std::size_t numberParameters,
std::size_t numberRounds);

//! Evaluate the Guassian process at the point \p input.
double evaluate(const TVector& input) const;

//! Compute the marginalized value of the Gaussian process in the dimension
//! \p dimension for the values \p input.
double evaluate1D(double input, int dimension) const;

//! Get the constant factor of the ANOVA decomposition of the Gaussian process.
double anovaConstantFactor() const;

//! Get the total variance of the hyperparameters in the Gaussian process
//! using ANOVA decomposition.
double anovaTotalVariance() const;

//! Get the main effect of the parameter \p dimension in the Gaussian process
//! using ANOVA decomposition.
double anovaMainEffect(int dimension) const;

//! Get the vector of main effects as an absolute value and as a fraction
//! of the total variance.
TDoubleDoublePrVec anovaMainEffects() const;

//! Set kernel \p parameters explicitly.
void kernelParameters(const TVector& parameters);

//! \name Test Interface
//@{
//! Get minus the data likelihood and its gradient as a function of the kernel
Expand Down Expand Up @@ -132,6 +157,14 @@ class MATHS_EXPORT CBayesianOptimisation {
TMatrix kernel(const TVector& a, double v) const;
TVectorDoublePr kernelCovariates(const TVector& a, const TVector& x, double vx) const;
double kernel(const TVector& a, const TVector& x, const TVector& y) const;
double evaluate(const TVector& Kinvf, const TVector& input) const;
double evaluate1D(const TVector& Kinvf, double input, int dimension) const;
double anovaConstantFactor(const TVector& Kinvf) const;
double anovaTotalVariance(const TVector& Kinvf) const;
double anovaMainEffect(const TVector& Kinvf, int dimension) const;
TVector kinvf() const;
TVector transformTo01(const TVector& x) const;
TVector scaledKernelParameters() const;

private:
CPRNG::CXorOShiro128Plus m_Rng;
Expand Down
6 changes: 6 additions & 0 deletions include/maths/CBoostedTree.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <core/CStateRestoreTraverser.h>

#include <maths/CBoostedTreeHyperparameters.h>
#include <maths/CBoostedTreeUtils.h>
#include <maths/CDataFrameCategoryEncoder.h>
#include <maths/CDataFramePredictiveModel.h>
#include <maths/CLinearAlgebraEigen.h>
Expand Down Expand Up @@ -201,6 +202,8 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel {
using TDataFramePtr = core::CDataFrame*;
using TNodeVec = std::vector<CBoostedTreeNode>;
using TNodeVecVec = std::vector<TNodeVec>;
using THyperparameterImportanceVec =
std::vector<boosted_tree_detail::SHyperparameterImportance>;

class MATHS_EXPORT CVisitor : public CDataFrameCategoryEncoder::CVisitor,
public CBoostedTreeNode::CVisitor {
Expand Down Expand Up @@ -230,6 +233,9 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel {
//! \warning Will return a nullptr if a trained model isn't available.
CTreeShapFeatureImportance* shap() const override;

//! Get the vector of hyperparameter importances.
THyperparameterImportanceVec hyperparameterImportance() const;

//! Get the column containing the dependent variable.
std::size_t columnHoldingDependentVariable() const override;

Expand Down
12 changes: 10 additions & 2 deletions include/maths/CBoostedTreeImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
#include <maths/ImportExport.h>

#include <boost/optional.hpp>
#include <boost/range/irange.hpp>

#include <limits>
#include <memory>
Expand Down Expand Up @@ -66,8 +65,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
using TOptionalDouble = boost::optional<double>;
using TRegularization = CBoostedTreeRegularization<double>;
using TSizeVec = std::vector<std::size_t>;
using TSizeRange = boost::integer_range<std::size_t>;
using TAnalysisInstrumentationPtr = CDataFrameTrainBoostedTreeInstrumentationInterface*;
using THyperparameterImportanceVec =
std::vector<boosted_tree_detail::SHyperparameterImportance>;

public:
static const double MINIMUM_RELATIVE_GAIN_PER_SPLIT;
Expand Down Expand Up @@ -95,6 +95,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
//! \warning Will return a nullptr if a trained model isn't available.
CTreeShapFeatureImportance* shap();

//! Get the vector of hyperparameter importances.
THyperparameterImportanceVec hyperparameterImportance() const;

//! Get the model produced by training if it has been run.
const TNodeVecVec& trainedModel() const;

Expand Down Expand Up @@ -174,6 +177,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
using TRegularizationOverride = CBoostedTreeRegularization<TOptionalDouble>;
using TTreeShapFeatureImportanceUPtr = std::unique_ptr<CTreeShapFeatureImportance>;
using TWorkspace = CBoostedTreeLeafNodeStatistics::CWorkspace;
using THyperparametersVec = std::vector<boosted_tree_detail::EHyperparameters>;

//! Tag progress through initialization.
enum EInitializationStage {
Expand Down Expand Up @@ -326,6 +330,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
//! Record hyperparameters for instrumentation.
void recordHyperparameters();

//! Populate the list of tunable hyperparameters
void initializeTunableHyperparameters();

private:
mutable CPRNG::CXorOShiro128Plus m_Rng;
EInitializationStage m_InitializationStage = E_NotInitialized;
Expand Down Expand Up @@ -374,6 +381,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
TAnalysisInstrumentationPtr m_Instrumentation;
mutable TMeanAccumulator m_ForestSizeAccumulator;
mutable TMeanAccumulator m_MeanLossAccumulator;
THyperparametersVec m_TunableHyperparameters;

private:
friend class CBoostedTreeFactory;
Expand Down
30 changes: 30 additions & 0 deletions include/maths/CBoostedTreeUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,36 @@ using TAlignedMemoryMappedFloatVector =

enum EExtraColumn { E_Prediction = 0, E_Gradient, E_Curvature, E_Weight };

enum EHyperparameters {
E_DownsampleFactor = 0,
E_Alpha,
E_Lambda,
E_Gamma,
E_SoftTreeDepthLimit,
E_SoftTreeDepthTolerance,
E_Eta,
E_EtaGrowthRatePerTree,
E_FeatureBagFraction
};

constexpr std::size_t NUMBER_HYPERPARAMETERS = E_FeatureBagFraction + 1; // This must be last hyperparameter

struct SHyperparameterImportance {
SHyperparameterImportance(EHyperparameters hyperparameter,
double value,
double absoluteImportance,
double relativeImportance,
bool supplied)
: s_Hyperparameter(hyperparameter), s_Value(value),
s_AbsoluteImportance(absoluteImportance),
s_RelativeImportance(relativeImportance), s_Supplied(supplied) {}
EHyperparameters s_Hyperparameter;
double s_Value;
double s_AbsoluteImportance;
double s_RelativeImportance;
bool s_Supplied;
};

//! Get the size of upper triangle of the loss Hessain.
inline std::size_t lossHessianUpperTriangleSize(std::size_t numberLossParameters) {
return numberLossParameters * (numberLossParameters + 1) / 2;
Expand Down
4 changes: 4 additions & 0 deletions include/maths/CSampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,10 @@ class MATHS_EXPORT CSampling : private core::CNonInstantiatable {
//! and \p rate on the \p n quantile intervals.
static void gammaSampleQuantiles(double shape, double rate, std::size_t n, TDoubleVec& result);

//! Generate \p samples of Sobol sequence \p n elements on
//! the hypercube [0, 1] in \p dim dimensions.
static void sobolSequenceSample(std::size_t dim, std::size_t n, TDoubleVecVec& samples);

private:
//! \brief A uniform generator on the interval [0, n).
template<typename RNG>
Expand Down
2 changes: 2 additions & 0 deletions lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,8 @@ CDataFrameTrainBoostedTreeClassifierRunner::inferenceModelMetadata() const {
if (featureImportance) {
m_InferenceModelMetadata.featureImportanceBaseline(featureImportance->baseline());
}
m_InferenceModelMetadata.hyperparameterImportance(
this->boostedTree().hyperparameterImportance());
return m_InferenceModelMetadata;
}

Expand Down
2 changes: 2 additions & 0 deletions lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ CDataFrameTrainBoostedTreeRegressionRunner::inferenceModelMetadata() const {
if (featureImportance) {
m_InferenceModelMetadata.featureImportanceBaseline(featureImportance->baseline());
}
m_InferenceModelMetadata.hyperparameterImportance(
this->boostedTree().hyperparameterImportance());
return m_InferenceModelMetadata;
}

Expand Down
1 change: 1 addition & 0 deletions lib/api/CDataFrameTrainBoostedTreeRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ const std::string CDataFrameTrainBoostedTreeRunner::ALPHA{"alpha"};
const std::string CDataFrameTrainBoostedTreeRunner::LAMBDA{"lambda"};
const std::string CDataFrameTrainBoostedTreeRunner::GAMMA{"gamma"};
const std::string CDataFrameTrainBoostedTreeRunner::ETA{"eta"};
const std::string CDataFrameTrainBoostedTreeRunner::ETA_GROWTH_RATE_PER_TREE{"eta_growth_rate_per_tree"};
const std::string CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_LIMIT{"soft_tree_depth_limit"};
const std::string CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_TOLERANCE{"soft_tree_depth_tolerance"};
const std::string CDataFrameTrainBoostedTreeRunner::MAX_TREES{"max_trees"};
Expand Down
Loading