Skip to content

Commit

Permalink
[7.x][ML] Hyperparameter importance (elastic#1649)
Browse files Browse the repository at this point in the history
Backport of  elastic#1627
  • Loading branch information
valeriy42 authored Jan 8, 2021
1 parent e8da7bc commit 5e2e2ff
Show file tree
Hide file tree
Showing 22 changed files with 932 additions and 9 deletions.
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@

* Fix edge case which could cause spurious anomalies early in the learning process
if the time series has non-diurnal seasonality. (See {ml-pull}1634[#1634].)
* Compute importance of hyperparameters optimized in the fine parameter tuning step.
(See {ml-pull}1627[#1627].)

== {es} version 7.11.0

Expand Down
1 change: 1 addition & 0 deletions include/api/CDataFrameTrainBoostedTreeRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
static const std::string LAMBDA;
static const std::string GAMMA;
static const std::string ETA;
static const std::string ETA_GROWTH_RATE_PER_TREE;
static const std::string SOFT_TREE_DEPTH_LIMIT;
static const std::string SOFT_TREE_DEPTH_TOLERANCE;
static const std::string MAX_TREES;
Expand Down
30 changes: 29 additions & 1 deletion include/api/CInferenceModelMetadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
#define INCLUDED_ml_api_CInferenceModelMetadata_h

#include <maths/CBasicStatistics.h>
#include <maths/CBoostedTree.h>
#include <maths/CLinearAlgebraEigen.h>

#include <api/CInferenceModelDefinition.h>
#include <api/ImportExport.h>

#include <string>
#include <tuple>

namespace ml {
namespace api {
Expand All @@ -21,16 +23,22 @@ namespace api {
//! (such as totol feature importance) into JSON format.
class API_EXPORT CInferenceModelMetadata {
public:
static const std::string JSON_ABSOLUTE_IMPORTANCE_TAG;
static const std::string JSON_BASELINE_TAG;
static const std::string JSON_FEATURE_IMPORTANCE_BASELINE_TAG;
static const std::string JSON_CLASS_NAME_TAG;
static const std::string JSON_CLASSES_TAG;
static const std::string JSON_FEATURE_IMPORTANCE_BASELINE_TAG;
static const std::string JSON_FEATURE_NAME_TAG;
static const std::string JSON_HYPERPARAMETERS_TAG;
static const std::string JSON_HYPERPARAMETER_NAME_TAG;
static const std::string JSON_HYPERPARAMETER_VALUE_TAG;
static const std::string JSON_HYPERPARAMETER_SUPPLIED_TAG;
static const std::string JSON_IMPORTANCE_TAG;
static const std::string JSON_MAX_TAG;
static const std::string JSON_MEAN_MAGNITUDE_TAG;
static const std::string JSON_MIN_TAG;
static const std::string JSON_MODEL_METADATA_TAG;
static const std::string JSON_RELATIVE_IMPORTANCE_TAG;
static const std::string JSON_TOTAL_FEATURE_IMPORTANCE_TAG;

public:
Expand All @@ -53,17 +61,36 @@ class API_EXPORT CInferenceModelMetadata {
//! Set the feature importance baseline (the individual feature importances are additive corrections
//! to the baseline value).
void featureImportanceBaseline(TVector&& baseline);
void hyperparameterImportance(const maths::CBoostedTree::THyperparameterImportanceVec& hyperparameterImportance);

private:
struct SHyperparameterImportance {
SHyperparameterImportance(std::string hyperparameterName,
double value,
double absoluteImportance,
double relativeImportance,
bool supplied)
: s_HyperparameterName(hyperparameterName), s_Value(value),
s_AbsoluteImportance(absoluteImportance),
s_RelativeImportance(relativeImportance), s_Supplied(supplied){};
std::string s_HyperparameterName;
double s_Value;
double s_AbsoluteImportance;
double s_RelativeImportance;
bool s_Supplied;
};

using TMeanAccumulator =
std::vector<maths::CBasicStatistics::SSampleMean<double>::TAccumulator>;
using TMinMaxAccumulator = std::vector<maths::CBasicStatistics::CMinMax<double>>;
using TSizeMeanAccumulatorUMap = std::unordered_map<std::size_t, TMeanAccumulator>;
using TSizeMinMaxAccumulatorUMap = std::unordered_map<std::size_t, TMinMaxAccumulator>;
using TOptionalVector = boost::optional<TVector>;
using THyperparametersVec = std::vector<SHyperparameterImportance>;

private:
void writeTotalFeatureImportance(TRapidJsonWriter& writer) const;
void writeHyperparameterImportance(TRapidJsonWriter& writer) const;
void writeFeatureImportanceBaseline(TRapidJsonWriter& writer) const;

private:
Expand All @@ -76,6 +103,7 @@ class API_EXPORT CInferenceModelMetadata {
[](const std::string& value, TRapidJsonWriter& writer) {
writer.String(value);
};
THyperparametersVec m_HyperparameterImportance;
};
}
}
Expand Down
33 changes: 33 additions & 0 deletions include/maths/CBayesianOptimisation.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,31 @@ class MATHS_EXPORT CBayesianOptimisation {
static std::size_t estimateMemoryUsage(std::size_t numberParameters,
std::size_t numberRounds);

//! Evaluate the Guassian process at the point \p input.
double evaluate(const TVector& input) const;

//! Compute the marginalized value of the Gaussian process in the dimension
//! \p dimension for the values \p input.
double evaluate1D(double input, int dimension) const;

//! Get the constant factor of the ANOVA decomposition of the Gaussian process.
double anovaConstantFactor() const;

//! Get the total variance of the hyperparameters in the Gaussian process
//! using ANOVA decomposition.
double anovaTotalVariance() const;

//! Get the main effect of the parameter \p dimension in the Gaussian process
//! using ANOVA decomposition.
double anovaMainEffect(int dimension) const;

//! Get the vector of main effects as an absolute value and as a fraction
//! of the total variance.
TDoubleDoublePrVec anovaMainEffects() const;

//! Set kernel \p parameters explicitly.
void kernelParameters(const TVector& parameters);

//! \name Test Interface
//@{
//! Get minus the data likelihood and its gradient as a function of the kernel
Expand Down Expand Up @@ -132,6 +157,14 @@ class MATHS_EXPORT CBayesianOptimisation {
TMatrix kernel(const TVector& a, double v) const;
TVectorDoublePr kernelCovariates(const TVector& a, const TVector& x, double vx) const;
double kernel(const TVector& a, const TVector& x, const TVector& y) const;
double evaluate(const TVector& Kinvf, const TVector& input) const;
double evaluate1D(const TVector& Kinvf, double input, int dimension) const;
double anovaConstantFactor(const TVector& Kinvf) const;
double anovaTotalVariance(const TVector& Kinvf) const;
double anovaMainEffect(const TVector& Kinvf, int dimension) const;
TVector kinvf() const;
TVector transformTo01(const TVector& x) const;
TVector scaledKernelParameters() const;

private:
CPRNG::CXorOShiro128Plus m_Rng;
Expand Down
6 changes: 6 additions & 0 deletions include/maths/CBoostedTree.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <core/CStateRestoreTraverser.h>

#include <maths/CBoostedTreeHyperparameters.h>
#include <maths/CBoostedTreeUtils.h>
#include <maths/CDataFrameCategoryEncoder.h>
#include <maths/CDataFramePredictiveModel.h>
#include <maths/CLinearAlgebraEigen.h>
Expand Down Expand Up @@ -201,6 +202,8 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel {
using TDataFramePtr = core::CDataFrame*;
using TNodeVec = std::vector<CBoostedTreeNode>;
using TNodeVecVec = std::vector<TNodeVec>;
using THyperparameterImportanceVec =
std::vector<boosted_tree_detail::SHyperparameterImportance>;

class MATHS_EXPORT CVisitor : public CDataFrameCategoryEncoder::CVisitor,
public CBoostedTreeNode::CVisitor {
Expand Down Expand Up @@ -230,6 +233,9 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel {
//! \warning Will return a nullptr if a trained model isn't available.
CTreeShapFeatureImportance* shap() const override;

//! Get the vector of hyperparameter importances.
THyperparameterImportanceVec hyperparameterImportance() const;

//! Get the column containing the dependent variable.
std::size_t columnHoldingDependentVariable() const override;

Expand Down
12 changes: 10 additions & 2 deletions include/maths/CBoostedTreeImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
#include <maths/ImportExport.h>

#include <boost/optional.hpp>
#include <boost/range/irange.hpp>

#include <limits>
#include <memory>
Expand Down Expand Up @@ -66,8 +65,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
using TOptionalDouble = boost::optional<double>;
using TRegularization = CBoostedTreeRegularization<double>;
using TSizeVec = std::vector<std::size_t>;
using TSizeRange = boost::integer_range<std::size_t>;
using TAnalysisInstrumentationPtr = CDataFrameTrainBoostedTreeInstrumentationInterface*;
using THyperparameterImportanceVec =
std::vector<boosted_tree_detail::SHyperparameterImportance>;

public:
static const double MINIMUM_RELATIVE_GAIN_PER_SPLIT;
Expand Down Expand Up @@ -95,6 +95,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
//! \warning Will return a nullptr if a trained model isn't available.
CTreeShapFeatureImportance* shap();

//! Get the vector of hyperparameter importances.
THyperparameterImportanceVec hyperparameterImportance() const;

//! Get the model produced by training if it has been run.
const TNodeVecVec& trainedModel() const;

Expand Down Expand Up @@ -174,6 +177,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
using TRegularizationOverride = CBoostedTreeRegularization<TOptionalDouble>;
using TTreeShapFeatureImportanceUPtr = std::unique_ptr<CTreeShapFeatureImportance>;
using TWorkspace = CBoostedTreeLeafNodeStatistics::CWorkspace;
using THyperparametersVec = std::vector<boosted_tree_detail::EHyperparameters>;

//! Tag progress through initialization.
enum EInitializationStage {
Expand Down Expand Up @@ -326,6 +330,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
//! Record hyperparameters for instrumentation.
void recordHyperparameters();

//! Populate the list of tunable hyperparameters
void initializeTunableHyperparameters();

private:
mutable CPRNG::CXorOShiro128Plus m_Rng;
EInitializationStage m_InitializationStage = E_NotInitialized;
Expand Down Expand Up @@ -374,6 +381,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
TAnalysisInstrumentationPtr m_Instrumentation;
mutable TMeanAccumulator m_ForestSizeAccumulator;
mutable TMeanAccumulator m_MeanLossAccumulator;
THyperparametersVec m_TunableHyperparameters;

private:
friend class CBoostedTreeFactory;
Expand Down
30 changes: 30 additions & 0 deletions include/maths/CBoostedTreeUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,36 @@ using TAlignedMemoryMappedFloatVector =

enum EExtraColumn { E_Prediction = 0, E_Gradient, E_Curvature, E_Weight };

enum EHyperparameters {
E_DownsampleFactor = 0,
E_Alpha,
E_Lambda,
E_Gamma,
E_SoftTreeDepthLimit,
E_SoftTreeDepthTolerance,
E_Eta,
E_EtaGrowthRatePerTree,
E_FeatureBagFraction
};

constexpr std::size_t NUMBER_HYPERPARAMETERS = E_FeatureBagFraction + 1; // This must be last hyperparameter

struct SHyperparameterImportance {
SHyperparameterImportance(EHyperparameters hyperparameter,
double value,
double absoluteImportance,
double relativeImportance,
bool supplied)
: s_Hyperparameter(hyperparameter), s_Value(value),
s_AbsoluteImportance(absoluteImportance),
s_RelativeImportance(relativeImportance), s_Supplied(supplied) {}
EHyperparameters s_Hyperparameter;
double s_Value;
double s_AbsoluteImportance;
double s_RelativeImportance;
bool s_Supplied;
};

//! Get the size of upper triangle of the loss Hessain.
inline std::size_t lossHessianUpperTriangleSize(std::size_t numberLossParameters) {
return numberLossParameters * (numberLossParameters + 1) / 2;
Expand Down
4 changes: 4 additions & 0 deletions include/maths/CSampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,10 @@ class MATHS_EXPORT CSampling : private core::CNonInstantiatable {
//! and \p rate on the \p n quantile intervals.
static void gammaSampleQuantiles(double shape, double rate, std::size_t n, TDoubleVec& result);

//! Generate \p samples of Sobol sequence \p n elements on
//! the hypercube [0, 1] in \p dim dimensions.
static void sobolSequenceSample(std::size_t dim, std::size_t n, TDoubleVecVec& samples);

private:
//! \brief A uniform generator on the interval [0, n).
template<typename RNG>
Expand Down
2 changes: 2 additions & 0 deletions lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,8 @@ CDataFrameTrainBoostedTreeClassifierRunner::inferenceModelMetadata() const {
if (featureImportance) {
m_InferenceModelMetadata.featureImportanceBaseline(featureImportance->baseline());
}
m_InferenceModelMetadata.hyperparameterImportance(
this->boostedTree().hyperparameterImportance());
return m_InferenceModelMetadata;
}

Expand Down
2 changes: 2 additions & 0 deletions lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ CDataFrameTrainBoostedTreeRegressionRunner::inferenceModelMetadata() const {
if (featureImportance) {
m_InferenceModelMetadata.featureImportanceBaseline(featureImportance->baseline());
}
m_InferenceModelMetadata.hyperparameterImportance(
this->boostedTree().hyperparameterImportance());
return m_InferenceModelMetadata;
}

Expand Down
1 change: 1 addition & 0 deletions lib/api/CDataFrameTrainBoostedTreeRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ const std::string CDataFrameTrainBoostedTreeRunner::ALPHA{"alpha"};
const std::string CDataFrameTrainBoostedTreeRunner::LAMBDA{"lambda"};
const std::string CDataFrameTrainBoostedTreeRunner::GAMMA{"gamma"};
const std::string CDataFrameTrainBoostedTreeRunner::ETA{"eta"};
const std::string CDataFrameTrainBoostedTreeRunner::ETA_GROWTH_RATE_PER_TREE{"eta_growth_rate_per_tree"};
const std::string CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_LIMIT{"soft_tree_depth_limit"};
const std::string CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_TOLERANCE{"soft_tree_depth_tolerance"};
const std::string CDataFrameTrainBoostedTreeRunner::MAX_TREES{"max_trees"};
Expand Down
Loading

0 comments on commit 5e2e2ff

Please sign in to comment.