elastic · valeriy42 · Jan 8, 2021 · Jan 8, 2021
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -34,6 +34,8 @@
 
 * Fix edge case which could cause spurious anomalies early in the learning process
   if the time series has non-diurnal seasonality. (See {ml-pull}1634[#1634].)
+* Compute importance of hyperparameters optimized in the fine parameter tuning step.
+  (See {ml-pull}1627[#1627].)
 
 == {es} version 7.11.0
 

diff --git a/include/api/CDataFrameTrainBoostedTreeRunner.h b/include/api/CDataFrameTrainBoostedTreeRunner.h
@@ -42,6 +42,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
     static const std::string LAMBDA;
     static const std::string GAMMA;
     static const std::string ETA;
+    static const std::string ETA_GROWTH_RATE_PER_TREE;
     static const std::string SOFT_TREE_DEPTH_LIMIT;
     static const std::string SOFT_TREE_DEPTH_TOLERANCE;
     static const std::string MAX_TREES;

diff --git a/include/api/CInferenceModelMetadata.h b/include/api/CInferenceModelMetadata.h
@@ -7,12 +7,14 @@
 #define INCLUDED_ml_api_CInferenceModelMetadata_h
 
 #include <maths/CBasicStatistics.h>
+#include <maths/CBoostedTree.h>
 #include <maths/CLinearAlgebraEigen.h>
 
 #include <api/CInferenceModelDefinition.h>
 #include <api/ImportExport.h>
 
 #include <string>
+#include <tuple>
 
 namespace ml {
 namespace api {
@@ -21,16 +23,22 @@ namespace api {
 //! (such as totol feature importance) into JSON format.
 class API_EXPORT CInferenceModelMetadata {
 public:
+    static const std::string JSON_ABSOLUTE_IMPORTANCE_TAG;
     static const std::string JSON_BASELINE_TAG;
-    static const std::string JSON_FEATURE_IMPORTANCE_BASELINE_TAG;
     static const std::string JSON_CLASS_NAME_TAG;
     static const std::string JSON_CLASSES_TAG;
+    static const std::string JSON_FEATURE_IMPORTANCE_BASELINE_TAG;
     static const std::string JSON_FEATURE_NAME_TAG;
+    static const std::string JSON_HYPERPARAMETERS_TAG;
+    static const std::string JSON_HYPERPARAMETER_NAME_TAG;
+    static const std::string JSON_HYPERPARAMETER_VALUE_TAG;
+    static const std::string JSON_HYPERPARAMETER_SUPPLIED_TAG;
     static const std::string JSON_IMPORTANCE_TAG;
     static const std::string JSON_MAX_TAG;
     static const std::string JSON_MEAN_MAGNITUDE_TAG;
     static const std::string JSON_MIN_TAG;
     static const std::string JSON_MODEL_METADATA_TAG;
+    static const std::string JSON_RELATIVE_IMPORTANCE_TAG;
     static const std::string JSON_TOTAL_FEATURE_IMPORTANCE_TAG;
 
 public:
@@ -53,17 +61,36 @@ class API_EXPORT CInferenceModelMetadata {
     //! Set the feature importance baseline (the individual feature importances are additive corrections
     //! to the baseline value).
     void featureImportanceBaseline(TVector&& baseline);
+    void hyperparameterImportance(const maths::CBoostedTree::THyperparameterImportanceVec& hyperparameterImportance);
 
 private:
+    struct SHyperparameterImportance {
+        SHyperparameterImportance(std::string hyperparameterName,
+                                  double value,
+                                  double absoluteImportance,
+                                  double relativeImportance,
+                                  bool supplied)
+            : s_HyperparameterName(hyperparameterName), s_Value(value),
+              s_AbsoluteImportance(absoluteImportance),
+              s_RelativeImportance(relativeImportance), s_Supplied(supplied){};
+        std::string s_HyperparameterName;
+        double s_Value;
+        double s_AbsoluteImportance;
+        double s_RelativeImportance;
+        bool s_Supplied;
+    };
+
     using TMeanAccumulator =
         std::vector<maths::CBasicStatistics::SSampleMean<double>::TAccumulator>;
     using TMinMaxAccumulator = std::vector<maths::CBasicStatistics::CMinMax<double>>;
     using TSizeMeanAccumulatorUMap = std::unordered_map<std::size_t, TMeanAccumulator>;
     using TSizeMinMaxAccumulatorUMap = std::unordered_map<std::size_t, TMinMaxAccumulator>;
     using TOptionalVector = boost::optional<TVector>;
+    using THyperparametersVec = std::vector<SHyperparameterImportance>;
 
 private:
     void writeTotalFeatureImportance(TRapidJsonWriter& writer) const;
+    void writeHyperparameterImportance(TRapidJsonWriter& writer) const;
     void writeFeatureImportanceBaseline(TRapidJsonWriter& writer) const;
 
 private:
@@ -76,6 +103,7 @@ class API_EXPORT CInferenceModelMetadata {
         [](const std::string& value, TRapidJsonWriter& writer) {
             writer.String(value);
         };
+    THyperparametersVec m_HyperparameterImportance;
 };
 }
 }

diff --git a/include/maths/CBayesianOptimisation.h b/include/maths/CBayesianOptimisation.h
@@ -93,6 +93,31 @@ class MATHS_EXPORT CBayesianOptimisation {
     static std::size_t estimateMemoryUsage(std::size_t numberParameters,
                                            std::size_t numberRounds);
 
+    //! Evaluate the Guassian process at the point \p input.
+    double evaluate(const TVector& input) const;
+
+    //! Compute the marginalized value of the Gaussian process in the dimension
+    //! \p dimension for the values \p input.
+    double evaluate1D(double input, int dimension) const;
+
+    //! Get the constant factor of the ANOVA decomposition of the Gaussian process.
+    double anovaConstantFactor() const;
+
+    //! Get the total variance of the hyperparameters in the Gaussian process
+    //! using ANOVA decomposition.
+    double anovaTotalVariance() const;
+
+    //! Get the main effect of the parameter \p dimension in the Gaussian process
+    //! using ANOVA decomposition.
+    double anovaMainEffect(int dimension) const;
+
+    //! Get the vector of main effects as an absolute value and as a fraction
+    //! of the total variance.
+    TDoubleDoublePrVec anovaMainEffects() const;
+
+    //! Set kernel \p parameters explicitly.
+    void kernelParameters(const TVector& parameters);
+
     //! \name Test Interface
     //@{
     //! Get minus the data likelihood and its gradient as a function of the kernel
@@ -132,6 +157,14 @@ class MATHS_EXPORT CBayesianOptimisation {
     TMatrix kernel(const TVector& a, double v) const;
     TVectorDoublePr kernelCovariates(const TVector& a, const TVector& x, double vx) const;
     double kernel(const TVector& a, const TVector& x, const TVector& y) const;
+    double evaluate(const TVector& Kinvf, const TVector& input) const;
+    double evaluate1D(const TVector& Kinvf, double input, int dimension) const;
+    double anovaConstantFactor(const TVector& Kinvf) const;
+    double anovaTotalVariance(const TVector& Kinvf) const;
+    double anovaMainEffect(const TVector& Kinvf, int dimension) const;
+    TVector kinvf() const;
+    TVector transformTo01(const TVector& x) const;
+    TVector scaledKernelParameters() const;
 
 private:
     CPRNG::CXorOShiro128Plus m_Rng;

diff --git a/include/maths/CBoostedTree.h b/include/maths/CBoostedTree.h
@@ -13,6 +13,7 @@
 #include <core/CStateRestoreTraverser.h>
 
 #include <maths/CBoostedTreeHyperparameters.h>
+#include <maths/CBoostedTreeUtils.h>
 #include <maths/CDataFrameCategoryEncoder.h>
 #include <maths/CDataFramePredictiveModel.h>
 #include <maths/CLinearAlgebraEigen.h>
@@ -201,6 +202,8 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel {
     using TDataFramePtr = core::CDataFrame*;
     using TNodeVec = std::vector<CBoostedTreeNode>;
     using TNodeVecVec = std::vector<TNodeVec>;
+    using THyperparameterImportanceVec =
+        std::vector<boosted_tree_detail::SHyperparameterImportance>;
 
     class MATHS_EXPORT CVisitor : public CDataFrameCategoryEncoder::CVisitor,
                                   public CBoostedTreeNode::CVisitor {
@@ -230,6 +233,9 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel {
     //! \warning Will return a nullptr if a trained model isn't available.
     CTreeShapFeatureImportance* shap() const override;
 
+    //! Get the vector of hyperparameter importances.
+    THyperparameterImportanceVec hyperparameterImportance() const;
+
     //! Get the column containing the dependent variable.
     std::size_t columnHoldingDependentVariable() const override;
 

diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h
@@ -28,7 +28,6 @@
 #include <maths/ImportExport.h>
 
 #include <boost/optional.hpp>
-#include <boost/range/irange.hpp>
 
 #include <limits>
 #include <memory>
@@ -66,8 +65,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     using TOptionalDouble = boost::optional<double>;
     using TRegularization = CBoostedTreeRegularization<double>;
     using TSizeVec = std::vector<std::size_t>;
-    using TSizeRange = boost::integer_range<std::size_t>;
     using TAnalysisInstrumentationPtr = CDataFrameTrainBoostedTreeInstrumentationInterface*;
+    using THyperparameterImportanceVec =
+        std::vector<boosted_tree_detail::SHyperparameterImportance>;
 
 public:
     static const double MINIMUM_RELATIVE_GAIN_PER_SPLIT;
@@ -95,6 +95,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! \warning Will return a nullptr if a trained model isn't available.
     CTreeShapFeatureImportance* shap();
 
+    //! Get the vector of hyperparameter importances.
+    THyperparameterImportanceVec hyperparameterImportance() const;
+
     //! Get the model produced by training if it has been run.
     const TNodeVecVec& trainedModel() const;
 
@@ -174,6 +177,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     using TRegularizationOverride = CBoostedTreeRegularization<TOptionalDouble>;
     using TTreeShapFeatureImportanceUPtr = std::unique_ptr<CTreeShapFeatureImportance>;
     using TWorkspace = CBoostedTreeLeafNodeStatistics::CWorkspace;
+    using THyperparametersVec = std::vector<boosted_tree_detail::EHyperparameters>;
 
     //! Tag progress through initialization.
     enum EInitializationStage {
@@ -326,6 +330,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! Record hyperparameters for instrumentation.
     void recordHyperparameters();
 
+    //! Populate the list of tunable hyperparameters
+    void initializeTunableHyperparameters();
+
 private:
     mutable CPRNG::CXorOShiro128Plus m_Rng;
     EInitializationStage m_InitializationStage = E_NotInitialized;
@@ -374,6 +381,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     TAnalysisInstrumentationPtr m_Instrumentation;
     mutable TMeanAccumulator m_ForestSizeAccumulator;
     mutable TMeanAccumulator m_MeanLossAccumulator;
+    THyperparametersVec m_TunableHyperparameters;
 
 private:
     friend class CBoostedTreeFactory;

diff --git a/include/maths/CBoostedTreeUtils.h b/include/maths/CBoostedTreeUtils.h
@@ -31,6 +31,36 @@ using TAlignedMemoryMappedFloatVector =
 
 enum EExtraColumn { E_Prediction = 0, E_Gradient, E_Curvature, E_Weight };
 
+enum EHyperparameters {
+    E_DownsampleFactor = 0,
+    E_Alpha,
+    E_Lambda,
+    E_Gamma,
+    E_SoftTreeDepthLimit,
+    E_SoftTreeDepthTolerance,
+    E_Eta,
+    E_EtaGrowthRatePerTree,
+    E_FeatureBagFraction
+};
+
+constexpr std::size_t NUMBER_HYPERPARAMETERS = E_FeatureBagFraction + 1; // This must be last hyperparameter
+
+struct SHyperparameterImportance {
+    SHyperparameterImportance(EHyperparameters hyperparameter,
+                              double value,
+                              double absoluteImportance,
+                              double relativeImportance,
+                              bool supplied)
+        : s_Hyperparameter(hyperparameter), s_Value(value),
+          s_AbsoluteImportance(absoluteImportance),
+          s_RelativeImportance(relativeImportance), s_Supplied(supplied) {}
+    EHyperparameters s_Hyperparameter;
+    double s_Value;
+    double s_AbsoluteImportance;
+    double s_RelativeImportance;
+    bool s_Supplied;
+};
+
 //! Get the size of upper triangle of the loss Hessain.
 inline std::size_t lossHessianUpperTriangleSize(std::size_t numberLossParameters) {
     return numberLossParameters * (numberLossParameters + 1) / 2;

diff --git a/include/maths/CSampling.h b/include/maths/CSampling.h
@@ -660,6 +660,10 @@ class MATHS_EXPORT CSampling : private core::CNonInstantiatable {
     //! and \p rate on the \p n quantile intervals.
     static void gammaSampleQuantiles(double shape, double rate, std::size_t n, TDoubleVec& result);
 
+    //! Generate \p samples of Sobol sequence \p n elements on
+    //! the hypercube [0, 1] in \p dim dimensions.
+    static void sobolSequenceSample(std::size_t dim, std::size_t n, TDoubleVecVec& samples);
+
 private:
     //! \brief A uniform generator on the interval [0, n).
     template<typename RNG>

diff --git a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
@@ -301,6 +301,8 @@ CDataFrameTrainBoostedTreeClassifierRunner::inferenceModelMetadata() const {
     if (featureImportance) {
         m_InferenceModelMetadata.featureImportanceBaseline(featureImportance->baseline());
     }
+    m_InferenceModelMetadata.hyperparameterImportance(
+        this->boostedTree().hyperparameterImportance());
     return m_InferenceModelMetadata;
 }
 

diff --git a/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc
@@ -159,6 +159,8 @@ CDataFrameTrainBoostedTreeRegressionRunner::inferenceModelMetadata() const {
     if (featureImportance) {
         m_InferenceModelMetadata.featureImportanceBaseline(featureImportance->baseline());
     }
+    m_InferenceModelMetadata.hyperparameterImportance(
+        this->boostedTree().hyperparameterImportance());
     return m_InferenceModelMetadata;
 }
 

diff --git a/lib/api/CDataFrameTrainBoostedTreeRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRunner.cc
@@ -367,6 +367,7 @@ const std::string CDataFrameTrainBoostedTreeRunner::ALPHA{"alpha"};
 const std::string CDataFrameTrainBoostedTreeRunner::LAMBDA{"lambda"};
 const std::string CDataFrameTrainBoostedTreeRunner::GAMMA{"gamma"};
 const std::string CDataFrameTrainBoostedTreeRunner::ETA{"eta"};
+const std::string CDataFrameTrainBoostedTreeRunner::ETA_GROWTH_RATE_PER_TREE{"eta_growth_rate_per_tree"};
 const std::string CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_LIMIT{"soft_tree_depth_limit"};
 const std::string CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_TOLERANCE{"soft_tree_depth_tolerance"};
 const std::string CDataFrameTrainBoostedTreeRunner::MAX_TREES{"max_trees"};