diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 67dd1f7d03..ba96df8ba2 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -34,6 +34,8 @@ * Fix edge case which could cause spurious anomalies early in the learning process if the time series has non-diurnal seasonality. (See {ml-pull}1634[#1634].) +* Compute importance of hyperparameters optimized in the fine parameter tuning step. + (See {ml-pull}1627[#1627].) == {es} version 7.11.0 diff --git a/include/api/CDataFrameTrainBoostedTreeRunner.h b/include/api/CDataFrameTrainBoostedTreeRunner.h index c36adac096..b1ccc0c79e 100644 --- a/include/api/CDataFrameTrainBoostedTreeRunner.h +++ b/include/api/CDataFrameTrainBoostedTreeRunner.h @@ -42,6 +42,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun static const std::string LAMBDA; static const std::string GAMMA; static const std::string ETA; + static const std::string ETA_GROWTH_RATE_PER_TREE; static const std::string SOFT_TREE_DEPTH_LIMIT; static const std::string SOFT_TREE_DEPTH_TOLERANCE; static const std::string MAX_TREES; diff --git a/include/api/CInferenceModelMetadata.h b/include/api/CInferenceModelMetadata.h index 8e3256a778..423e8ada69 100644 --- a/include/api/CInferenceModelMetadata.h +++ b/include/api/CInferenceModelMetadata.h @@ -7,12 +7,14 @@ #define INCLUDED_ml_api_CInferenceModelMetadata_h #include +#include #include #include #include #include +#include namespace ml { namespace api { @@ -21,16 +23,22 @@ namespace api { //! (such as totol feature importance) into JSON format. class API_EXPORT CInferenceModelMetadata { public: + static const std::string JSON_ABSOLUTE_IMPORTANCE_TAG; static const std::string JSON_BASELINE_TAG; - static const std::string JSON_FEATURE_IMPORTANCE_BASELINE_TAG; static const std::string JSON_CLASS_NAME_TAG; static const std::string JSON_CLASSES_TAG; + static const std::string JSON_FEATURE_IMPORTANCE_BASELINE_TAG; static const std::string JSON_FEATURE_NAME_TAG; + static const std::string JSON_HYPERPARAMETERS_TAG; + static const std::string JSON_HYPERPARAMETER_NAME_TAG; + static const std::string JSON_HYPERPARAMETER_VALUE_TAG; + static const std::string JSON_HYPERPARAMETER_SUPPLIED_TAG; static const std::string JSON_IMPORTANCE_TAG; static const std::string JSON_MAX_TAG; static const std::string JSON_MEAN_MAGNITUDE_TAG; static const std::string JSON_MIN_TAG; static const std::string JSON_MODEL_METADATA_TAG; + static const std::string JSON_RELATIVE_IMPORTANCE_TAG; static const std::string JSON_TOTAL_FEATURE_IMPORTANCE_TAG; public: @@ -53,17 +61,36 @@ class API_EXPORT CInferenceModelMetadata { //! Set the feature importance baseline (the individual feature importances are additive corrections //! to the baseline value). void featureImportanceBaseline(TVector&& baseline); + void hyperparameterImportance(const maths::CBoostedTree::THyperparameterImportanceVec& hyperparameterImportance); private: + struct SHyperparameterImportance { + SHyperparameterImportance(std::string hyperparameterName, + double value, + double absoluteImportance, + double relativeImportance, + bool supplied) + : s_HyperparameterName(hyperparameterName), s_Value(value), + s_AbsoluteImportance(absoluteImportance), + s_RelativeImportance(relativeImportance), s_Supplied(supplied){}; + std::string s_HyperparameterName; + double s_Value; + double s_AbsoluteImportance; + double s_RelativeImportance; + bool s_Supplied; + }; + using TMeanAccumulator = std::vector::TAccumulator>; using TMinMaxAccumulator = std::vector>; using TSizeMeanAccumulatorUMap = std::unordered_map; using TSizeMinMaxAccumulatorUMap = std::unordered_map; using TOptionalVector = boost::optional; + using THyperparametersVec = std::vector; private: void writeTotalFeatureImportance(TRapidJsonWriter& writer) const; + void writeHyperparameterImportance(TRapidJsonWriter& writer) const; void writeFeatureImportanceBaseline(TRapidJsonWriter& writer) const; private: @@ -76,6 +103,7 @@ class API_EXPORT CInferenceModelMetadata { [](const std::string& value, TRapidJsonWriter& writer) { writer.String(value); }; + THyperparametersVec m_HyperparameterImportance; }; } } diff --git a/include/maths/CBayesianOptimisation.h b/include/maths/CBayesianOptimisation.h index 6b2fe2f859..91872b2521 100644 --- a/include/maths/CBayesianOptimisation.h +++ b/include/maths/CBayesianOptimisation.h @@ -93,6 +93,31 @@ class MATHS_EXPORT CBayesianOptimisation { static std::size_t estimateMemoryUsage(std::size_t numberParameters, std::size_t numberRounds); + //! Evaluate the Guassian process at the point \p input. + double evaluate(const TVector& input) const; + + //! Compute the marginalized value of the Gaussian process in the dimension + //! \p dimension for the values \p input. + double evaluate1D(double input, int dimension) const; + + //! Get the constant factor of the ANOVA decomposition of the Gaussian process. + double anovaConstantFactor() const; + + //! Get the total variance of the hyperparameters in the Gaussian process + //! using ANOVA decomposition. + double anovaTotalVariance() const; + + //! Get the main effect of the parameter \p dimension in the Gaussian process + //! using ANOVA decomposition. + double anovaMainEffect(int dimension) const; + + //! Get the vector of main effects as an absolute value and as a fraction + //! of the total variance. + TDoubleDoublePrVec anovaMainEffects() const; + + //! Set kernel \p parameters explicitly. + void kernelParameters(const TVector& parameters); + //! \name Test Interface //@{ //! Get minus the data likelihood and its gradient as a function of the kernel @@ -132,6 +157,14 @@ class MATHS_EXPORT CBayesianOptimisation { TMatrix kernel(const TVector& a, double v) const; TVectorDoublePr kernelCovariates(const TVector& a, const TVector& x, double vx) const; double kernel(const TVector& a, const TVector& x, const TVector& y) const; + double evaluate(const TVector& Kinvf, const TVector& input) const; + double evaluate1D(const TVector& Kinvf, double input, int dimension) const; + double anovaConstantFactor(const TVector& Kinvf) const; + double anovaTotalVariance(const TVector& Kinvf) const; + double anovaMainEffect(const TVector& Kinvf, int dimension) const; + TVector kinvf() const; + TVector transformTo01(const TVector& x) const; + TVector scaledKernelParameters() const; private: CPRNG::CXorOShiro128Plus m_Rng; diff --git a/include/maths/CBoostedTree.h b/include/maths/CBoostedTree.h index 12339263c0..32d20aec1b 100644 --- a/include/maths/CBoostedTree.h +++ b/include/maths/CBoostedTree.h @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -201,6 +202,8 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel { using TDataFramePtr = core::CDataFrame*; using TNodeVec = std::vector; using TNodeVecVec = std::vector; + using THyperparameterImportanceVec = + std::vector; class MATHS_EXPORT CVisitor : public CDataFrameCategoryEncoder::CVisitor, public CBoostedTreeNode::CVisitor { @@ -230,6 +233,9 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel { //! \warning Will return a nullptr if a trained model isn't available. CTreeShapFeatureImportance* shap() const override; + //! Get the vector of hyperparameter importances. + THyperparameterImportanceVec hyperparameterImportance() const; + //! Get the column containing the dependent variable. std::size_t columnHoldingDependentVariable() const override; diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index dff3e7f6fd..4630423fd4 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -28,7 +28,6 @@ #include #include -#include #include #include @@ -66,8 +65,9 @@ class MATHS_EXPORT CBoostedTreeImpl final { using TOptionalDouble = boost::optional; using TRegularization = CBoostedTreeRegularization; using TSizeVec = std::vector; - using TSizeRange = boost::integer_range; using TAnalysisInstrumentationPtr = CDataFrameTrainBoostedTreeInstrumentationInterface*; + using THyperparameterImportanceVec = + std::vector; public: static const double MINIMUM_RELATIVE_GAIN_PER_SPLIT; @@ -95,6 +95,9 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! \warning Will return a nullptr if a trained model isn't available. CTreeShapFeatureImportance* shap(); + //! Get the vector of hyperparameter importances. + THyperparameterImportanceVec hyperparameterImportance() const; + //! Get the model produced by training if it has been run. const TNodeVecVec& trainedModel() const; @@ -174,6 +177,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { using TRegularizationOverride = CBoostedTreeRegularization; using TTreeShapFeatureImportanceUPtr = std::unique_ptr; using TWorkspace = CBoostedTreeLeafNodeStatistics::CWorkspace; + using THyperparametersVec = std::vector; //! Tag progress through initialization. enum EInitializationStage { @@ -326,6 +330,9 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! Record hyperparameters for instrumentation. void recordHyperparameters(); + //! Populate the list of tunable hyperparameters + void initializeTunableHyperparameters(); + private: mutable CPRNG::CXorOShiro128Plus m_Rng; EInitializationStage m_InitializationStage = E_NotInitialized; @@ -374,6 +381,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { TAnalysisInstrumentationPtr m_Instrumentation; mutable TMeanAccumulator m_ForestSizeAccumulator; mutable TMeanAccumulator m_MeanLossAccumulator; + THyperparametersVec m_TunableHyperparameters; private: friend class CBoostedTreeFactory; diff --git a/include/maths/CBoostedTreeUtils.h b/include/maths/CBoostedTreeUtils.h index fe6d076b8e..b403181c17 100644 --- a/include/maths/CBoostedTreeUtils.h +++ b/include/maths/CBoostedTreeUtils.h @@ -31,6 +31,36 @@ using TAlignedMemoryMappedFloatVector = enum EExtraColumn { E_Prediction = 0, E_Gradient, E_Curvature, E_Weight }; +enum EHyperparameters { + E_DownsampleFactor = 0, + E_Alpha, + E_Lambda, + E_Gamma, + E_SoftTreeDepthLimit, + E_SoftTreeDepthTolerance, + E_Eta, + E_EtaGrowthRatePerTree, + E_FeatureBagFraction +}; + +constexpr std::size_t NUMBER_HYPERPARAMETERS = E_FeatureBagFraction + 1; // This must be last hyperparameter + +struct SHyperparameterImportance { + SHyperparameterImportance(EHyperparameters hyperparameter, + double value, + double absoluteImportance, + double relativeImportance, + bool supplied) + : s_Hyperparameter(hyperparameter), s_Value(value), + s_AbsoluteImportance(absoluteImportance), + s_RelativeImportance(relativeImportance), s_Supplied(supplied) {} + EHyperparameters s_Hyperparameter; + double s_Value; + double s_AbsoluteImportance; + double s_RelativeImportance; + bool s_Supplied; +}; + //! Get the size of upper triangle of the loss Hessain. inline std::size_t lossHessianUpperTriangleSize(std::size_t numberLossParameters) { return numberLossParameters * (numberLossParameters + 1) / 2; diff --git a/include/maths/CSampling.h b/include/maths/CSampling.h index b7fb84edba..ff80067c0b 100644 --- a/include/maths/CSampling.h +++ b/include/maths/CSampling.h @@ -660,6 +660,10 @@ class MATHS_EXPORT CSampling : private core::CNonInstantiatable { //! and \p rate on the \p n quantile intervals. static void gammaSampleQuantiles(double shape, double rate, std::size_t n, TDoubleVec& result); + //! Generate \p samples of Sobol sequence \p n elements on + //! the hypercube [0, 1] in \p dim dimensions. + static void sobolSequenceSample(std::size_t dim, std::size_t n, TDoubleVecVec& samples); + private: //! \brief A uniform generator on the interval [0, n). template diff --git a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc index ce61c06abc..eb01019f0a 100644 --- a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc +++ b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc @@ -301,6 +301,8 @@ CDataFrameTrainBoostedTreeClassifierRunner::inferenceModelMetadata() const { if (featureImportance) { m_InferenceModelMetadata.featureImportanceBaseline(featureImportance->baseline()); } + m_InferenceModelMetadata.hyperparameterImportance( + this->boostedTree().hyperparameterImportance()); return m_InferenceModelMetadata; } diff --git a/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc index da38052361..a44639c967 100644 --- a/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc +++ b/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc @@ -159,6 +159,8 @@ CDataFrameTrainBoostedTreeRegressionRunner::inferenceModelMetadata() const { if (featureImportance) { m_InferenceModelMetadata.featureImportanceBaseline(featureImportance->baseline()); } + m_InferenceModelMetadata.hyperparameterImportance( + this->boostedTree().hyperparameterImportance()); return m_InferenceModelMetadata; } diff --git a/lib/api/CDataFrameTrainBoostedTreeRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRunner.cc index 01a179ef92..9dfc112b0f 100644 --- a/lib/api/CDataFrameTrainBoostedTreeRunner.cc +++ b/lib/api/CDataFrameTrainBoostedTreeRunner.cc @@ -367,6 +367,7 @@ const std::string CDataFrameTrainBoostedTreeRunner::ALPHA{"alpha"}; const std::string CDataFrameTrainBoostedTreeRunner::LAMBDA{"lambda"}; const std::string CDataFrameTrainBoostedTreeRunner::GAMMA{"gamma"}; const std::string CDataFrameTrainBoostedTreeRunner::ETA{"eta"}; +const std::string CDataFrameTrainBoostedTreeRunner::ETA_GROWTH_RATE_PER_TREE{"eta_growth_rate_per_tree"}; const std::string CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_LIMIT{"soft_tree_depth_limit"}; const std::string CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_TOLERANCE{"soft_tree_depth_tolerance"}; const std::string CDataFrameTrainBoostedTreeRunner::MAX_TREES{"max_trees"}; diff --git a/lib/api/CInferenceModelMetadata.cc b/lib/api/CInferenceModelMetadata.cc index bbc4605533..12d9cfb808 100644 --- a/lib/api/CInferenceModelMetadata.cc +++ b/lib/api/CInferenceModelMetadata.cc @@ -5,6 +5,10 @@ */ #include +#include + +#include + #include namespace ml { @@ -13,6 +17,7 @@ namespace api { void CInferenceModelMetadata::write(TRapidJsonWriter& writer) const { this->writeTotalFeatureImportance(writer); this->writeFeatureImportanceBaseline(writer); + this->writeHyperparameterImportance(writer); } void CInferenceModelMetadata::writeTotalFeatureImportance(TRapidJsonWriter& writer) const { @@ -136,6 +141,29 @@ void CInferenceModelMetadata::writeFeatureImportanceBaseline(TRapidJsonWriter& w } } +void CInferenceModelMetadata::writeHyperparameterImportance(TRapidJsonWriter& writer) const { + // TODO use struct instead of a tuple + writer.Key(JSON_HYPERPARAMETERS_TAG); + writer.StartArray(); + for (const auto& item : m_HyperparameterImportance) { + writer.StartObject(); + writer.Key(JSON_HYPERPARAMETER_NAME_TAG); + writer.String(item.s_HyperparameterName); + writer.Key(JSON_HYPERPARAMETER_VALUE_TAG); + writer.Double(item.s_Value); + if (item.s_Supplied == false) { + writer.Key(JSON_ABSOLUTE_IMPORTANCE_TAG); + writer.Double(item.s_AbsoluteImportance); + writer.Key(JSON_RELATIVE_IMPORTANCE_TAG); + writer.Double(item.s_RelativeImportance); + } + writer.Key(JSON_HYPERPARAMETER_SUPPLIED_TAG); + writer.Bool(item.s_Supplied); + writer.EndObject(); + } + writer.EndArray(); +} + const std::string& CInferenceModelMetadata::typeString() const { return JSON_MODEL_METADATA_TAG; } @@ -171,17 +199,68 @@ void CInferenceModelMetadata::featureImportanceBaseline(TVector&& baseline) { m_ShapBaseline = baseline; } +void CInferenceModelMetadata::hyperparameterImportance( + const maths::CBoostedTree::THyperparameterImportanceVec& hyperparameterImportance) { + m_HyperparameterImportance.clear(); + m_HyperparameterImportance.reserve(hyperparameterImportance.size()); + for (const auto& item : hyperparameterImportance) { + std::string hyperparameterName; + switch (item.s_Hyperparameter) { + case maths::boosted_tree_detail::E_Alpha: + hyperparameterName = CDataFrameTrainBoostedTreeRunner::ALPHA; + break; + case maths::boosted_tree_detail::E_DownsampleFactor: + hyperparameterName = CDataFrameTrainBoostedTreeRunner::DOWNSAMPLE_FACTOR; + break; + case maths::boosted_tree_detail::E_Eta: + hyperparameterName = CDataFrameTrainBoostedTreeRunner::DOWNSAMPLE_FACTOR; + break; + case maths::boosted_tree_detail::E_EtaGrowthRatePerTree: + hyperparameterName = CDataFrameTrainBoostedTreeRunner::ETA_GROWTH_RATE_PER_TREE; + break; + case maths::boosted_tree_detail::E_FeatureBagFraction: + hyperparameterName = CDataFrameTrainBoostedTreeRunner::FEATURE_BAG_FRACTION; + break; + case maths::boosted_tree_detail::E_Gamma: + hyperparameterName = CDataFrameTrainBoostedTreeRunner::GAMMA; + break; + case maths::boosted_tree_detail::E_Lambda: + hyperparameterName = CDataFrameTrainBoostedTreeRunner::LAMBDA; + break; + case maths::boosted_tree_detail::E_SoftTreeDepthLimit: + hyperparameterName = CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_LIMIT; + break; + case maths::boosted_tree_detail::E_SoftTreeDepthTolerance: + hyperparameterName = CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_TOLERANCE; + break; + } + m_HyperparameterImportance.emplace_back( + hyperparameterName, item.s_Value, item.s_AbsoluteImportance, + item.s_RelativeImportance, item.s_Supplied); + } + std::sort(m_HyperparameterImportance.begin(), + m_HyperparameterImportance.end(), [](const auto& a, const auto& b) { + return a.s_AbsoluteImportance > b.s_AbsoluteImportance; + }); +} + // clang-format off +const std::string CInferenceModelMetadata::JSON_ABSOLUTE_IMPORTANCE_TAG{"absolute_importance"}; const std::string CInferenceModelMetadata::JSON_BASELINE_TAG{"baseline"}; -const std::string CInferenceModelMetadata::JSON_FEATURE_IMPORTANCE_BASELINE_TAG{"feature_importance_baseline"}; const std::string CInferenceModelMetadata::JSON_CLASS_NAME_TAG{"class_name"}; const std::string CInferenceModelMetadata::JSON_CLASSES_TAG{"classes"}; +const std::string CInferenceModelMetadata::JSON_FEATURE_IMPORTANCE_BASELINE_TAG{"feature_importance_baseline"}; const std::string CInferenceModelMetadata::JSON_FEATURE_NAME_TAG{"feature_name"}; +const std::string CInferenceModelMetadata::JSON_HYPERPARAMETERS_TAG{"hyperparameters"}; +const std::string CInferenceModelMetadata::JSON_HYPERPARAMETER_NAME_TAG{"name"}; +const std::string CInferenceModelMetadata::JSON_HYPERPARAMETER_VALUE_TAG{"value"}; +const std::string CInferenceModelMetadata::JSON_HYPERPARAMETER_SUPPLIED_TAG{"supplied"}; const std::string CInferenceModelMetadata::JSON_IMPORTANCE_TAG{"importance"}; const std::string CInferenceModelMetadata::JSON_MAX_TAG{"max"}; const std::string CInferenceModelMetadata::JSON_MEAN_MAGNITUDE_TAG{"mean_magnitude"}; const std::string CInferenceModelMetadata::JSON_MIN_TAG{"min"}; const std::string CInferenceModelMetadata::JSON_MODEL_METADATA_TAG{"model_metadata"}; +const std::string CInferenceModelMetadata::JSON_RELATIVE_IMPORTANCE_TAG{"relative_importance"}; const std::string CInferenceModelMetadata::JSON_TOTAL_FEATURE_IMPORTANCE_TAG{"total_feature_importance"}; // clang-format on } diff --git a/lib/api/unittest/CInferenceModelMetadataTest.cc b/lib/api/unittest/CInferenceModelMetadataTest.cc new file mode 100644 index 0000000000..c91ffa5f40 --- /dev/null +++ b/lib/api/unittest/CInferenceModelMetadataTest.cc @@ -0,0 +1,98 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +#include +#include + +#include + +#include + +#include +#include + +#include +#include + +#include + +#include +#include + +BOOST_AUTO_TEST_SUITE(CInferenceModelMetadataTest) + +using namespace ml; + +namespace { +using TDoubleVec = std::vector; +using TStrVec = std::vector; +using TLossFunctionType = maths::boosted_tree::ELossType; +} + +BOOST_AUTO_TEST_CASE(testJsonSchema) { + // Test the results the analyzer produces match running the regression directly. + + std::stringstream output; + auto outputWriterFactory = [&output]() { + return std::make_unique(output); + }; + + TDoubleVec expectedPredictions; + + TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; + TStrVec fieldValues{"", "", "", "", "", "0", ""}; + api::CDataFrameAnalyzer analyzer{ + test::CDataFrameAnalysisSpecificationFactory{} + .predictionLambda(0.5) + .predictionEta(.5) + .predictionGamma(0.5) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), + outputWriterFactory}; + test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( + TLossFunctionType::E_MseRegression, fieldNames, fieldValues, analyzer, + expectedPredictions); + + analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + + rapidjson::Document results; + rapidjson::ParseResult ok(results.Parse(output.str())); + LOG_DEBUG(<< output.str()); + BOOST_TEST_REQUIRE(static_cast(ok) == true); + + std::ifstream modelMetaDataSchemaFileStream("testfiles/model_meta_data/model_meta_data.schema.json"); + BOOST_REQUIRE_MESSAGE(modelMetaDataSchemaFileStream.is_open(), "Cannot open test file!"); + std::string modelMetaDataSchemaJson( + (std::istreambuf_iterator(modelMetaDataSchemaFileStream)), + std::istreambuf_iterator()); + rapidjson::Document modelMetaDataSchemaDocument; + BOOST_REQUIRE_MESSAGE( + modelMetaDataSchemaDocument.Parse(modelMetaDataSchemaJson).HasParseError() == false, + "Cannot parse JSON schema!"); + rapidjson::SchemaDocument modelMetaDataSchema(modelMetaDataSchemaDocument); + rapidjson::SchemaValidator modelMetaDataValidator(modelMetaDataSchema); + + bool hasModelMetadata{false}; + for (const auto& result : results.GetArray()) { + if (result.HasMember("model_metadata")) { + hasModelMetadata = true; + BOOST_TEST_REQUIRE(result["model_metadata"].IsObject() = true); + if (result["model_metadata"].Accept(modelMetaDataValidator) == false) { + rapidjson::StringBuffer sb; + modelMetaDataValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid schema: " << sb.GetString()); + LOG_ERROR(<< "Invalid keyword: " + << modelMetaDataValidator.GetInvalidSchemaKeyword()); + sb.Clear(); + modelMetaDataValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid document: " << sb.GetString()); + BOOST_FAIL("Schema validation failed"); + } + } + } + + BOOST_TEST_REQUIRE(hasModelMetadata); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/api/unittest/Makefile b/lib/api/unittest/Makefile index 37e2e5df05..0acc272967 100644 --- a/lib/api/unittest/Makefile +++ b/lib/api/unittest/Makefile @@ -41,6 +41,7 @@ SRCS=\ CFieldDataCategorizerTest.cc \ CForecastRunnerTest.cc \ CGlobalCategoryIdTest.cc \ + CInferenceModelMetadataTest.cc \ CIoManagerTest.cc \ CJsonOutputWriterTest.cc \ CLengthEncodedInputParserTest.cc \ diff --git a/lib/api/unittest/testfiles/model_meta_data/model_meta_data.schema.json b/lib/api/unittest/testfiles/model_meta_data/model_meta_data.schema.json new file mode 100644 index 0000000000..fc33a88d8b --- /dev/null +++ b/lib/api/unittest/testfiles/model_meta_data/model_meta_data.schema.json @@ -0,0 +1,97 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "$id": "https://raw.githubusercontent.com/elastic/ml-json-schemas-private/master/schemas/model_meta_data/model_meta_data.schema.json", + "description": "Optional model meta information", + "title": "model_meta_data", + "type": "object", + "properties": { + "total_feature_importance": { + "description": "Average feature importance for all features used by the model", + "type": "array", + "items": { + "type": "object", + "properties": { + "field_name": { + "type": "string" + }, + "importance": { + "type": "number" + } + }, + "required": [ + "field_name", + "importance" + ], + "additionalProperties": false + }, + "additionalItems": false + }, + "hyperparameters": { + "description": "List of hyperparameters together their absolute and relative importances from Bayesian optimization.", + "type": "array", + "items": { + "type": "object", + "oneOf": [ + { + "properties": { + "name": { + "type": "string" + }, + "value": { + "description": "The best hyperparameter value or the value supplied by the user.", + "type": "number" + }, + "supplied": { + "description": "Wether or not the value was supplied by the user.", + "type": "boolean", + "enum": [ + false + ] + }, + "absolute_importance": { + "type": "number" + }, + "relative_importance": { + "type": "number" + } + }, + "required": [ + "name", + "value", + "supplied", + "absolute_importance", + "relative_importance" + ], + "additionalProperties": false + }, + { + "properties": { + "name": { + "type": "string" + }, + "value": { + "description": "The best hyperparameter value or the value supplied by the user.", + "type": "number" + }, + "supplied": { + "description": "Wether or not the value was supplied by the user.", + "type": "boolean", + "enum": [ + true + ] + } + }, + "required": [ + "name", + "value", + "supplied" + ], + "additionalProperties": false + } + ] + }, + "additionalItems": false + } + }, + "additionalProperties": false +} \ No newline at end of file diff --git a/lib/maths/CBayesianOptimisation.cc b/lib/maths/CBayesianOptimisation.cc index 31fcfcc736..241da2f9dd 100644 --- a/lib/maths/CBayesianOptimisation.cc +++ b/lib/maths/CBayesianOptimisation.cc @@ -6,6 +6,7 @@ #include +#include #include #include #include @@ -58,6 +59,21 @@ double stableNormPdf(double z) { // altogether because it is possible that the function we're interpolating has a // narrow deep valley that the Gaussian Process hasn't sampled. const double MINIMUM_KERNEL_SCALE_FOR_EXPECTATION_MAXIMISATION{1e-8}; + +double integrate1dKernel(double theta1, double x) { + double c{std::sqrt(CTools::pow2(theta1) + MINIMUM_KERNEL_SCALE_FOR_EXPECTATION_MAXIMISATION)}; + return CTools::stable(boost::math::constants::root_pi() * + (std::erf(c * (1 - x)) + std::erf(c * x)) / (2 * c)); +} + +double integrate1dKernelProduct(double theta1, double xit, double xjt) { + double c{std::sqrt(CTools::pow2(theta1) + MINIMUM_KERNEL_SCALE_FOR_EXPECTATION_MAXIMISATION)}; + return CTools::stable( + boost::math::constants::root_half_pi() / (2 * c) * + std::exp(-0.5 * CTools::pow2(c) * CTools::pow2(xit - xjt)) * + (std::erf(c / boost::math::constants::root_two() * (xit + xjt)) - + std::erf(c / boost::math::constants::root_two() * (xit + xjt - 2)))); +} } CBayesianOptimisation::CBayesianOptimisation(TDoubleDoublePrVec parameterBounds, @@ -189,6 +205,184 @@ CBayesianOptimisation::maximumExpectedImprovement() { return {std::move(xmax), expectedImprovement}; } +CBayesianOptimisation::TVector CBayesianOptimisation::kinvf() const { + TVector Kinvf; + TMatrix K{this->kernel(m_KernelParameters, this->meanErrorVariance())}; + Kinvf = K.ldlt().solve(this->function()); + return Kinvf; +} + +CBayesianOptimisation::TVector CBayesianOptimisation::transformTo01(const TVector& x) const { + return (x - m_MinBoundary).cwiseQuotient(m_MaxBoundary - m_MinBoundary); +} + +CBayesianOptimisation::TVector CBayesianOptimisation::scaledKernelParameters() const { + TVector scaledKernelParameters{m_KernelParameters.tail(m_MinBoundary.size())}; + return scaledKernelParameters.cwiseProduct(m_MaxBoundary - m_MinBoundary); +} + +double CBayesianOptimisation::evaluate(const TVector& input) const { + return this->evaluate(this->kinvf(), input); +} + +double CBayesianOptimisation::evaluate(const TVector& Kinvf, const TVector& input) const { + TVector Kxn; + std::tie(Kxn, std::ignore) = this->kernelCovariates(m_KernelParameters, input, + this->meanErrorVariance()); + return Kxn.transpose() * Kinvf; +} + +double CBayesianOptimisation::evaluate1D(const TVector& Kinvf, double input, int dimension) const { + TVector scaledKernelParameters{this->scaledKernelParameters()}; + auto prodXt = [&scaledKernelParameters, this](const TVector& x, int t) { + double prod{1.0}; + for (int d = 0; d < m_MinBoundary.size(); ++d) { + if (d != t) { + prod *= integrate1dKernel(scaledKernelParameters(d), x(d)); + } + } + return prod; + }; + + double sum{0.0}; + input = (input - m_MinBoundary(dimension)) / + (m_MaxBoundary(dimension) - m_MinBoundary(dimension)); + for (std::size_t i = 0; i < m_FunctionMeanValues.size(); ++i) { + TVector x{this->transformTo01(m_FunctionMeanValues[i].first)}; + sum += Kinvf(static_cast(i)) * + std::exp(-(CTools::pow2(scaledKernelParameters[dimension]) + + MINIMUM_KERNEL_COORDINATE_DISTANCE_SCALE) * + CTools::pow2(input - x(dimension))) * + prodXt(x, dimension); + } + + double theta02{CTools::pow2(m_KernelParameters(0))}; + double scale{std::max(1.0, theta02)}; //prevent cancellation errors + double f0{this->anovaConstantFactor(Kinvf) / scale}; + return scale * (theta02 / scale * sum - f0); +} + +double CBayesianOptimisation::evaluate1D(double input, int dimension) const { + if (dimension < 0 || dimension > m_MinBoundary.size()) { + LOG_ERROR(<< "Input error: dimension " << dimension << " is out of bounds. It should be between 0 and " + << m_MinBoundary.size()); + return 0.0; + } + return this->evaluate1D(this->kinvf(), input, dimension); +} + +double CBayesianOptimisation::anovaConstantFactor(const TVector& Kinvf) const { + TVector scaledKernelParameters{this->scaledKernelParameters()}; + double sum{0.0}; + for (std::size_t i = 0; i < m_FunctionMeanValues.size(); ++i) { + double prod{1.0}; + TVector x{this->transformTo01(m_FunctionMeanValues[i].first)}; + for (int d = 0; d < m_MinBoundary.size(); ++d) { + prod *= integrate1dKernel(scaledKernelParameters(d), x(d)); + } + sum += Kinvf(i) * prod; + } + return CTools::pow2(m_KernelParameters(0)) * sum; +} + +double CBayesianOptimisation::anovaConstantFactor() const { + return this->anovaConstantFactor(this->kinvf()); +} + +double CBayesianOptimisation::anovaTotalVariance(const TVector& Kinvf) const { + TVector scaledKernelParameters{this->scaledKernelParameters()}; + auto prodIj = [this, &scaledKernelParameters, &Kinvf](std::size_t i, std::size_t j) { + TVector xi{this->transformTo01(m_FunctionMeanValues[i].first)}; + TVector xj{this->transformTo01(m_FunctionMeanValues[j].first)}; + double prod{1.0}; + for (int t = 0; t < m_MinBoundary.size(); ++t) { + prod *= integrate1dKernelProduct(scaledKernelParameters[t], xi(t), xj(t)); + } + return Kinvf(static_cast(i)) * Kinvf(static_cast(j)) * prod; + }; + + double sum{0.0}; + for (std::size_t i = 0; i < m_FunctionMeanValues.size(); ++i) { + for (std::size_t j = 0; j < m_FunctionMeanValues.size(); ++j) { + sum += prodIj(i, j); + } + } + double theta04{std::pow(m_KernelParameters(0), 4)}; + double scale{std::max(1.0, theta04)}; // prevent cancellation errors + double f02{CTools::pow2(this->anovaConstantFactor(Kinvf)) / scale}; + double variance{scale * (theta04 / scale * sum - f02)}; + return std::max(0.0, variance); +} + +double CBayesianOptimisation::anovaTotalVariance() const { + return this->anovaTotalVariance(this->kinvf()); +} + +double CBayesianOptimisation::anovaMainEffect(const TVector& Kinvf, int dimension) const { + TVector scaledKernelParameters{this->scaledKernelParameters()}; + auto prodXt = [this, &scaledKernelParameters](const TVector& x, int t) { + double prod{1.0}; + for (int d = 0; d < m_MinBoundary.size(); ++d) { + if (d != t) { + prod *= integrate1dKernel(scaledKernelParameters(d), x(d)); + } + } + return prod; + }; + double sum1{0.0}; + double sum2{0.0}; + for (std::size_t i = 0; i < m_FunctionMeanValues.size(); ++i) { + TVector xi{this->transformTo01(m_FunctionMeanValues[i].first)}; + for (std::size_t j = 0; j < m_FunctionMeanValues.size(); ++j) { + TVector xj{this->transformTo01(m_FunctionMeanValues[j].first)}; + sum1 += Kinvf(static_cast(i)) * Kinvf(static_cast(j)) * + prodXt(xi, dimension) * prodXt(xj, dimension) * + integrate1dKernelProduct(scaledKernelParameters(dimension), + xi(dimension), xj(dimension)); + } + sum2 += Kinvf(static_cast(i)) * + integrate1dKernel(scaledKernelParameters(dimension), xi(dimension)) * + prodXt(xi, dimension); + } + double theta02{CTools::pow2(m_KernelParameters(0))}; + double theta04{std::pow(m_KernelParameters(0), 4)}; + double f0{this->anovaConstantFactor()}; + double f02{CTools::pow2(f0)}; + double scale{std::max(1.0, theta04)}; // prevent cancellation errors + return scale * (theta04 * sum1 / scale - 2 * theta02 * sum2 * f0 / scale + f02 / scale); +} + +double CBayesianOptimisation::anovaMainEffect(int dimension) const { + if (dimension < 0 || dimension > m_MinBoundary.size()) { + LOG_ERROR(<< "Input error: dimension " << dimension << " is out of bounds. It should be between 0 and " + << m_MinBoundary.size()); + return 0.0; + } + return this->anovaMainEffect(this->kinvf(), dimension); +} + +CBayesianOptimisation::TDoubleDoublePrVec CBayesianOptimisation::anovaMainEffects() const { + TDoubleDoublePrVec mainEffects; + mainEffects.reserve(static_cast(m_MinBoundary.size())); + TVector Kinvf{this->kinvf()}; + double f0{this->anovaConstantFactor(Kinvf)}; + double totalVariance(this->anovaTotalVariance(Kinvf)); + for (int i = 0; i < m_MinBoundary.size(); ++i) { + double effect{this->anovaMainEffect(Kinvf, i)}; + mainEffects.emplace_back(effect, effect / totalVariance); + } + LOG_TRACE(<< "GP ANOVA constant " << f0 << " variance " << totalVariance + << "\nmain effects " << core::CContainerPrinter::print(mainEffects) + << "\nkernel parameters " << m_KernelParameters.transpose()); + return mainEffects; +} + +void CBayesianOptimisation::kernelParameters(const TVector& parameters) { + if (m_KernelParameters.size() == parameters.size()) { + m_KernelParameters = parameters; + } +} + std::pair CBayesianOptimisation::minusLikelihoodAndGradient() const { diff --git a/lib/maths/CBoostedTree.cc b/lib/maths/CBoostedTree.cc index 9f35e93a09..1510533007 100644 --- a/lib/maths/CBoostedTree.cc +++ b/lib/maths/CBoostedTree.cc @@ -163,6 +163,10 @@ CTreeShapFeatureImportance* CBoostedTree::shap() const { return m_Impl->shap(); } +CBoostedTree::THyperparameterImportanceVec CBoostedTree::hyperparameterImportance() const { + return m_Impl->hyperparameterImportance(); +} + std::size_t CBoostedTree::columnHoldingDependentVariable() const { return m_Impl->columnHoldingDependentVariable(); } diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index ef34a05781..8acaac45ce 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -220,6 +221,7 @@ void CBoostedTreeFactory::initializeHyperparameterOptimisation() const { LOG_TRACE(<< "hyperparameter search bounding box = " << core::CContainerPrinter::print(boundingBox)); + m_TreeImpl->initializeTunableHyperparameters(); m_TreeImpl->m_BayesianOptimization = std::make_unique( std::move(boundingBox), m_BayesianOptimisationRestarts.value_or(CBayesianOptimisation::RESTARTS)); diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 079bf6bcd8..a396dbdd36 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -339,6 +339,8 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows, std::size_t foldRoundLossMemoryUsage{m_NumberFolds * m_NumberRounds * sizeof(TOptionalDouble)}; std::size_t hyperparametersMemoryUsage{numberColumns * sizeof(double)}; + std::size_t tunableHyperparametersMemoryUsage{m_TunableHyperparameters.size() * + sizeof(int)}; // The leaves' row masks memory is accounted for here because it's proportional // to the log2(number of nodes). The compressed bit vector representation uses // roughly log2(E[run length]) / E[run length] bytes per bit. As we grow the @@ -371,8 +373,9 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows, this->numberHyperparametersToTune(), m_NumberRounds)}; std::size_t worstCaseMemoryUsage{ sizeof(*this) + forestMemoryUsage + foldRoundLossMemoryUsage + - hyperparametersMemoryUsage + leafNodeStatisticsMemoryUsage + - dataTypeMemoryUsage + featureSampleProbabilities + missingFeatureMaskMemoryUsage + + hyperparametersMemoryUsage + tunableHyperparametersMemoryUsage + + leafNodeStatisticsMemoryUsage + dataTypeMemoryUsage + + featureSampleProbabilities + missingFeatureMaskMemoryUsage + trainTestMaskMemoryUsage + bayesianOptimisationMemoryUsage}; return CBoostedTreeImpl::correctedMemoryUsage(static_cast(worstCaseMemoryUsage)); @@ -1265,8 +1268,8 @@ bool CBoostedTreeImpl::selectNextHyperparameters(const TMeanVarAccumulator& loss double meanLoss{CBasicStatistics::mean(lossMoments)}; double lossVariance{CBasicStatistics::variance(lossMoments)}; - LOG_TRACE(<< "round = " << m_CurrentRound << " loss = " << meanLoss - << ": regularization = " << m_Regularization.print() + LOG_TRACE(<< "round = " << m_CurrentRound << " loss = " << meanLoss << " variance = " + << lossVariance << ": regularization = " << m_Regularization.print() << ", downsample factor = " << m_DownsampleFactor << ", eta = " << m_Eta << ", eta growth rate per tree = " << m_EtaGrowthRatePerTree << ", feature bag fraction = " << m_FeatureBagFraction); @@ -1283,6 +1286,16 @@ bool CBoostedTreeImpl::selectNextHyperparameters(const TMeanVarAccumulator& loss } // Write parameters for next round. + if (m_DownsampleFactorOverride == boost::none) { + auto i = std::distance(m_TunableHyperparameters.begin(), + std::find(m_TunableHyperparameters.begin(), + m_TunableHyperparameters.end(), E_DownsampleFactor)); + if (static_cast(i) < m_TunableHyperparameters.size()) { + scale = std::min(1.0, 2.0 * parameters(i) / + (CTools::stableExp(minBoundary(0)) + + CTools::stableExp(maxBoundary(0)))); + } + } i = 0; if (m_DownsampleFactorOverride == boost::none) { m_DownsampleFactor = CTools::stableExp(parameters(i++)); @@ -1390,6 +1403,59 @@ void CBoostedTreeImpl::recordHyperparameters() { m_Regularization.leafWeightPenaltyMultiplier()}; } +void CBoostedTreeImpl::initializeTunableHyperparameters() { + m_TunableHyperparameters.clear(); + for (int i = 0; i < static_cast(NUMBER_HYPERPARAMETERS); ++i) { + switch (i) { + case E_DownsampleFactor: + if (m_DownsampleFactorOverride == boost::none) { + m_TunableHyperparameters.emplace_back(E_DownsampleFactor); + } + break; + case E_Alpha: + if (m_RegularizationOverride.depthPenaltyMultiplier() == boost::none) { + m_TunableHyperparameters.emplace_back(E_Alpha); + } + break; + case E_Lambda: + if (m_RegularizationOverride.leafWeightPenaltyMultiplier() == boost::none) { + m_TunableHyperparameters.emplace_back(E_Lambda); + } + break; + case E_Gamma: + if (m_RegularizationOverride.treeSizePenaltyMultiplier() == boost::none) { + m_TunableHyperparameters.emplace_back(E_Gamma); + } + break; + case E_SoftTreeDepthLimit: + if (m_RegularizationOverride.softTreeDepthLimit() == boost::none) { + m_TunableHyperparameters.emplace_back(E_SoftTreeDepthLimit); + } + break; + case E_SoftTreeDepthTolerance: + if (m_RegularizationOverride.softTreeDepthTolerance() == boost::none) { + m_TunableHyperparameters.emplace_back(E_SoftTreeDepthTolerance); + } + break; + case E_Eta: + if (m_EtaOverride == boost::none) { + m_TunableHyperparameters.emplace_back(E_Eta); + } + break; + case E_EtaGrowthRatePerTree: + if (m_EtaOverride == boost::none) { + m_TunableHyperparameters.emplace_back(E_EtaGrowthRatePerTree); + } + break; + case E_FeatureBagFraction: + if (m_FeatureBagFractionOverride == boost::none) { + m_TunableHyperparameters.emplace_back(E_FeatureBagFraction); + } + break; + } + } +} + void CBoostedTreeImpl::startProgressMonitoringFineTuneHyperparameters() { // This costs "number folds" * "maximum number trees per forest" units @@ -1540,6 +1606,7 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert m_StopCrossValidationEarly, inserter); core::CPersistUtils::persist(TESTING_ROW_MASKS_TAG, m_TestingRowMasks, inserter); core::CPersistUtils::persist(TRAINING_ROW_MASKS_TAG, m_TrainingRowMasks, inserter); + // m_TunableHyperparameters is not persisted explicitly, it is restored from overriden hyperparameters } bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { @@ -1653,8 +1720,9 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav core::CPersistUtils::restore(TESTING_ROW_MASKS_TAG, m_TestingRowMasks, traverser)) RESTORE(TRAINING_ROW_MASKS_TAG, core::CPersistUtils::restore(TRAINING_ROW_MASKS_TAG, m_TrainingRowMasks, traverser)) + // m_TunableHyperparameters is not restored explicitly, it is restored from overriden hyperparameters } while (traverser.next()); - + this->initializeTunableHyperparameters(); m_InitializationStage = static_cast(initializationStage); return true; @@ -1695,6 +1763,60 @@ CTreeShapFeatureImportance* CBoostedTreeImpl::shap() { return m_TreeShap.get(); } +CBoostedTreeImpl::THyperparameterImportanceVec +CBoostedTreeImpl::hyperparameterImportance() const { + THyperparameterImportanceVec hyperparameterImportances; + hyperparameterImportances.reserve(m_TunableHyperparameters.size()); + CBayesianOptimisation::TDoubleDoublePrVec anovaMainEffects{ + m_BayesianOptimization->anovaMainEffects()}; + for (std::size_t i = 0; i < static_cast(NUMBER_HYPERPARAMETERS); ++i) { + double absoluteImportance{0.0}; + double relativeImportance{0.0}; + bool supplied{true}; + auto tunableIndex = std::distance(m_TunableHyperparameters.begin(), + std::find(m_TunableHyperparameters.begin(), + m_TunableHyperparameters.end(), i)); + if (static_cast(tunableIndex) < m_TunableHyperparameters.size()) { + supplied = false; + std::tie(absoluteImportance, relativeImportance) = anovaMainEffects[tunableIndex]; + } + double hyperparameterValue; + switch (i) { + case E_Alpha: + hyperparameterValue = m_Regularization.depthPenaltyMultiplier(); + break; + case E_DownsampleFactor: + hyperparameterValue = m_DownsampleFactor; + break; + case E_Eta: + hyperparameterValue = m_Eta; + break; + case E_EtaGrowthRatePerTree: + hyperparameterValue = m_EtaGrowthRatePerTree; + break; + case E_FeatureBagFraction: + hyperparameterValue = m_FeatureBagFraction; + break; + case E_Gamma: + hyperparameterValue = m_Regularization.treeSizePenaltyMultiplier(); + break; + case E_Lambda: + hyperparameterValue = m_Regularization.leafWeightPenaltyMultiplier(); + break; + case E_SoftTreeDepthLimit: + hyperparameterValue = m_Regularization.softTreeDepthLimit(); + break; + case E_SoftTreeDepthTolerance: + hyperparameterValue = m_Regularization.softTreeDepthTolerance(); + break; + } + hyperparameterImportances.emplace_back(static_cast(i), + hyperparameterValue, absoluteImportance, + relativeImportance, supplied); + } + return hyperparameterImportances; +} + const CBoostedTreeImpl::TDoubleVec& CBoostedTreeImpl::featureSampleProbabilities() const { return m_FeatureSampleProbabilities; } diff --git a/lib/maths/CSampling.cc b/lib/maths/CSampling.cc index 8a7e0e44b5..2e8c3ad3f6 100644 --- a/lib/maths/CSampling.cc +++ b/lib/maths/CSampling.cc @@ -23,7 +23,10 @@ #include #include #include +#include +#include #include +#include #include #include @@ -820,6 +823,30 @@ void CSampling::gammaSampleQuantiles(double shape, double rate, std::size_t n, T } } +void CSampling::sobolSequenceSample(std::size_t dim, std::size_t n, TDoubleVecVec& samples) { + samples.clear(); + if (n == 0 || dim == 0) { + return; + } + + try { + using TSobolGenerator = + boost::variate_generator>; + boost::random::sobol engine(dim); + TSobolGenerator gen(engine, boost::uniform_01()); + samples.resize(n, TDoubleVec(dim)); + for (std::size_t i = 0u; i < n; ++i) { + for (std::size_t j = 0u; j < dim; ++j) { + samples[i][j] = gen(); + } + } + } catch (const std::exception& e) { + LOG_ERROR(<< "Failed to sample Sobol sequence: " << e.what() + << ", dim = " << dim << ", n = " << n); + samples.clear(); + } +} + core::CFastMutex CSampling::ms_Lock; CSampling::CRandomNumberGenerator CSampling::ms_Rng; diff --git a/lib/maths/unittest/CBayesianOptimisationTest.cc b/lib/maths/unittest/CBayesianOptimisationTest.cc index 698b7d282f..22cc63a1d4 100644 --- a/lib/maths/unittest/CBayesianOptimisationTest.cc +++ b/lib/maths/unittest/CBayesianOptimisationTest.cc @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include @@ -24,6 +26,7 @@ using namespace ml; namespace { using TDoubleVec = std::vector; +using TDoubleVecVec = std::vector; using TVector = maths::CDenseVector; TVector vector(TDoubleVec components) { @@ -83,6 +86,27 @@ void testPersistRestoreIsIdempotent(const TDoubleVec& minBoundary, LOG_DEBUG(<< "Second string " << persistTwiceSStream.str()); BOOST_REQUIRE_EQUAL(persistOnceSStream.str(), persistTwiceSStream.str()); } + +maths::CBayesianOptimisation +initBayesianOptimization(std::size_t dim, std::size_t numSamples, double min, double max) { + test::CRandomNumbers rng; + TDoubleVec trainSamples(numSamples * dim); + rng.generateUniformSamples(min, max, trainSamples.size(), trainSamples); + maths::CBayesianOptimisation::TDoubleDoublePrVec boundaries; + boundaries.reserve(dim); + for (std::size_t d = 0; d < dim; ++d) { + boundaries.emplace_back(min, max); + } + maths::CBayesianOptimisation bopt{boundaries}; + for (std::size_t i = 0; i < numSamples; i += 2) { + TVector x{vector({trainSamples[i], trainSamples[i + 1]})}; + bopt.add(x, x.squaredNorm(), 1.0); + } + TDoubleVec kernelParameters(dim + 1, 0.5); + kernelParameters[0] = 0.7; + bopt.kernelParameters(vector(kernelParameters)); + return bopt; +} } BOOST_AUTO_TEST_CASE(testLikelihoodGradient) { @@ -368,4 +392,130 @@ BOOST_AUTO_TEST_CASE(testPersistRestore) { } } +BOOST_AUTO_TEST_CASE(testEvaluate) { + TDoubleVec coordinates{0.25, 0.5, 0.75}; + maths::CBayesianOptimisation bopt{{{0.0, 1.0}, {0.0, 1.0}}}; + for (std::size_t i = 0; i < 3; ++i) { + for (std::size_t j = 0; j < 3; ++j) { + TVector x{vector({coordinates[i], coordinates[j]})}; + bopt.add(x, x.squaredNorm(), 0.0); + } + } + + TVector kernelParameters(vector({1.0, 0.5, 0.5})); + bopt.kernelParameters(kernelParameters); + + TDoubleVecVec testPoints{{0.3, 0.3}, {0.3, 0.6}, {0.6, 0.3}}; + TDoubleVec testTargets{0.17823499, 0.45056931, 0.45056931}; + + for (std::size_t i = 0u; i < testPoints.size(); ++i) { + TVector x{vector(testPoints[i])}; + double actualTarget{bopt.evaluate(x)}; + BOOST_REQUIRE_CLOSE_ABSOLUTE(actualTarget, testTargets[i], 1e-5); + } +} + +BOOST_AUTO_TEST_CASE(testEvaluate1D) { + using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; + test::CRandomNumbers rng; + std::size_t dim{2}; + std::size_t mcSamples{100}; + maths::CBayesianOptimisation bopt{initBayesianOptimization(dim, 20, 0.0, 1.0)}; + double f0{bopt.anovaConstantFactor()}; + + TDoubleVecVec testSamples; + maths::CSampling::sobolSequenceSample(dim, mcSamples, testSamples); + + TDoubleVec testInput(1); + rng.generateUniformSamples(0, 1, 1, testInput); + + for (int d = 0; d < static_cast(dim); ++d) { + TMeanAccumulator meanAccumulator; + double ftActual{bopt.evaluate1D(testInput[0], d)}; + for (std::size_t i = 0u; i < mcSamples; ++i) { + TVector input{vector(testSamples[i])}; + input(d) = testInput[0]; + meanAccumulator.add(bopt.evaluate(input) - f0); + } + double ftExpected{maths::CBasicStatistics::mean(meanAccumulator)}; + BOOST_REQUIRE_CLOSE_ABSOLUTE(ftActual, ftExpected, 5e-4); + } +} + +BOOST_AUTO_TEST_CASE(testAnovaConstantFactor) { + using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; + + std::size_t dim{2}; + std::size_t mcSamples{1000}; + TDoubleVecVec testSamples; + maths::CSampling::sobolSequenceSample(dim, mcSamples, testSamples); + + auto verify = [&](double min, double max) { + TMeanAccumulator meanAccumulator; + maths::CBayesianOptimisation bopt{initBayesianOptimization(dim, 20, min, max)}; + double f0Actual{bopt.anovaConstantFactor()}; + for (std::size_t i = 0u; i < mcSamples; ++i) { + TVector input{(vector(testSamples[i]) * (max - min)).array() + min}; + meanAccumulator.add(bopt.evaluate(input)); + } + double f0Expected{maths::CBasicStatistics::mean(meanAccumulator)}; + BOOST_REQUIRE_CLOSE_ABSOLUTE(f0Actual, f0Expected, 5e-3); + }; + verify(0.0, 1.0); + verify(-3.0, 3.0); + verify(0.2, 0.8); +} + +BOOST_AUTO_TEST_CASE(testAnovaTotalVariance) { + using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; + TMeanAccumulator meanAccumulator; + std::size_t dim{2}; + std::size_t mcSamples{1000}; + TDoubleVecVec testSamples; + maths::CSampling::sobolSequenceSample(dim, mcSamples, testSamples); + + auto verify = [&](double min, double max) { + TMeanAccumulator meanAccumulator; + maths::CBayesianOptimisation bopt{initBayesianOptimization(dim, 20, min, max)}; + double f0{bopt.anovaConstantFactor()}; + double totalVarianceActual{bopt.anovaTotalVariance()}; + for (std::size_t i = 0u; i < mcSamples; ++i) { + TVector input{(vector(testSamples[i]) * (max - min)).array() + min}; + meanAccumulator.add(maths::CTools::pow2(bopt.evaluate(input) - f0)); + } + double totalVarianceExpected{maths::CBasicStatistics::mean(meanAccumulator)}; + BOOST_REQUIRE_CLOSE_ABSOLUTE(totalVarianceActual, totalVarianceExpected, 5e-3); + }; + verify(0.0, 1.0); + verify(-3.0, 3.0); + verify(0.2, 0.8); +} + +BOOST_AUTO_TEST_CASE(testAnovaMainEffect) { + using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; + TMeanAccumulator meanAccumulator; + std::size_t dim{2}; + std::size_t mcSamples{1000}; + TDoubleVecVec testSamples; + maths::CSampling::sobolSequenceSample(1, mcSamples, testSamples); + + auto verify = [&](double min, double max) { + maths::CBayesianOptimisation bopt{initBayesianOptimization(dim, 20, min, max)}; + for (std::size_t d = 0; d < dim; ++d) { + TMeanAccumulator meanAccumulator; + for (std::size_t i = 0; i < mcSamples; ++i) { + TVector input{(vector(testSamples[i]) * (max - min)).array() + min}; + meanAccumulator.add(maths::CTools::pow2( + bopt.evaluate1D(input[0], static_cast(d)))); + } + double mainEffectExpected(maths::CBasicStatistics::mean(meanAccumulator)); + double mainEffectActual{bopt.anovaMainEffect(static_cast(d))}; + BOOST_REQUIRE_CLOSE_ABSOLUTE(mainEffectActual, mainEffectExpected, 5e-3); + } + }; + verify(0.0, 1.0); + verify(-3.0, 3.0); + verify(0.2, 0.8); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/maths/unittest/CSamplingTest.cc b/lib/maths/unittest/CSamplingTest.cc index a1e907bf66..3eb6a9e424 100644 --- a/lib/maths/unittest/CSamplingTest.cc +++ b/lib/maths/unittest/CSamplingTest.cc @@ -389,4 +389,36 @@ BOOST_AUTO_TEST_CASE(testVectorDissimilaritySampler) { BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(percentageSeparationIncrease) > 50.0); } +BOOST_AUTO_TEST_CASE(testSobolSequenceSampling) { + { + // 1-dimensional sequence + std::size_t dim{1}; + std::size_t n{10}; + TDoubleVecVec expected{{0.5}, {0.75}, {0.25}, {0.375}, {0.875}, + {0.625}, {0.125}, {0.1875}, {0.6875}, {0.9375}}; + TDoubleVecVec actual; + maths::CSampling::sobolSequenceSample(dim, n, actual); + for (std::size_t i = 0u; i < n; ++i) { + BOOST_REQUIRE_EQUAL_COLLECTIONS(expected[i].begin(), expected[i].end(), + actual[i].begin(), actual[i].end()); + } + } + + { + // 2-dimensional sequence + std::size_t dim{2}; + std::size_t n{10}; + TDoubleVecVec expected{ + {0.5, 0.5}, {0.75, 0.25}, {0.25, 0.75}, {0.375, 0.375}, + {0.875, 0.875}, {0.625, 0.125}, {0.125, 0.625}, {0.1875, 0.3125}, + {0.6875, 0.8125}, {0.9375, 0.0625}}; + TDoubleVecVec actual; + maths::CSampling::sobolSequenceSample(dim, n, actual); + for (std::size_t i = 0u; i < n; ++i) { + BOOST_REQUIRE_EQUAL_COLLECTIONS(expected[i].begin(), expected[i].end(), + actual[i].begin(), actual[i].end()); + } + } +} + BOOST_AUTO_TEST_SUITE_END()