diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index df60309dc9..5997690edc 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -57,6 +57,8 @@ necessary. This will improve the allocation of data frame analyses to cluster no * Upgrade the compiler used on Linux from gcc 7.3 to gcc 7.5, and the binutils used in the build from version 2.20 to 2.34. (See {ml-pull}1013[#1013].) * Remove all memory overheads for computing tree SHAP values. (See {ml-pull}1023[#1023].) +* Distinguish between empty and missing categorical fields in classification and regression +model training. (See {ml-pull}1034[#1034].) === Bug Fixes diff --git a/include/api/CDataFrameAnalysisSpecification.h b/include/api/CDataFrameAnalysisSpecification.h index 6fefb6dd8a..266e9136ce 100644 --- a/include/api/CDataFrameAnalysisSpecification.h +++ b/include/api/CDataFrameAnalysisSpecification.h @@ -66,6 +66,7 @@ class API_EXPORT CDataFrameAnalysisSpecification { static const std::string THREADS; static const std::string TEMPORARY_DIRECTORY; static const std::string RESULTS_FIELD; + static const std::string MISSING_FIELD_VALUE; static const std::string CATEGORICAL_FIELD_NAMES; static const std::string DISK_USAGE_ALLOWED; static const std::string ANALYSIS; @@ -203,6 +204,7 @@ class API_EXPORT CDataFrameAnalysisSpecification { std::string m_ResultsField; std::string m_JobId; std::string m_AnalysisName; + std::string m_MissingFieldValue; TStrVec m_CategoricalFieldNames; bool m_DiskUsageAllowed; // TODO Sparse table support diff --git a/include/api/CDataFrameAnalysisSpecificationJsonWriter.h b/include/api/CDataFrameAnalysisSpecificationJsonWriter.h index 4399cbb5bf..ba03cec583 100644 --- a/include/api/CDataFrameAnalysisSpecificationJsonWriter.h +++ b/include/api/CDataFrameAnalysisSpecificationJsonWriter.h @@ -34,6 +34,7 @@ class API_EXPORT CDataFrameAnalysisSpecificationJsonWriter : private core::CNonI std::size_t numberThreads, const std::string& temporaryDirectory, const std::string& resultsField, + const std::string& missingString, const TStrVec& categoricalFields, bool diskUsageAllowed, const std::string& analysisName, @@ -48,6 +49,7 @@ class API_EXPORT CDataFrameAnalysisSpecificationJsonWriter : private core::CNonI std::size_t numberThreads, const std::string& temporaryDirectory, const std::string& resultsField, + const std::string& missingString, const TStrVec& categoricalFields, bool diskUsageAllowed, const std::string& analysisName, @@ -56,10 +58,11 @@ class API_EXPORT CDataFrameAnalysisSpecificationJsonWriter : private core::CNonI //! Returns a string with the data frame analysis specification in JSON format. static std::string jsonString(const std::string& jobId, - size_t rows, - size_t cols, - size_t memoryLimit, - size_t numberThreads, + std::size_t rows, + std::size_t cols, + std::size_t memoryLimit, + std::size_t numberThreads, + const std::string& missingString, const TStrVec& categoricalFields, bool diskUsageAllowed, const std::string& tempDir, diff --git a/include/core/CDataFrame.h b/include/core/CDataFrame.h index 6134ff7dc6..f17fc59041 100644 --- a/include/core/CDataFrame.h +++ b/include/core/CDataFrame.h @@ -238,6 +238,9 @@ class CORE_EXPORT CDataFrame final { //! The maximum number of distinct categorical fields we can faithfully represent. static const std::size_t MAX_CATEGORICAL_CARDINALITY; + //! The default value indicating that a value is missing. + static const std::string DEFAULT_MISSING_STRING; + public: //! \param[in] inMainMemory True if the data frame is stored in main memory. //! \param[in] numberColumns The number of columns in the data frame. @@ -443,6 +446,9 @@ class CORE_EXPORT CDataFrame final { //! Write the column names. void columnNames(TStrVec columnNames); + //! Write the string which indicates that a value is missing. + void missingString(std::string missing); + //! Write for which columns an empty string implies the value is missing. void emptyIsMissing(TBoolVec emptyIsMissing); @@ -577,7 +583,12 @@ class CORE_EXPORT CDataFrame final { //! A lookup for the integer value of categories. TStrSizeUMapVec m_CategoricalColumnValueLookup; + //! The string which indicates that a category is missing. + std::string m_MissingString; + //! Indicator vector for treating empty strings as missing values. + // TODO Remove once Java passes the correct value for the missing target + // for classification. TBoolVec m_EmptyIsMissing; //! Indicator vector of the columns which contain categorical values. diff --git a/include/test/CDataFrameAnalysisSpecificationFactory.h b/include/test/CDataFrameAnalysisSpecificationFactory.h index 0fec4c3148..07bb626238 100644 --- a/include/test/CDataFrameAnalysisSpecificationFactory.h +++ b/include/test/CDataFrameAnalysisSpecificationFactory.h @@ -14,6 +14,8 @@ #include +#include + #include #include #include @@ -32,37 +34,77 @@ class TEST_EXPORT CDataFrameAnalysisSpecificationFactory { using TSpecificationUPtr = std::unique_ptr; public: + CDataFrameAnalysisSpecificationFactory(); + static const std::string& classification(); static const std::string& regression(); - static TSpecificationUPtr outlierSpec(std::size_t rows = 110, - std::size_t cols = 5, - std::size_t memoryLimit = 100000, - const std::string& method = "", - std::size_t numberNeighbours = 0, - bool computeFeatureInfluence = false, - bool diskUsageAllowed = true); - - static TSpecificationUPtr - predictionSpec(const std::string& analysis, - const std::string& dependentVariable, - std::size_t rows = 100, - std::size_t cols = 5, - std::size_t memoryLimit = 7000000, - std::size_t numberRoundsPerHyperparameter = 0, - std::size_t bayesianOptimisationRestarts = 0, - const TStrVec& categoricalFieldNames = TStrVec{}, - double alpha = -1.0, - double lambda = -1.0, - double gamma = -1.0, - double softTreeDepthLimit = -1.0, - double softTreeDepthTolerance = -1.0, - double eta = -1.0, - std::size_t maximumNumberTrees = 0, - double featureBagFraction = -1.0, - size_t topShapValues = 0, - TPersisterSupplier* persisterSupplier = nullptr, - TRestoreSearcherSupplier* restoreSearcherSupplier = nullptr); + // Shared + CDataFrameAnalysisSpecificationFactory& rows(std::size_t rows); + CDataFrameAnalysisSpecificationFactory& columns(std::size_t columns); + CDataFrameAnalysisSpecificationFactory& memoryLimit(std::size_t memoryLimit); + CDataFrameAnalysisSpecificationFactory& missingString(const std::string& missing); + CDataFrameAnalysisSpecificationFactory& diskUsageAllowed(bool disk); + + // Outliers + CDataFrameAnalysisSpecificationFactory& outlierMethod(std::string method); + CDataFrameAnalysisSpecificationFactory& outlierNumberNeighbours(std::size_t number); + CDataFrameAnalysisSpecificationFactory& outlierComputeInfluence(bool compute); + + // Prediction + CDataFrameAnalysisSpecificationFactory& + predicitionNumberRoundsPerHyperparameter(std::size_t rounds); + CDataFrameAnalysisSpecificationFactory& + predictionBayesianOptimisationRestarts(std::size_t restarts); + CDataFrameAnalysisSpecificationFactory& + predictionCategoricalFieldNames(const TStrVec& categorical); + CDataFrameAnalysisSpecificationFactory& predictionAlpha(double alpha); + CDataFrameAnalysisSpecificationFactory& predictionLambda(double lambda); + CDataFrameAnalysisSpecificationFactory& predictionGamma(double gamma); + CDataFrameAnalysisSpecificationFactory& predictionSoftTreeDepthLimit(double limit); + CDataFrameAnalysisSpecificationFactory& predictionSoftTreeDepthTolerance(double tolerance); + CDataFrameAnalysisSpecificationFactory& predictionEta(double eta); + CDataFrameAnalysisSpecificationFactory& predictionMaximumNumberTrees(std::size_t number); + CDataFrameAnalysisSpecificationFactory& predictionFeatureBagFraction(double fraction); + CDataFrameAnalysisSpecificationFactory& predictionNumberTopShapValues(std::size_t number); + CDataFrameAnalysisSpecificationFactory& + predictionPersisterSupplier(TPersisterSupplier* persisterSupplier); + CDataFrameAnalysisSpecificationFactory& + predictionRestoreSearcherSupplier(TRestoreSearcherSupplier* restoreSearcherSupplier); + + TSpecificationUPtr outlierSpec() const; + TSpecificationUPtr predictionSpec(const std::string& analysis, + const std::string& dependentVariable) const; + +private: + using TOptionalSize = boost::optional; + +private: + // Shared + TOptionalSize m_Rows; + TOptionalSize m_Columns; + TOptionalSize m_MemoryLimit; + std::string m_MissingString; + bool m_DiskUsageAllowed = true; + // Outliers + std::string m_Method; + std::size_t m_NumberNeighbours = 0; + bool m_ComputeFeatureInfluence = false; + // Prediction + std::size_t m_NumberRoundsPerHyperparameter = 0; + std::size_t m_BayesianOptimisationRestarts = 0; + TStrVec m_CategoricalFieldNames; + double m_Alpha = -1.0; + double m_Lambda = -1.0; + double m_Gamma = -1.0; + double m_SoftTreeDepthLimit = -1.0; + double m_SoftTreeDepthTolerance = -1.0; + double m_Eta = -1.0; + std::size_t m_MaximumNumberTrees = 0; + double m_FeatureBagFraction = -1.0; + std::size_t m_NumberTopShapValues = 0; + TPersisterSupplier* m_PersisterSupplier = nullptr; + TRestoreSearcherSupplier* m_RestoreSearcherSupplier = nullptr; }; } } diff --git a/lib/api/CDataFrameAnalysisSpecification.cc b/lib/api/CDataFrameAnalysisSpecification.cc index 14c5d9fdf6..1a1203fa82 100644 --- a/lib/api/CDataFrameAnalysisSpecification.cc +++ b/lib/api/CDataFrameAnalysisSpecification.cc @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -28,18 +29,19 @@ namespace ml { namespace api { // These must be consistent with Java names. -const std::string CDataFrameAnalysisSpecification::JOB_ID("job_id"); -const std::string CDataFrameAnalysisSpecification::ROWS("rows"); -const std::string CDataFrameAnalysisSpecification::COLS("cols"); -const std::string CDataFrameAnalysisSpecification::MEMORY_LIMIT("memory_limit"); -const std::string CDataFrameAnalysisSpecification::THREADS("threads"); -const std::string CDataFrameAnalysisSpecification::TEMPORARY_DIRECTORY("temp_dir"); -const std::string CDataFrameAnalysisSpecification::RESULTS_FIELD("results_field"); +const std::string CDataFrameAnalysisSpecification::JOB_ID{"job_id"}; +const std::string CDataFrameAnalysisSpecification::ROWS{"rows"}; +const std::string CDataFrameAnalysisSpecification::COLS{"cols"}; +const std::string CDataFrameAnalysisSpecification::MEMORY_LIMIT{"memory_limit"}; +const std::string CDataFrameAnalysisSpecification::THREADS{"threads"}; +const std::string CDataFrameAnalysisSpecification::TEMPORARY_DIRECTORY{"temp_dir"}; +const std::string CDataFrameAnalysisSpecification::RESULTS_FIELD{"results_field"}; +const std::string CDataFrameAnalysisSpecification::MISSING_FIELD_VALUE{"missing_field_value"}; const std::string CDataFrameAnalysisSpecification::CATEGORICAL_FIELD_NAMES{"categorical_fields"}; -const std::string CDataFrameAnalysisSpecification::DISK_USAGE_ALLOWED("disk_usage_allowed"); -const std::string CDataFrameAnalysisSpecification::ANALYSIS("analysis"); -const std::string CDataFrameAnalysisSpecification::NAME("name"); -const std::string CDataFrameAnalysisSpecification::PARAMETERS("parameters"); +const std::string CDataFrameAnalysisSpecification::DISK_USAGE_ALLOWED{"disk_usage_allowed"}; +const std::string CDataFrameAnalysisSpecification::ANALYSIS{"analysis"}; +const std::string CDataFrameAnalysisSpecification::NAME{"name"}; +const std::string CDataFrameAnalysisSpecification::PARAMETERS{"parameters"}; namespace { using TBoolVec = std::vector; @@ -75,6 +77,8 @@ const CDataFrameAnalysisConfigReader CONFIG_READER{[] { CDataFrameAnalysisConfigReader::E_OptionalParameter); theReader.addParameter(CDataFrameAnalysisSpecification::RESULTS_FIELD, CDataFrameAnalysisConfigReader::E_OptionalParameter); + theReader.addParameter(CDataFrameAnalysisSpecification::MISSING_FIELD_VALUE, + CDataFrameAnalysisConfigReader::E_OptionalParameter); theReader.addParameter(CDataFrameAnalysisSpecification::CATEGORICAL_FIELD_NAMES, CDataFrameAnalysisConfigReader::E_OptionalParameter); theReader.addParameter(CDataFrameAnalysisSpecification::DISK_USAGE_ALLOWED, @@ -131,12 +135,20 @@ CDataFrameAnalysisSpecification::CDataFrameAnalysisSpecification( m_TemporaryDirectory = parameters[TEMPORARY_DIRECTORY].fallback(std::string{}); m_JobId = parameters[JOB_ID].fallback(std::string{}); m_ResultsField = parameters[RESULTS_FIELD].fallback(DEFAULT_RESULT_FIELD); + m_MissingFieldValue = parameters[MISSING_FIELD_VALUE].fallback( + core::CDataFrame::DEFAULT_MISSING_STRING); m_CategoricalFieldNames = parameters[CATEGORICAL_FIELD_NAMES].fallback(TStrVec{}); m_DiskUsageAllowed = parameters[DISK_USAGE_ALLOWED].fallback(DEFAULT_DISK_USAGE_ALLOWED); + double missing; + if (m_MissingFieldValue != core::CDataFrame::DEFAULT_MISSING_STRING && + core::CStringUtils::stringToTypeSilent(m_MissingFieldValue, missing)) { + HANDLE_FATAL(<< "Input error: you can't use a number (" << missing + << ") to denote a missing field value.") + } if (m_DiskUsageAllowed && m_TemporaryDirectory.empty()) { HANDLE_FATAL(<< "Input error: temporary directory path should be explicitly set if disk" - " usage is allowed! Please report this problem."); + " usage is allowed! Please report this problem.") } auto jsonAnalysis = parameters[ANALYSIS].jsonObject(); @@ -189,6 +201,7 @@ CDataFrameAnalysisSpecification::makeDataFrame() { ? core::makeMainStorageDataFrame(m_NumberColumns) : core::makeDiskStorageDataFrame(m_TemporaryDirectory, m_NumberColumns, m_NumberRows); + result.first->missingString(m_MissingFieldValue); result.first->reserve(m_NumberThreads, m_NumberColumns + this->numberExtraColumns()); return result; diff --git a/lib/api/CDataFrameAnalysisSpecificationJsonWriter.cc b/lib/api/CDataFrameAnalysisSpecificationJsonWriter.cc index 38f5e0f124..b278781118 100644 --- a/lib/api/CDataFrameAnalysisSpecificationJsonWriter.cc +++ b/lib/api/CDataFrameAnalysisSpecificationJsonWriter.cc @@ -6,6 +6,8 @@ #include +#include + #include #include @@ -20,6 +22,7 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId, std::size_t numberThreads, const std::string& temporaryDirectory, const std::string& resultsField, + const std::string& missingFieldValue, const TStrVec& categoricalFields, bool diskUsageAllowed, const std::string& analysisName, @@ -34,8 +37,8 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId, } } write(jobId, rows, cols, memoryLimit, numberThreads, temporaryDirectory, - resultsField, categoricalFields, diskUsageAllowed, analysisName, - analysisParametersDoc, writer); + resultsField, missingFieldValue, categoricalFields, diskUsageAllowed, + analysisName, analysisParametersDoc, writer); } void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId, @@ -45,6 +48,7 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId, std::size_t numberThreads, const std::string& temporaryDirectory, const std::string& resultsField, + const std::string& missingFieldValue, const TStrVec& categoricalFields, bool diskUsageAllowed, const std::string& analysisName, @@ -73,6 +77,11 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId, writer.Key(CDataFrameAnalysisSpecification::RESULTS_FIELD); writer.String(resultsField); + if (missingFieldValue != core::CDataFrame::DEFAULT_MISSING_STRING) { + writer.Key(CDataFrameAnalysisSpecification::MISSING_FIELD_VALUE); + writer.String(missingFieldValue); + } + rapidjson::Value array(rapidjson::kArrayType); for (const auto& field : categoricalFields) { array.PushBack(rapidjson::Value(rapidjson::StringRef(field)), @@ -105,24 +114,26 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId, writer.Flush(); } -std::string -CDataFrameAnalysisSpecificationJsonWriter::jsonString(const std::string& jobId, - size_t rows, - size_t cols, - size_t memoryLimit, - size_t numberThreads, - const TStrVec& categoricalFields, - bool diskUsageAllowed, - const std::string& tempDir, - const std::string& resultField, - const std::string& analysisName, - const std::string& analysisParameters) { +std::string CDataFrameAnalysisSpecificationJsonWriter::jsonString( + const std::string& jobId, + std::size_t rows, + std::size_t cols, + std::size_t memoryLimit, + std::size_t numberThreads, + const std::string& missingFieldValue, + const TStrVec& categoricalFields, + bool diskUsageAllowed, + const std::string& tempDir, + const std::string& resultField, + const std::string& analysisName, + const std::string& analysisParameters) { rapidjson::StringBuffer stringBuffer; TRapidJsonLineWriter writer; writer.Reset(stringBuffer); write(jobId, rows, cols, memoryLimit, numberThreads, tempDir, resultField, - categoricalFields, diskUsageAllowed, analysisName, analysisParameters, writer); + missingFieldValue, categoricalFields, diskUsageAllowed, analysisName, + analysisParameters, writer); return stringBuffer.GetString(); } diff --git a/lib/api/unittest/CBoostedTreeInferenceModelBuilderTest.cc b/lib/api/unittest/CBoostedTreeInferenceModelBuilderTest.cc index 923b4e2aba..6e65e8cc2a 100644 --- a/lib/api/unittest/CBoostedTreeInferenceModelBuilderTest.cc +++ b/lib/api/unittest/CBoostedTreeInferenceModelBuilderTest.cc @@ -94,14 +94,17 @@ BOOST_AUTO_TEST_CASE(testIntegrationRegression) { values[2].push_back(values[0][i] * weights[0] + values[1][i] * weights[1]); } + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::regression(), "target_col", - numberExamples, cols, 30000000, 0, 0, {"categorical_col"}), + specFactory.rows(numberExamples) + .columns(cols) + .memoryLimit(30000000) + .predictionCategoricalFieldNames({"categorical_col"}) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::regression(), "target_col"), outputWriterFactory}; - TDataFrameUPtr frame = - core::makeMainStorageDataFrame(cols + 2, numberExamples).first; + TDataFrameUPtr frame{ + core::makeMainStorageDataFrame(cols + 2, numberExamples).first}; for (std::size_t i = 0; i < numberExamples; ++i) { for (std::size_t j = 0; j < cols; ++j) { fieldValues[j] = core::CStringUtils::typeToStringPrecise( @@ -178,14 +181,17 @@ BOOST_AUTO_TEST_CASE(testIntegrationClassification) { values[1] = generateCategoricalData(rng, numberExamples, {100., 5.0, 5.0}).second; values[2] = generateCategoricalData(rng, numberExamples, {5.0, 5.0}).second; + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::classification(), "target_col", - numberExamples, cols, 30000000, 0, 0, {"categorical_col", "target_col"}), + specFactory.rows(numberExamples) + .columns(cols) + .memoryLimit(30000000) + .predictionCategoricalFieldNames({"categorical_col", "target_col"}) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target_col"), outputWriterFactory}; - TDataFrameUPtr frame = - core::makeMainStorageDataFrame(cols + 2, numberExamples).first; + TDataFrameUPtr frame{ + core::makeMainStorageDataFrame(cols + 2, numberExamples).first}; for (std::size_t i = 0; i < numberExamples; ++i) { for (std::size_t j = 0; j < cols; ++j) { fieldValues[j] = core::CStringUtils::typeToStringPrecise( @@ -270,14 +276,17 @@ BOOST_AUTO_TEST_CASE(testJsonSchema) { values[2].push_back(values[0][i] * weights[0] + values[1][i] * weights[1]); } + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::regression(), "target_col", - numberExamples, cols, 30000000, 0, 0, {"categorical_col"}), + specFactory.rows(numberExamples) + .columns(cols) + .memoryLimit(30000000) + .predictionCategoricalFieldNames({"categorical_col"}) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::regression(), "target_col"), outputWriterFactory}; - TDataFrameUPtr frame = - core::makeMainStorageDataFrame(cols + 2, numberExamples).first; + TDataFrameUPtr frame{ + core::makeMainStorageDataFrame(cols + 2, numberExamples).first}; for (std::size_t i = 0; i < numberExamples; ++i) { for (std::size_t j = 0; j < cols; ++j) { fieldValues[j] = core::CStringUtils::typeToStringPrecise( diff --git a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc index 4f74941bf9..a3e57fafd7 100644 --- a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc @@ -40,8 +40,12 @@ BOOST_AUTO_TEST_CASE(testComputeExecutionStrategyForOutliers) { for (auto numberCols : numbersCols) { LOG_DEBUG(<< "# rows = " << numberRows << ", # cols = " << numberCols); - auto spec{test::CDataFrameAnalysisSpecificationFactory::outlierSpec( - numberRows, numberCols, 100000000, "", 0, true)}; + test::CDataFrameAnalysisSpecificationFactory specFactory; + auto spec = specFactory.rows(numberRows) + .columns(numberCols) + .memoryLimit(100000000) + .outlierComputeInfluence(true) + .outlierSpec(); api::CDataFrameOutliersRunnerFactory factory; auto runner = factory.make(*spec); @@ -83,8 +87,13 @@ BOOST_AUTO_TEST_CASE(testComputeAndSaveExecutionStrategyDiskUsageFlag) { // Test large memory requirement without disk usage { errors.clear(); - auto spec = test::CDataFrameAnalysisSpecificationFactory::outlierSpec( - 1000, 100, 500000, "", 0, true, false); + test::CDataFrameAnalysisSpecificationFactory specFactory; + auto spec = specFactory.rows(1000) + .columns(100) + .memoryLimit(500000) + .outlierComputeInfluence(true) + .diskUsageAllowed(false) + .outlierSpec(); // single error is registered that the memory limit is to low LOG_DEBUG(<< "errors = " << core::CContainerPrinter::print(errors)); @@ -97,8 +106,13 @@ BOOST_AUTO_TEST_CASE(testComputeAndSaveExecutionStrategyDiskUsageFlag) { // Test large memory requirement with disk usage { errors.clear(); - auto spec = test::CDataFrameAnalysisSpecificationFactory::outlierSpec( - 1000, 100, 500000, "", 0, true, true); + test::CDataFrameAnalysisSpecificationFactory specFactory; + auto spec = specFactory.rows(1000) + .columns(100) + .memoryLimit(500000) + .outlierComputeInfluence(true) + .diskUsageAllowed(true) + .outlierSpec(); // no error should be registered BOOST_REQUIRE_EQUAL(0, static_cast(errors.size())); @@ -107,8 +121,13 @@ BOOST_AUTO_TEST_CASE(testComputeAndSaveExecutionStrategyDiskUsageFlag) { // Test low memory requirement without disk usage { errors.clear(); - auto spec = test::CDataFrameAnalysisSpecificationFactory::outlierSpec( - 10, 10, 500000, "", 0, true, false); + test::CDataFrameAnalysisSpecificationFactory specFactory; + auto spec = specFactory.rows(10) + .columns(10) + .memoryLimit(500000) + .outlierComputeInfluence(true) + .diskUsageAllowed(false) + .outlierSpec(); // no error should be registered BOOST_REQUIRE_EQUAL(0, static_cast(errors.size())); @@ -131,10 +150,13 @@ void testEstimateMemoryUsage(std::int64_t numberRows, core::CLogger::CScopeSetFatalErrorHandler scope{errorHandler}; - // The output writer won't close the JSON structures until is is destroyed + // The output writer won't close the JSON structures until is is destroyed. { - auto spec{test::CDataFrameAnalysisSpecificationFactory::outlierSpec( - numberRows, 5, 100000000, "", 0, true)}; + test::CDataFrameAnalysisSpecificationFactory specFactory; + auto spec = specFactory.rows(numberRows) + .memoryLimit(100000000) + .outlierComputeInfluence(true) + .outlierSpec(); core::CJsonOutputStreamWrapper wrappedOutStream(sstream); api::CMemoryUsageEstimationResultJsonWriter writer(wrappedOutStream); @@ -189,7 +211,7 @@ void testColumnsForWhichEmptyIsMissing(const std::string& analysis, const TBoolVec& expectedEmptyIsMissing) { std::string parameters{"{\"dependent_variable\": \"" + dependentVariableName + "\"}"}; std::string jsonSpec{api::CDataFrameAnalysisSpecificationJsonWriter::jsonString( - "testJob", 10000, 5, 100000000, 1, categoricalFields, true, + "testJob", 10000, 5, 100000000, 1, "", categoricalFields, true, test::CTestTmpDir::tmpDir(), "", analysis, parameters)}; api::CDataFrameAnalysisSpecification spec{jsonSpec}; auto emptyIsMissing = spec.columnsForWhichEmptyIsMissing(fieldNames); diff --git a/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc b/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc index 450764e5cb..bfca409ac2 100644 --- a/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc @@ -39,7 +39,7 @@ using TRunnerFactoryUPtrVec = std::vector; std::string createSpecJsonForTempDirDiskUsageTest(bool tempDirPathSet, bool diskUsageAllowed) { std::string tempDir = tempDirPathSet ? test::CTestTmpDir::tmpDir() : ""; return api::CDataFrameAnalysisSpecificationJsonWriter::jsonString( - "testJob", 100, 3, 500000, 1, {}, diskUsageAllowed, tempDir, "", + "testJob", 100, 3, 500000, 1, "", {}, diskUsageAllowed, tempDir, "", "outlier_detection", ""); } } @@ -340,7 +340,7 @@ BOOST_AUTO_TEST_CASE(testCreate) { std::string parameters{"{\"dependent_variable\": \"class\"}"}; api::CDataFrameAnalysisSpecification spec{ api::CDataFrameAnalysisSpecificationJsonWriter::jsonString( - "testJob", 10000, 5, 100000000, 1, {}, true, + "testJob", 10000, 5, 100000000, 1, "", {}, true, test::CTestTmpDir::tmpDir(), "", "classification", parameters)}; LOG_DEBUG(<< core::CContainerPrinter::print(errors)); BOOST_TEST_REQUIRE(errors.size() > 0); @@ -352,7 +352,19 @@ BOOST_AUTO_TEST_CASE(testCreate) { std::string parameters{"{\"dependent_variable\": \"value\"}"}; api::CDataFrameAnalysisSpecification spec{ api::CDataFrameAnalysisSpecificationJsonWriter::jsonString( - "testJob", 10000, 5, 100000000, 1, {"value"}, true, + "testJob", 10000, 5, 100000000, 1, "", {"value"}, true, + test::CTestTmpDir::tmpDir(), "", "regression", parameters)}; + LOG_DEBUG(<< core::CContainerPrinter::print(errors)); + BOOST_TEST_REQUIRE(errors.size() > 0); + } + + LOG_DEBUG(<< "Missing field value is numeric"); + { + errors.clear(); + std::string parameters{"{\"dependent_variable\": \"value\"}"}; + api::CDataFrameAnalysisSpecification spec{ + api::CDataFrameAnalysisSpecificationJsonWriter::jsonString( + "testJob", 10000, 5, 100000000, 1, "42", {}, true, test::CTestTmpDir::tmpDir(), "", "regression", parameters)}; LOG_DEBUG(<< core::CContainerPrinter::print(errors)); BOOST_TEST_REQUIRE(errors.size() > 0); @@ -372,7 +384,8 @@ BOOST_AUTO_TEST_CASE(testRunAnalysis) { }; std::string jsonSpec = api::CDataFrameAnalysisSpecificationJsonWriter::jsonString( - "testJob", 100, 10, 1000, 1, {}, true, test::CTestTmpDir::tmpDir(), "", "test", ""); + "testJob", 100, 10, 1000, 1, "", {}, true, test::CTestTmpDir::tmpDir(), + "", "test", ""); for (std::size_t i = 0; i < 10; ++i) { api::CDataFrameAnalysisSpecification spec{testFactory(), jsonSpec}; diff --git a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc index ab3a713961..eae3c71aea 100644 --- a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc @@ -134,12 +134,21 @@ struct SFixture { auto outputWriterFactory = [&]() { return std::make_unique(s_Output); }; + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::regression(), - "target", s_Rows, 5, 26000000, 0, 0, {"c1"}, s_Alpha, s_Lambda, - s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, s_Eta, - s_MaximumNumberTrees, s_FeatureBagFraction, shapValues), + specFactory.rows(s_Rows) + .memoryLimit(26000000) + .predictionCategoricalFieldNames({"c1"}) + .predictionAlpha(s_Alpha) + .predictionLambda(s_Lambda) + .predictionGamma(s_Gamma) + .predictionSoftTreeDepthLimit(s_SoftTreeDepthLimit) + .predictionSoftTreeDepthTolerance(s_SoftTreeDepthTolerance) + .predictionEta(s_Eta) + .predictionMaximumNumberTrees(s_MaximumNumberTrees) + .predictionFeatureBagFraction(s_FeatureBagFraction) + .predictionNumberTopShapValues(shapValues) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), outputWriterFactory}; TStrVec fieldNames{"target", "c1", "c2", "c3", "c4", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; @@ -178,12 +187,21 @@ struct SFixture { auto outputWriterFactory = [&]() { return std::make_unique(s_Output); }; + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::classification(), - "target", s_Rows, 5, 26000000, 0, 0, {"target"}, s_Alpha, - s_Lambda, s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, - s_Eta, s_MaximumNumberTrees, s_FeatureBagFraction, shapValues), + specFactory.rows(s_Rows) + .memoryLimit(26000000) + .predictionCategoricalFieldNames({"target"}) + .predictionAlpha(s_Alpha) + .predictionLambda(s_Lambda) + .predictionGamma(s_Gamma) + .predictionSoftTreeDepthLimit(s_SoftTreeDepthLimit) + .predictionSoftTreeDepthTolerance(s_SoftTreeDepthTolerance) + .predictionEta(s_Eta) + .predictionMaximumNumberTrees(s_MaximumNumberTrees) + .predictionFeatureBagFraction(s_FeatureBagFraction) + .predictionNumberTopShapValues(shapValues) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target"), outputWriterFactory}; TStrVec fieldNames{"target", "c1", "c2", "c3", "c4", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; @@ -217,12 +235,20 @@ struct SFixture { auto outputWriterFactory = [&]() { return std::make_unique(s_Output); }; + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::regression(), - "target", s_Rows, 5, 26000000, 0, 0, {}, s_Alpha, s_Lambda, - s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, s_Eta, - s_MaximumNumberTrees, s_FeatureBagFraction, shapValues), + specFactory.rows(s_Rows) + .memoryLimit(26000000) + .predictionAlpha(s_Alpha) + .predictionLambda(s_Lambda) + .predictionGamma(s_Gamma) + .predictionSoftTreeDepthLimit(s_SoftTreeDepthLimit) + .predictionSoftTreeDepthTolerance(s_SoftTreeDepthTolerance) + .predictionEta(s_Eta) + .predictionMaximumNumberTrees(s_MaximumNumberTrees) + .predictionFeatureBagFraction(s_FeatureBagFraction) + .predictionNumberTopShapValues(shapValues) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), outputWriterFactory}; TStrVec fieldNames{"target", "c1", "c2", "c3", "c4", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; diff --git a/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc b/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc index 7bfaa8a4cc..55437c0857 100644 --- a/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc @@ -115,7 +115,7 @@ BOOST_AUTO_TEST_CASE(testWithoutControlMessages) { std::stringstream persistStream; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::outlierSpec(), outputWriterFactory}; + test::CDataFrameAnalysisSpecificationFactory{}.outlierSpec(), outputWriterFactory}; TDoubleVec expectedScores; TDoubleVecVec expectedFeatureInfluences; @@ -162,7 +162,7 @@ BOOST_AUTO_TEST_CASE(testRunOutlierDetection) { }; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::outlierSpec(), outputWriterFactory}; + test::CDataFrameAnalysisSpecificationFactory{}.outlierSpec(), outputWriterFactory}; TDoubleVec expectedScores; TDoubleVecVec expectedFeatureInfluences; @@ -223,8 +223,8 @@ BOOST_AUTO_TEST_CASE(testRunOutlierDetectionPartitioned) { return std::make_unique(output); }; - api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::outlierSpec(1000), outputWriterFactory}; + test::CDataFrameAnalysisSpecificationFactory specFactory; + api::CDataFrameAnalyzer analyzer{specFactory.rows(1000).outlierSpec(), outputWriterFactory}; TDoubleVec expectedScores; TDoubleVecVec expectedFeatureInfluences; @@ -276,9 +276,9 @@ BOOST_AUTO_TEST_CASE(testRunOutlierFeatureInfluences) { return std::make_unique(output); }; - api::CDataFrameAnalyzer analyzer{test::CDataFrameAnalysisSpecificationFactory::outlierSpec( - 110, 5, 100000, "", 0, true), - outputWriterFactory}; + test::CDataFrameAnalysisSpecificationFactory specFactory; + api::CDataFrameAnalyzer analyzer{ + specFactory.outlierComputeInfluence(true).outlierSpec(), outputWriterFactory}; TDoubleVec expectedScores; TDoubleVecVec expectedFeatureInfluences; @@ -331,9 +331,12 @@ BOOST_AUTO_TEST_CASE(testRunOutlierDetectionWithParams) { return std::make_unique(output); }; + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::outlierSpec( - 110, 5, 1000000, methods[method], k, false), + specFactory.outlierMethod(methods[method]) + .outlierNumberNeighbours(k) + .outlierComputeInfluence(false) + .outlierSpec(), outputWriterFactory}; TDoubleVec expectedScores; @@ -376,7 +379,7 @@ BOOST_AUTO_TEST_CASE(testOutlierDetectionStateReport) { }; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::outlierSpec(), outputWriterFactory}; + test::CDataFrameAnalysisSpecificationFactory{}.outlierSpec(), outputWriterFactory}; TDoubleVec expectedScores; TDoubleVecVec expectedFeatureInfluences; @@ -411,7 +414,7 @@ BOOST_AUTO_TEST_CASE(testFlushMessage) { }; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::outlierSpec(), outputWriterFactory}; + test::CDataFrameAnalysisSpecificationFactory{}.outlierSpec(), outputWriterFactory}; BOOST_REQUIRE_EQUAL( true, analyzer.handleRecord({"c1", "c2", "c3", "c4", "c5", ".", "."}, {"", "", "", "", "", "", " "})); @@ -449,7 +452,7 @@ BOOST_AUTO_TEST_CASE(testErrors) { { errors.clear(); api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::outlierSpec(), outputWriterFactory}; + test::CDataFrameAnalysisSpecificationFactory{}.outlierSpec(), outputWriterFactory}; BOOST_REQUIRE_EQUAL( false, analyzer.handleRecord({"c1", "c2", "c3", ".", "c4", "c5", "."}, {"10", "10", "10", "", "10", "10", ""})); @@ -460,7 +463,7 @@ BOOST_AUTO_TEST_CASE(testErrors) { // Test missing special field { api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::outlierSpec(), outputWriterFactory}; + test::CDataFrameAnalysisSpecificationFactory{}.outlierSpec(), outputWriterFactory}; errors.clear(); BOOST_REQUIRE_EQUAL( false, analyzer.handleRecord({"c1", "c2", "c3", "c4", "c5", "."}, @@ -472,7 +475,7 @@ BOOST_AUTO_TEST_CASE(testErrors) { // Test bad control message { api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::outlierSpec(), outputWriterFactory}; + test::CDataFrameAnalysisSpecificationFactory{}.outlierSpec(), outputWriterFactory}; errors.clear(); BOOST_REQUIRE_EQUAL( false, analyzer.handleRecord({"c1", "c2", "c3", "c4", "c5", ".", "."}, @@ -484,7 +487,7 @@ BOOST_AUTO_TEST_CASE(testErrors) { // Test bad input { api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::outlierSpec(), outputWriterFactory}; + test::CDataFrameAnalysisSpecificationFactory{}.outlierSpec(), outputWriterFactory}; errors.clear(); BOOST_REQUIRE_EQUAL( false, analyzer.handleRecord({"c1", "c2", "c3", "c4", "c5", ".", "."}, @@ -496,8 +499,8 @@ BOOST_AUTO_TEST_CASE(testErrors) { // Test inconsistent number of rows { // Fewer rows than expected is ignored. - api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::outlierSpec(2), outputWriterFactory}; + test::CDataFrameAnalysisSpecificationFactory specFactory; + api::CDataFrameAnalyzer analyzer{specFactory.rows(2).outlierSpec(), outputWriterFactory}; errors.clear(); BOOST_REQUIRE_EQUAL( true, analyzer.handleRecord({"c1", "c2", "c3", "c4", "c5", ".", "."}, @@ -509,8 +512,8 @@ BOOST_AUTO_TEST_CASE(testErrors) { BOOST_TEST_REQUIRE(errors.empty()); } { - api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::outlierSpec(2), outputWriterFactory}; + test::CDataFrameAnalysisSpecificationFactory specFactory; + api::CDataFrameAnalyzer analyzer{specFactory.rows(2).outlierSpec(), outputWriterFactory}; errors.clear(); BOOST_REQUIRE_EQUAL( true, analyzer.handleRecord({"c1", "c2", "c3", "c4", "c5", ".", "."}, @@ -530,8 +533,8 @@ BOOST_AUTO_TEST_CASE(testErrors) { // No data. { - api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::outlierSpec(2), outputWriterFactory}; + test::CDataFrameAnalysisSpecificationFactory specFactory; + api::CDataFrameAnalyzer analyzer{specFactory.rows(2).outlierSpec(), outputWriterFactory}; errors.clear(); BOOST_REQUIRE_EQUAL( true, analyzer.handleRecord({"c1", "c2", "c3", "c4", "c5", ".", "."}, @@ -549,8 +552,8 @@ BOOST_AUTO_TEST_CASE(testRoundTripDocHashes) { return std::make_unique(output); }; - api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::outlierSpec(9), outputWriterFactory}; + test::CDataFrameAnalysisSpecificationFactory specFactory; + api::CDataFrameAnalyzer analyzer{specFactory.rows(9).outlierSpec(), outputWriterFactory}; for (auto i : {"1", "2", "3", "4", "5", "6", "7", "8", "9"}) { analyzer.handleRecord({"c1", "c2", "c3", "c4", "c5", ".", "."}, {i, i, i, i, i, i, ""}); diff --git a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc index e8056a9e82..43e378b0f0 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -369,6 +370,79 @@ void testOneRunOfBoostedTreeTrainingWithStateRecovery(F makeSpec, std::size_t it } } +BOOST_AUTO_TEST_CASE(testMissingString) { + + // Test that the special missing value string is respected. + + std::stringstream output; + auto outputWriterFactory = [&output]() { + return std::make_unique(output); + }; + + TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; + TStrVec fieldValues{"a", "2.0", "3.0", "4.0", "5.0", "0", ""}; + + // Test default value. + { + std::string a{"a"}; + std::string b{"b"}; + std::string missing{core::CDataFrame::DEFAULT_MISSING_STRING}; + + test::CDataFrameAnalysisSpecificationFactory specFactory; + api::CDataFrameAnalyzer analyzer{ + specFactory.rows(5).predictionCategoricalFieldNames({"f1"}).predictionSpec( + test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), + outputWriterFactory}; + + TBoolVec isMissing; + for (const auto& category : {a, missing, b, a, missing}) { + fieldValues[0] = category; + analyzer.handleRecord(fieldNames, fieldValues); + isMissing.push_back(category == missing); + } + analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + + analyzer.dataFrame().readRows(1, [&](TRowItr beginRows, TRowItr endRows) { + std::size_t i{0}; + for (auto row = beginRows; row != endRows; ++row, ++i) { + BOOST_REQUIRE_EQUAL(isMissing[row->index()], + maths::CDataFrameUtils::isMissing((*row)[0])); + } + }); + } + + // Test custom value. + { + std::string a{"a"}; + std::string b{"b"}; + std::string missing{"foo"}; + + test::CDataFrameAnalysisSpecificationFactory specFactory; + api::CDataFrameAnalyzer analyzer{ + specFactory.rows(5) + .predictionCategoricalFieldNames({"f1"}) + .missingString("foo") + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), + outputWriterFactory}; + + TBoolVec isMissing; + for (const auto& category : {a, missing, b, a, missing}) { + fieldValues[0] = category; + analyzer.handleRecord(fieldNames, fieldValues); + isMissing.push_back(category == missing); + } + analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + + analyzer.dataFrame().readRows(1, [&](TRowItr beginRows, TRowItr endRows) { + std::size_t i{0}; + for (auto row = beginRows; row != endRows; ++row, ++i) { + BOOST_REQUIRE_EQUAL(isMissing[row->index()], + maths::CDataFrameUtils::isMissing((*row)[0])); + } + }); + } +} + BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTraining) { // Test the results the analyzer produces match running the regression directly. @@ -383,7 +457,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTraining) { TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( + test::CDataFrameAnalysisSpecificationFactory{}.predictionSpec( test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), outputWriterFactory}; addPredictionTestData(E_Regression, fieldNames, fieldValues, analyzer, expectedPredictions); @@ -448,7 +522,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTrainingStateReport) { TStrVec fieldNames{"c1", "c2", "c3", "c4", "c5", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec("regression", "c5"), + test::CDataFrameAnalysisSpecificationFactory{}.predictionSpec("regression", "c5"), outputWriterFactory}; addPredictionTestData(E_Regression, fieldNames, fieldValues, analyzer, expectedPredictions); analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); @@ -477,11 +551,17 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTrainingWithParams) { return std::make_unique(output); }; + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::regression(), "target", - 100, 5, 4000000, 0, 0, {}, alpha, lambda, gamma, softTreeDepthLimit, - softTreeDepthTolerance, eta, maximumNumberTrees, featureBagFraction), + specFactory.predictionAlpha(alpha) + .predictionLambda(lambda) + .predictionGamma(gamma) + .predictionSoftTreeDepthLimit(softTreeDepthLimit) + .predictionSoftTreeDepthTolerance(softTreeDepthTolerance) + .predictionEta(eta) + .predictionMaximumNumberTrees(maximumNumberTrees) + .predictionFeatureBagFraction(featureBagFraction) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), outputWriterFactory}; TDoubleVec expectedPredictions; @@ -553,9 +633,10 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTrainingWithRowsMissingTargetVa auto target = [](double feature) { return 10.0 * feature; }; + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::regression(), "target", 50, 2, 4000000), + specFactory.rows(50).columns(2).memoryLimit(4000000).predictionSpec( + test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), outputWriterFactory}; TDoubleVec feature; @@ -636,13 +717,21 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTrainingWithStateRecovery) { auto makeSpec = [&](const std::string& dependentVariable, std::size_t numberExamples, TPersisterSupplier persisterSupplier) { - return test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::regression(), dependentVariable, - numberExamples, 5, 15000000, numberRoundsPerHyperparameter, 12, - {}, params.s_Alpha, params.s_Lambda, params.s_Gamma, - params.s_SoftTreeDepthLimit, params.s_SoftTreeDepthTolerance, - params.s_Eta, params.s_MaximumNumberTrees, params.s_FeatureBagFraction, - 0 /*numTopFeatureImportanceValues*/, &persisterSupplier); + test::CDataFrameAnalysisSpecificationFactory specFactory; + return specFactory.rows(numberExamples) + .memoryLimit(15000000) + .predicitionNumberRoundsPerHyperparameter(numberRoundsPerHyperparameter) + .predictionAlpha(params.s_Alpha) + .predictionLambda(params.s_Lambda) + .predictionGamma(params.s_Gamma) + .predictionSoftTreeDepthLimit(params.s_SoftTreeDepthLimit) + .predictionSoftTreeDepthTolerance(params.s_SoftTreeDepthTolerance) + .predictionEta(params.s_Eta) + .predictionMaximumNumberTrees(params.s_MaximumNumberTrees) + .predictionFeatureBagFraction(params.s_FeatureBagFraction) + .predictionPersisterSupplier(&persisterSupplier) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::regression(), + dependentVariable); }; finalIteration = params.numberUnset() * numberRoundsPerHyperparameter; @@ -670,10 +759,11 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeClassifierTraining) { TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::classification(), - "target", 100, 5, 6000000, 0, 0, {"target"}), + specFactory.memoryLimit(6000000) + .predictionCategoricalFieldNames({"target"}) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target"), outputWriterFactory}; addPredictionTestData(E_BinaryClassification, fieldNames, fieldValues, analyzer, expectedPredictions); @@ -756,10 +846,13 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeClassifierImbalanced) { TDoubleVec regressors; rng.generateUniformSamples(-5.0, 10.0, numberExamples * weights.size(), regressors); + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::classification(), - "target", numberExamples, 4, 18000000, 0, 0, {"target"}), + specFactory.rows(numberExamples) + .columns(4) + .memoryLimit(18000000) + .predictionCategoricalFieldNames({"target"}) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target"), outputWriterFactory}; TStrVec actuals; @@ -804,10 +897,12 @@ BOOST_AUTO_TEST_CASE(testCategoricalFields) { }; { + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::regression(), - "x5", 1000, 5, 27000000, 0, 0, {"x1", "x2"}), + specFactory.rows(1000) + .memoryLimit(27000000) + .predictionCategoricalFieldNames({"x1", "x2"}) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::regression(), "x5"), outputWriterFactory}; TStrVec x[]{{"x11", "x12", "x13", "x14", "x15"}, @@ -846,10 +941,12 @@ BOOST_AUTO_TEST_CASE(testCategoricalFields) { { std::size_t rows{core::CDataFrame::MAX_CATEGORICAL_CARDINALITY + 3}; + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::regression(), - "x5", rows, 5, 8000000000, 0, 0, {"x1"}), + specFactory.rows(rows) + .memoryLimit(8000000000) + .predictionCategoricalFieldNames({"x1"}) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::regression(), "x5"), outputWriterFactory}; TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."}; @@ -910,10 +1007,12 @@ BOOST_AUTO_TEST_CASE(testCategoricalFieldsEmptyAsMissing) { return std::make_unique(output); }; + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::classification(), - "x5", 1000, 5, 27000000, 0, 0, {"x1", "x2", "x5"}), + specFactory.rows(1000) + .memoryLimit(27000000) + .predictionCategoricalFieldNames({"x1", "x2", "x5"}) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "x5"), outputWriterFactory}; TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."}; diff --git a/lib/api/unittest/CDataFrameTrainBoostedTreeClassifierRunnerTest.cc b/lib/api/unittest/CDataFrameTrainBoostedTreeClassifierRunnerTest.cc index c68fc1cf15..598fd044d3 100644 --- a/lib/api/unittest/CDataFrameTrainBoostedTreeClassifierRunnerTest.cc +++ b/lib/api/unittest/CDataFrameTrainBoostedTreeClassifierRunnerTest.cc @@ -30,16 +30,20 @@ BOOST_AUTO_TEST_CASE(testPredictionFieldNameClash) { auto errorHandler = [&errors](std::string error) { errors.push_back(error); }; core::CLogger::CScopeSetFatalErrorHandler scope{errorHandler}; - const auto spec{test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::classification(), - "dep_var", 5, 6, 13000000, 0, 0, {"dep_var"})}; + test::CDataFrameAnalysisSpecificationFactory specFactory; + auto spec = specFactory.rows(5) + .columns(6) + .memoryLimit(13000000) + .predictionCategoricalFieldNames({"dep_var"}) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), + "dep_var"); rapidjson::Document jsonParameters; jsonParameters.Parse("{" " \"dependent_variable\": \"dep_var\"," " \"prediction_field_name\": \"is_training\"" "}"); - const auto parameters{ - api::CDataFrameTrainBoostedTreeClassifierRunner::parameterReader().read(jsonParameters)}; + auto parameters = + api::CDataFrameTrainBoostedTreeClassifierRunner::parameterReader().read(jsonParameters); api::CDataFrameTrainBoostedTreeClassifierRunner runner(*spec, parameters); BOOST_TEST_REQUIRE(errors.size() == 1); @@ -76,9 +80,13 @@ void testWriteOneRow(const std::string& dependentVariableField, BOOST_TEST_REQUIRE(frame->numberRows() == rows.size()); // Create classification analysis runner object - const auto spec{test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::classification(), dependentVariableField, - rows.size(), columnNames.size(), 13000000, 0, 0, categoricalColumns)}; + test::CDataFrameAnalysisSpecificationFactory specFactory; + auto spec = specFactory.rows(rows.size()) + .columns(columnNames.size()) + .memoryLimit(13000000) + .predictionCategoricalFieldNames(categoricalColumns) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), + dependentVariableField); rapidjson::Document jsonParameters; if (predictionFieldType.empty()) { jsonParameters.Parse("{\"dependent_variable\": \"" + dependentVariableField + "\"}"); @@ -92,8 +100,8 @@ void testWriteOneRow(const std::string& dependentVariableField, "\"" "}"); } - const auto parameters{ - api::CDataFrameTrainBoostedTreeClassifierRunner::parameterReader().read(jsonParameters)}; + auto parameters = + api::CDataFrameTrainBoostedTreeClassifierRunner::parameterReader().read(jsonParameters); api::CDataFrameTrainBoostedTreeClassifierRunner runner{*spec, parameters}; // Write results to the output stream diff --git a/lib/api/unittest/CDataFrameTrainBoostedTreeRegressionRunnerTest.cc b/lib/api/unittest/CDataFrameTrainBoostedTreeRegressionRunnerTest.cc index 9e5701e09e..258d41b36c 100644 --- a/lib/api/unittest/CDataFrameTrainBoostedTreeRegressionRunnerTest.cc +++ b/lib/api/unittest/CDataFrameTrainBoostedTreeRegressionRunnerTest.cc @@ -28,16 +28,16 @@ BOOST_AUTO_TEST_CASE(testPredictionFieldNameClash) { auto errorHandler = [&errors](std::string error) { errors.push_back(error); }; core::CLogger::CScopeSetFatalErrorHandler scope{errorHandler}; - const auto spec{test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::regression(), "dep_var", - 5, 6, 13000000, 0, 0)}; + test::CDataFrameAnalysisSpecificationFactory specFactory; + auto spec = specFactory.rows(5).columns(6).memoryLimit(13000000).predictionSpec( + test::CDataFrameAnalysisSpecificationFactory::regression(), "dep_var"); rapidjson::Document jsonParameters; jsonParameters.Parse("{" " \"dependent_variable\": \"dep_var\"," " \"prediction_field_name\": \"is_training\"" "}"); - const auto parameters{ - api::CDataFrameTrainBoostedTreeRegressionRunner::parameterReader().read(jsonParameters)}; + auto parameters = + api::CDataFrameTrainBoostedTreeRegressionRunner::parameterReader().read(jsonParameters); api::CDataFrameTrainBoostedTreeRegressionRunner runner(*spec, parameters); BOOST_TEST_REQUIRE(errors.size() == 1); diff --git a/lib/core/CDataFrame.cc b/lib/core/CDataFrame.cc index 915d9f61fc..0e064d0d48 100644 --- a/lib/core/CDataFrame.cc +++ b/lib/core/CDataFrame.cc @@ -124,8 +124,9 @@ CDataFrame::CDataFrame(bool inMainMemory, const TWriteSliceToStoreFunc& writeSliceToStore) : m_InMainMemory{inMainMemory}, m_NumberColumns{numberColumns}, m_RowCapacity{numberColumns}, m_SliceCapacityInRows{sliceCapacityInRows}, - m_ReadAndWriteToStoreSyncStrategy{readAndWriteToStoreSyncStrategy}, m_WriteSliceToStore{writeSliceToStore}, - m_ColumnNames(numberColumns), m_CategoricalColumnValues(numberColumns), + m_ReadAndWriteToStoreSyncStrategy{readAndWriteToStoreSyncStrategy}, + m_WriteSliceToStore{writeSliceToStore}, m_ColumnNames(numberColumns), + m_CategoricalColumnValues(numberColumns), m_MissingString{DEFAULT_MISSING_STRING}, m_EmptyIsMissing(numberColumns, false), m_ColumnIsCategorical(numberColumns, false) { } @@ -216,7 +217,13 @@ void CDataFrame::parseAndWriteRow(const TStrCRng& columnValues, const std::strin auto stringToValue = [this](bool isCategorical, TStrSizeUMap& categoryLookup, TStrVec& categories, bool emptyIsMissing, const std::string& columnValue) { + if (columnValue == m_MissingString) { + ++m_MissingValueCount; + return core::CFloatStorage{valueOfMissing()}; + } + if (isCategorical) { + // TODO Remove when Java passes special missing value string. if (columnValue.empty() && emptyIsMissing) { return core::CFloatStorage{valueOfMissing()}; } @@ -251,6 +258,7 @@ void CDataFrame::parseAndWriteRow(const TStrCRng& columnValues, const std::strin double value; if (columnValue.empty()) { + // TODO Remove when Java passes special missing value string. ++m_MissingValueCount; return core::CFloatStorage{valueOfMissing()}; } else if (core::CStringUtils::stringToTypeSilent(columnValue, value) == false) { @@ -300,6 +308,10 @@ void CDataFrame::columnNames(TStrVec columnNames) { } } +void CDataFrame::missingString(std::string missing) { + m_MissingString = std::move(missing); +} + void CDataFrame::emptyIsMissing(TBoolVec emptyIsMissing) { if (emptyIsMissing.size() != m_NumberColumns) { HANDLE_FATAL(<< "Internal error: expected '" << m_NumberColumns @@ -374,6 +386,7 @@ std::size_t CDataFrame::memoryUsage() const { std::size_t memory{CMemory::dynamicSize(m_ColumnNames)}; memory += CMemory::dynamicSize(m_CategoricalColumnValues); memory += CMemory::dynamicSize(m_CategoricalColumnValueLookup); + memory += CMemory::dynamicSize(m_MissingString); memory += CMemory::dynamicSize(m_EmptyIsMissing); memory += CMemory::dynamicSize(m_ColumnIsCategorical); memory += CMemory::dynamicSize(m_Slices); @@ -630,6 +643,7 @@ bool CDataFrame::maskedRowsInSlice(ITR& maskedRow, const std::size_t CDataFrame::MAX_CATEGORICAL_CARDINALITY{ 1 << (std::numeric_limits::digits)}; +const std::string CDataFrame::DEFAULT_MISSING_STRING{"\0", 1}; CDataFrame::CDataFrameRowSliceWriter::CDataFrameRowSliceWriter( std::size_t numberRows, diff --git a/lib/core/CStringUtils.cc b/lib/core/CStringUtils.cc index b0401e2382..74a4b4147f 100644 --- a/lib/core/CStringUtils.cc +++ b/lib/core/CStringUtils.cc @@ -452,9 +452,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, unsigned l if (ret == 0 && errno == EINVAL) { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to unsigned long long: " - << ::strerror(errno)); + << "' to unsigned long long: " << ::strerror(errno)); } return false; } @@ -463,9 +461,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, unsigned l { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to unsigned long long: " - << ::strerror(errno)); + << "' to unsigned long long: " << ::strerror(errno)); } return false; } @@ -473,9 +469,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, unsigned l if (endPtr != nullptr && *endPtr != '\0') { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to unsigned long long: first invalid character " - << endPtr); + << "' to unsigned long long: first invalid character " << endPtr); } return false; } @@ -500,9 +494,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, unsigned l if (ret == 0 && errno == EINVAL) { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to unsigned long: " - << ::strerror(errno)); + << "' to unsigned long: " << ::strerror(errno)); } return false; } @@ -511,9 +503,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, unsigned l { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to unsigned long: " - << ::strerror(errno)); + << "' to unsigned long: " << ::strerror(errno)); } return false; } @@ -521,9 +511,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, unsigned l if (endPtr != nullptr && *endPtr != '\0') { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to unsigned long: first invalid character " - << endPtr); + << "' to unsigned long: first invalid character " << endPtr); } return false; } @@ -544,9 +532,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, unsigned i // Now check if the result is in range for unsigned int if (ret > std::numeric_limits::max()) { if (!silent) { - LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to unsigned int - out of range"); + LOG_ERROR(<< "Unable to convert string '" << str << "' to unsigned int - out of range"); } return false; } @@ -568,8 +554,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, unsigned s if (ret > std::numeric_limits::max()) { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to unsigned short - out of range"); + << "' to unsigned short - out of range"); } return false; } @@ -594,9 +579,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, long long& if (ret == 0 && errno == EINVAL) { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to long long: " - << ::strerror(errno)); + << "' to long long: " << ::strerror(errno)); } return false; } @@ -605,9 +588,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, long long& { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to long long: " - << ::strerror(errno)); + << "' to long long: " << ::strerror(errno)); } return false; } @@ -615,9 +596,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, long long& if (endPtr != nullptr && *endPtr != '\0') { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to long long: first invalid character " - << endPtr); + << "' to long long: first invalid character " << endPtr); } return false; } @@ -642,9 +621,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, long& i) { if (ret == 0 && errno == EINVAL) { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to long: " - << ::strerror(errno)); + << "' to long: " << ::strerror(errno)); } return false; } @@ -653,9 +630,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, long& i) { { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to long: " - << ::strerror(errno)); + << "' to long: " << ::strerror(errno)); } return false; } @@ -663,9 +638,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, long& i) { if (endPtr != nullptr && *endPtr != '\0') { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to long: first invalid character " - << endPtr); + << "' to long: first invalid character " << endPtr); } return false; } @@ -708,9 +681,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, short& i) if (ret < std::numeric_limits::min() || ret > std::numeric_limits::max()) { if (!silent) { - LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to short - out of range"); + LOG_ERROR(<< "Unable to convert string '" << str << "' to short - out of range"); } return false; } @@ -805,9 +776,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, double& d) if (ret == 0 && errno == EINVAL) { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to double: " - << ::strerror(errno)); + << "' to double: " << ::strerror(errno)); } return false; } @@ -815,9 +784,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, double& d) if ((ret == HUGE_VAL || ret == -HUGE_VAL) && errno == ERANGE) { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to double: " - << ::strerror(errno)); + << "' to double: " << ::strerror(errno)); } return false; } @@ -825,9 +792,7 @@ bool CStringUtils::_stringToType(bool silent, const std::string& str, double& d) if (endPtr != nullptr && *endPtr != '\0') { if (!silent) { LOG_ERROR(<< "Unable to convert string '" << str - << "'" - " to double: first invalid character " - << endPtr); + << "' to double: first invalid character " << endPtr); } return false; } diff --git a/lib/maths/CDataFrameUtils.cc b/lib/maths/CDataFrameUtils.cc index 3409d3e1b4..518bedbc28 100644 --- a/lib/maths/CDataFrameUtils.cc +++ b/lib/maths/CDataFrameUtils.cc @@ -563,9 +563,11 @@ CDataFrameUtils::categoryFrequencies(std::size_t numberThreads, [&](TDoubleVecVec& counts, TRowItr beginRows, TRowItr endRows) { for (auto row = beginRows; row != endRows; ++row) { for (std::size_t i : columnMask) { - std::size_t category{static_cast((*row)[i])}; - counts[i].resize(std::max(counts[i].size(), category + 1), 0.0); - counts[i][category] += 1.0; + if (isMissing((*row)[i]) == false) { + std::size_t category{static_cast((*row)[i])}; + counts[i].resize(std::max(counts[i].size(), category + 1), 0.0); + counts[i][category] += 1.0; + } } } }, @@ -588,12 +590,12 @@ CDataFrameUtils::categoryFrequencies(std::size_t numberThreads, readCategoryCounts, &rowMask), copyCategoryCounts, reduceCategoryCounts, result) == false) { HANDLE_FATAL(<< "Internal error: failed to calculate category" - << " frequencies. Please report this problem."); + << " frequencies. Please report this problem.") return result; } } catch (const std::exception& e) { HANDLE_FATAL(<< "Internal error: '" << e.what() << "' exception calculating" - << " category frequencies. Please report this problem."); + << " category frequencies. Please report this problem.") } double Z{rowMask.manhattan()}; @@ -628,7 +630,7 @@ CDataFrameUtils::meanValueOfTargetForCategories(const CColumnValue& target, [&](TMeanAccumulatorVecVec& means_, TRowItr beginRows, TRowItr endRows) { for (auto row = beginRows; row != endRows; ++row) { for (std::size_t i : columnMask) { - if (isMissing(target(*row)) == false) { + if (isMissing((*row)[i]) == false && isMissing(target(*row)) == false) { std::size_t category{static_cast((*row)[i])}; means_[i].resize(std::max(means_[i].size(), category + 1)); means_[i][category].add(target(*row)); @@ -654,12 +656,12 @@ CDataFrameUtils::meanValueOfTargetForCategories(const CColumnValue& target, if (doReduce(frame.readRows(numberThreads, 0, frame.numberRows(), readColumnMeans, &rowMask), copyColumnMeans, reduceColumnMeans, means) == false) { HANDLE_FATAL(<< "Internal error: failed to calculate mean target values" - << " for categories. Please report this problem."); + << " for categories. Please report this problem.") return result; } } catch (const std::exception& e) { HANDLE_FATAL(<< "Internal error: '" << e.what() << "' exception calculating" - << " mean target values for categories. Please report this problem."); + << " mean target values for categories. Please report this problem.") return result; } for (std::size_t i = 0; i < result.size(); ++i) { @@ -760,7 +762,7 @@ CDataFrameUtils::maximumMinimumRecallDecisionThreshold(std::size_t numberThreads TQuantileSketchVec classProbabilityClassOneQuantiles; if (doReduce(frame.readRows(numberThreads, 0, frame.numberRows(), readQuantiles, &rowMask), copyQuantiles, reduceQuantiles, classProbabilityClassOneQuantiles) == false) { - HANDLE_FATAL(<< "Failed to compute category quantiles"); + HANDLE_FATAL(<< "Failed to compute category quantiles") return 0.5; } diff --git a/lib/test/CDataFrameAnalysisSpecificationFactory.cc b/lib/test/CDataFrameAnalysisSpecificationFactory.cc index f548cf6bb2..43ccbd0fea 100644 --- a/lib/test/CDataFrameAnalysisSpecificationFactory.cc +++ b/lib/test/CDataFrameAnalysisSpecificationFactory.cc @@ -6,6 +6,7 @@ #include +#include #include #include @@ -23,6 +24,10 @@ namespace ml { namespace test { using TRapidJsonLineWriter = core::CRapidJsonLineWriter; +CDataFrameAnalysisSpecificationFactory::CDataFrameAnalysisSpecificationFactory() + : m_MissingString{core::CDataFrame::DEFAULT_MISSING_STRING} { +} + const std::string& CDataFrameAnalysisSpecificationFactory::classification() { return api::CDataFrameTrainBoostedTreeClassifierRunnerFactory::NAME; } @@ -31,31 +36,162 @@ const std::string& CDataFrameAnalysisSpecificationFactory::regression() { return api::CDataFrameTrainBoostedTreeRegressionRunnerFactory::NAME; } +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::rows(std::size_t rows) { + m_Rows = rows; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::columns(std::size_t columns) { + m_Columns = columns; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::memoryLimit(std::size_t memoryLimit) { + m_MemoryLimit = memoryLimit; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::missingString(const std::string& missing) { + m_MissingString = missing; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::diskUsageAllowed(bool disk) { + m_DiskUsageAllowed = disk; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::outlierMethod(std::string method) { + m_Method = method; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::outlierNumberNeighbours(std::size_t number) { + m_NumberNeighbours = number; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::outlierComputeInfluence(bool compute) { + m_ComputeFeatureInfluence = compute; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::predicitionNumberRoundsPerHyperparameter(std::size_t rounds) { + m_NumberRoundsPerHyperparameter = rounds; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::predictionBayesianOptimisationRestarts(std::size_t restarts) { + m_BayesianOptimisationRestarts = restarts; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::predictionCategoricalFieldNames(const TStrVec& categorical) { + m_CategoricalFieldNames = categorical; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::predictionAlpha(double alpha) { + m_Alpha = alpha; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::predictionLambda(double lambda) { + m_Lambda = lambda; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::predictionGamma(double gamma) { + m_Gamma = gamma; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::predictionSoftTreeDepthLimit(double limit) { + m_SoftTreeDepthLimit = limit; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::predictionSoftTreeDepthTolerance(double tolerance) { + m_SoftTreeDepthTolerance = tolerance; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::predictionEta(double eta) { + m_Eta = eta; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::predictionMaximumNumberTrees(std::size_t number) { + m_MaximumNumberTrees = number; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::predictionFeatureBagFraction(double fraction) { + m_FeatureBagFraction = fraction; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::predictionNumberTopShapValues(std::size_t number) { + m_NumberTopShapValues = number; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::predictionPersisterSupplier(TPersisterSupplier* persisterSupplier) { + m_PersisterSupplier = persisterSupplier; + return *this; +} + +CDataFrameAnalysisSpecificationFactory& +CDataFrameAnalysisSpecificationFactory::predictionRestoreSearcherSupplier( + TRestoreSearcherSupplier* restoreSearcherSupplier) { + m_RestoreSearcherSupplier = restoreSearcherSupplier; + return *this; +} + CDataFrameAnalysisSpecificationFactory::TSpecificationUPtr -CDataFrameAnalysisSpecificationFactory::outlierSpec(std::size_t rows, - std::size_t cols, - std::size_t memoryLimit, - const std::string& method, - std::size_t numberNeighbours, - bool computeFeatureInfluence, - bool diskUsageAllowed) { +CDataFrameAnalysisSpecificationFactory::outlierSpec() const { + + std::size_t rows{m_Rows ? *m_Rows : 110}; + std::size_t columns{m_Columns ? *m_Columns : 5}; + std::size_t memoryLimit{m_MemoryLimit ? *m_MemoryLimit : 100000}; rapidjson::StringBuffer parameters; TRapidJsonLineWriter writer; writer.Reset(parameters); writer.StartObject(); - if (method != "") { + if (m_Method != "") { writer.Key(api::CDataFrameOutliersRunner::METHOD); - writer.String(method); + writer.String(m_Method); } - if (numberNeighbours > 0) { + if (m_NumberNeighbours > 0) { writer.Key(api::CDataFrameOutliersRunner::N_NEIGHBORS); - writer.Uint64(numberNeighbours); + writer.Uint64(m_NumberNeighbours); } - if (computeFeatureInfluence == false) { + if (m_ComputeFeatureInfluence == false) { writer.Key(api::CDataFrameOutliersRunner::COMPUTE_FEATURE_INFLUENCE); - writer.Bool(computeFeatureInfluence); + writer.Bool(m_ComputeFeatureInfluence); } else { writer.Key(api::CDataFrameOutliersRunner::FEATURE_INFLUENCE_THRESHOLD); writer.Double(0.0); @@ -64,8 +200,9 @@ CDataFrameAnalysisSpecificationFactory::outlierSpec(std::size_t rows, writer.Flush(); std::string spec{api::CDataFrameAnalysisSpecificationJsonWriter::jsonString( - "testJob", rows, cols, memoryLimit, 1, {}, diskUsageAllowed, CTestTmpDir::tmpDir(), - "ml", api::CDataFrameOutliersRunnerFactory::NAME, parameters.GetString())}; + "testJob", rows, columns, memoryLimit, 1, m_MissingString, {}, + m_DiskUsageAllowed, CTestTmpDir::tmpDir(), "ml", + api::CDataFrameOutliersRunnerFactory::NAME, parameters.GetString())}; LOG_TRACE(<< "spec =\n" << spec); @@ -73,26 +210,12 @@ CDataFrameAnalysisSpecificationFactory::outlierSpec(std::size_t rows, } CDataFrameAnalysisSpecificationFactory::TSpecificationUPtr -CDataFrameAnalysisSpecificationFactory::predictionSpec( - const std::string& analysis, - const std::string& dependentVariable, - std::size_t rows, - std::size_t cols, - std::size_t memoryLimit, - std::size_t numberRoundsPerHyperparameter, - std::size_t bayesianOptimisationRestarts, - const TStrVec& categoricalFieldNames, - double alpha, - double lambda, - double gamma, - double softTreeDepthLimit, - double softTreeDepthTolerance, - double eta, - std::size_t maximumNumberTrees, - double featureBagFraction, - size_t numTopFeatureImportanceValues, - TPersisterSupplier* persisterSupplier, - TRestoreSearcherSupplier* restoreSearcherSupplier) { +CDataFrameAnalysisSpecificationFactory::predictionSpec(const std::string& analysis, + const std::string& dependentVariable) const { + + std::size_t rows{m_Rows ? *m_Rows : 100}; + std::size_t columns{m_Columns ? *m_Columns : 5}; + std::size_t memoryLimit{m_MemoryLimit ? *m_MemoryLimit : 7000000}; rapidjson::StringBuffer parameters; TRapidJsonLineWriter writer; @@ -101,49 +224,49 @@ CDataFrameAnalysisSpecificationFactory::predictionSpec( writer.StartObject(); writer.Key(api::CDataFrameTrainBoostedTreeRunner::DEPENDENT_VARIABLE_NAME); writer.String(dependentVariable); - if (alpha >= 0.0) { + if (m_Alpha >= 0.0) { writer.Key(api::CDataFrameTrainBoostedTreeRunner::ALPHA); - writer.Double(alpha); + writer.Double(m_Alpha); } - if (lambda >= 0.0) { + if (m_Lambda >= 0.0) { writer.Key(api::CDataFrameTrainBoostedTreeRunner::LAMBDA); - writer.Double(lambda); + writer.Double(m_Lambda); } - if (gamma >= 0.0) { + if (m_Gamma >= 0.0) { writer.Key(api::CDataFrameTrainBoostedTreeRunner::GAMMA); - writer.Double(gamma); + writer.Double(m_Gamma); } - if (softTreeDepthLimit >= 0.0) { + if (m_SoftTreeDepthLimit >= 0.0) { writer.Key(api::CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_LIMIT); - writer.Double(softTreeDepthLimit); + writer.Double(m_SoftTreeDepthLimit); } - if (softTreeDepthTolerance >= 0.0) { + if (m_SoftTreeDepthTolerance >= 0.0) { writer.Key(api::CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_TOLERANCE); - writer.Double(softTreeDepthTolerance); + writer.Double(m_SoftTreeDepthTolerance); } - if (eta > 0.0) { + if (m_Eta > 0.0) { writer.Key(api::CDataFrameTrainBoostedTreeRunner::ETA); - writer.Double(eta); + writer.Double(m_Eta); } - if (maximumNumberTrees > 0) { + if (m_MaximumNumberTrees > 0) { writer.Key(api::CDataFrameTrainBoostedTreeRunner::MAXIMUM_NUMBER_TREES); - writer.Uint64(maximumNumberTrees); + writer.Uint64(m_MaximumNumberTrees); } - if (featureBagFraction > 0.0) { + if (m_FeatureBagFraction > 0.0) { writer.Key(api::CDataFrameTrainBoostedTreeRunner::FEATURE_BAG_FRACTION); - writer.Double(featureBagFraction); + writer.Double(m_FeatureBagFraction); } - if (numberRoundsPerHyperparameter > 0) { + if (m_NumberRoundsPerHyperparameter > 0) { writer.Key(api::CDataFrameTrainBoostedTreeRunner::NUMBER_ROUNDS_PER_HYPERPARAMETER); - writer.Uint64(numberRoundsPerHyperparameter); + writer.Uint64(m_NumberRoundsPerHyperparameter); } - if (bayesianOptimisationRestarts > 0) { + if (m_BayesianOptimisationRestarts > 0) { writer.Key(api::CDataFrameTrainBoostedTreeRunner::BAYESIAN_OPTIMISATION_RESTARTS); - writer.Uint64(bayesianOptimisationRestarts); + writer.Uint64(m_BayesianOptimisationRestarts); } - if (numTopFeatureImportanceValues > 0) { + if (m_NumberTopShapValues > 0) { writer.Key(api::CDataFrameTrainBoostedTreeRunner::NUM_TOP_FEATURE_IMPORTANCE_VALUES); - writer.Uint64(numTopFeatureImportanceValues); + writer.Uint64(m_NumberTopShapValues); } if (analysis == classification()) { writer.Key(api::CDataFrameTrainBoostedTreeClassifierRunner::NUM_TOP_CLASSES); @@ -152,16 +275,16 @@ CDataFrameAnalysisSpecificationFactory::predictionSpec( writer.EndObject(); std::string spec{api::CDataFrameAnalysisSpecificationJsonWriter::jsonString( - "testJob", rows, cols, memoryLimit, 1, categoricalFieldNames, true, - CTestTmpDir::tmpDir(), "ml", analysis, parameters.GetString())}; + "testJob", rows, columns, memoryLimit, 1, m_MissingString, m_CategoricalFieldNames, + true, CTestTmpDir::tmpDir(), "ml", analysis, parameters.GetString())}; LOG_TRACE(<< "spec =\n" << spec); - if (restoreSearcherSupplier != nullptr && persisterSupplier != nullptr) { + if (m_RestoreSearcherSupplier != nullptr && m_PersisterSupplier != nullptr) { return std::make_unique( - spec, *persisterSupplier, *restoreSearcherSupplier); - } else if (restoreSearcherSupplier == nullptr && persisterSupplier != nullptr) { - return std::make_unique(spec, *persisterSupplier); + spec, *m_PersisterSupplier, *m_RestoreSearcherSupplier); + } else if (m_RestoreSearcherSupplier == nullptr && m_PersisterSupplier != nullptr) { + return std::make_unique(spec, *m_PersisterSupplier); } else { return std::make_unique(spec); }