diff --git a/include/api/CDataFrameAnalysisRunner.h b/include/api/CDataFrameAnalysisRunner.h index ed89a2bc68..63d1a07337 100644 --- a/include/api/CDataFrameAnalysisRunner.h +++ b/include/api/CDataFrameAnalysisRunner.h @@ -103,9 +103,6 @@ class API_EXPORT CDataFrameAnalysisRunner { //! \return The number of columns this analysis appends. virtual std::size_t numberExtraColumns() const = 0; - //! \return Indicator of columns for which empty value should be treated as missing. - virtual TBoolVec columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const; - //! Write the extra columns of \p row added by the analysis to \p writer. //! //! This should create a new object of the form: diff --git a/include/api/CDataFrameAnalysisSpecification.h b/include/api/CDataFrameAnalysisSpecification.h index 266e9136ce..339478a550 100644 --- a/include/api/CDataFrameAnalysisSpecification.h +++ b/include/api/CDataFrameAnalysisSpecification.h @@ -172,22 +172,22 @@ class API_EXPORT CDataFrameAnalysisSpecification { //! \note The commit of the results of the analysis is atomic per partition. //! \warning This assumes that there is no access to the data frame in the //! calling thread until the runner has finished. - CDataFrameAnalysisRunner* run(core::CDataFrame& frame) const; + CDataFrameAnalysisRunner* + run(core::CDataFrame& frame, + core::CRapidJsonConcurrentLineWriter* outputWriter = nullptr) const; //! Estimates memory usage in two cases: //! 1. disk is not used (the whole data frame fits in main memory) //! 2. disk is used (only one partition needs to be loaded to main memory) void estimateMemoryUsage(CMemoryUsageEstimationResultJsonWriter& writer) const; - //! \return Indicator of columns for which empty value should be treated as missing. - TBoolVec columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const; - - //! \return shared pointer to the persistence stream. + //! \return The stream to which to persist state if there is one. TDataAdderUPtr persister() const; + //! \return The stream from which to retore state if there is one. TDataSearcherUPtr restoreSearcher() const; - //! Get pointer to the analysis runner. + //! \return The analysis runner. CDataFrameAnalysisRunner* runner(); private: diff --git a/include/api/CDataFrameTrainBoostedTreeClassifierRunner.h b/include/api/CDataFrameTrainBoostedTreeClassifierRunner.h index 1568c49fc3..6bc7ea8981 100644 --- a/include/api/CDataFrameTrainBoostedTreeClassifierRunner.h +++ b/include/api/CDataFrameTrainBoostedTreeClassifierRunner.h @@ -40,9 +40,6 @@ class API_EXPORT CDataFrameTrainBoostedTreeClassifierRunner final CDataFrameTrainBoostedTreeClassifierRunner(const CDataFrameAnalysisSpecification& spec, const CDataFrameAnalysisParameters& parameters); - //! \return Indicator of columns for which empty value should be treated as missing. - TBoolVec columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const override; - //! Write the prediction for \p row to \p writer. void writeOneRow(const core::CDataFrame& frame, const TRowRef& row, diff --git a/include/core/CDataFrame.h b/include/core/CDataFrame.h index d64c5c7bb1..44f54aafac 100644 --- a/include/core/CDataFrame.h +++ b/include/core/CDataFrame.h @@ -450,9 +450,6 @@ class CORE_EXPORT CDataFrame final { //! Write the string which indicates that a value is missing. void missingString(std::string missing); - //! Write for which columns an empty string implies the value is missing. - void emptyIsMissing(TBoolVec emptyIsMissing); - //! Write which columns contain categorical data. void categoricalColumns(TStrVec categoricalColumnNames); @@ -589,11 +586,6 @@ class CORE_EXPORT CDataFrame final { //! The string which indicates that a category is missing. std::string m_MissingString; - //! Indicator vector for treating empty strings as missing values. - // TODO Remove once Java passes the correct value for the missing target - // for classification. - TBoolVec m_EmptyIsMissing; - //! Indicator vector of the columns which contain categorical values. TBoolVec m_ColumnIsCategorical; diff --git a/lib/api/CDataFrameAnalysisRunner.cc b/lib/api/CDataFrameAnalysisRunner.cc index 893742d422..b81ff58e7f 100644 --- a/lib/api/CDataFrameAnalysisRunner.cc +++ b/lib/api/CDataFrameAnalysisRunner.cc @@ -42,10 +42,6 @@ CDataFrameAnalysisRunner::~CDataFrameAnalysisRunner() { this->waitToFinish(); } -TBoolVec CDataFrameAnalysisRunner::columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const { - return TBoolVec(fieldNames.size(), false); -} - void CDataFrameAnalysisRunner::estimateMemoryUsage(CMemoryUsageEstimationResultJsonWriter& writer) const { std::size_t numberRows{m_Spec.numberRows()}; std::size_t numberColumns{m_Spec.numberColumns()}; diff --git a/lib/api/CDataFrameAnalysisSpecification.cc b/lib/api/CDataFrameAnalysisSpecification.cc index 1a1203fa82..725ba40a39 100644 --- a/lib/api/CDataFrameAnalysisSpecification.cc +++ b/lib/api/CDataFrameAnalysisSpecification.cc @@ -118,14 +118,14 @@ CDataFrameAnalysisSpecification::CDataFrameAnalysisSpecification( rapidjson::Document specification; if (specification.Parse(jsonSpecification.c_str()) == false) { HANDLE_FATAL(<< "Input error: failed to parse analysis specification '" - << jsonSpecification << "'. Please report this problem."); + << jsonSpecification << "'. Please report this problem.") } else { auto parameters = CONFIG_READER.read(specification); for (auto name : {ROWS, COLS, MEMORY_LIMIT, THREADS}) { if (parameters[name].as() == 0) { - HANDLE_FATAL(<< "Input error: '" << name << "' must be non-zero"); + HANDLE_FATAL(<< "Input error: '" << name << "' must be non-zero") } } m_NumberRows = parameters[ROWS].as(); @@ -207,8 +207,11 @@ CDataFrameAnalysisSpecification::makeDataFrame() { return result; } -CDataFrameAnalysisRunner* CDataFrameAnalysisSpecification::run(core::CDataFrame& frame) const { +CDataFrameAnalysisRunner* +CDataFrameAnalysisSpecification::run(core::CDataFrame& frame, + core::CRapidJsonConcurrentLineWriter* writer) const { if (m_Runner != nullptr) { + m_Runner->instrumentation().writer(writer); m_Runner->run(frame); return m_Runner.get(); } @@ -218,20 +221,12 @@ CDataFrameAnalysisRunner* CDataFrameAnalysisSpecification::run(core::CDataFrame& void CDataFrameAnalysisSpecification::estimateMemoryUsage(CMemoryUsageEstimationResultJsonWriter& writer) const { if (m_Runner == nullptr) { HANDLE_FATAL(<< "Internal error: no runner available so can't estimate memory." - << " Please report this problem."); + << " Please report this problem.") return; } m_Runner->estimateMemoryUsage(writer); } -TBoolVec CDataFrameAnalysisSpecification::columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const { - if (m_Runner == nullptr) { - HANDLE_FATAL(<< "Internal error: no runner available. Please report this problem."); - return TBoolVec(fieldNames.size(), false); - } - return m_Runner->columnsForWhichEmptyIsMissing(fieldNames); -} - void CDataFrameAnalysisSpecification::initializeRunner(const rapidjson::Value& jsonAnalysis) { // We pass of the interpretation of the parameters object to the appropriate // analysis runner. @@ -251,7 +246,7 @@ void CDataFrameAnalysisSpecification::initializeRunner(const rapidjson::Value& j } HANDLE_FATAL(<< "Input error: unexpected analysis name '" << m_AnalysisName - << "'. Please report this problem."); + << "'. Please report this problem.") } CDataFrameAnalysisSpecification::TDataAdderUPtr diff --git a/lib/api/CDataFrameAnalyzer.cc b/lib/api/CDataFrameAnalyzer.cc index 921c5b0853..fc90c8b34c 100644 --- a/lib/api/CDataFrameAnalyzer.cc +++ b/lib/api/CDataFrameAnalyzer.cc @@ -132,15 +132,13 @@ void CDataFrameAnalyzer::run() { auto outStream = m_ResultsStreamSupplier(); core::CRapidJsonConcurrentLineWriter outputWriter{*outStream}; - CDataFrameAnalysisRunner* analysis{m_AnalysisSpecification->runner()}; - if (analysis == nullptr) { - return; + auto analysisRunner = m_AnalysisSpecification->run(*m_DataFrame, &outputWriter); + + if (analysisRunner != nullptr) { + this->monitorProgress(*analysisRunner, outputWriter); + analysisRunner->waitToFinish(); + this->writeResultsOf(*analysisRunner, outputWriter); } - analysis->instrumentation().writer(&outputWriter); - m_AnalysisSpecification->run(*m_DataFrame); - this->monitorProgress(*analysis, outputWriter); - analysis->waitToFinish(); - this->writeResultsOf(*analysis, outputWriter); } const CDataFrameAnalyzer::TTemporaryDirectoryPtr& CDataFrameAnalyzer::dataFrameDirectory() const { @@ -248,8 +246,6 @@ void CDataFrameAnalyzer::captureFieldNames(const TStrVec& fieldNames) { TStrVec columnNames{fieldNames.begin() + m_BeginDataFieldValues, fieldNames.begin() + m_EndDataFieldValues}; m_DataFrame->columnNames(columnNames); - m_DataFrame->emptyIsMissing( - m_AnalysisSpecification->columnsForWhichEmptyIsMissing(columnNames)); m_DataFrame->categoricalColumns(m_AnalysisSpecification->categoricalFieldNames()); m_CapturedFieldNames = true; } diff --git a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc index 3f76fd5ac7..59892c311e 100644 --- a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc +++ b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc @@ -96,19 +96,6 @@ CDataFrameTrainBoostedTreeClassifierRunner::CDataFrameTrainBoostedTreeClassifier } } -TBoolVec CDataFrameTrainBoostedTreeClassifierRunner::columnsForWhichEmptyIsMissing( - const TStrVec& fieldNames) const { - // The only field for which empty value should be treated as missing is dependent - // variable which has empty value for non-training rows. - TBoolVec emptyAsMissing(fieldNames.size(), false); - auto pos = std::find(fieldNames.begin(), fieldNames.end(), - this->dependentVariableFieldName()); - if (pos != fieldNames.end()) { - emptyAsMissing[pos - fieldNames.begin()] = true; - } - return emptyAsMissing; -} - void CDataFrameTrainBoostedTreeClassifierRunner::writeOneRow( const core::CDataFrame& frame, const TRowRef& row, diff --git a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc index a3e57fafd7..1e221a1d58 100644 --- a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc @@ -204,31 +204,4 @@ BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor1000Rows) { testEstimateMemoryUsage(1000, "403kB", "142kB", 0); } -void testColumnsForWhichEmptyIsMissing(const std::string& analysis, - const std::string& dependentVariableName, - const TStrVec& fieldNames, - const TStrVec& categoricalFields, - const TBoolVec& expectedEmptyIsMissing) { - std::string parameters{"{\"dependent_variable\": \"" + dependentVariableName + "\"}"}; - std::string jsonSpec{api::CDataFrameAnalysisSpecificationJsonWriter::jsonString( - "testJob", 10000, 5, 100000000, 1, "", categoricalFields, true, - test::CTestTmpDir::tmpDir(), "", analysis, parameters)}; - api::CDataFrameAnalysisSpecification spec{jsonSpec}; - auto emptyIsMissing = spec.columnsForWhichEmptyIsMissing(fieldNames); - BOOST_REQUIRE_EQUAL(core::CContainerPrinter::print(expectedEmptyIsMissing), - core::CContainerPrinter::print(emptyIsMissing)); -} - -BOOST_AUTO_TEST_CASE(testColumnsForWhichEmptyIsMissingClassification) { - testColumnsForWhichEmptyIsMissing("classification", "class", - {"feature_1", "feature_2", "feature_3", "class"}, - {"class"}, {false, false, false, true}); -} - -BOOST_AUTO_TEST_CASE(testColumnsForWhichEmptyIsMissingRegression) { - testColumnsForWhichEmptyIsMissing("regression", "value", - {"feature_1", "feature_2", "feature_3", "value"}, - {}, {false, false, false, false}); -} - BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc b/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc index bfca409ac2..ca5d2d18ac 100644 --- a/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc @@ -405,7 +405,7 @@ BOOST_AUTO_TEST_CASE(testRunAnalysis) { BOOST_TEST_REQUIRE(runner->instrumentation().progress() <= 1.0); } - LOG_DEBUG(<< "final progress = " << lastProgress); + LOG_TRACE(<< "final progress = " << lastProgress); BOOST_REQUIRE_EQUAL(1.0, runner->instrumentation().progress()); } } diff --git a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc index 1af987a3da..72d0e28be2 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc @@ -653,7 +653,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTrainingWithRowsMissingTargetVa } for (std::size_t i = 40; i < 50; ++i) { fieldValues[0] = std::to_string(feature[i]); - fieldValues[1] = ""; + fieldValues[1] = core::CDataFrame::DEFAULT_MISSING_STRING; fieldValues[2] = std::to_string(i); analyzer.handleRecord(fieldNames, fieldValues); } @@ -985,10 +985,8 @@ BOOST_AUTO_TEST_CASE(testCategoricalFieldsEmptyAsMissing) { return [expected](double actual) { return expected == actual; }; }; - auto missing = []() { - return [](double actual) { - return maths::CDataFrameUtils::isMissing(actual); - }; + auto missing = [](double actual) { + return maths::CDataFrameUtils::isMissing(actual); }; auto assertRow = [&](const std::size_t row_i, @@ -1007,11 +1005,14 @@ BOOST_AUTO_TEST_CASE(testCategoricalFieldsEmptyAsMissing) { return std::make_unique(output); }; + std::string missingString{"foo"}; + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ specFactory.rows(1000) .memoryLimit(27000000) .predictionCategoricalFieldNames({"x1", "x2", "x5"}) + .missingString(missingString) .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "x5"), outputWriterFactory}; @@ -1019,11 +1020,11 @@ BOOST_AUTO_TEST_CASE(testCategoricalFieldsEmptyAsMissing) { analyzer.handleRecord(fieldNames, {"x11", "x21", "0", "0", "x51", "0", ""}); analyzer.handleRecord(fieldNames, {"x12", "x22", "1", "1", "x52", "1", ""}); analyzer.handleRecord(fieldNames, {"", "x23", "2", "2", "x51", "2", ""}); - analyzer.handleRecord(fieldNames, {"x14", "x24", "3", "3", "", "3", ""}); + analyzer.handleRecord(fieldNames, {"x14", "x24", "3", "3", missingString, "3", ""}); analyzer.handleRecord(fieldNames, {"x15", "x25", "4", "4", "x51", "4", ""}); analyzer.handleRecord(fieldNames, {"x11", "x26", "5", "5", "x52", "5", ""}); - analyzer.handleRecord(fieldNames, {"x12", "", "6", "6", "", "6", ""}); - analyzer.handleRecord(fieldNames, {"x13", "x21", "7", "7", "", "7", ""}); + analyzer.handleRecord(fieldNames, {"x12", "", "6", "6", missingString, "6", ""}); + analyzer.handleRecord(fieldNames, {"x13", "x21", "7", "7", missingString, "7", ""}); analyzer.handleRecord(fieldNames, {"x14", "x22", "8", "8", "x51", "8", ""}); analyzer.handleRecord(fieldNames, {"", "x23", "9", "9", "x52", "9", ""}); analyzer.receivedAllRows(); @@ -1036,11 +1037,11 @@ BOOST_AUTO_TEST_CASE(testCategoricalFieldsEmptyAsMissing) { assertRow(0, {eq(0.0), eq(0.0), eq(0.0), eq(0.0), eq(0.0)}, rows[0]); assertRow(1, {eq(1.0), eq(1.0), eq(1.0), eq(1.0), eq(1.0)}, rows[1]); assertRow(2, {eq(2.0), eq(2.0), eq(2.0), eq(2.0), eq(0.0)}, rows[2]); - assertRow(3, {eq(3.0), eq(3.0), eq(3.0), eq(3.0), missing()}, rows[3]); + assertRow(3, {eq(3.0), eq(3.0), eq(3.0), eq(3.0), missing}, rows[3]); assertRow(4, {eq(4.0), eq(4.0), eq(4.0), eq(4.0), eq(0.0)}, rows[4]); assertRow(5, {eq(0.0), eq(5.0), eq(5.0), eq(5.0), eq(1.0)}, rows[5]); - assertRow(6, {eq(1.0), eq(6.0), eq(6.0), eq(6.0), missing()}, rows[6]); - assertRow(7, {eq(5.0), eq(0.0), eq(7.0), eq(7.0), missing()}, rows[7]); + assertRow(6, {eq(1.0), eq(6.0), eq(6.0), eq(6.0), missing}, rows[6]); + assertRow(7, {eq(5.0), eq(0.0), eq(7.0), eq(7.0), missing}, rows[7]); assertRow(8, {eq(3.0), eq(1.0), eq(8.0), eq(8.0), eq(0.0)}, rows[8]); assertRow(9, {eq(2.0), eq(2.0), eq(9.0), eq(9.0), eq(1.0)}, rows[9]); }); diff --git a/lib/core/CDataFrame.cc b/lib/core/CDataFrame.cc index 3f05c330d2..c65baba8ab 100644 --- a/lib/core/CDataFrame.cc +++ b/lib/core/CDataFrame.cc @@ -127,7 +127,6 @@ CDataFrame::CDataFrame(bool inMainMemory, m_ReadAndWriteToStoreSyncStrategy{readAndWriteToStoreSyncStrategy}, m_WriteSliceToStore{writeSliceToStore}, m_ColumnNames(numberColumns), m_CategoricalColumnValues(numberColumns), m_MissingString{DEFAULT_MISSING_STRING}, - m_EmptyIsMissing(numberColumns, false), m_ColumnIsCategorical(numberColumns, false) { } @@ -169,7 +168,6 @@ void CDataFrame::resizeColumns(std::size_t numberThreads, std::size_t numberColu this->reserve(numberThreads, numberColumns); m_ColumnNames.resize(numberColumns); m_CategoricalColumnValues.resize(numberColumns); - m_EmptyIsMissing.resize(numberColumns, false); m_ColumnIsCategorical.resize(numberColumns, false); m_NumberColumns = numberColumns; } @@ -215,19 +213,13 @@ CDataFrame::TRowFuncVecBoolPr CDataFrame::writeColumns(std::size_t numberThreads void CDataFrame::parseAndWriteRow(const TStrCRng& columnValues, const std::string* hash) { auto stringToValue = [this](bool isCategorical, TStrSizeUMap& categoryLookup, - TStrVec& categories, bool emptyIsMissing, - const std::string& columnValue) { + TStrVec& categories, const std::string& columnValue) { if (columnValue == m_MissingString) { ++m_MissingValueCount; return core::CFloatStorage{valueOfMissing()}; } if (isCategorical) { - // TODO Remove when Java passes special missing value string. - if (columnValue.empty() && emptyIsMissing) { - return core::CFloatStorage{valueOfMissing()}; - } - // This encodes in a format suitable for efficient storage. The // actual encoding approach is chosen when the analysis runs. std::size_t id; @@ -257,11 +249,7 @@ void CDataFrame::parseAndWriteRow(const TStrCRng& columnValues, const std::strin // otherwise we must impute or exit with failure. double value; - if (columnValue.empty()) { - // TODO Remove when Java passes special missing value string. - ++m_MissingValueCount; - return core::CFloatStorage{valueOfMissing()}; - } else if (core::CStringUtils::stringToTypeSilent(columnValue, value) == false) { + if (core::CStringUtils::stringToTypeSilent(columnValue, value) == false) { ++m_BadValueCount; return core::CFloatStorage{valueOfMissing()}; } @@ -278,9 +266,9 @@ void CDataFrame::parseAndWriteRow(const TStrCRng& columnValues, const std::strin this->writeRow([&](TFloatVecItr columns, std::int32_t& docHash) { for (std::size_t i = 0; i < columnValues.size(); ++i, ++columns) { - *columns = stringToValue( - m_ColumnIsCategorical[i], m_CategoricalColumnValueLookup[i], - m_CategoricalColumnValues[i], m_EmptyIsMissing[i], columnValues[i]); + *columns = stringToValue(m_ColumnIsCategorical[i], + m_CategoricalColumnValueLookup[i], + m_CategoricalColumnValues[i], columnValues[i]); } docHash = 0; if (hash != nullptr && @@ -312,16 +300,6 @@ void CDataFrame::missingString(std::string missing) { m_MissingString = std::move(missing); } -void CDataFrame::emptyIsMissing(TBoolVec emptyIsMissing) { - if (emptyIsMissing.size() != m_NumberColumns) { - HANDLE_FATAL(<< "Internal error: expected '" << m_NumberColumns - << "' 'empty is missing' column indicator values but got " - << CContainerPrinter::print(emptyIsMissing)); - } else { - m_EmptyIsMissing = std::move(emptyIsMissing); - } -} - void CDataFrame::categoricalColumns(TStrVec categoricalColumnNames) { std::sort(categoricalColumnNames.begin(), categoricalColumnNames.end()); for (std::size_t i = 0; i < m_ColumnNames.size(); ++i) { @@ -387,7 +365,6 @@ std::size_t CDataFrame::memoryUsage() const { memory += CMemory::dynamicSize(m_CategoricalColumnValues); memory += CMemory::dynamicSize(m_CategoricalColumnValueLookup); memory += CMemory::dynamicSize(m_MissingString); - memory += CMemory::dynamicSize(m_EmptyIsMissing); memory += CMemory::dynamicSize(m_ColumnIsCategorical); memory += CMemory::dynamicSize(m_Slices); memory += CMemory::dynamicSize(m_Writer);