Skip to content

Commit

Permalink
[7.7][ML] Remove special case handling for missing target field (#1039)
Browse files Browse the repository at this point in the history
Backport #1036.
  • Loading branch information
tveasey authored Mar 9, 2020
1 parent fd1a140 commit 9140285
Show file tree
Hide file tree
Showing 12 changed files with 38 additions and 127 deletions.
3 changes: 0 additions & 3 deletions include/api/CDataFrameAnalysisRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,6 @@ class API_EXPORT CDataFrameAnalysisRunner {
//! \return The number of columns this analysis appends.
virtual std::size_t numberExtraColumns() const = 0;

//! \return Indicator of columns for which empty value should be treated as missing.
virtual TBoolVec columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const;

//! Write the extra columns of \p row added by the analysis to \p writer.
//!
//! This should create a new object of the form:
Expand Down
12 changes: 6 additions & 6 deletions include/api/CDataFrameAnalysisSpecification.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,22 +172,22 @@ class API_EXPORT CDataFrameAnalysisSpecification {
//! \note The commit of the results of the analysis is atomic per partition.
//! \warning This assumes that there is no access to the data frame in the
//! calling thread until the runner has finished.
CDataFrameAnalysisRunner* run(core::CDataFrame& frame) const;
CDataFrameAnalysisRunner*
run(core::CDataFrame& frame,
core::CRapidJsonConcurrentLineWriter* outputWriter = nullptr) const;

//! Estimates memory usage in two cases:
//! 1. disk is not used (the whole data frame fits in main memory)
//! 2. disk is used (only one partition needs to be loaded to main memory)
void estimateMemoryUsage(CMemoryUsageEstimationResultJsonWriter& writer) const;

//! \return Indicator of columns for which empty value should be treated as missing.
TBoolVec columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const;

//! \return shared pointer to the persistence stream.
//! \return The stream to which to persist state if there is one.
TDataAdderUPtr persister() const;

//! \return The stream from which to retore state if there is one.
TDataSearcherUPtr restoreSearcher() const;

//! Get pointer to the analysis runner.
//! \return The analysis runner.
CDataFrameAnalysisRunner* runner();

private:
Expand Down
3 changes: 0 additions & 3 deletions include/api/CDataFrameTrainBoostedTreeClassifierRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,6 @@ class API_EXPORT CDataFrameTrainBoostedTreeClassifierRunner final
CDataFrameTrainBoostedTreeClassifierRunner(const CDataFrameAnalysisSpecification& spec,
const CDataFrameAnalysisParameters& parameters);

//! \return Indicator of columns for which empty value should be treated as missing.
TBoolVec columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const override;

//! Write the prediction for \p row to \p writer.
void writeOneRow(const core::CDataFrame& frame,
const TRowRef& row,
Expand Down
8 changes: 0 additions & 8 deletions include/core/CDataFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -450,9 +450,6 @@ class CORE_EXPORT CDataFrame final {
//! Write the string which indicates that a value is missing.
void missingString(std::string missing);

//! Write for which columns an empty string implies the value is missing.
void emptyIsMissing(TBoolVec emptyIsMissing);

//! Write which columns contain categorical data.
void categoricalColumns(TStrVec categoricalColumnNames);

Expand Down Expand Up @@ -589,11 +586,6 @@ class CORE_EXPORT CDataFrame final {
//! The string which indicates that a category is missing.
std::string m_MissingString;

//! Indicator vector for treating empty strings as missing values.
// TODO Remove once Java passes the correct value for the missing target
// for classification.
TBoolVec m_EmptyIsMissing;

//! Indicator vector of the columns which contain categorical values.
TBoolVec m_ColumnIsCategorical;

Expand Down
4 changes: 0 additions & 4 deletions lib/api/CDataFrameAnalysisRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,6 @@ CDataFrameAnalysisRunner::~CDataFrameAnalysisRunner() {
this->waitToFinish();
}

TBoolVec CDataFrameAnalysisRunner::columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const {
return TBoolVec(fieldNames.size(), false);
}

void CDataFrameAnalysisRunner::estimateMemoryUsage(CMemoryUsageEstimationResultJsonWriter& writer) const {
std::size_t numberRows{m_Spec.numberRows()};
std::size_t numberColumns{m_Spec.numberColumns()};
Expand Down
21 changes: 8 additions & 13 deletions lib/api/CDataFrameAnalysisSpecification.cc
Original file line number Diff line number Diff line change
Expand Up @@ -118,14 +118,14 @@ CDataFrameAnalysisSpecification::CDataFrameAnalysisSpecification(
rapidjson::Document specification;
if (specification.Parse(jsonSpecification.c_str()) == false) {
HANDLE_FATAL(<< "Input error: failed to parse analysis specification '"
<< jsonSpecification << "'. Please report this problem.");
<< jsonSpecification << "'. Please report this problem.")
} else {

auto parameters = CONFIG_READER.read(specification);

for (auto name : {ROWS, COLS, MEMORY_LIMIT, THREADS}) {
if (parameters[name].as<std::size_t>() == 0) {
HANDLE_FATAL(<< "Input error: '" << name << "' must be non-zero");
HANDLE_FATAL(<< "Input error: '" << name << "' must be non-zero")
}
}
m_NumberRows = parameters[ROWS].as<std::size_t>();
Expand Down Expand Up @@ -207,8 +207,11 @@ CDataFrameAnalysisSpecification::makeDataFrame() {
return result;
}

CDataFrameAnalysisRunner* CDataFrameAnalysisSpecification::run(core::CDataFrame& frame) const {
CDataFrameAnalysisRunner*
CDataFrameAnalysisSpecification::run(core::CDataFrame& frame,
core::CRapidJsonConcurrentLineWriter* writer) const {
if (m_Runner != nullptr) {
m_Runner->instrumentation().writer(writer);
m_Runner->run(frame);
return m_Runner.get();
}
Expand All @@ -218,20 +221,12 @@ CDataFrameAnalysisRunner* CDataFrameAnalysisSpecification::run(core::CDataFrame&
void CDataFrameAnalysisSpecification::estimateMemoryUsage(CMemoryUsageEstimationResultJsonWriter& writer) const {
if (m_Runner == nullptr) {
HANDLE_FATAL(<< "Internal error: no runner available so can't estimate memory."
<< " Please report this problem.");
<< " Please report this problem.")
return;
}
m_Runner->estimateMemoryUsage(writer);
}

TBoolVec CDataFrameAnalysisSpecification::columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const {
if (m_Runner == nullptr) {
HANDLE_FATAL(<< "Internal error: no runner available. Please report this problem.");
return TBoolVec(fieldNames.size(), false);
}
return m_Runner->columnsForWhichEmptyIsMissing(fieldNames);
}

void CDataFrameAnalysisSpecification::initializeRunner(const rapidjson::Value& jsonAnalysis) {
// We pass of the interpretation of the parameters object to the appropriate
// analysis runner.
Expand All @@ -251,7 +246,7 @@ void CDataFrameAnalysisSpecification::initializeRunner(const rapidjson::Value& j
}

HANDLE_FATAL(<< "Input error: unexpected analysis name '" << m_AnalysisName
<< "'. Please report this problem.");
<< "'. Please report this problem.")
}

CDataFrameAnalysisSpecification::TDataAdderUPtr
Expand Down
16 changes: 6 additions & 10 deletions lib/api/CDataFrameAnalyzer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -132,15 +132,13 @@ void CDataFrameAnalyzer::run() {
auto outStream = m_ResultsStreamSupplier();
core::CRapidJsonConcurrentLineWriter outputWriter{*outStream};

CDataFrameAnalysisRunner* analysis{m_AnalysisSpecification->runner()};
if (analysis == nullptr) {
return;
auto analysisRunner = m_AnalysisSpecification->run(*m_DataFrame, &outputWriter);

if (analysisRunner != nullptr) {
this->monitorProgress(*analysisRunner, outputWriter);
analysisRunner->waitToFinish();
this->writeResultsOf(*analysisRunner, outputWriter);
}
analysis->instrumentation().writer(&outputWriter);
m_AnalysisSpecification->run(*m_DataFrame);
this->monitorProgress(*analysis, outputWriter);
analysis->waitToFinish();
this->writeResultsOf(*analysis, outputWriter);
}

const CDataFrameAnalyzer::TTemporaryDirectoryPtr& CDataFrameAnalyzer::dataFrameDirectory() const {
Expand Down Expand Up @@ -248,8 +246,6 @@ void CDataFrameAnalyzer::captureFieldNames(const TStrVec& fieldNames) {
TStrVec columnNames{fieldNames.begin() + m_BeginDataFieldValues,
fieldNames.begin() + m_EndDataFieldValues};
m_DataFrame->columnNames(columnNames);
m_DataFrame->emptyIsMissing(
m_AnalysisSpecification->columnsForWhichEmptyIsMissing(columnNames));
m_DataFrame->categoricalColumns(m_AnalysisSpecification->categoricalFieldNames());
m_CapturedFieldNames = true;
}
Expand Down
13 changes: 0 additions & 13 deletions lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,19 +96,6 @@ CDataFrameTrainBoostedTreeClassifierRunner::CDataFrameTrainBoostedTreeClassifier
}
}

TBoolVec CDataFrameTrainBoostedTreeClassifierRunner::columnsForWhichEmptyIsMissing(
const TStrVec& fieldNames) const {
// The only field for which empty value should be treated as missing is dependent
// variable which has empty value for non-training rows.
TBoolVec emptyAsMissing(fieldNames.size(), false);
auto pos = std::find(fieldNames.begin(), fieldNames.end(),
this->dependentVariableFieldName());
if (pos != fieldNames.end()) {
emptyAsMissing[pos - fieldNames.begin()] = true;
}
return emptyAsMissing;
}

void CDataFrameTrainBoostedTreeClassifierRunner::writeOneRow(
const core::CDataFrame& frame,
const TRowRef& row,
Expand Down
27 changes: 0 additions & 27 deletions lib/api/unittest/CDataFrameAnalysisRunnerTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -204,31 +204,4 @@ BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor1000Rows) {
testEstimateMemoryUsage(1000, "403kB", "142kB", 0);
}

void testColumnsForWhichEmptyIsMissing(const std::string& analysis,
const std::string& dependentVariableName,
const TStrVec& fieldNames,
const TStrVec& categoricalFields,
const TBoolVec& expectedEmptyIsMissing) {
std::string parameters{"{\"dependent_variable\": \"" + dependentVariableName + "\"}"};
std::string jsonSpec{api::CDataFrameAnalysisSpecificationJsonWriter::jsonString(
"testJob", 10000, 5, 100000000, 1, "", categoricalFields, true,
test::CTestTmpDir::tmpDir(), "", analysis, parameters)};
api::CDataFrameAnalysisSpecification spec{jsonSpec};
auto emptyIsMissing = spec.columnsForWhichEmptyIsMissing(fieldNames);
BOOST_REQUIRE_EQUAL(core::CContainerPrinter::print(expectedEmptyIsMissing),
core::CContainerPrinter::print(emptyIsMissing));
}

BOOST_AUTO_TEST_CASE(testColumnsForWhichEmptyIsMissingClassification) {
testColumnsForWhichEmptyIsMissing("classification", "class",
{"feature_1", "feature_2", "feature_3", "class"},
{"class"}, {false, false, false, true});
}

BOOST_AUTO_TEST_CASE(testColumnsForWhichEmptyIsMissingRegression) {
testColumnsForWhichEmptyIsMissing("regression", "value",
{"feature_1", "feature_2", "feature_3", "value"},
{}, {false, false, false, false});
}

BOOST_AUTO_TEST_SUITE_END()
2 changes: 1 addition & 1 deletion lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ BOOST_AUTO_TEST_CASE(testRunAnalysis) {
BOOST_TEST_REQUIRE(runner->instrumentation().progress() <= 1.0);
}

LOG_DEBUG(<< "final progress = " << lastProgress);
LOG_TRACE(<< "final progress = " << lastProgress);
BOOST_REQUIRE_EQUAL(1.0, runner->instrumentation().progress());
}
}
Expand Down
23 changes: 12 additions & 11 deletions lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -653,7 +653,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTrainingWithRowsMissingTargetVa
}
for (std::size_t i = 40; i < 50; ++i) {
fieldValues[0] = std::to_string(feature[i]);
fieldValues[1] = "";
fieldValues[1] = core::CDataFrame::DEFAULT_MISSING_STRING;
fieldValues[2] = std::to_string(i);
analyzer.handleRecord(fieldNames, fieldValues);
}
Expand Down Expand Up @@ -985,10 +985,8 @@ BOOST_AUTO_TEST_CASE(testCategoricalFieldsEmptyAsMissing) {
return [expected](double actual) { return expected == actual; };
};

auto missing = []() {
return [](double actual) {
return maths::CDataFrameUtils::isMissing(actual);
};
auto missing = [](double actual) {
return maths::CDataFrameUtils::isMissing(actual);
};

auto assertRow = [&](const std::size_t row_i,
Expand All @@ -1007,23 +1005,26 @@ BOOST_AUTO_TEST_CASE(testCategoricalFieldsEmptyAsMissing) {
return std::make_unique<core::CJsonOutputStreamWrapper>(output);
};

std::string missingString{"foo"};

test::CDataFrameAnalysisSpecificationFactory specFactory;
api::CDataFrameAnalyzer analyzer{
specFactory.rows(1000)
.memoryLimit(27000000)
.predictionCategoricalFieldNames({"x1", "x2", "x5"})
.missingString(missingString)
.predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "x5"),
outputWriterFactory};

TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."};
analyzer.handleRecord(fieldNames, {"x11", "x21", "0", "0", "x51", "0", ""});
analyzer.handleRecord(fieldNames, {"x12", "x22", "1", "1", "x52", "1", ""});
analyzer.handleRecord(fieldNames, {"", "x23", "2", "2", "x51", "2", ""});
analyzer.handleRecord(fieldNames, {"x14", "x24", "3", "3", "", "3", ""});
analyzer.handleRecord(fieldNames, {"x14", "x24", "3", "3", missingString, "3", ""});
analyzer.handleRecord(fieldNames, {"x15", "x25", "4", "4", "x51", "4", ""});
analyzer.handleRecord(fieldNames, {"x11", "x26", "5", "5", "x52", "5", ""});
analyzer.handleRecord(fieldNames, {"x12", "", "6", "6", "", "6", ""});
analyzer.handleRecord(fieldNames, {"x13", "x21", "7", "7", "", "7", ""});
analyzer.handleRecord(fieldNames, {"x12", "", "6", "6", missingString, "6", ""});
analyzer.handleRecord(fieldNames, {"x13", "x21", "7", "7", missingString, "7", ""});
analyzer.handleRecord(fieldNames, {"x14", "x22", "8", "8", "x51", "8", ""});
analyzer.handleRecord(fieldNames, {"", "x23", "9", "9", "x52", "9", ""});
analyzer.receivedAllRows();
Expand All @@ -1036,11 +1037,11 @@ BOOST_AUTO_TEST_CASE(testCategoricalFieldsEmptyAsMissing) {
assertRow(0, {eq(0.0), eq(0.0), eq(0.0), eq(0.0), eq(0.0)}, rows[0]);
assertRow(1, {eq(1.0), eq(1.0), eq(1.0), eq(1.0), eq(1.0)}, rows[1]);
assertRow(2, {eq(2.0), eq(2.0), eq(2.0), eq(2.0), eq(0.0)}, rows[2]);
assertRow(3, {eq(3.0), eq(3.0), eq(3.0), eq(3.0), missing()}, rows[3]);
assertRow(3, {eq(3.0), eq(3.0), eq(3.0), eq(3.0), missing}, rows[3]);
assertRow(4, {eq(4.0), eq(4.0), eq(4.0), eq(4.0), eq(0.0)}, rows[4]);
assertRow(5, {eq(0.0), eq(5.0), eq(5.0), eq(5.0), eq(1.0)}, rows[5]);
assertRow(6, {eq(1.0), eq(6.0), eq(6.0), eq(6.0), missing()}, rows[6]);
assertRow(7, {eq(5.0), eq(0.0), eq(7.0), eq(7.0), missing()}, rows[7]);
assertRow(6, {eq(1.0), eq(6.0), eq(6.0), eq(6.0), missing}, rows[6]);
assertRow(7, {eq(5.0), eq(0.0), eq(7.0), eq(7.0), missing}, rows[7]);
assertRow(8, {eq(3.0), eq(1.0), eq(8.0), eq(8.0), eq(0.0)}, rows[8]);
assertRow(9, {eq(2.0), eq(2.0), eq(9.0), eq(9.0), eq(1.0)}, rows[9]);
});
Expand Down
33 changes: 5 additions & 28 deletions lib/core/CDataFrame.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ CDataFrame::CDataFrame(bool inMainMemory,
m_ReadAndWriteToStoreSyncStrategy{readAndWriteToStoreSyncStrategy},
m_WriteSliceToStore{writeSliceToStore}, m_ColumnNames(numberColumns),
m_CategoricalColumnValues(numberColumns), m_MissingString{DEFAULT_MISSING_STRING},
m_EmptyIsMissing(numberColumns, false),
m_ColumnIsCategorical(numberColumns, false) {
}

Expand Down Expand Up @@ -169,7 +168,6 @@ void CDataFrame::resizeColumns(std::size_t numberThreads, std::size_t numberColu
this->reserve(numberThreads, numberColumns);
m_ColumnNames.resize(numberColumns);
m_CategoricalColumnValues.resize(numberColumns);
m_EmptyIsMissing.resize(numberColumns, false);
m_ColumnIsCategorical.resize(numberColumns, false);
m_NumberColumns = numberColumns;
}
Expand Down Expand Up @@ -215,19 +213,13 @@ CDataFrame::TRowFuncVecBoolPr CDataFrame::writeColumns(std::size_t numberThreads
void CDataFrame::parseAndWriteRow(const TStrCRng& columnValues, const std::string* hash) {

auto stringToValue = [this](bool isCategorical, TStrSizeUMap& categoryLookup,
TStrVec& categories, bool emptyIsMissing,
const std::string& columnValue) {
TStrVec& categories, const std::string& columnValue) {
if (columnValue == m_MissingString) {
++m_MissingValueCount;
return core::CFloatStorage{valueOfMissing()};
}

if (isCategorical) {
// TODO Remove when Java passes special missing value string.
if (columnValue.empty() && emptyIsMissing) {
return core::CFloatStorage{valueOfMissing()};
}

// This encodes in a format suitable for efficient storage. The
// actual encoding approach is chosen when the analysis runs.
std::size_t id;
Expand Down Expand Up @@ -257,11 +249,7 @@ void CDataFrame::parseAndWriteRow(const TStrCRng& columnValues, const std::strin
// otherwise we must impute or exit with failure.

double value;
if (columnValue.empty()) {
// TODO Remove when Java passes special missing value string.
++m_MissingValueCount;
return core::CFloatStorage{valueOfMissing()};
} else if (core::CStringUtils::stringToTypeSilent(columnValue, value) == false) {
if (core::CStringUtils::stringToTypeSilent(columnValue, value) == false) {
++m_BadValueCount;
return core::CFloatStorage{valueOfMissing()};
}
Expand All @@ -278,9 +266,9 @@ void CDataFrame::parseAndWriteRow(const TStrCRng& columnValues, const std::strin

this->writeRow([&](TFloatVecItr columns, std::int32_t& docHash) {
for (std::size_t i = 0; i < columnValues.size(); ++i, ++columns) {
*columns = stringToValue(
m_ColumnIsCategorical[i], m_CategoricalColumnValueLookup[i],
m_CategoricalColumnValues[i], m_EmptyIsMissing[i], columnValues[i]);
*columns = stringToValue(m_ColumnIsCategorical[i],
m_CategoricalColumnValueLookup[i],
m_CategoricalColumnValues[i], columnValues[i]);
}
docHash = 0;
if (hash != nullptr &&
Expand Down Expand Up @@ -312,16 +300,6 @@ void CDataFrame::missingString(std::string missing) {
m_MissingString = std::move(missing);
}

void CDataFrame::emptyIsMissing(TBoolVec emptyIsMissing) {
if (emptyIsMissing.size() != m_NumberColumns) {
HANDLE_FATAL(<< "Internal error: expected '" << m_NumberColumns
<< "' 'empty is missing' column indicator values but got "
<< CContainerPrinter::print(emptyIsMissing));
} else {
m_EmptyIsMissing = std::move(emptyIsMissing);
}
}

void CDataFrame::categoricalColumns(TStrVec categoricalColumnNames) {
std::sort(categoricalColumnNames.begin(), categoricalColumnNames.end());
for (std::size_t i = 0; i < m_ColumnNames.size(); ++i) {
Expand Down Expand Up @@ -387,7 +365,6 @@ std::size_t CDataFrame::memoryUsage() const {
memory += CMemory::dynamicSize(m_CategoricalColumnValues);
memory += CMemory::dynamicSize(m_CategoricalColumnValueLookup);
memory += CMemory::dynamicSize(m_MissingString);
memory += CMemory::dynamicSize(m_EmptyIsMissing);
memory += CMemory::dynamicSize(m_ColumnIsCategorical);
memory += CMemory::dynamicSize(m_Slices);
memory += CMemory::dynamicSize(m_Writer);
Expand Down

0 comments on commit 9140285

Please sign in to comment.