Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Remove special case handling for missing target field for classification #1036

Merged
merged 1 commit into from
Mar 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions include/api/CDataFrameAnalysisRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,6 @@ class API_EXPORT CDataFrameAnalysisRunner {
//! \return The number of columns this analysis appends.
virtual std::size_t numberExtraColumns() const = 0;

//! \return Indicator of columns for which empty value should be treated as missing.
virtual TBoolVec columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const;

//! Write the extra columns of \p row added by the analysis to \p writer.
//!
//! This should create a new object of the form:
Expand Down
12 changes: 6 additions & 6 deletions include/api/CDataFrameAnalysisSpecification.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,22 +172,22 @@ class API_EXPORT CDataFrameAnalysisSpecification {
//! \note The commit of the results of the analysis is atomic per partition.
//! \warning This assumes that there is no access to the data frame in the
//! calling thread until the runner has finished.
CDataFrameAnalysisRunner* run(core::CDataFrame& frame) const;
CDataFrameAnalysisRunner*
run(core::CDataFrame& frame,
core::CRapidJsonConcurrentLineWriter* outputWriter = nullptr) const;

//! Estimates memory usage in two cases:
//! 1. disk is not used (the whole data frame fits in main memory)
//! 2. disk is used (only one partition needs to be loaded to main memory)
void estimateMemoryUsage(CMemoryUsageEstimationResultJsonWriter& writer) const;

//! \return Indicator of columns for which empty value should be treated as missing.
TBoolVec columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const;

//! \return shared pointer to the persistence stream.
//! \return The stream to which to persist state if there is one.
TDataAdderUPtr persister() const;

//! \return The stream from which to retore state if there is one.
TDataSearcherUPtr restoreSearcher() const;

//! Get pointer to the analysis runner.
//! \return The analysis runner.
CDataFrameAnalysisRunner* runner();

private:
Expand Down
3 changes: 0 additions & 3 deletions include/api/CDataFrameTrainBoostedTreeClassifierRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,6 @@ class API_EXPORT CDataFrameTrainBoostedTreeClassifierRunner final
CDataFrameTrainBoostedTreeClassifierRunner(const CDataFrameAnalysisSpecification& spec,
const CDataFrameAnalysisParameters& parameters);

//! \return Indicator of columns for which empty value should be treated as missing.
TBoolVec columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const override;

//! Write the prediction for \p row to \p writer.
void writeOneRow(const core::CDataFrame& frame,
const TRowRef& row,
Expand Down
8 changes: 0 additions & 8 deletions include/core/CDataFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -449,9 +449,6 @@ class CORE_EXPORT CDataFrame final {
//! Write the string which indicates that a value is missing.
void missingString(std::string missing);

//! Write for which columns an empty string implies the value is missing.
void emptyIsMissing(TBoolVec emptyIsMissing);

//! Write which columns contain categorical data.
void categoricalColumns(TStrVec categoricalColumnNames);

Expand Down Expand Up @@ -586,11 +583,6 @@ class CORE_EXPORT CDataFrame final {
//! The string which indicates that a category is missing.
std::string m_MissingString;

//! Indicator vector for treating empty strings as missing values.
// TODO Remove once Java passes the correct value for the missing target
// for classification.
TBoolVec m_EmptyIsMissing;

//! Indicator vector of the columns which contain categorical values.
TBoolVec m_ColumnIsCategorical;

Expand Down
4 changes: 0 additions & 4 deletions lib/api/CDataFrameAnalysisRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,6 @@ CDataFrameAnalysisRunner::~CDataFrameAnalysisRunner() {
this->waitToFinish();
}

TBoolVec CDataFrameAnalysisRunner::columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const {
return TBoolVec(fieldNames.size(), false);
}

void CDataFrameAnalysisRunner::estimateMemoryUsage(CMemoryUsageEstimationResultJsonWriter& writer) const {
std::size_t numberRows{m_Spec.numberRows()};
std::size_t numberColumns{m_Spec.numberColumns()};
Expand Down
21 changes: 8 additions & 13 deletions lib/api/CDataFrameAnalysisSpecification.cc
Original file line number Diff line number Diff line change
Expand Up @@ -118,14 +118,14 @@ CDataFrameAnalysisSpecification::CDataFrameAnalysisSpecification(
rapidjson::Document specification;
if (specification.Parse(jsonSpecification.c_str()) == false) {
HANDLE_FATAL(<< "Input error: failed to parse analysis specification '"
<< jsonSpecification << "'. Please report this problem.");
<< jsonSpecification << "'. Please report this problem.")
} else {

auto parameters = CONFIG_READER.read(specification);

for (auto name : {ROWS, COLS, MEMORY_LIMIT, THREADS}) {
if (parameters[name].as<std::size_t>() == 0) {
HANDLE_FATAL(<< "Input error: '" << name << "' must be non-zero");
HANDLE_FATAL(<< "Input error: '" << name << "' must be non-zero")
}
}
m_NumberRows = parameters[ROWS].as<std::size_t>();
Expand Down Expand Up @@ -207,8 +207,11 @@ CDataFrameAnalysisSpecification::makeDataFrame() {
return result;
}

CDataFrameAnalysisRunner* CDataFrameAnalysisSpecification::run(core::CDataFrame& frame) const {
CDataFrameAnalysisRunner*
CDataFrameAnalysisSpecification::run(core::CDataFrame& frame,
core::CRapidJsonConcurrentLineWriter* writer) const {
if (m_Runner != nullptr) {
m_Runner->instrumentation().writer(writer);
m_Runner->run(frame);
return m_Runner.get();
}
Expand All @@ -218,20 +221,12 @@ CDataFrameAnalysisRunner* CDataFrameAnalysisSpecification::run(core::CDataFrame&
void CDataFrameAnalysisSpecification::estimateMemoryUsage(CMemoryUsageEstimationResultJsonWriter& writer) const {
if (m_Runner == nullptr) {
HANDLE_FATAL(<< "Internal error: no runner available so can't estimate memory."
<< " Please report this problem.");
<< " Please report this problem.")
return;
}
m_Runner->estimateMemoryUsage(writer);
}

TBoolVec CDataFrameAnalysisSpecification::columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const {
if (m_Runner == nullptr) {
HANDLE_FATAL(<< "Internal error: no runner available. Please report this problem.");
return TBoolVec(fieldNames.size(), false);
}
return m_Runner->columnsForWhichEmptyIsMissing(fieldNames);
}

void CDataFrameAnalysisSpecification::initializeRunner(const rapidjson::Value& jsonAnalysis) {
// We pass of the interpretation of the parameters object to the appropriate
// analysis runner.
Expand All @@ -251,7 +246,7 @@ void CDataFrameAnalysisSpecification::initializeRunner(const rapidjson::Value& j
}

HANDLE_FATAL(<< "Input error: unexpected analysis name '" << m_AnalysisName
<< "'. Please report this problem.");
<< "'. Please report this problem.")
}

CDataFrameAnalysisSpecification::TDataAdderUPtr
Expand Down
16 changes: 6 additions & 10 deletions lib/api/CDataFrameAnalyzer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -132,15 +132,13 @@ void CDataFrameAnalyzer::run() {
auto outStream = m_ResultsStreamSupplier();
core::CRapidJsonConcurrentLineWriter outputWriter{*outStream};

CDataFrameAnalysisRunner* analysis{m_AnalysisSpecification->runner()};
if (analysis == nullptr) {
return;
auto analysisRunner = m_AnalysisSpecification->run(*m_DataFrame, &outputWriter);

if (analysisRunner != nullptr) {
this->monitorProgress(*analysisRunner, outputWriter);
analysisRunner->waitToFinish();
this->writeResultsOf(*analysisRunner, outputWriter);
}
analysis->instrumentation().writer(&outputWriter);
m_AnalysisSpecification->run(*m_DataFrame);
this->monitorProgress(*analysis, outputWriter);
analysis->waitToFinish();
this->writeResultsOf(*analysis, outputWriter);
}

const CDataFrameAnalyzer::TTemporaryDirectoryPtr& CDataFrameAnalyzer::dataFrameDirectory() const {
Expand Down Expand Up @@ -248,8 +246,6 @@ void CDataFrameAnalyzer::captureFieldNames(const TStrVec& fieldNames) {
TStrVec columnNames{fieldNames.begin() + m_BeginDataFieldValues,
fieldNames.begin() + m_EndDataFieldValues};
m_DataFrame->columnNames(columnNames);
m_DataFrame->emptyIsMissing(
m_AnalysisSpecification->columnsForWhichEmptyIsMissing(columnNames));
m_DataFrame->categoricalColumns(m_AnalysisSpecification->categoricalFieldNames());
m_CapturedFieldNames = true;
}
Expand Down
13 changes: 0 additions & 13 deletions lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,19 +96,6 @@ CDataFrameTrainBoostedTreeClassifierRunner::CDataFrameTrainBoostedTreeClassifier
}
}

TBoolVec CDataFrameTrainBoostedTreeClassifierRunner::columnsForWhichEmptyIsMissing(
const TStrVec& fieldNames) const {
// The only field for which empty value should be treated as missing is dependent
// variable which has empty value for non-training rows.
TBoolVec emptyAsMissing(fieldNames.size(), false);
auto pos = std::find(fieldNames.begin(), fieldNames.end(),
this->dependentVariableFieldName());
if (pos != fieldNames.end()) {
emptyAsMissing[pos - fieldNames.begin()] = true;
}
return emptyAsMissing;
}

void CDataFrameTrainBoostedTreeClassifierRunner::writeOneRow(
const core::CDataFrame& frame,
const TRowRef& row,
Expand Down
27 changes: 0 additions & 27 deletions lib/api/unittest/CDataFrameAnalysisRunnerTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -204,31 +204,4 @@ BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor1000Rows) {
testEstimateMemoryUsage(1000, "403kB", "142kB", 0);
}

void testColumnsForWhichEmptyIsMissing(const std::string& analysis,
const std::string& dependentVariableName,
const TStrVec& fieldNames,
const TStrVec& categoricalFields,
const TBoolVec& expectedEmptyIsMissing) {
std::string parameters{"{\"dependent_variable\": \"" + dependentVariableName + "\"}"};
std::string jsonSpec{api::CDataFrameAnalysisSpecificationJsonWriter::jsonString(
"testJob", 10000, 5, 100000000, 1, "", categoricalFields, true,
test::CTestTmpDir::tmpDir(), "", analysis, parameters)};
api::CDataFrameAnalysisSpecification spec{jsonSpec};
auto emptyIsMissing = spec.columnsForWhichEmptyIsMissing(fieldNames);
BOOST_REQUIRE_EQUAL(core::CContainerPrinter::print(expectedEmptyIsMissing),
core::CContainerPrinter::print(emptyIsMissing));
}

BOOST_AUTO_TEST_CASE(testColumnsForWhichEmptyIsMissingClassification) {
testColumnsForWhichEmptyIsMissing("classification", "class",
{"feature_1", "feature_2", "feature_3", "class"},
{"class"}, {false, false, false, true});
}

BOOST_AUTO_TEST_CASE(testColumnsForWhichEmptyIsMissingRegression) {
testColumnsForWhichEmptyIsMissing("regression", "value",
{"feature_1", "feature_2", "feature_3", "value"},
{}, {false, false, false, false});
}

BOOST_AUTO_TEST_SUITE_END()
2 changes: 1 addition & 1 deletion lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ BOOST_AUTO_TEST_CASE(testRunAnalysis) {
BOOST_TEST_REQUIRE(runner->instrumentation().progress() <= 1.0);
}

LOG_DEBUG(<< "final progress = " << lastProgress);
LOG_TRACE(<< "final progress = " << lastProgress);
BOOST_REQUIRE_EQUAL(1.0, runner->instrumentation().progress());
}
}
Expand Down
23 changes: 12 additions & 11 deletions lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -653,7 +653,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTrainingWithRowsMissingTargetVa
}
for (std::size_t i = 40; i < 50; ++i) {
fieldValues[0] = std::to_string(feature[i]);
fieldValues[1] = "";
fieldValues[1] = core::CDataFrame::DEFAULT_MISSING_STRING;
fieldValues[2] = std::to_string(i);
analyzer.handleRecord(fieldNames, fieldValues);
}
Expand Down Expand Up @@ -985,10 +985,8 @@ BOOST_AUTO_TEST_CASE(testCategoricalFieldsEmptyAsMissing) {
return [expected](double actual) { return expected == actual; };
};

auto missing = []() {
return [](double actual) {
return maths::CDataFrameUtils::isMissing(actual);
};
auto missing = [](double actual) {
return maths::CDataFrameUtils::isMissing(actual);
};

auto assertRow = [&](const std::size_t row_i,
Expand All @@ -1007,23 +1005,26 @@ BOOST_AUTO_TEST_CASE(testCategoricalFieldsEmptyAsMissing) {
return std::make_unique<core::CJsonOutputStreamWrapper>(output);
};

std::string missingString{"foo"};

test::CDataFrameAnalysisSpecificationFactory specFactory;
api::CDataFrameAnalyzer analyzer{
specFactory.rows(1000)
.memoryLimit(27000000)
.predictionCategoricalFieldNames({"x1", "x2", "x5"})
.missingString(missingString)
.predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "x5"),
outputWriterFactory};

TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."};
analyzer.handleRecord(fieldNames, {"x11", "x21", "0", "0", "x51", "0", ""});
analyzer.handleRecord(fieldNames, {"x12", "x22", "1", "1", "x52", "1", ""});
analyzer.handleRecord(fieldNames, {"", "x23", "2", "2", "x51", "2", ""});
analyzer.handleRecord(fieldNames, {"x14", "x24", "3", "3", "", "3", ""});
analyzer.handleRecord(fieldNames, {"x14", "x24", "3", "3", missingString, "3", ""});
analyzer.handleRecord(fieldNames, {"x15", "x25", "4", "4", "x51", "4", ""});
analyzer.handleRecord(fieldNames, {"x11", "x26", "5", "5", "x52", "5", ""});
analyzer.handleRecord(fieldNames, {"x12", "", "6", "6", "", "6", ""});
analyzer.handleRecord(fieldNames, {"x13", "x21", "7", "7", "", "7", ""});
analyzer.handleRecord(fieldNames, {"x12", "", "6", "6", missingString, "6", ""});
analyzer.handleRecord(fieldNames, {"x13", "x21", "7", "7", missingString, "7", ""});
analyzer.handleRecord(fieldNames, {"x14", "x22", "8", "8", "x51", "8", ""});
analyzer.handleRecord(fieldNames, {"", "x23", "9", "9", "x52", "9", ""});
analyzer.receivedAllRows();
Expand All @@ -1036,11 +1037,11 @@ BOOST_AUTO_TEST_CASE(testCategoricalFieldsEmptyAsMissing) {
assertRow(0, {eq(0.0), eq(0.0), eq(0.0), eq(0.0), eq(0.0)}, rows[0]);
assertRow(1, {eq(1.0), eq(1.0), eq(1.0), eq(1.0), eq(1.0)}, rows[1]);
assertRow(2, {eq(2.0), eq(2.0), eq(2.0), eq(2.0), eq(0.0)}, rows[2]);
assertRow(3, {eq(3.0), eq(3.0), eq(3.0), eq(3.0), missing()}, rows[3]);
assertRow(3, {eq(3.0), eq(3.0), eq(3.0), eq(3.0), missing}, rows[3]);
assertRow(4, {eq(4.0), eq(4.0), eq(4.0), eq(4.0), eq(0.0)}, rows[4]);
assertRow(5, {eq(0.0), eq(5.0), eq(5.0), eq(5.0), eq(1.0)}, rows[5]);
assertRow(6, {eq(1.0), eq(6.0), eq(6.0), eq(6.0), missing()}, rows[6]);
assertRow(7, {eq(5.0), eq(0.0), eq(7.0), eq(7.0), missing()}, rows[7]);
assertRow(6, {eq(1.0), eq(6.0), eq(6.0), eq(6.0), missing}, rows[6]);
assertRow(7, {eq(5.0), eq(0.0), eq(7.0), eq(7.0), missing}, rows[7]);
assertRow(8, {eq(3.0), eq(1.0), eq(8.0), eq(8.0), eq(0.0)}, rows[8]);
assertRow(9, {eq(2.0), eq(2.0), eq(9.0), eq(9.0), eq(1.0)}, rows[9]);
});
Expand Down
33 changes: 5 additions & 28 deletions lib/core/CDataFrame.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ CDataFrame::CDataFrame(bool inMainMemory,
m_ReadAndWriteToStoreSyncStrategy{readAndWriteToStoreSyncStrategy},
m_WriteSliceToStore{writeSliceToStore}, m_ColumnNames(numberColumns),
m_CategoricalColumnValues(numberColumns), m_MissingString{DEFAULT_MISSING_STRING},
m_EmptyIsMissing(numberColumns, false),
m_ColumnIsCategorical(numberColumns, false) {
}

Expand Down Expand Up @@ -169,7 +168,6 @@ void CDataFrame::resizeColumns(std::size_t numberThreads, std::size_t numberColu
this->reserve(numberThreads, numberColumns);
m_ColumnNames.resize(numberColumns);
m_CategoricalColumnValues.resize(numberColumns);
m_EmptyIsMissing.resize(numberColumns, false);
m_ColumnIsCategorical.resize(numberColumns, false);
m_NumberColumns = numberColumns;
}
Expand Down Expand Up @@ -215,19 +213,13 @@ CDataFrame::TRowFuncVecBoolPr CDataFrame::writeColumns(std::size_t numberThreads
void CDataFrame::parseAndWriteRow(const TStrCRng& columnValues, const std::string* hash) {

auto stringToValue = [this](bool isCategorical, TStrSizeUMap& categoryLookup,
TStrVec& categories, bool emptyIsMissing,
const std::string& columnValue) {
TStrVec& categories, const std::string& columnValue) {
if (columnValue == m_MissingString) {
++m_MissingValueCount;
return core::CFloatStorage{valueOfMissing()};
}

if (isCategorical) {
// TODO Remove when Java passes special missing value string.
if (columnValue.empty() && emptyIsMissing) {
return core::CFloatStorage{valueOfMissing()};
}

// This encodes in a format suitable for efficient storage. The
// actual encoding approach is chosen when the analysis runs.
std::size_t id;
Expand Down Expand Up @@ -257,11 +249,7 @@ void CDataFrame::parseAndWriteRow(const TStrCRng& columnValues, const std::strin
// otherwise we must impute or exit with failure.

double value;
if (columnValue.empty()) {
// TODO Remove when Java passes special missing value string.
++m_MissingValueCount;
return core::CFloatStorage{valueOfMissing()};
} else if (core::CStringUtils::stringToTypeSilent(columnValue, value) == false) {
if (core::CStringUtils::stringToTypeSilent(columnValue, value) == false) {
++m_BadValueCount;
return core::CFloatStorage{valueOfMissing()};
}
Expand All @@ -278,9 +266,9 @@ void CDataFrame::parseAndWriteRow(const TStrCRng& columnValues, const std::strin

this->writeRow([&](TFloatVecItr columns, std::int32_t& docHash) {
for (std::size_t i = 0; i < columnValues.size(); ++i, ++columns) {
*columns = stringToValue(
m_ColumnIsCategorical[i], m_CategoricalColumnValueLookup[i],
m_CategoricalColumnValues[i], m_EmptyIsMissing[i], columnValues[i]);
*columns = stringToValue(m_ColumnIsCategorical[i],
m_CategoricalColumnValueLookup[i],
m_CategoricalColumnValues[i], columnValues[i]);
}
docHash = 0;
if (hash != nullptr &&
Expand Down Expand Up @@ -312,16 +300,6 @@ void CDataFrame::missingString(std::string missing) {
m_MissingString = std::move(missing);
}

void CDataFrame::emptyIsMissing(TBoolVec emptyIsMissing) {
if (emptyIsMissing.size() != m_NumberColumns) {
HANDLE_FATAL(<< "Internal error: expected '" << m_NumberColumns
<< "' 'empty is missing' column indicator values but got "
<< CContainerPrinter::print(emptyIsMissing));
} else {
m_EmptyIsMissing = std::move(emptyIsMissing);
}
}

void CDataFrame::categoricalColumns(TStrVec categoricalColumnNames) {
std::sort(categoricalColumnNames.begin(), categoricalColumnNames.end());
for (std::size_t i = 0; i < m_ColumnNames.size(); ++i) {
Expand Down Expand Up @@ -387,7 +365,6 @@ std::size_t CDataFrame::memoryUsage() const {
memory += CMemory::dynamicSize(m_CategoricalColumnValues);
memory += CMemory::dynamicSize(m_CategoricalColumnValueLookup);
memory += CMemory::dynamicSize(m_MissingString);
memory += CMemory::dynamicSize(m_EmptyIsMissing);
memory += CMemory::dynamicSize(m_ColumnIsCategorical);
memory += CMemory::dynamicSize(m_Slices);
memory += CMemory::dynamicSize(m_Writer);
Expand Down