Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Distinguish missing and empty categorical values #1034

Merged
merged 6 commits into from
Mar 4, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ necessary. This will improve the allocation of data frame analyses to cluster no
* Upgrade the compiler used on Linux from gcc 7.3 to gcc 7.5, and the binutils used in
the build from version 2.20 to 2.34. (See {ml-pull}1013[#1013].)
* Remove all memory overheads for computing tree SHAP values. (See {ml-pull}1023[#1023].)
* Distinguish between empty and missing categorical fields in classification and regression
model training. (See {ml-pull}1034[#1034].)

=== Bug Fixes

Expand Down
2 changes: 2 additions & 0 deletions include/api/CDataFrameAnalysisSpecification.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class API_EXPORT CDataFrameAnalysisSpecification {
static const std::string THREADS;
static const std::string TEMPORARY_DIRECTORY;
static const std::string RESULTS_FIELD;
static const std::string MISSING_FIELD_VALUE;
static const std::string CATEGORICAL_FIELD_NAMES;
static const std::string DISK_USAGE_ALLOWED;
static const std::string ANALYSIS;
Expand Down Expand Up @@ -203,6 +204,7 @@ class API_EXPORT CDataFrameAnalysisSpecification {
std::string m_ResultsField;
std::string m_JobId;
std::string m_AnalysisName;
std::string m_MissingFieldValue;
TStrVec m_CategoricalFieldNames;
bool m_DiskUsageAllowed;
// TODO Sparse table support
Expand Down
11 changes: 7 additions & 4 deletions include/api/CDataFrameAnalysisSpecificationJsonWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class API_EXPORT CDataFrameAnalysisSpecificationJsonWriter : private core::CNonI
std::size_t numberThreads,
const std::string& temporaryDirectory,
const std::string& resultsField,
const std::string& missingString,
const TStrVec& categoricalFields,
bool diskUsageAllowed,
const std::string& analysisName,
Expand All @@ -48,6 +49,7 @@ class API_EXPORT CDataFrameAnalysisSpecificationJsonWriter : private core::CNonI
std::size_t numberThreads,
const std::string& temporaryDirectory,
const std::string& resultsField,
const std::string& missingString,
const TStrVec& categoricalFields,
bool diskUsageAllowed,
const std::string& analysisName,
Expand All @@ -56,10 +58,11 @@ class API_EXPORT CDataFrameAnalysisSpecificationJsonWriter : private core::CNonI

//! Returns a string with the data frame analysis specification in JSON format.
static std::string jsonString(const std::string& jobId,
size_t rows,
size_t cols,
size_t memoryLimit,
size_t numberThreads,
std::size_t rows,
std::size_t cols,
std::size_t memoryLimit,
std::size_t numberThreads,
const std::string& missingString,
const TStrVec& categoricalFields,
bool diskUsageAllowed,
const std::string& tempDir,
Expand Down
11 changes: 11 additions & 0 deletions include/core/CDataFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,9 @@ class CORE_EXPORT CDataFrame final {
//! The maximum number of distinct categorical fields we can faithfully represent.
static const std::size_t MAX_CATEGORICAL_CARDINALITY;

//! The default value indicating that a value is missing.
static const std::string DEFAULT_MISSING_STRING;

public:
//! \param[in] inMainMemory True if the data frame is stored in main memory.
//! \param[in] numberColumns The number of columns in the data frame.
Expand Down Expand Up @@ -443,6 +446,9 @@ class CORE_EXPORT CDataFrame final {
//! Write the column names.
void columnNames(TStrVec columnNames);

//! Write the string which indicates that a value is missing.
void missingString(std::string missing);

//! Write for which columns an empty string implies the value is missing.
void emptyIsMissing(TBoolVec emptyIsMissing);

Expand Down Expand Up @@ -577,7 +583,12 @@ class CORE_EXPORT CDataFrame final {
//! A lookup for the integer value of categories.
TStrSizeUMapVec m_CategoricalColumnValueLookup;

//! The string which indicates that a category is missing.
std::string m_MissingString;

//! Indicator vector for treating empty strings as missing values.
// TODO Remove once Java passes the correct value for the missing target
// for classification.
TBoolVec m_EmptyIsMissing;

//! Indicator vector of the columns which contain categorical values.
Expand Down
98 changes: 70 additions & 28 deletions include/test/CDataFrameAnalysisSpecificationFactory.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

#include <test/ImportExport.h>

#include <boost/optional.hpp>

#include <cstddef>
#include <memory>
#include <string>
Expand All @@ -32,37 +34,77 @@ class TEST_EXPORT CDataFrameAnalysisSpecificationFactory {
using TSpecificationUPtr = std::unique_ptr<api::CDataFrameAnalysisSpecification>;

public:
CDataFrameAnalysisSpecificationFactory();

static const std::string& classification();
static const std::string& regression();

static TSpecificationUPtr outlierSpec(std::size_t rows = 110,
std::size_t cols = 5,
std::size_t memoryLimit = 100000,
const std::string& method = "",
std::size_t numberNeighbours = 0,
bool computeFeatureInfluence = false,
bool diskUsageAllowed = true);

static TSpecificationUPtr
predictionSpec(const std::string& analysis,
const std::string& dependentVariable,
std::size_t rows = 100,
std::size_t cols = 5,
std::size_t memoryLimit = 7000000,
std::size_t numberRoundsPerHyperparameter = 0,
std::size_t bayesianOptimisationRestarts = 0,
const TStrVec& categoricalFieldNames = TStrVec{},
double alpha = -1.0,
double lambda = -1.0,
double gamma = -1.0,
double softTreeDepthLimit = -1.0,
double softTreeDepthTolerance = -1.0,
double eta = -1.0,
std::size_t maximumNumberTrees = 0,
double featureBagFraction = -1.0,
size_t topShapValues = 0,
TPersisterSupplier* persisterSupplier = nullptr,
TRestoreSearcherSupplier* restoreSearcherSupplier = nullptr);
// Shared
CDataFrameAnalysisSpecificationFactory& rows(std::size_t rows);
CDataFrameAnalysisSpecificationFactory& columns(std::size_t columns);
CDataFrameAnalysisSpecificationFactory& memoryLimit(std::size_t memoryLimit);
CDataFrameAnalysisSpecificationFactory& missingString(const std::string& missing);
CDataFrameAnalysisSpecificationFactory& diskUsageAllowed(bool disk);

// Outliers
CDataFrameAnalysisSpecificationFactory& outlierMethod(std::string method);
CDataFrameAnalysisSpecificationFactory& outlierNumberNeighbours(std::size_t number);
CDataFrameAnalysisSpecificationFactory& outlierComputeInfluence(bool compute);

// Prediction
CDataFrameAnalysisSpecificationFactory&
predicitionNumberRoundsPerHyperparameter(std::size_t rounds);
CDataFrameAnalysisSpecificationFactory&
predictionBayesianOptimisationRestarts(std::size_t restarts);
CDataFrameAnalysisSpecificationFactory&
predictionCategoricalFieldNames(const TStrVec& categorical);
CDataFrameAnalysisSpecificationFactory& predictionAlpha(double alpha);
CDataFrameAnalysisSpecificationFactory& predictionLambda(double lambda);
CDataFrameAnalysisSpecificationFactory& predictionGamma(double gamma);
CDataFrameAnalysisSpecificationFactory& predictionSoftTreeDepthLimit(double limit);
CDataFrameAnalysisSpecificationFactory& predictionSoftTreeDepthTolerance(double tolerance);
CDataFrameAnalysisSpecificationFactory& predictionEta(double eta);
CDataFrameAnalysisSpecificationFactory& predictionMaximumNumberTrees(std::size_t number);
CDataFrameAnalysisSpecificationFactory& predictionFeatureBagFraction(double fraction);
CDataFrameAnalysisSpecificationFactory& predictionNumberTopShapValues(std::size_t number);
CDataFrameAnalysisSpecificationFactory&
predictionPersisterSupplier(TPersisterSupplier* persisterSupplier);
CDataFrameAnalysisSpecificationFactory&
predictionRestoreSearcherSupplier(TRestoreSearcherSupplier* restoreSearcherSupplier);

TSpecificationUPtr outlierSpec() const;
TSpecificationUPtr predictionSpec(const std::string& analysis,
const std::string& dependentVariable) const;

private:
using TOptionalSize = boost::optional<std::size_t>;

private:
// Shared
TOptionalSize m_Rows;
TOptionalSize m_Columns;
TOptionalSize m_MemoryLimit;
std::string m_MissingString;
bool m_DiskUsageAllowed = true;
// Outliers
std::string m_Method;
std::size_t m_NumberNeighbours = 0;
bool m_ComputeFeatureInfluence = false;
// Prediction
std::size_t m_NumberRoundsPerHyperparameter = 0;
std::size_t m_BayesianOptimisationRestarts = 0;
TStrVec m_CategoricalFieldNames;
double m_Alpha = -1.0;
double m_Lambda = -1.0;
double m_Gamma = -1.0;
double m_SoftTreeDepthLimit = -1.0;
double m_SoftTreeDepthTolerance = -1.0;
double m_Eta = -1.0;
std::size_t m_MaximumNumberTrees = 0;
double m_FeatureBagFraction = -1.0;
std::size_t m_NumberTopShapValues = 0;
TPersisterSupplier* m_PersisterSupplier = nullptr;
TRestoreSearcherSupplier* m_RestoreSearcherSupplier = nullptr;
};
}
}
Expand Down
36 changes: 24 additions & 12 deletions lib/api/CDataFrameAnalysisSpecification.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <core/CDataFrame.h>
#include <core/CLogger.h>
#include <core/CStringUtils.h>

#include <api/CDataFrameAnalysisConfigReader.h>
#include <api/CDataFrameOutliersRunner.h>
Expand All @@ -28,18 +29,19 @@ namespace ml {
namespace api {

// These must be consistent with Java names.
const std::string CDataFrameAnalysisSpecification::JOB_ID("job_id");
const std::string CDataFrameAnalysisSpecification::ROWS("rows");
const std::string CDataFrameAnalysisSpecification::COLS("cols");
const std::string CDataFrameAnalysisSpecification::MEMORY_LIMIT("memory_limit");
const std::string CDataFrameAnalysisSpecification::THREADS("threads");
const std::string CDataFrameAnalysisSpecification::TEMPORARY_DIRECTORY("temp_dir");
const std::string CDataFrameAnalysisSpecification::RESULTS_FIELD("results_field");
const std::string CDataFrameAnalysisSpecification::JOB_ID{"job_id"};
const std::string CDataFrameAnalysisSpecification::ROWS{"rows"};
const std::string CDataFrameAnalysisSpecification::COLS{"cols"};
const std::string CDataFrameAnalysisSpecification::MEMORY_LIMIT{"memory_limit"};
const std::string CDataFrameAnalysisSpecification::THREADS{"threads"};
const std::string CDataFrameAnalysisSpecification::TEMPORARY_DIRECTORY{"temp_dir"};
const std::string CDataFrameAnalysisSpecification::RESULTS_FIELD{"results_field"};
const std::string CDataFrameAnalysisSpecification::MISSING_FIELD_VALUE{"missing_field_value"};
const std::string CDataFrameAnalysisSpecification::CATEGORICAL_FIELD_NAMES{"categorical_fields"};
const std::string CDataFrameAnalysisSpecification::DISK_USAGE_ALLOWED("disk_usage_allowed");
const std::string CDataFrameAnalysisSpecification::ANALYSIS("analysis");
const std::string CDataFrameAnalysisSpecification::NAME("name");
const std::string CDataFrameAnalysisSpecification::PARAMETERS("parameters");
const std::string CDataFrameAnalysisSpecification::DISK_USAGE_ALLOWED{"disk_usage_allowed"};
const std::string CDataFrameAnalysisSpecification::ANALYSIS{"analysis"};
const std::string CDataFrameAnalysisSpecification::NAME{"name"};
const std::string CDataFrameAnalysisSpecification::PARAMETERS{"parameters"};

namespace {
using TBoolVec = std::vector<bool>;
Expand Down Expand Up @@ -75,6 +77,8 @@ const CDataFrameAnalysisConfigReader CONFIG_READER{[] {
CDataFrameAnalysisConfigReader::E_OptionalParameter);
theReader.addParameter(CDataFrameAnalysisSpecification::RESULTS_FIELD,
CDataFrameAnalysisConfigReader::E_OptionalParameter);
theReader.addParameter(CDataFrameAnalysisSpecification::MISSING_FIELD_VALUE,
CDataFrameAnalysisConfigReader::E_OptionalParameter);
theReader.addParameter(CDataFrameAnalysisSpecification::CATEGORICAL_FIELD_NAMES,
CDataFrameAnalysisConfigReader::E_OptionalParameter);
theReader.addParameter(CDataFrameAnalysisSpecification::DISK_USAGE_ALLOWED,
Expand Down Expand Up @@ -131,12 +135,19 @@ CDataFrameAnalysisSpecification::CDataFrameAnalysisSpecification(
m_TemporaryDirectory = parameters[TEMPORARY_DIRECTORY].fallback(std::string{});
m_JobId = parameters[JOB_ID].fallback(std::string{});
m_ResultsField = parameters[RESULTS_FIELD].fallback(DEFAULT_RESULT_FIELD);
m_MissingFieldValue = parameters[MISSING_FIELD_VALUE].fallback(
core::CDataFrame::DEFAULT_MISSING_STRING);
m_CategoricalFieldNames = parameters[CATEGORICAL_FIELD_NAMES].fallback(TStrVec{});
m_DiskUsageAllowed = parameters[DISK_USAGE_ALLOWED].fallback(DEFAULT_DISK_USAGE_ALLOWED);

double missing;
if (core::CStringUtils::stringToTypeSilent(m_MissingFieldValue, missing)) {
HANDLE_FATAL(<< "Input error: you can't use a number (" << missing
<< ") to denote a missing field value.")
}
if (m_DiskUsageAllowed && m_TemporaryDirectory.empty()) {
HANDLE_FATAL(<< "Input error: temporary directory path should be explicitly set if disk"
" usage is allowed! Please report this problem.");
" usage is allowed! Please report this problem.")
}

auto jsonAnalysis = parameters[ANALYSIS].jsonObject();
Expand Down Expand Up @@ -189,6 +200,7 @@ CDataFrameAnalysisSpecification::makeDataFrame() {
? core::makeMainStorageDataFrame(m_NumberColumns)
: core::makeDiskStorageDataFrame(m_TemporaryDirectory,
m_NumberColumns, m_NumberRows);
result.first->missingString(m_MissingFieldValue);
result.first->reserve(m_NumberThreads, m_NumberColumns + this->numberExtraColumns());

return result;
Expand Down
41 changes: 26 additions & 15 deletions lib/api/CDataFrameAnalysisSpecificationJsonWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

#include <api/CDataFrameAnalysisSpecificationJsonWriter.h>

#include <core/CDataFrame.h>

#include <api/CDataFrameAnalysisSpecification.h>

#include <iostream>
Expand All @@ -20,6 +22,7 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
std::size_t numberThreads,
const std::string& temporaryDirectory,
const std::string& resultsField,
const std::string& missingFieldValue,
const TStrVec& categoricalFields,
bool diskUsageAllowed,
const std::string& analysisName,
Expand All @@ -34,8 +37,8 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
}
}
write(jobId, rows, cols, memoryLimit, numberThreads, temporaryDirectory,
resultsField, categoricalFields, diskUsageAllowed, analysisName,
analysisParametersDoc, writer);
resultsField, missingFieldValue, categoricalFields, diskUsageAllowed,
analysisName, analysisParametersDoc, writer);
}

void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
Expand All @@ -45,6 +48,7 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
std::size_t numberThreads,
const std::string& temporaryDirectory,
const std::string& resultsField,
const std::string& missingFieldValue,
const TStrVec& categoricalFields,
bool diskUsageAllowed,
const std::string& analysisName,
Expand Down Expand Up @@ -73,6 +77,11 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
writer.Key(CDataFrameAnalysisSpecification::RESULTS_FIELD);
writer.String(resultsField);

if (missingFieldValue != core::CDataFrame::DEFAULT_MISSING_STRING) {
writer.Key(CDataFrameAnalysisSpecification::MISSING_FIELD_VALUE);
writer.String(missingFieldValue);
}

rapidjson::Value array(rapidjson::kArrayType);
for (const auto& field : categoricalFields) {
array.PushBack(rapidjson::Value(rapidjson::StringRef(field)),
Expand Down Expand Up @@ -105,24 +114,26 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
writer.Flush();
}

std::string
CDataFrameAnalysisSpecificationJsonWriter::jsonString(const std::string& jobId,
size_t rows,
size_t cols,
size_t memoryLimit,
size_t numberThreads,
const TStrVec& categoricalFields,
bool diskUsageAllowed,
const std::string& tempDir,
const std::string& resultField,
const std::string& analysisName,
const std::string& analysisParameters) {
std::string CDataFrameAnalysisSpecificationJsonWriter::jsonString(
const std::string& jobId,
std::size_t rows,
std::size_t cols,
std::size_t memoryLimit,
std::size_t numberThreads,
const std::string& missingFieldValue,
const TStrVec& categoricalFields,
bool diskUsageAllowed,
const std::string& tempDir,
const std::string& resultField,
const std::string& analysisName,
const std::string& analysisParameters) {
rapidjson::StringBuffer stringBuffer;
TRapidJsonLineWriter writer;
writer.Reset(stringBuffer);

write(jobId, rows, cols, memoryLimit, numberThreads, tempDir, resultField,
categoricalFields, diskUsageAllowed, analysisName, analysisParameters, writer);
missingFieldValue, categoricalFields, diskUsageAllowed, analysisName,
analysisParameters, writer);

return stringBuffer.GetString();
}
Expand Down
Loading