Skip to content

Commit

Permalink
[ML] Distinguish missing and empty categorical values (#1034)
Browse files Browse the repository at this point in the history
  • Loading branch information
tveasey authored Mar 4, 2020
1 parent 1cf9de8 commit 9abec24
Show file tree
Hide file tree
Showing 19 changed files with 668 additions and 300 deletions.
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ necessary. This will improve the allocation of data frame analyses to cluster no
* Upgrade the compiler used on Linux from gcc 7.3 to gcc 7.5, and the binutils used in
the build from version 2.20 to 2.34. (See {ml-pull}1013[#1013].)
* Remove all memory overheads for computing tree SHAP values. (See {ml-pull}1023[#1023].)
* Distinguish between empty and missing categorical fields in classification and regression
model training. (See {ml-pull}1034[#1034].)

=== Bug Fixes

Expand Down
2 changes: 2 additions & 0 deletions include/api/CDataFrameAnalysisSpecification.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class API_EXPORT CDataFrameAnalysisSpecification {
static const std::string THREADS;
static const std::string TEMPORARY_DIRECTORY;
static const std::string RESULTS_FIELD;
static const std::string MISSING_FIELD_VALUE;
static const std::string CATEGORICAL_FIELD_NAMES;
static const std::string DISK_USAGE_ALLOWED;
static const std::string ANALYSIS;
Expand Down Expand Up @@ -203,6 +204,7 @@ class API_EXPORT CDataFrameAnalysisSpecification {
std::string m_ResultsField;
std::string m_JobId;
std::string m_AnalysisName;
std::string m_MissingFieldValue;
TStrVec m_CategoricalFieldNames;
bool m_DiskUsageAllowed;
// TODO Sparse table support
Expand Down
11 changes: 7 additions & 4 deletions include/api/CDataFrameAnalysisSpecificationJsonWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class API_EXPORT CDataFrameAnalysisSpecificationJsonWriter : private core::CNonI
std::size_t numberThreads,
const std::string& temporaryDirectory,
const std::string& resultsField,
const std::string& missingString,
const TStrVec& categoricalFields,
bool diskUsageAllowed,
const std::string& analysisName,
Expand All @@ -48,6 +49,7 @@ class API_EXPORT CDataFrameAnalysisSpecificationJsonWriter : private core::CNonI
std::size_t numberThreads,
const std::string& temporaryDirectory,
const std::string& resultsField,
const std::string& missingString,
const TStrVec& categoricalFields,
bool diskUsageAllowed,
const std::string& analysisName,
Expand All @@ -56,10 +58,11 @@ class API_EXPORT CDataFrameAnalysisSpecificationJsonWriter : private core::CNonI

//! Returns a string with the data frame analysis specification in JSON format.
static std::string jsonString(const std::string& jobId,
size_t rows,
size_t cols,
size_t memoryLimit,
size_t numberThreads,
std::size_t rows,
std::size_t cols,
std::size_t memoryLimit,
std::size_t numberThreads,
const std::string& missingString,
const TStrVec& categoricalFields,
bool diskUsageAllowed,
const std::string& tempDir,
Expand Down
11 changes: 11 additions & 0 deletions include/core/CDataFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,9 @@ class CORE_EXPORT CDataFrame final {
//! The maximum number of distinct categorical fields we can faithfully represent.
static const std::size_t MAX_CATEGORICAL_CARDINALITY;

//! The default value indicating that a value is missing.
static const std::string DEFAULT_MISSING_STRING;

public:
//! \param[in] inMainMemory True if the data frame is stored in main memory.
//! \param[in] numberColumns The number of columns in the data frame.
Expand Down Expand Up @@ -443,6 +446,9 @@ class CORE_EXPORT CDataFrame final {
//! Write the column names.
void columnNames(TStrVec columnNames);

//! Write the string which indicates that a value is missing.
void missingString(std::string missing);

//! Write for which columns an empty string implies the value is missing.
void emptyIsMissing(TBoolVec emptyIsMissing);

Expand Down Expand Up @@ -577,7 +583,12 @@ class CORE_EXPORT CDataFrame final {
//! A lookup for the integer value of categories.
TStrSizeUMapVec m_CategoricalColumnValueLookup;

//! The string which indicates that a category is missing.
std::string m_MissingString;

//! Indicator vector for treating empty strings as missing values.
// TODO Remove once Java passes the correct value for the missing target
// for classification.
TBoolVec m_EmptyIsMissing;

//! Indicator vector of the columns which contain categorical values.
Expand Down
98 changes: 70 additions & 28 deletions include/test/CDataFrameAnalysisSpecificationFactory.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

#include <test/ImportExport.h>

#include <boost/optional.hpp>

#include <cstddef>
#include <memory>
#include <string>
Expand All @@ -32,37 +34,77 @@ class TEST_EXPORT CDataFrameAnalysisSpecificationFactory {
using TSpecificationUPtr = std::unique_ptr<api::CDataFrameAnalysisSpecification>;

public:
CDataFrameAnalysisSpecificationFactory();

static const std::string& classification();
static const std::string& regression();

static TSpecificationUPtr outlierSpec(std::size_t rows = 110,
std::size_t cols = 5,
std::size_t memoryLimit = 100000,
const std::string& method = "",
std::size_t numberNeighbours = 0,
bool computeFeatureInfluence = false,
bool diskUsageAllowed = true);

static TSpecificationUPtr
predictionSpec(const std::string& analysis,
const std::string& dependentVariable,
std::size_t rows = 100,
std::size_t cols = 5,
std::size_t memoryLimit = 7000000,
std::size_t numberRoundsPerHyperparameter = 0,
std::size_t bayesianOptimisationRestarts = 0,
const TStrVec& categoricalFieldNames = TStrVec{},
double alpha = -1.0,
double lambda = -1.0,
double gamma = -1.0,
double softTreeDepthLimit = -1.0,
double softTreeDepthTolerance = -1.0,
double eta = -1.0,
std::size_t maximumNumberTrees = 0,
double featureBagFraction = -1.0,
size_t topShapValues = 0,
TPersisterSupplier* persisterSupplier = nullptr,
TRestoreSearcherSupplier* restoreSearcherSupplier = nullptr);
// Shared
CDataFrameAnalysisSpecificationFactory& rows(std::size_t rows);
CDataFrameAnalysisSpecificationFactory& columns(std::size_t columns);
CDataFrameAnalysisSpecificationFactory& memoryLimit(std::size_t memoryLimit);
CDataFrameAnalysisSpecificationFactory& missingString(const std::string& missing);
CDataFrameAnalysisSpecificationFactory& diskUsageAllowed(bool disk);

// Outliers
CDataFrameAnalysisSpecificationFactory& outlierMethod(std::string method);
CDataFrameAnalysisSpecificationFactory& outlierNumberNeighbours(std::size_t number);
CDataFrameAnalysisSpecificationFactory& outlierComputeInfluence(bool compute);

// Prediction
CDataFrameAnalysisSpecificationFactory&
predicitionNumberRoundsPerHyperparameter(std::size_t rounds);
CDataFrameAnalysisSpecificationFactory&
predictionBayesianOptimisationRestarts(std::size_t restarts);
CDataFrameAnalysisSpecificationFactory&
predictionCategoricalFieldNames(const TStrVec& categorical);
CDataFrameAnalysisSpecificationFactory& predictionAlpha(double alpha);
CDataFrameAnalysisSpecificationFactory& predictionLambda(double lambda);
CDataFrameAnalysisSpecificationFactory& predictionGamma(double gamma);
CDataFrameAnalysisSpecificationFactory& predictionSoftTreeDepthLimit(double limit);
CDataFrameAnalysisSpecificationFactory& predictionSoftTreeDepthTolerance(double tolerance);
CDataFrameAnalysisSpecificationFactory& predictionEta(double eta);
CDataFrameAnalysisSpecificationFactory& predictionMaximumNumberTrees(std::size_t number);
CDataFrameAnalysisSpecificationFactory& predictionFeatureBagFraction(double fraction);
CDataFrameAnalysisSpecificationFactory& predictionNumberTopShapValues(std::size_t number);
CDataFrameAnalysisSpecificationFactory&
predictionPersisterSupplier(TPersisterSupplier* persisterSupplier);
CDataFrameAnalysisSpecificationFactory&
predictionRestoreSearcherSupplier(TRestoreSearcherSupplier* restoreSearcherSupplier);

TSpecificationUPtr outlierSpec() const;
TSpecificationUPtr predictionSpec(const std::string& analysis,
const std::string& dependentVariable) const;

private:
using TOptionalSize = boost::optional<std::size_t>;

private:
// Shared
TOptionalSize m_Rows;
TOptionalSize m_Columns;
TOptionalSize m_MemoryLimit;
std::string m_MissingString;
bool m_DiskUsageAllowed = true;
// Outliers
std::string m_Method;
std::size_t m_NumberNeighbours = 0;
bool m_ComputeFeatureInfluence = false;
// Prediction
std::size_t m_NumberRoundsPerHyperparameter = 0;
std::size_t m_BayesianOptimisationRestarts = 0;
TStrVec m_CategoricalFieldNames;
double m_Alpha = -1.0;
double m_Lambda = -1.0;
double m_Gamma = -1.0;
double m_SoftTreeDepthLimit = -1.0;
double m_SoftTreeDepthTolerance = -1.0;
double m_Eta = -1.0;
std::size_t m_MaximumNumberTrees = 0;
double m_FeatureBagFraction = -1.0;
std::size_t m_NumberTopShapValues = 0;
TPersisterSupplier* m_PersisterSupplier = nullptr;
TRestoreSearcherSupplier* m_RestoreSearcherSupplier = nullptr;
};
}
}
Expand Down
37 changes: 25 additions & 12 deletions lib/api/CDataFrameAnalysisSpecification.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <core/CDataFrame.h>
#include <core/CLogger.h>
#include <core/CStringUtils.h>

#include <api/CDataFrameAnalysisConfigReader.h>
#include <api/CDataFrameOutliersRunner.h>
Expand All @@ -28,18 +29,19 @@ namespace ml {
namespace api {

// These must be consistent with Java names.
const std::string CDataFrameAnalysisSpecification::JOB_ID("job_id");
const std::string CDataFrameAnalysisSpecification::ROWS("rows");
const std::string CDataFrameAnalysisSpecification::COLS("cols");
const std::string CDataFrameAnalysisSpecification::MEMORY_LIMIT("memory_limit");
const std::string CDataFrameAnalysisSpecification::THREADS("threads");
const std::string CDataFrameAnalysisSpecification::TEMPORARY_DIRECTORY("temp_dir");
const std::string CDataFrameAnalysisSpecification::RESULTS_FIELD("results_field");
const std::string CDataFrameAnalysisSpecification::JOB_ID{"job_id"};
const std::string CDataFrameAnalysisSpecification::ROWS{"rows"};
const std::string CDataFrameAnalysisSpecification::COLS{"cols"};
const std::string CDataFrameAnalysisSpecification::MEMORY_LIMIT{"memory_limit"};
const std::string CDataFrameAnalysisSpecification::THREADS{"threads"};
const std::string CDataFrameAnalysisSpecification::TEMPORARY_DIRECTORY{"temp_dir"};
const std::string CDataFrameAnalysisSpecification::RESULTS_FIELD{"results_field"};
const std::string CDataFrameAnalysisSpecification::MISSING_FIELD_VALUE{"missing_field_value"};
const std::string CDataFrameAnalysisSpecification::CATEGORICAL_FIELD_NAMES{"categorical_fields"};
const std::string CDataFrameAnalysisSpecification::DISK_USAGE_ALLOWED("disk_usage_allowed");
const std::string CDataFrameAnalysisSpecification::ANALYSIS("analysis");
const std::string CDataFrameAnalysisSpecification::NAME("name");
const std::string CDataFrameAnalysisSpecification::PARAMETERS("parameters");
const std::string CDataFrameAnalysisSpecification::DISK_USAGE_ALLOWED{"disk_usage_allowed"};
const std::string CDataFrameAnalysisSpecification::ANALYSIS{"analysis"};
const std::string CDataFrameAnalysisSpecification::NAME{"name"};
const std::string CDataFrameAnalysisSpecification::PARAMETERS{"parameters"};

namespace {
using TBoolVec = std::vector<bool>;
Expand Down Expand Up @@ -75,6 +77,8 @@ const CDataFrameAnalysisConfigReader CONFIG_READER{[] {
CDataFrameAnalysisConfigReader::E_OptionalParameter);
theReader.addParameter(CDataFrameAnalysisSpecification::RESULTS_FIELD,
CDataFrameAnalysisConfigReader::E_OptionalParameter);
theReader.addParameter(CDataFrameAnalysisSpecification::MISSING_FIELD_VALUE,
CDataFrameAnalysisConfigReader::E_OptionalParameter);
theReader.addParameter(CDataFrameAnalysisSpecification::CATEGORICAL_FIELD_NAMES,
CDataFrameAnalysisConfigReader::E_OptionalParameter);
theReader.addParameter(CDataFrameAnalysisSpecification::DISK_USAGE_ALLOWED,
Expand Down Expand Up @@ -131,12 +135,20 @@ CDataFrameAnalysisSpecification::CDataFrameAnalysisSpecification(
m_TemporaryDirectory = parameters[TEMPORARY_DIRECTORY].fallback(std::string{});
m_JobId = parameters[JOB_ID].fallback(std::string{});
m_ResultsField = parameters[RESULTS_FIELD].fallback(DEFAULT_RESULT_FIELD);
m_MissingFieldValue = parameters[MISSING_FIELD_VALUE].fallback(
core::CDataFrame::DEFAULT_MISSING_STRING);
m_CategoricalFieldNames = parameters[CATEGORICAL_FIELD_NAMES].fallback(TStrVec{});
m_DiskUsageAllowed = parameters[DISK_USAGE_ALLOWED].fallback(DEFAULT_DISK_USAGE_ALLOWED);

double missing;
if (m_MissingFieldValue != core::CDataFrame::DEFAULT_MISSING_STRING &&
core::CStringUtils::stringToTypeSilent(m_MissingFieldValue, missing)) {
HANDLE_FATAL(<< "Input error: you can't use a number (" << missing
<< ") to denote a missing field value.")
}
if (m_DiskUsageAllowed && m_TemporaryDirectory.empty()) {
HANDLE_FATAL(<< "Input error: temporary directory path should be explicitly set if disk"
" usage is allowed! Please report this problem.");
" usage is allowed! Please report this problem.")
}

auto jsonAnalysis = parameters[ANALYSIS].jsonObject();
Expand Down Expand Up @@ -189,6 +201,7 @@ CDataFrameAnalysisSpecification::makeDataFrame() {
? core::makeMainStorageDataFrame(m_NumberColumns)
: core::makeDiskStorageDataFrame(m_TemporaryDirectory,
m_NumberColumns, m_NumberRows);
result.first->missingString(m_MissingFieldValue);
result.first->reserve(m_NumberThreads, m_NumberColumns + this->numberExtraColumns());

return result;
Expand Down
41 changes: 26 additions & 15 deletions lib/api/CDataFrameAnalysisSpecificationJsonWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

#include <api/CDataFrameAnalysisSpecificationJsonWriter.h>

#include <core/CDataFrame.h>

#include <api/CDataFrameAnalysisSpecification.h>

#include <iostream>
Expand All @@ -20,6 +22,7 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
std::size_t numberThreads,
const std::string& temporaryDirectory,
const std::string& resultsField,
const std::string& missingFieldValue,
const TStrVec& categoricalFields,
bool diskUsageAllowed,
const std::string& analysisName,
Expand All @@ -34,8 +37,8 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
}
}
write(jobId, rows, cols, memoryLimit, numberThreads, temporaryDirectory,
resultsField, categoricalFields, diskUsageAllowed, analysisName,
analysisParametersDoc, writer);
resultsField, missingFieldValue, categoricalFields, diskUsageAllowed,
analysisName, analysisParametersDoc, writer);
}

void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
Expand All @@ -45,6 +48,7 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
std::size_t numberThreads,
const std::string& temporaryDirectory,
const std::string& resultsField,
const std::string& missingFieldValue,
const TStrVec& categoricalFields,
bool diskUsageAllowed,
const std::string& analysisName,
Expand Down Expand Up @@ -73,6 +77,11 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
writer.Key(CDataFrameAnalysisSpecification::RESULTS_FIELD);
writer.String(resultsField);

if (missingFieldValue != core::CDataFrame::DEFAULT_MISSING_STRING) {
writer.Key(CDataFrameAnalysisSpecification::MISSING_FIELD_VALUE);
writer.String(missingFieldValue);
}

rapidjson::Value array(rapidjson::kArrayType);
for (const auto& field : categoricalFields) {
array.PushBack(rapidjson::Value(rapidjson::StringRef(field)),
Expand Down Expand Up @@ -105,24 +114,26 @@ void CDataFrameAnalysisSpecificationJsonWriter::write(const std::string& jobId,
writer.Flush();
}

std::string
CDataFrameAnalysisSpecificationJsonWriter::jsonString(const std::string& jobId,
size_t rows,
size_t cols,
size_t memoryLimit,
size_t numberThreads,
const TStrVec& categoricalFields,
bool diskUsageAllowed,
const std::string& tempDir,
const std::string& resultField,
const std::string& analysisName,
const std::string& analysisParameters) {
std::string CDataFrameAnalysisSpecificationJsonWriter::jsonString(
const std::string& jobId,
std::size_t rows,
std::size_t cols,
std::size_t memoryLimit,
std::size_t numberThreads,
const std::string& missingFieldValue,
const TStrVec& categoricalFields,
bool diskUsageAllowed,
const std::string& tempDir,
const std::string& resultField,
const std::string& analysisName,
const std::string& analysisParameters) {
rapidjson::StringBuffer stringBuffer;
TRapidJsonLineWriter writer;
writer.Reset(stringBuffer);

write(jobId, rows, cols, memoryLimit, numberThreads, tempDir, resultField,
categoricalFields, diskUsageAllowed, analysisName, analysisParameters, writer);
missingFieldValue, categoricalFields, diskUsageAllowed, analysisName,
analysisParameters, writer);

return stringBuffer.GetString();
}
Expand Down
Loading

0 comments on commit 9abec24

Please sign in to comment.