Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Add information about samples per node to the tree #991

Merged
merged 25 commits into from
Feb 17, 2020
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
feacc51
additions to definition
valeriy42 Feb 5, 2020
714633f
Merge branch 'master' into ml-cpp-850
valeriy42 Feb 5, 2020
4245b52
wire through number of samples
valeriy42 Feb 5, 2020
2cc8b0c
unit test adjusted
valeriy42 Feb 6, 2020
e7229e1
refactoring, formatting
valeriy42 Feb 6, 2020
69f6650
move setter for number of samples
valeriy42 Feb 6, 2020
1d23037
add enhancement note
valeriy42 Feb 6, 2020
a85304d
bump version for persist/restore
valeriy42 Feb 6, 2020
ee63036
adjust SHAP algorithm to use precomputed number samples
valeriy42 Feb 6, 2020
736416a
add comments
valeriy42 Feb 6, 2020
f9edc6b
updated test bounds
valeriy42 Feb 7, 2020
bcadd65
rename variables for consistency
valeriy42 Feb 7, 2020
868d49f
version bump fixed
valeriy42 Feb 7, 2020
db799ed
Merge branch 'master' into ml-cpp-850
valeriy42 Feb 7, 2020
763bc4e
formatting
valeriy42 Feb 7, 2020
aaf79b7
clang warning fixed.
valeriy42 Feb 10, 2020
097dc25
samples per node computation as a standalone method
valeriy42 Feb 13, 2020
29dd782
formatting
valeriy42 Feb 14, 2020
6a61fa9
Merge branch 'master' of https://github.com/elastic/ml-cpp into ml-cp…
valeriy42 Feb 17, 2020
21e714d
explicit numberSamples vector removed
valeriy42 Feb 17, 2020
ed15070
Formatting
valeriy42 Feb 17, 2020
c30908e
changes in CBoostedTreeLeafNodeStatistics reverted
valeriy42 Feb 5, 2020
3fbb75e
use root() and fix conversions
valeriy42 Feb 17, 2020
93a4432
move number samples computations
valeriy42 Feb 17, 2020
f1cc310
fix for root method
valeriy42 Feb 17, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ progress, memory usage, etc. (See {ml-pull}906[#906].)

* Improve initialization of learn rate for better and more stable results in regression
and classification. (See {ml-pull}948[#948].)
* Add number of processed training samples to the definition of decision tree nodes.
(See {ml-pull}991[#991].)
* Add new model_size_stats fields to instrument categorization. (See {ml-pull}948[#948]
and {pull}51879[#51879], issue: {issue}50794[#50749].)

Expand Down
1 change: 1 addition & 0 deletions include/api/CBoostedTreeInferenceModelBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class API_EXPORT CBoostedTreeInferenceModelBuilder : public maths::CBoostedTree:
bool assignMissingToLeft,
double nodeValue,
double gain,
std::size_t numberSamples,
maths::CBoostedTreeNode::TOptionalNodeIndex leftChild,
maths::CBoostedTreeNode::TOptionalNodeIndex rightChild) override;
void addIdentityEncoding(std::size_t inputColumnIndex) override;
Expand Down
4 changes: 3 additions & 1 deletion include/api/CInferenceModelDefinition.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,8 @@ class API_EXPORT CTree final : public CTrainedModel {
double threshold,
bool defaultLeft,
double leafValue,
size_t splitFeature,
std::size_t splitFeature,
std::size_t numberSamples,
const TOptionalNodeIndex& leftChild,
const TOptionalNodeIndex& rightChild,
const TOptionalDouble& splitGain);
Expand All @@ -175,6 +176,7 @@ class API_EXPORT CTree final : public CTrainedModel {
TOptionalNodeIndex m_LeftChild;
TOptionalNodeIndex m_RightChild;
std::size_t m_SplitFeature;
std::size_t m_NumberSamples;
double m_Threshold;
double m_LeafValue;
TOptionalDouble m_SplitGain;
Expand Down
25 changes: 17 additions & 8 deletions include/maths/CBoostedTree.h
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,9 @@ class MATHS_EXPORT CBoostedTreeNode final {
bool assignMissingToLeft,
double nodeValue,
double gain,
TOptionalNodeIndex leftChild,
TOptionalNodeIndex rightChild) = 0;
std::size_t numberSamples,
maths::CBoostedTreeNode::TOptionalNodeIndex leftChild,
valeriy42 marked this conversation as resolved.
Show resolved Hide resolved
maths::CBoostedTreeNode::TOptionalNodeIndex rightChild) = 0;
valeriy42 marked this conversation as resolved.
Show resolved Hide resolved
};

public:
Expand Down Expand Up @@ -306,19 +307,26 @@ class MATHS_EXPORT CBoostedTreeNode final {
//! Get the total curvature at the rows below this node.
double curvature() const { return m_Curvature; }

//! Set the number of samples to \p value.
void numberSamples(size_t value);
valeriy42 marked this conversation as resolved.
Show resolved Hide resolved

//! Get number of samples affected by the node.
size_t numberSamples() const;
valeriy42 marked this conversation as resolved.
Show resolved Hide resolved

//! Get the index of the left child node.
TNodeIndex leftChildIndex() const { return m_LeftChild.get(); }

//! Get the index of the right child node.
TNodeIndex rightChildIndex() const { return m_RightChild.get(); }

//! Split this node and add its child nodes to \p tree.
TSizeSizePr split(std::size_t splitFeature,
double splitValue,
bool assignMissingToLeft,
double gain,
double curvature,
TNodeVec& tree);
CBoostedTreeNode::TSizeSizePr split(std::size_t splitFeature,
valeriy42 marked this conversation as resolved.
Show resolved Hide resolved
double splitValue,
bool assignMissingToLeft,
double gain,
double curvature,
std::size_t numberSamples,
TNodeVec& tree);

//! Get the feature index of the split.
std::size_t splitFeature() const { return m_SplitFeature; }
Expand Down Expand Up @@ -348,6 +356,7 @@ class MATHS_EXPORT CBoostedTreeNode final {
double m_NodeValue = 0.0;
double m_Gain = 0.0;
double m_Curvature = 0.0;
std::size_t m_NumberSamples = 0;
};

//! \brief A boosted regression tree model.
Expand Down
8 changes: 7 additions & 1 deletion include/maths/CBoostedTreeLeafNodeStatistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <core/CPackedBitVector.h>
#include <core/CSmallVector.h>

#include <maths/CBoostedTree.h>
#include <maths/CBoostedTreeHyperparameters.h>
#include <maths/CBoostedTreeUtils.h>
#include <maths/COrderings.h>
Expand Down Expand Up @@ -45,6 +46,7 @@ class CBoostedTreeLeafNodeStatistics final {
using TImmutableRadixSetVec = std::vector<core::CImmutableRadixSet<double>>;
using TPtr = std::shared_ptr<CBoostedTreeLeafNodeStatistics>;
using TPtrPtrPr = std::pair<TPtr, TPtr>;
using TNodeVec = CBoostedTree::TNodeVec;

public:
CBoostedTreeLeafNodeStatistics(std::size_t id,
Expand Down Expand Up @@ -99,7 +101,8 @@ class CBoostedTreeLeafNodeStatistics final {
const TImmutableRadixSetVec& candidateSplits,
const TSizeVec& featureBag,
const CBoostedTreeNode& split,
bool leftChildHasFewerRows);
bool leftChildHasFewerRows,
TNodeVec& tree);

//! Order two leaves by decreasing gain in splitting them.
bool operator<(const CBoostedTreeLeafNodeStatistics& rhs) const;
Expand All @@ -116,6 +119,9 @@ class CBoostedTreeLeafNodeStatistics final {
//! Check if the left child has fewer rows than the right child.
bool leftChildHasFewerRows() const;

//! Get number of training samples in this leaf node.
std::size_t numberSamples() const;

//! Check if we should assign the missing feature rows to the left child
//! of the split.
bool assignMissingToLeft() const;
Expand Down
13 changes: 5 additions & 8 deletions include/maths/CTreeShapFeatureImportance.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,14 @@ class MATHS_EXPORT CTreeShapFeatureImportance {
//! by \p offset.
void shap(core::CDataFrame& frame, const CDataFrameCategoryEncoder& encoder, std::size_t offset);

//! Compute number of training samples from \p frame that pass every node in the \p tree.
static TDoubleVec samplesPerNode(const TTree& tree,
const core::CDataFrame& frame,
const CDataFrameCategoryEncoder& encoder,
std::size_t numThreads);
//! Return the array with number of training samples affected by every node of the \p tree.
static CTreeShapFeatureImportance::TDoubleVec numberSamples(const TTree& tree);
valeriy42 marked this conversation as resolved.
Show resolved Hide resolved

//! Recursively computes inner node values as weighted average of the children (leaf) values
//! \returns The maximum depth the the tree.
static std::size_t updateNodeValues(TTree& tree,
std::size_t nodeIndex,
const TDoubleVec& samplesPerNode,
const TDoubleVec& numberSamples,
std::size_t depth);

//! Get the reference to the trees.
Expand Down Expand Up @@ -126,7 +123,7 @@ class MATHS_EXPORT CTreeShapFeatureImportance {
//! Recursively traverses all pathes in the \p tree and updated SHAP values once it hits a leaf.
//! Ref. Algorithm 2 in the paper by Lundberg et al.
void shapRecursive(const TTree& tree,
const TDoubleVec& samplesPerNode,
const TDoubleVec& numberSamples,
valeriy42 marked this conversation as resolved.
Show resolved Hide resolved
const CDataFrameCategoryEncoder& encoder,
const CEncodedDataFrameRowRef& encodedRow,
SPath& splitPath,
Expand All @@ -146,7 +143,7 @@ class MATHS_EXPORT CTreeShapFeatureImportance {
private:
TTreeVec m_Trees;
std::size_t m_NumberThreads;
TDoubleVecVec m_SamplesPerNode;
TDoubleVecVec m_NumberSamples;
valeriy42 marked this conversation as resolved.
Show resolved Hide resolved
};
}
}
Expand Down
6 changes: 4 additions & 2 deletions lib/api/CBoostedTreeInferenceModelBuilder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ void CBoostedTreeInferenceModelBuilder::addNode(std::size_t splitFeature,
bool assignMissingToLeft,
double nodeValue,
double gain,
std::size_t numberSamples,
maths::CBoostedTreeNode::TOptionalNodeIndex leftChild,
maths::CBoostedTreeNode::TOptionalNodeIndex rightChild) {
auto ensemble{static_cast<CEnsemble*>(m_Definition.trainedModel().get())};
Expand All @@ -97,8 +98,9 @@ void CBoostedTreeInferenceModelBuilder::addNode(std::size_t splitFeature,
if (tree == nullptr) {
HANDLE_FATAL(<< "Internal error. Tree points to a nullptr.")
}
tree->treeStructure().emplace_back(tree->size(), splitValue, assignMissingToLeft, nodeValue,
splitFeature, leftChild, rightChild, gain);
tree->treeStructure().emplace_back(tree->size(), splitValue, assignMissingToLeft,
nodeValue, splitFeature, numberSamples,
leftChild, rightChild, gain);
}

CBoostedTreeInferenceModelBuilder::CBoostedTreeInferenceModelBuilder(TStrVec fieldNames,
Expand Down
10 changes: 8 additions & 2 deletions lib/api/CInferenceModelDefinition.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ const std::string JSON_LEFT_CHILD_TAG{"left_child"};
const std::string JSON_LOGISTIC_REGRESSION_TAG{"logistic_regression"};
const std::string JSON_LT{"lt"};
const std::string JSON_NODE_INDEX_TAG{"node_index"};
const std::string JSON_NUMBER_SAMPLES_TAG{"number_samples"};
const std::string JSON_ONE_HOT_ENCODING_TAG{"one_hot_encoding"};
const std::string JSON_PREPROCESSORS_TAG{"preprocessors"};
const std::string JSON_RIGHT_CHILD_TAG{"right_child"};
Expand Down Expand Up @@ -79,6 +80,9 @@ void addJsonArray(const std::string& tag,
void CTree::CTreeNode::addToDocument(rapidjson::Value& parentObject,
TRapidJsonWriter& writer) const {
writer.addMember(JSON_NODE_INDEX_TAG, rapidjson::Value(m_NodeIndex).Move(), parentObject);
writer.addMember(
JSON_NUMBER_SAMPLES_TAG,
rapidjson::Value(static_cast<std::uint64_t>(m_NumberSamples)).Move(), parentObject);

if (m_LeftChild) {
// internal node
Expand Down Expand Up @@ -118,11 +122,13 @@ CTree::CTreeNode::CTreeNode(TNodeIndex nodeIndex,
bool defaultLeft,
double leafValue,
std::size_t splitFeature,
std::size_t numberSamples,
const TOptionalNodeIndex& leftChild,
const TOptionalNodeIndex& rightChild,
const TOptionalDouble& splitGain)
: m_DefaultLeft(defaultLeft), m_NodeIndex(nodeIndex), m_LeftChild(leftChild),
m_RightChild(rightChild), m_SplitFeature(splitFeature),
: m_DefaultLeft(defaultLeft), m_NodeIndex(nodeIndex),
m_LeftChild(leftChild), m_RightChild(rightChild),
m_SplitFeature(splitFeature), m_NumberSamples(numberSamples),
m_Threshold(threshold), m_LeafValue(leafValue), m_SplitGain(splitGain) {
}

Expand Down
5 changes: 3 additions & 2 deletions lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -299,8 +299,9 @@ BOOST_FIXTURE_TEST_CASE(testRegressionFeatureImportanceNoImportance, SFixture) {
// c1 explains 95% of the prediction value, i.e. the difference from the prediction is less than 2%.
BOOST_REQUIRE_CLOSE(c1, prediction, 5.0);
for (const auto& feature : {"c2", "c3", "c4"}) {
BOOST_REQUIRE_SMALL(readShapValue(result, feature), 2.0);
cNoImportanceMean.add(std::fabs(readShapValue(result, feature)));
double c = readShapValue(result, feature);
BOOST_REQUIRE_SMALL(c, 2.5);
valeriy42 marked this conversation as resolved.
Show resolved Hide resolved
cNoImportanceMean.add(std::fabs(c));
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@
},
"right_child": {
"type": "integer"
},
"number_samples": {
"description": "Number of training samples that were affected by the node.",
"type": "integer"
}
},
"required": [
Expand All @@ -75,7 +79,8 @@
"decision_type",
"default_left",
"left_child",
"right_child"
"right_child",
"number_samples"
],
"additionalProperties": false
},
Expand All @@ -88,11 +93,16 @@
},
"leaf_value": {
"type": "number"
},
"number_samples": {
"description": "Number of training samples that were affected by the node.",
"type": "integer"
}
},
"required": [
"node_index",
"leaf_value"
"leaf_value",
"number_samples"
],
"additionalProperties": false
},
Expand Down Expand Up @@ -234,10 +244,14 @@
"items": {
"type": "number"
}
},
"num_classes": {
"type": "integer"
}
},
"required": [
"weights"
"weights",
"num_classes"
]
}
},
Expand Down
18 changes: 16 additions & 2 deletions lib/maths/CBoostedTree.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ const std::string SPLIT_FEATURE_TAG{"split_feature"};
const std::string ASSIGN_MISSING_TO_LEFT_TAG{"assign_missing_to_left "};
const std::string NODE_VALUE_TAG{"node_value"};
const std::string SPLIT_VALUE_TAG{"split_value"};
const std::string NUMBER_SAMPLES_TAG{"number_samples"};

double LOG_EPSILON{std::log(100.0 * std::numeric_limits<double>::epsilon())};

Expand Down Expand Up @@ -349,6 +350,7 @@ CBoostedTreeNode::TSizeSizePr CBoostedTreeNode::split(std::size_t splitFeature,
bool assignMissingToLeft,
double gain,
double curvature,
std::size_t numberSamples,
TNodeVec& tree) {
m_SplitFeature = splitFeature;
m_SplitValue = splitValue;
Expand All @@ -357,6 +359,7 @@ CBoostedTreeNode::TSizeSizePr CBoostedTreeNode::split(std::size_t splitFeature,
m_RightChild = static_cast<TNodeIndex>(tree.size() + 1);
m_Gain = gain;
m_Curvature = curvature;
m_NumberSamples = numberSamples;
TSizeSizePr result{m_LeftChild.get(), m_RightChild.get()};
// Don't access members after calling resize because this object is likely an element of the vector being resized.
tree.resize(tree.size() + 2);
Expand All @@ -370,6 +373,7 @@ void CBoostedTreeNode::acceptPersistInserter(core::CStatePersistInserter& insert
core::CPersistUtils::persist(ASSIGN_MISSING_TO_LEFT_TAG, m_AssignMissingToLeft, inserter);
core::CPersistUtils::persist(NODE_VALUE_TAG, m_NodeValue, inserter);
core::CPersistUtils::persist(SPLIT_VALUE_TAG, m_SplitValue, inserter);
core::CPersistUtils::persist(NUMBER_SAMPLES_TAG, m_NumberSamples, inserter);
}

bool CBoostedTreeNode::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) {
Expand All @@ -388,6 +392,8 @@ bool CBoostedTreeNode::acceptRestoreTraverser(core::CStateRestoreTraverser& trav
core::CPersistUtils::restore(NODE_VALUE_TAG, m_NodeValue, traverser))
RESTORE(SPLIT_VALUE_TAG,
core::CPersistUtils::restore(SPLIT_VALUE_TAG, m_SplitValue, traverser))
RESTORE(NUMBER_SAMPLES_TAG,
core::CPersistUtils::restore(NUMBER_SAMPLES_TAG, m_NumberSamples, traverser))
} while (traverser.next());
return true;
}
Expand All @@ -412,8 +418,16 @@ std::ostringstream& CBoostedTreeNode::doPrint(std::string pad,
}

void CBoostedTreeNode::accept(CVisitor& visitor) const {
visitor.addNode(m_SplitFeature, m_SplitValue, m_AssignMissingToLeft,
m_NodeValue, m_Gain, m_LeftChild, m_RightChild);
visitor.addNode(m_SplitFeature, m_SplitValue, m_AssignMissingToLeft, m_NodeValue,
m_Gain, m_NumberSamples, m_LeftChild, m_RightChild);
}

void CBoostedTreeNode::numberSamples(size_t numberSamples) {
m_NumberSamples = numberSamples;
}

size_t CBoostedTreeNode::numberSamples() const {
return m_NumberSamples;
}

CBoostedTree::CBoostedTree(core::CDataFrame& frame,
Expand Down
Loading