Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Treeshap hypothesis tests #4671

Merged
merged 8 commits into from
Apr 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/include/cuml/explainer/tree_shap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class TreePathInfo {
public:
enum class ThresholdTypeEnum : std::uint8_t { kFloat, kDouble };
virtual ThresholdTypeEnum GetThresholdType() const = 0;
virtual ~TreePathInfo() {}
};

std::unique_ptr<TreePathInfo> extract_path_info(ModelHandle model);
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/randomforest/randomforest.cu
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,8 @@ void build_treelite_forest(ModelHandle* model_handle,
ASSERT(model != nullptr, "Invalid downcast to tl::ModelImpl");

// Determine number of outputs
ASSERT(forest->trees.size() == forest->rf_params.n_trees, "Inconsistent number of trees.");
ASSERT(forest->trees.size() > 0, "Empty forest.");
int num_outputs = forest->trees.front()->num_outputs;
ASSERT(num_outputs > 0, "Invalid forest");
for (const auto& tree : forest->trees) {
Expand Down
43 changes: 31 additions & 12 deletions python/cuml/ensemble/randomforest_common.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -216,20 +216,39 @@ class BaseRandomForestModel(Base):
self.treelite_serialized_model)

else:
if self.dtype not in [np.float32, np.float64]:
raise ValueError("Unknown dtype.")

if self.RF_type == CLASSIFICATION:
build_treelite_forest(
&tl_handle,
<RandomForestMetaData[float, int]*>
<uintptr_t> self.rf_forest,
<int> self.n_cols
)
if self.dtype==np.float32:
build_treelite_forest(
&tl_handle,
<RandomForestMetaData[float, int]*>
<uintptr_t> self.rf_forest,
<int> self.n_cols
)
elif self.dtype==np.float64:
build_treelite_forest(
&tl_handle,
<RandomForestMetaData[double, int]*>
<uintptr_t> self.rf_forest64,
<int> self.n_cols
)
else:
build_treelite_forest(
&tl_handle,
<RandomForestMetaData[float, float]*>
<uintptr_t> self.rf_forest,
<int> self.n_cols
)
if self.dtype==np.float32:
build_treelite_forest(
&tl_handle,
<RandomForestMetaData[float, float]*>
<uintptr_t> self.rf_forest,
<int> self.n_cols
)
elif self.dtype==np.float64:
build_treelite_forest(
&tl_handle,
<RandomForestMetaData[double, double]*>
<uintptr_t> self.rf_forest64,
<int> self.n_cols
)

self.treelite_handle = <uintptr_t> tl_handle
return self.treelite_handle
Expand Down
240 changes: 215 additions & 25 deletions python/cuml/test/explainer/test_gpu_treeshap.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,15 @@
import pandas as pd
import cupy as cp
import cudf
from hypothesis import given, settings, assume, HealthCheck, strategies as st
from cuml.experimental.explainer.tree_shap import TreeExplainer
from cuml.common.import_utils import has_xgboost, has_lightgbm, has_shap
from cuml.common.import_utils import has_sklearn
from cuml.common.exceptions import NotFittedError
from cuml.ensemble import RandomForestRegressor as curfr
from cuml.ensemble import RandomForestClassifier as curfc
import cuml
from cuml.test.utils import as_type

if has_xgboost():
import xgboost as xgb
Expand All @@ -42,33 +45,43 @@

def make_classification_with_categorical(
*, n_samples, n_features, n_categorical, n_informative, n_redundant,
n_repeated, n_classes, random_state):
X, y = make_classification(n_samples=n_samples, n_features=n_features,
n_informative=n_informative,
n_redundant=n_redundant, n_repeated=n_repeated,
n_classes=n_classes, random_state=random_state)
X, y = X.astype(np.float32), y.astype(np.float32)
n_repeated, n_classes, random_state, numeric_dtype=np.float32):
X, y = make_classification(
n_samples=n_samples, n_features=n_features,
n_informative=n_informative, n_redundant=n_redundant,
n_repeated=n_repeated, n_classes=n_classes, random_state=random_state,
n_clusters_per_class=min(2, n_features))
X, y = X.astype(numeric_dtype), y.astype(numeric_dtype)

# Turn some columns into categorical, by taking quartiles
n = np.atleast_1d(y).shape[0]
X = pd.DataFrame({f'f{i}': X[:, i] for i in range(n_features)})
for i in range(n_categorical):
column = f'f{i}'
X[column] = pd.qcut(X[column], 4, labels=range(4))
n_bins = min(4, n)
X[column] = pd.qcut(X[column], n_bins, labels=range(n_bins))
# make sure each target exists
y[0:n_classes] = range(n_classes)

assert len(np.unique(y)) == n_classes
return X, y


def make_regression_with_categorical(
*, n_samples, n_features, n_categorical, n_informative, random_state):
*, n_samples, n_features, n_categorical, n_informative, random_state,
numeric_dtype=np.float32, n_targets=1):
X, y = make_regression(n_samples=n_samples, n_features=n_features,
n_informative=n_informative, n_targets=1,
n_informative=n_informative, n_targets=n_targets,
random_state=random_state)
X, y = X.astype(np.float32), y.astype(np.float32)
X, y = X.astype(numeric_dtype), y.astype(numeric_dtype)

# Turn some columns into categorical, by taking quartiles
n = np.atleast_1d(y).shape[0]
X = pd.DataFrame({f'f{i}': X[:, i] for i in range(n_features)})
for i in range(n_categorical):
column = f'f{i}'
X[column] = pd.qcut(X[column], 4, labels=range(4))
n_bins = min(4, n)
X[column] = pd.qcut(X[column], n_bins, labels=range(n_bins))
return X, y


Expand Down Expand Up @@ -349,9 +362,9 @@ def test_xgb_classifier_with_categorical(n_classes):
n_samples = 100
n_features = 8
X, y = make_classification_with_categorical(
n_samples=n_samples, n_features=n_features, n_categorical=4,
n_informative=n_features, n_redundant=0, n_repeated=0,
n_classes=n_classes, random_state=2022)
n_samples=n_samples, n_features=n_features, n_categorical=4,
n_informative=n_features, n_redundant=0, n_repeated=0,
n_classes=n_classes, random_state=2022)

dtrain = xgb.DMatrix(X, y, enable_categorical=True)
params = {"tree_method": "gpu_hist", "max_depth": 6,
Expand Down Expand Up @@ -396,8 +409,8 @@ def test_xgb_regressor_with_categorical():
n_samples = 100
n_features = 8
X, y = make_regression_with_categorical(
n_samples=n_samples, n_features=n_features, n_categorical=4,
n_informative=n_features, random_state=2022)
n_samples=n_samples, n_features=n_features, n_categorical=4,
n_informative=n_features, random_state=2022)

dtrain = xgb.DMatrix(X, y, enable_categorical=True)
params = {"tree_method": "gpu_hist", "max_depth": 6,
Expand Down Expand Up @@ -425,16 +438,17 @@ def test_lightgbm_regressor_with_categorical():
n_features = 8
n_categorical = 8
X, y = make_regression_with_categorical(
n_samples=n_samples, n_features=n_features,
n_categorical=n_categorical, n_informative=n_features,
random_state=2022)
n_samples=n_samples, n_features=n_features,
n_categorical=n_categorical, n_informative=n_features,
random_state=2022)

dtrain = lgb.Dataset(X, label=y, categorical_feature=range(n_categorical))
params = {"num_leaves": 64, "seed": 0, "objective": "regression",
"metric": "rmse", "min_data_per_group": 1}
lgb_model = lgb.train(params, dtrain, num_boost_round=10,
valid_sets=[dtrain], valid_names=['train'])
assert count_categorical_split(treelite.Model.from_lightgbm(lgb_model)) > 0
assert count_categorical_split(
treelite.Model.from_lightgbm(lgb_model)) > 0

explainer = TreeExplainer(model=lgb_model)
out = explainer.shap_values(X)
Expand All @@ -455,10 +469,10 @@ def test_lightgbm_classifier_with_categorical(n_classes):
n_features = 8
n_categorical = 8
X, y = make_classification_with_categorical(
n_samples=n_samples, n_features=n_features,
n_categorical=n_categorical, n_informative=n_features,
n_redundant=0, n_repeated=0, n_classes=n_classes,
random_state=2022)
n_samples=n_samples, n_features=n_features,
n_categorical=n_categorical, n_informative=n_features,
n_redundant=0, n_repeated=0, n_classes=n_classes,
random_state=2022)

dtrain = lgb.Dataset(X, label=y, categorical_feature=range(n_categorical))
params = {"num_leaves": 64, "seed": 0, "min_data_per_group": 1}
Expand All @@ -471,7 +485,8 @@ def test_lightgbm_classifier_with_categorical(n_classes):
params["num_class"] = n_classes
lgb_model = lgb.train(params, dtrain, num_boost_round=10,
valid_sets=[dtrain], valid_names=['train'])
assert count_categorical_split(treelite.Model.from_lightgbm(lgb_model)) > 0
assert count_categorical_split(
treelite.Model.from_lightgbm(lgb_model)) > 0

# Insert NaN randomly into X
X_test = X.values.copy()
Expand All @@ -493,3 +508,178 @@ def test_lightgbm_classifier_with_categorical(n_classes):
np.testing.assert_almost_equal(out, ref_out, decimal=5)
np.testing.assert_almost_equal(explainer.expected_value,
ref_expected_value, decimal=5)


def learn_model(
draw, X, y, task, learner, n_estimators, n_targets):
if learner == 'xgb':
assume(has_xgboost())
if task == 'regression':
objective = draw(st.sampled_from(['reg:squarederror',
'reg:pseudohubererror']))
model = xgb.XGBRegressor(
n_estimators=n_estimators, tree_method='gpu_hist',
objective=objective, enable_categorical=True, verbosity=0).fit(
X, y)
elif task == 'classification':
valid_objectives = ['binary:logistic', 'binary:hinge',
'binary:logitraw', 'count:poisson', ]
if n_targets > 2:
valid_objectives += ['rank:pairwise', 'rank:ndcg', 'rank:map',
'multi:softmax', 'multi:softprob']

objective = draw(st.sampled_from(valid_objectives))
model = xgb.XGBClassifier(
n_estimators=n_estimators, tree_method='gpu_hist',
objective=objective, enable_categorical=True, verbosity=0).fit(
X, y)
return model.get_booster(), model.predict(X, output_margin=True)
elif learner == 'rf':
predict_model = 'GPU 'if y.dtype == np.float32 else 'CPU'
if task == 'regression':
model = cuml.ensemble.RandomForestRegressor(
n_estimators=n_estimators)
model.fit(X, y)
pred = model.predict(X, predict_model=predict_model)
elif task == 'classification':
model = cuml.ensemble.RandomForestClassifier(
n_estimators=n_estimators)
model.fit(X, y)
pred = model.predict_proba(X)
return model, pred
elif learner == 'skl_rf':
assume(has_sklearn())
if task == 'regression':
model = sklrfr(
n_estimators=n_estimators)
model.fit(X, y)
pred = model.predict(X)
elif task == 'classification':
model = sklrfc(
n_estimators=n_estimators)
model.fit(X, y)
pred = model.predict_proba(X)
return model, pred
elif learner == 'lgbm':
assume(has_lightgbm())
if task == 'regression':
model = lgb.LGBMRegressor(
n_estimators=n_estimators).fit(X, y)
elif task == 'classification':
model = lgb.LGBMClassifier(
n_estimators=n_estimators).fit(X, y)
return model.booster_, model.predict(X, raw_score=True)


@st.composite
def shap_strategy(draw):
task = draw(st.sampled_from(['regression', 'classification']))

n_estimators = draw(st.integers(1, 16))
n_samples = draw(st.integers(2, 100))
n_features = draw(st.integers(2, 100))
learner = draw(st.sampled_from(['xgb', 'rf', 'skl_rf', 'lgbm']))
supports_categorical = learner in ['xgb', 'lgbm']
supports_nan = learner in ['xgb', 'lgbm']
if task == 'classification':
n_targets = draw(st.integers(2, 5))
else:
n_targets = 1
Comment on lines +585 to +587
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So n_targets means n_classes in the context of classification? Let's just use n_classes for this purpose, since it's confusing otherwise. (I was wondering if we were using an unreleased feature of XGBoost)

Copy link
Contributor Author

@RAMitchell RAMitchell Apr 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These tests will support multi-output regression. I am using n_targets as a more generic term.

n_targets = min(n_targets, n_features)
n_targets = min(n_targets, n_samples)

has_categoricals = draw(st.booleans()) and supports_categorical
dtype = draw(st.sampled_from([np.float32, np.float64]))
if has_categoricals:
n_categorical = draw(st.integers(1, n_features))
else:
n_categorical = 0

has_nan = not has_categoricals and supports_nan

# Filter issues and invalid examples here
if task == 'classification' and learner == 'rf':
# No way to predict_proba with RandomForestClassifier
# trained on 64-bit data
# https://github.com/rapidsai/cuml/issues/4663
assume(dtype == np.float32)
if task == 'regression' and learner == 'skl_rf':
# multi-output regression not working
# https://github.com/dmlc/treelite/issues/375
assume(n_targets == 1)

# 64 bit thresholds can fail
# https://github.com/rapidsai/cuml/issues/4670
if learner in ['rf', 'skl_rf']:
assume(dtype == np.float32)

# treelite considers a binary classification model to have
# n_classes=1, which produces an unexpected output shape
# in the shap values
if task == 'classification' and learner == 'skl_rf':
assume(n_targets > 2)

# ensure we get some variation in test datasets
dataset_seed = draw(st.integers(1, 5))
if task == 'classification':
X, y = make_classification_with_categorical(
n_samples=n_samples, n_features=n_features,
n_categorical=n_categorical, n_informative=n_features,
n_redundant=0, n_repeated=0, random_state=dataset_seed,
n_classes=n_targets, numeric_dtype=dtype)
else:
X, y = make_regression_with_categorical(
n_samples=n_samples, n_features=n_features,
n_categorical=n_categorical, n_informative=n_features,
random_state=dataset_seed, numeric_dtype=dtype,
n_targets=n_targets)

if has_nan:
# set about half the first column to nan
X.iloc[np.random.randint(0, n_samples, n_samples//2), 0] = np.nan

assert len(X.select_dtypes(include='category').columns) == n_categorical

model, preds = learn_model(
draw, X, y, task, learner, n_estimators, n_targets)

return X, y, model, preds


def check_efficiency(expected_value, pred, shap_values):
# shap values add up to prediction
if len(shap_values.shape) <= 2:
assert np.allclose(np.sum(shap_values, axis=-1) +
expected_value, pred, 1e-3, 1e-3)
else:
n_targets = shap_values.shape[0]
for i in range(n_targets):
assert np.allclose(
np.sum(shap_values[i],
axis=-1) + expected_value[i],
pred[:, i],
1e-3, 1e-3)


# Generating input data/models can be time consuming and triggers
# hypothesis HealthCheck
@settings(deadline=None, max_examples=20,
suppress_health_check=[HealthCheck.too_slow])
@given(shap_strategy())
def test_with_hypothesis(params):
X, y, model, preds = params
explainer = TreeExplainer(model=model)
out = explainer.shap_values(X)
check_efficiency(explainer.expected_value, preds, out)


@settings(deadline=None)
@given(st.sampled_from(['numpy', 'cupy', 'cudf', 'pandas']))
def test_input_types(input_type):
# simple test to not crash on different input data-frames
X = np.array([[0.0, 2.0], [1.0, 0.5]])
y = np.array([0, 1])
X, y = as_type(input_type, X, y)
model = cuml.ensemble.RandomForestRegressor().fit(X, y)
explainer = TreeExplainer(model=model)
explainer.shap_values(X)