tests/python_package_test/test_plotting.py

# coding: utf-8
import numpy as np
import pytest
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from lightgbm.compat import GRAPHVIZ_INSTALLED, MATPLOTLIB_INSTALLED, PANDAS_INSTALLED, pd_DataFrame

if MATPLOTLIB_INSTALLED:
    import matplotlib
    matplotlib.use('Agg')
if GRAPHVIZ_INSTALLED:
    import graphviz

from .utils import load_breast_cancer, make_synthetic_regression


@pytest.fixture(scope="module")
def breast_cancer_split():
    return train_test_split(*load_breast_cancer(return_X_y=True),
                            test_size=0.1, random_state=1)


@pytest.fixture(scope="module")
def train_data(breast_cancer_split):
    X_train, _, y_train, _ = breast_cancer_split
    return lgb.Dataset(X_train, y_train)


@pytest.fixture
def params():
    return {"objective": "binary",
            "verbose": -1,
            "num_leaves": 3}


@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason='matplotlib is not installed')
def test_plot_importance(params, breast_cancer_split, train_data):
    X_train, _, y_train, _ = breast_cancer_split

    gbm0 = lgb.train(params, train_data, num_boost_round=10)
    ax0 = lgb.plot_importance(gbm0)
    assert isinstance(ax0, matplotlib.axes.Axes)
    assert ax0.get_title() == 'Feature importance'
    assert ax0.get_xlabel() == 'Feature importance'
    assert ax0.get_ylabel() == 'Features'
    assert len(ax0.patches) <= 30

    gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
    gbm1.fit(X_train, y_train)

    ax1 = lgb.plot_importance(gbm1, color='r', title='t', xlabel='x', ylabel='y')
    assert isinstance(ax1, matplotlib.axes.Axes)
    assert ax1.get_title() == 't'
    assert ax1.get_xlabel() == 'x'
    assert ax1.get_ylabel() == 'y'
    assert len(ax1.patches) <= 30
    for patch in ax1.patches:
        assert patch.get_facecolor() == (1., 0, 0, 1.)  # red

    ax2 = lgb.plot_importance(gbm0, color=['r', 'y', 'g', 'b'], title=None, xlabel=None, ylabel=None)
    assert isinstance(ax2, matplotlib.axes.Axes)
    assert ax2.get_title() == ''
    assert ax2.get_xlabel() == ''
    assert ax2.get_ylabel() == ''
    assert len(ax2.patches) <= 30
    assert ax2.patches[0].get_facecolor() == (1., 0, 0, 1.)  # r
    assert ax2.patches[1].get_facecolor() == (.75, .75, 0, 1.)  # y
    assert ax2.patches[2].get_facecolor() == (0, .5, 0, 1.)  # g
    assert ax2.patches[3].get_facecolor() == (0, 0, 1., 1.)  # b

    ax3 = lgb.plot_importance(gbm0, title='t @importance_type@', xlabel='x @importance_type@', ylabel='y @importance_type@')
    assert isinstance(ax3, matplotlib.axes.Axes)
    assert ax3.get_title() == 't @importance_type@'
    assert ax3.get_xlabel() == 'x split'
    assert ax3.get_ylabel() == 'y @importance_type@'
    assert len(ax3.patches) <= 30

    gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1, importance_type="gain")
    gbm2.fit(X_train, y_train)

    def get_bounds_of_first_patch(axes):
        return axes.patches[0].get_extents().bounds

    first_bar1 = get_bounds_of_first_patch(lgb.plot_importance(gbm1))
    first_bar2 = get_bounds_of_first_patch(lgb.plot_importance(gbm1, importance_type="split"))
    first_bar3 = get_bounds_of_first_patch(lgb.plot_importance(gbm1, importance_type="gain"))
    first_bar4 = get_bounds_of_first_patch(lgb.plot_importance(gbm2))
    first_bar5 = get_bounds_of_first_patch(lgb.plot_importance(gbm2, importance_type="split"))
    first_bar6 = get_bounds_of_first_patch(lgb.plot_importance(gbm2, importance_type="gain"))

    assert first_bar1 == first_bar2
    assert first_bar1 == first_bar5
    assert first_bar3 == first_bar4
    assert first_bar3 == first_bar6
    assert first_bar1 != first_bar3


@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason='matplotlib is not installed')
def test_plot_split_value_histogram(params, breast_cancer_split, train_data):
    X_train, _, y_train, _ = breast_cancer_split

    gbm0 = lgb.train(params, train_data, num_boost_round=10)
    ax0 = lgb.plot_split_value_histogram(gbm0, 27)
    assert isinstance(ax0, matplotlib.axes.Axes)
    assert ax0.get_title() == 'Split value histogram for feature with index 27'
    assert ax0.get_xlabel() == 'Feature split value'
    assert ax0.get_ylabel() == 'Count'
    assert len(ax0.patches) <= 2

    gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
    gbm1.fit(X_train, y_train)

    ax1 = lgb.plot_split_value_histogram(gbm1, gbm1.booster_.feature_name()[27], figsize=(10, 5),
                                         title='Histogram for feature @index/name@ @feature@',
                                         xlabel='x', ylabel='y', color='r')
    assert isinstance(ax1, matplotlib.axes.Axes)
    title = f'Histogram for feature name {gbm1.booster_.feature_name()[27]}'
    assert ax1.get_title() == title
    assert ax1.get_xlabel() == 'x'
    assert ax1.get_ylabel() == 'y'
    assert len(ax1.patches) <= 2
    for patch in ax1.patches:
        assert patch.get_facecolor() == (1., 0, 0, 1.)  # red

    ax2 = lgb.plot_split_value_histogram(gbm0, 27, bins=10, color=['r', 'y', 'g', 'b'],
                                         title=None, xlabel=None, ylabel=None)
    assert isinstance(ax2, matplotlib.axes.Axes)
    assert ax2.get_title() == ''
    assert ax2.get_xlabel() == ''
    assert ax2.get_ylabel() == ''
    assert len(ax2.patches) == 10
    assert ax2.patches[0].get_facecolor() == (1., 0, 0, 1.)  # r
    assert ax2.patches[1].get_facecolor() == (.75, .75, 0, 1.)  # y
    assert ax2.patches[2].get_facecolor() == (0, .5, 0, 1.)  # g
    assert ax2.patches[3].get_facecolor() == (0, 0, 1., 1.)  # b

    with pytest.raises(ValueError):
        lgb.plot_split_value_histogram(gbm0, 0)  # was not used in splitting


@pytest.mark.skipif(not MATPLOTLIB_INSTALLED or not GRAPHVIZ_INSTALLED,
                    reason='matplotlib or graphviz is not installed')
def test_plot_tree(breast_cancer_split):
    X_train, _, y_train, _ = breast_cancer_split
    gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
    gbm.fit(X_train, y_train)

    with pytest.raises(IndexError):
        lgb.plot_tree(gbm, tree_index=83)

    ax = lgb.plot_tree(gbm, tree_index=3, figsize=(15, 8), show_info=['split_gain'])
    assert isinstance(ax, matplotlib.axes.Axes)
    w, h = ax.axes.get_figure().get_size_inches()
    assert int(w) == 15
    assert int(h) == 8


@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
def test_create_tree_digraph(breast_cancer_split):
    X_train, _, y_train, _ = breast_cancer_split

    constraints = [-1, 1] * int(X_train.shape[1] / 2)
    gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1, monotone_constraints=constraints)
    gbm.fit(X_train, y_train)

    with pytest.raises(IndexError):
        lgb.create_tree_digraph(gbm, tree_index=83)

    graph = lgb.create_tree_digraph(gbm, tree_index=3,
                                    show_info=['split_gain', 'internal_value', 'internal_weight'],
                                    name='Tree4', node_attr={'color': 'red'})
    graph.render(view=False)
    assert isinstance(graph, graphviz.Digraph)
    assert graph.name == 'Tree4'
    assert len(graph.node_attr) == 1
    assert graph.node_attr['color'] == 'red'
    assert len(graph.graph_attr) == 0
    assert len(graph.edge_attr) == 0
    graph_body = ''.join(graph.body)
    assert 'leaf' in graph_body
    assert 'gain' in graph_body
    assert 'value' in graph_body
    assert 'weight' in graph_body
    assert '#ffdddd' in graph_body
    assert '#ddffdd' in graph_body
    assert 'data' not in graph_body
    assert 'count' not in graph_body


@pytest.mark.parametrize('use_missing', [True, False])
@pytest.mark.parametrize('zero_as_missing', [True, False])
def test_numeric_split_direction(use_missing, zero_as_missing):
    if use_missing and zero_as_missing:
        pytest.skip('use_missing and zero_as_missing both set to True')
    X, y = make_synthetic_regression()
    rng = np.random.RandomState(0)
    zero_mask = rng.rand(X.shape[0]) < 0.05
    X[zero_mask, :] = 0
    if use_missing:
        nan_mask = ~zero_mask & (rng.rand(X.shape[0]) < 0.1)
        X[nan_mask, :] = np.nan
    ds = lgb.Dataset(X, y)
    params = {
        'num_leaves': 127,
        'min_child_samples': 1,
        'use_missing': use_missing,
        'zero_as_missing': zero_as_missing,
    }
    bst = lgb.train(params, ds, num_boost_round=1)

    case_with_zero = X[zero_mask][[0]]
    expected_leaf_zero = bst.predict(case_with_zero, pred_leaf=True)[0]
    node = bst.dump_model()['tree_info'][0]['tree_structure']
    while 'decision_type' in node:
        direction = lgb.plotting._determine_direction_for_numeric_split(
            case_with_zero[0][node['split_feature']], node['threshold'], node['missing_type'], node['default_left']
        )
        node = node['left_child'] if direction == 'left' else node['right_child']
    assert node['leaf_index'] == expected_leaf_zero

    if use_missing:
        case_with_nan = X[nan_mask][[0]]
        expected_leaf_nan = bst.predict(case_with_nan, pred_leaf=True)[0]
        node = bst.dump_model()['tree_info'][0]['tree_structure']
        while 'decision_type' in node:
            direction = lgb.plotting._determine_direction_for_numeric_split(
                case_with_nan[0][node['split_feature']], node['threshold'], node['missing_type'], node['default_left']
            )
            node = node['left_child'] if direction == 'left' else node['right_child']
        assert node['leaf_index'] == expected_leaf_nan
        assert expected_leaf_zero != expected_leaf_nan


@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
def test_example_case_in_tree_digraph():
    rng = np.random.RandomState(0)
    x1 = rng.rand(100)
    cat = rng.randint(1, 3, size=x1.size)
    X = np.vstack([x1, cat]).T
    y = x1 + 2 * cat
    feature_name = ['x1', 'cat']
    ds = lgb.Dataset(X, y, feature_name=feature_name, categorical_feature=['cat'])

    num_round = 3
    bst = lgb.train({'num_leaves': 7}, ds, num_boost_round=num_round)
    mod = bst.dump_model()
    example_case = X[[0]]
    makes_categorical_splits = False
    seen_indices = set()
    for i in range(num_round):
        graph = lgb.create_tree_digraph(bst, example_case=example_case, tree_index=i)
        gbody = graph.body
        node = mod['tree_info'][i]['tree_structure']
        while 'decision_type' in node:  # iterate through the splits
            split_index = node['split_index']

            node_in_graph = [n for n in gbody if f'split{split_index}' in n and '->' not in n]
            assert len(node_in_graph) == 1
            seen_indices.add(gbody.index(node_in_graph[0]))

            edge_to_node = [e for e in gbody if f'-> split{split_index}' in e]
            if node['decision_type'] == '<=':
                direction = lgb.plotting._determine_direction_for_numeric_split(
                    example_case[0][node['split_feature']], node['threshold'], node['missing_type'], node['default_left'])
            else:
                makes_categorical_splits = True
                direction = lgb.plotting._determine_direction_for_categorical_split(
                    example_case[0][node['split_feature']], node['threshold']
                )
            node = node['left_child'] if direction == 'left' else node['right_child']
            assert 'color=blue' in node_in_graph[0]
            if edge_to_node:
                assert len(edge_to_node) == 1
                assert 'color=blue' in edge_to_node[0]
                seen_indices.add(gbody.index(edge_to_node[0]))
        # we're in a leaf now
        leaf_index = node['leaf_index']
        leaf_in_graph = [n for n in gbody if f'leaf{leaf_index}' in n and '->' not in n]
        edge_to_leaf = [e for e in gbody if f'-> leaf{leaf_index}' in e]
        assert len(leaf_in_graph) == 1
        assert 'color=blue' in leaf_in_graph[0]
        assert len(edge_to_leaf) == 1
        assert 'color=blue' in edge_to_leaf[0]
        seen_indices.update([gbody.index(leaf_in_graph[0]), gbody.index(edge_to_leaf[0])])

        # check that the rest of the elements have black color
        remaining_elements = [e for i, e in enumerate(graph.body) if i not in seen_indices and 'graph' not in e]
        assert all('color=black' in e for e in remaining_elements)

        # check that we got to the expected leaf
        expected_leaf = bst.predict(example_case, start_iteration=i, num_iteration=1, pred_leaf=True)[0]
        assert leaf_index == expected_leaf
    assert makes_categorical_splits


@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
@pytest.mark.parametrize('input_type', ['array', 'dataframe'])
def test_empty_example_case_on_tree_digraph_raises_error(input_type):
    X, y = make_synthetic_regression()
    if input_type == 'dataframe':
        if not PANDAS_INSTALLED:
            pytest.skip(reason='pandas is not installed')
        X = pd_DataFrame(X)
    ds = lgb.Dataset(X, y)
    bst = lgb.train({'num_leaves': 3}, ds, num_boost_round=1)
    example_case = X[:0]
    if input_type == 'dataframe':
        example_case = pd_DataFrame(example_case)
    with pytest.raises(ValueError, match='example_case must have a single row.'):
        lgb.create_tree_digraph(bst, tree_index=0, example_case=example_case)


@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason='matplotlib is not installed')
def test_plot_metrics(params, breast_cancer_split, train_data):
    X_train, X_test, y_train, y_test = breast_cancer_split
    test_data = lgb.Dataset(X_test, y_test, reference=train_data)
    params.update({"metric": {"binary_logloss", "binary_error"}})

    evals_result0 = {}
    lgb.train(params, train_data,
              valid_sets=[train_data, test_data],
              valid_names=['v1', 'v2'],
              num_boost_round=10,
              callbacks=[lgb.record_evaluation(evals_result0)])
    with pytest.warns(UserWarning, match="More than one metric available, picking one to plot."):
        ax0 = lgb.plot_metric(evals_result0)
    assert isinstance(ax0, matplotlib.axes.Axes)
    assert ax0.get_title() == 'Metric during training'
    assert ax0.get_xlabel() == 'Iterations'
    assert ax0.get_ylabel() in {'binary_logloss', 'binary_error'}
    legend_items = ax0.get_legend().get_texts()
    assert len(legend_items) == 2
    assert legend_items[0].get_text() == 'v1'
    assert legend_items[1].get_text() == 'v2'

    ax1 = lgb.plot_metric(evals_result0, metric='binary_error')
    assert isinstance(ax1, matplotlib.axes.Axes)
    assert ax1.get_title() == 'Metric during training'
    assert ax1.get_xlabel() == 'Iterations'
    assert ax1.get_ylabel() == 'binary_error'
    legend_items = ax1.get_legend().get_texts()
    assert len(legend_items) == 2
    assert legend_items[0].get_text() == 'v1'
    assert legend_items[1].get_text() == 'v2'

    ax2 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2'])
    assert isinstance(ax2, matplotlib.axes.Axes)
    assert ax2.get_title() == 'Metric during training'
    assert ax2.get_xlabel() == 'Iterations'
    assert ax2.get_ylabel() == 'binary_logloss'
    legend_items = ax2.get_legend().get_texts()
    assert len(legend_items) == 1
    assert legend_items[0].get_text() == 'v2'

    ax3 = lgb.plot_metric(
        evals_result0,
        metric='binary_logloss',
        dataset_names=['v1'],
        title='Metric @metric@',
        xlabel='Iterations @metric@',
        ylabel='Value of "@metric@"',
        figsize=(5, 5),
        dpi=600,
        grid=False
    )
    assert isinstance(ax3, matplotlib.axes.Axes)
    assert ax3.get_title() == 'Metric @metric@'
    assert ax3.get_xlabel() == 'Iterations @metric@'
    assert ax3.get_ylabel() == 'Value of "binary_logloss"'
    legend_items = ax3.get_legend().get_texts()
    assert len(legend_items) == 1
    assert legend_items[0].get_text() == 'v1'
    assert ax3.get_figure().get_figheight() == 5
    assert ax3.get_figure().get_figwidth() == 5
    assert ax3.get_figure().get_dpi() == 600
    for grid_line in ax3.get_xgridlines():
        assert not grid_line.get_visible()
    for grid_line in ax3.get_ygridlines():
        assert not grid_line.get_visible()

    evals_result1 = {}
    lgb.train(params, train_data,
              num_boost_round=10,
              callbacks=[lgb.record_evaluation(evals_result1)])
    with pytest.raises(ValueError, match="eval results cannot be empty."):
        lgb.plot_metric(evals_result1)

    gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
    gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)])
    ax4 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None)
    assert isinstance(ax4, matplotlib.axes.Axes)
    assert ax4.get_title() == ''
    assert ax4.get_xlabel() == ''
    assert ax4.get_ylabel() == ''
    legend_items = ax4.get_legend().get_texts()
    assert len(legend_items) == 1
    assert legend_items[0].get_text() == 'valid_0'