Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Plot apd #729

Merged
merged 7 commits into from
Aug 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions examples/sample_notebooks/evidently_metrics.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
"from evidently.metrics import ColumnDriftMetric\n",
"from evidently.metrics import DataDriftTable\n",
"from evidently.metrics import DatasetDriftMetric\n",
"from evidently.metrics import ColumnCategoryMetric\n",
"from evidently.metrics import ColumnDistributionMetric\n",
"from evidently.metrics import ColumnValuePlot\n",
"from evidently.metrics import ColumnQuantileMetric\n",
Expand Down Expand Up @@ -305,6 +306,7 @@
" ColumnCorrelationsMetric(column_name=\"education\"),\n",
" ColumnValueListMetric(column_name=\"relationship\", values=[\"Husband\", \"Unmarried\"]), \n",
" ColumnValueRangeMetric(column_name=\"age\", left=10, right=20),\n",
" ColumnCategoryMetric(column_name='education', category='Some-college'),\n",
" \n",
"])\n",
"\n",
Expand Down
4 changes: 3 additions & 1 deletion examples/sample_notebooks/evidently_tests.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,9 @@
" TestColumnNumberOfDifferentMissingValues(column_name='education'),\n",
" TestColumnAllConstantValues(column_name='education'),\n",
" TestColumnAllUniqueValues(column_name='education'),\n",
" TestColumnRegExp(column_name='education',reg_exp='^[0..9]')\n",
" TestColumnRegExp(column_name='education',reg_exp='^[0..9]'),\n",
" TestCategoryShare(column_name='education', category='Some-college', lt=0.5),\n",
" TestCategoryShare(column_name='age', category=27., lt=0.5)\n",
"])\n",
"\n",
"data_integrity_column_tests.run(reference_data=adult_ref, current_data=adult_cur)\n",
Expand Down
2 changes: 1 addition & 1 deletion src/evidently/calculations/data_drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def get_one_column_drift(
column_name,
datetime_column_name,
)
current_scatter["current"] = df
current_scatter["current (mean)"] = df
if prefix is None:
x_name = "Index binned"
else:
Expand Down
2 changes: 2 additions & 0 deletions src/evidently/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from .data_integrity.column_summary_metric import ColumnSummaryMetric
from .data_integrity.dataset_missing_values_metric import DatasetMissingValuesMetric
from .data_integrity.dataset_summary_metric import DatasetSummaryMetric
from .data_quality.column_category_metric import ColumnCategoryMetric
from .data_quality.column_correlations_metric import ColumnCorrelationsMetric
from .data_quality.column_distribution_metric import ColumnDistributionMetric
from .data_quality.column_quantile_metric import ColumnQuantileMetric
Expand Down Expand Up @@ -76,6 +77,7 @@
"ColumnSummaryMetric",
"DatasetMissingValuesMetric",
"DatasetSummaryMetric",
"ColumnCategoryMetric",
"ColumnCorrelationsMetric",
"ColumnDistributionMetric",
"ColumnInteractionPlot",
Expand Down
5 changes: 3 additions & 2 deletions src/evidently/metrics/data_drift/column_drift_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def get_one_column_drift(
column.name,
datetime_name,
)
current_scatter["current"] = df
current_scatter["current (mean)"] = df
if prefix is None:
x_name = "Index binned"
else:
Expand Down Expand Up @@ -372,9 +372,10 @@ def render_html(self, obj: ColumnDriftMetric) -> List[BaseWidgetInfo]:
std=(result.scatter.plot_shape["y0"] - result.scatter.plot_shape["y1"]) / 2,
xaxis_name=result.scatter.x_name,
xaxis_name_ref=None,
yaxis_name=result.column_name,
yaxis_name=f"{result.column_name} (mean +/- std)",
color_options=self.color_options,
return_json=False,
line_name="reference (mean)",
)
tabs.append(TabData("DATA DRIFT", plotly_figure(title="", figure=scatter_fig)))

Expand Down
3 changes: 2 additions & 1 deletion src/evidently/metrics/data_drift/data_drift_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,10 @@ def _generate_column_params(
std=(data.scatter.plot_shape["y0"] - data.scatter.plot_shape["y1"]) / 2,
xaxis_name=data.scatter.x_name,
xaxis_name_ref=None,
yaxis_name=data.column_name,
yaxis_name=f"{data.column_name} (mean +/- std)",
color_options=self.color_options,
return_json=False,
line_name="reference (mean)",
)
scatter = plotly_figure(title="", figure=scatter_fig)
details.with_part("DATA DRIFT", info=scatter)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,9 +173,10 @@ def _generate_column_params(
std=(data.scatter.plot_shape["y0"] - data.scatter.plot_shape["y1"]) / 2,
xaxis_name=data.scatter.x_name,
xaxis_name_ref=None,
yaxis_name=data.column_name,
yaxis_name=f"{data.column_name} (mean +/- std)",
color_options=self.color_options,
return_json=False,
line_name="reference (mean)",
)
scatter = plotly_figure(title="", figure=scatter_fig)
details.with_part("DATA DRIFT", info=scatter)
Expand Down
111 changes: 111 additions & 0 deletions src/evidently/metrics/data_quality/column_category_metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from typing import Dict
from typing import List
from typing import Optional
from typing import Union

import pandas as pd

from evidently.base_metric import ColumnName
from evidently.base_metric import InputData
from evidently.base_metric import Metric
from evidently.base_metric import MetricResult
from evidently.model.widget import BaseWidgetInfo
from evidently.options.base import AnyOptions
from evidently.renderers.base_renderer import MetricRenderer
from evidently.renderers.base_renderer import default_renderer
from evidently.renderers.html_widgets import CounterData
from evidently.renderers.html_widgets import counter
from evidently.renderers.html_widgets import header_text


class CategoryStat(MetricResult):
all_num: int
category_num: int
category_ratio: float


class ColumnCategoryMetricResult(MetricResult):
column_name: str
category: Union[int, float, str]
current: CategoryStat
reference: Optional[CategoryStat] = None
counts_of_values: Dict[str, pd.DataFrame]


class ColumnCategoryMetric(Metric[ColumnCategoryMetricResult]):
"""Calculates count and shares of values in the predefined values list"""

column_name: ColumnName
category: Union[int, float, str]

def __init__(
self, column_name: Union[str, ColumnName], category: Union[int, float, str], options: AnyOptions = None
) -> None:
self.column_name = ColumnName.from_any(column_name)
self.category = category
super().__init__(options=options)

def calculate(self, data: InputData) -> ColumnCategoryMetricResult:
if not data.has_column(self.column_name):
raise ValueError(f"Column '{self.column_name.display_name}' was not found in data.")

current_column = data.get_current_column(self.column_name)
reference_column = data.get_reference_column(self.column_name)

counts_of_values = None
counts_of_values = {}
current_counts = current_column.value_counts(dropna=False).reset_index()
current_counts.columns = ["x", "count"]
counts_of_values["current"] = current_counts.head(10)
counts_of_values["current"].index = counts_of_values["current"].index.astype("str")
if reference_column is not None:
reference_counts = reference_column.value_counts(dropna=False).reset_index()
reference_counts.columns = ["x", "count"]
counts_of_values["reference"] = reference_counts.head(10)
counts_of_values["reference"].index = counts_of_values["reference"].index.astype("str")

reference: Optional[CategoryStat] = None
if reference_column is not None:
reference = CategoryStat(
all_num=len(reference_column),
category_num=(reference_column == self.category).sum(),
category_ratio=(reference_column == self.category).mean(),
)
return ColumnCategoryMetricResult(
column_name=self.column_name.display_name,
category=self.category,
current=CategoryStat(
all_num=current_column.shape[0],
category_num=(current_column == self.category).sum(),
category_ratio=(current_column == self.category).mean(),
),
reference=reference,
counts_of_values=counts_of_values,
)


@default_renderer(wrap_type=ColumnCategoryMetric)
class ColumnCategoryMetricRenderer(MetricRenderer):
def _get_count_info(self, stat: CategoryStat):
percents = round(stat.category_ratio * 100, 3)
return f"{stat.category_num} out of {stat.all_num} ({percents}%)"

def render_html(self, obj: ColumnCategoryMetric) -> List[BaseWidgetInfo]:
metric_result = obj.get_result()
result = [header_text(label=f"Column '{metric_result.column_name}'. Сategory '{metric_result.category}'.")]
counters = [
CounterData.string(
label="current",
value=self._get_count_info(metric_result.current),
),
]

if metric_result.reference is not None:
counters.append(
CounterData.string(
label="reference",
value=self._get_count_info(metric_result.reference),
),
)
result.append(counter(counters=counters))
return result
4 changes: 4 additions & 0 deletions src/evidently/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
from .data_integrity_tests import TestShareOfRowsWithMissingValues
from .data_quality_tests import TestAllColumnsMostCommonValueShare
from .data_quality_tests import TestCatColumnsOutOfListValues
from .data_quality_tests import TestCategoryCount
from .data_quality_tests import TestCategoryShare
from .data_quality_tests import TestColumnQuantile
from .data_quality_tests import TestColumnValueMax
from .data_quality_tests import TestColumnValueMean
Expand Down Expand Up @@ -93,6 +95,8 @@
"TestTNR",
"TestTPR",
"TestAllFeaturesValueDrift",
"TestCategoryCount",
"TestCategoryShare",
"TestColumnDrift",
"TestCustomFeaturesValueDrift",
"TestEmbeddingsDrift",
Expand Down
Loading