From 0dda2fee99892b70dedf9f0380520c28a932f0b9 Mon Sep 17 00:00:00 2001 From: murilommen Date: Mon, 19 Sep 2022 18:22:57 -0300 Subject: [PATCH] refreshing examples and to_html() method details: - examples on our README were not using the latest whylogs.core.constraints.factories the same way as on flytesnacks. - WhylogsDatasetProfileTransformer.to_html() now returns a ProfileSummary HTML string Signed-off-by: murilommen --- plugins/flytekit-whylogs/README.md | 28 ++++----- .../flytekitplugins/whylogs/schema.py | 6 +- plugins/flytekit-whylogs/requirements.txt | 58 ++++++++++++++++--- plugins/flytekit-whylogs/setup.py | 2 +- plugins/flytekit-whylogs/tests/test_schema.py | 35 ++++++----- 5 files changed, 88 insertions(+), 41 deletions(-) diff --git a/plugins/flytekit-whylogs/README.md b/plugins/flytekit-whylogs/README.md index aeaff969e5..827d4b9cbc 100644 --- a/plugins/flytekit-whylogs/README.md +++ b/plugins/flytekit-whylogs/README.md @@ -15,15 +15,17 @@ pip install flytekitplugins-whylogs To generate profiles, you can add a task like the following: ```python +import whylogs as why from whylogs.core import DatasetProfileView -import whylogs as ylog import pandas as pd +from flytekit import task + @task def profile(df: pd.DataFrame) -> DatasetProfileView: - result = ylog.log(df) # Various overloads for different common data types exist - profile = result.view() + result = why.log(df) # Various overloads for different common data types exist + profile_view = result.view() return profile ``` @@ -37,21 +39,19 @@ if the data in the workflow doesn't conform to some configured constraints, like min/max values on features, data types on features, etc. ```python +from whylogs.core.constraints.factories import greater_than_number, mean_between_range + @task -def validate_data(profile: DatasetProfileView): - column = profile.get_column("my_column") - print(column.to_summary_dict()) # To see available things you can validate against - builder = ConstraintsBuilder(profile) - numConstraint = MetricConstraint( - name='numbers between 0 and 4 only', - condition=lambda x: x.min > 0 and x.max < 4, - metric_selector=MetricsSelector(metric_name='distribution', column_name='my_column')) - builder.add_constraint(numConstraint) +def validate_data(profile_view: DatasetProfileView): + builder = ConstraintsBuilder(dataset_profile_view=profile_view) + builder.add_constraint(greater_than_number(column_name="my_column", number=0.14)) + builder.add_constraint(mean_between_range(column_name="my_other_column", lower=2, upper=3)) constraint = builder.build() valid = constraint.validate() - if(not valid): + if valid is False: + print(constraint.report()) raise Exception("Invalid data found") ``` -Check out our [constraints notebook](https://github.com/whylabs/whylogs/blob/1.0.x/python/examples/basic/MetricConstraints.ipynb) for more examples. +If you want to learn more about whylogs, check out our [example notebooks](https://github.com/whylabs/whylogs/tree/mainline/python/examples). diff --git a/plugins/flytekit-whylogs/flytekitplugins/whylogs/schema.py b/plugins/flytekit-whylogs/flytekitplugins/whylogs/schema.py index 71247255f7..46646a012a 100644 --- a/plugins/flytekit-whylogs/flytekitplugins/whylogs/schema.py +++ b/plugins/flytekit-whylogs/flytekitplugins/whylogs/schema.py @@ -1,6 +1,7 @@ from typing import Type from whylogs.core import DatasetProfileView +from whylogs.viz.extensions.reports.profile_summary import ProfileSummaryReport from flytekit import BlobType, FlyteContext from flytekit.extend import T, TypeEngine, TypeTransformer @@ -42,9 +43,8 @@ def to_python_value(self, ctx: FlyteContext, lv: Literal, expected_python_type: def to_html( self, ctx: FlyteContext, python_val: DatasetProfileView, expected_python_type: Type[DatasetProfileView] ) -> str: - pandas_profile = str(python_val.to_pandas().to_html()) - header = str("

Profile View

\n") - return header + pandas_profile + report = ProfileSummaryReport(target_view=python_val) + return report.report().data TypeEngine.register(WhylogsDatasetProfileTransformer()) diff --git a/plugins/flytekit-whylogs/requirements.txt b/plugins/flytekit-whylogs/requirements.txt index 9001bc05e0..ea1a572cf4 100644 --- a/plugins/flytekit-whylogs/requirements.txt +++ b/plugins/flytekit-whylogs/requirements.txt @@ -6,21 +6,61 @@ # -e file:.#egg=flytekitplugins-whylogs # via -r requirements.in -flake8==4.0.1 +appnope==0.1.3 + # via ipython +asttokens==2.0.8 + # via stack-data +backcall==0.2.0 + # via ipython +decorator==5.1.1 + # via ipython +executing==1.0.0 + # via stack-data +ipython==8.5.0 # via whylogs -mccabe==0.6.1 - # via flake8 +jedi==0.18.1 + # via ipython +matplotlib-inline==0.1.6 + # via ipython +numpy==1.23.3 + # via scipy +parso==0.8.3 + # via jedi +pexpect==4.8.0 + # via ipython +pickleshare==0.7.5 + # via ipython +prompt-toolkit==3.0.31 + # via ipython protobuf==3.20.1 # via # flytekitplugins-whylogs # whylogs -pycodestyle==2.8.0 - # via flake8 -pyflakes==2.4.0 - # via flake8 +ptyprocess==0.7.0 + # via pexpect +pure-eval==0.2.2 + # via stack-data +pybars3==0.9.7 + # via whylogs +pygments==2.13.0 + # via ipython +pymeta3==0.5.1 + # via pybars3 +scipy==1.9.1 + # via whylogs +six==1.16.0 + # via asttokens +stack-data==0.5.0 + # via ipython +traitlets==5.4.0 + # via + # ipython + # matplotlib-inline typing-extensions==4.3.0 # via whylogs -whylogs==1.0.6 +wcwidth==0.2.5 + # via prompt-toolkit +whylogs[viz]==1.1.0 # via flytekitplugins-whylogs -whylogs-sketching==3.4.1.dev2 +whylogs-sketching==3.4.1.dev3 # via whylogs diff --git a/plugins/flytekit-whylogs/setup.py b/plugins/flytekit-whylogs/setup.py index 54af3c474e..ce10e877f6 100644 --- a/plugins/flytekit-whylogs/setup.py +++ b/plugins/flytekit-whylogs/setup.py @@ -4,7 +4,7 @@ microlib_name = f"flytekitplugins-{PLUGIN_NAME}" -plugin_requires = ["protobuf>=3.15,<4.0.0", "whylogs", "whylogs[viz]"] +plugin_requires = ["protobuf>=3.15,<4.0.0", "whylogs[viz]>=1.0.8"] __version__ = "0.0.0+develop" diff --git a/plugins/flytekit-whylogs/tests/test_schema.py b/plugins/flytekit-whylogs/tests/test_schema.py index 8fffae1c75..c8d6487553 100644 --- a/plugins/flytekit-whylogs/tests/test_schema.py +++ b/plugins/flytekit-whylogs/tests/test_schema.py @@ -1,21 +1,19 @@ from datetime import datetime +from typing import Type import pandas as pd -import pytest import whylogs as why from whylogs.core import DatasetProfileView +from flytekitplugins.whylogs.schema import WhylogsDatasetProfileTransformer +from flytekit.core.context_manager import FlyteContextManager from flytekit import task, workflow -@pytest.fixture -def input_data(): - return pd.DataFrame({"a": [1, 2, 3, 4]}) - - @task -def whylogs_profiling(data: pd.DataFrame) -> DatasetProfileView: - result = why.log(pandas=data) +def whylogs_profiling() -> DatasetProfileView: + df = pd.DataFrame({"a": [1, 2, 3, 4]}) + result = why.log(pandas=df) return result.view() @@ -25,18 +23,27 @@ def fetch_whylogs_datetime(profile_view: DatasetProfileView) -> datetime: @workflow -def whylogs_wf(data: pd.DataFrame) -> datetime: - profile_view = whylogs_profiling(data=data) +def whylogs_wf() -> datetime: + profile_view = whylogs_profiling() return fetch_whylogs_datetime(profile_view=profile_view) -def test_task_returns_whylogs_profile_view(input_data): - actual_profile = whylogs_profiling(data=input_data) +def test_task_returns_whylogs_profile_view() -> None: + actual_profile = whylogs_profiling() assert actual_profile is not None assert isinstance(actual_profile, DatasetProfileView) -def test_profile_view_gets_passed_on_tasks(input_data): - result = whylogs_wf(data=input_data) +def test_profile_view_gets_passed_on_tasks() -> None: + result = whylogs_wf() assert result is not None assert isinstance(result, datetime) + + +def test_to_html_method() -> None: + tf = WhylogsDatasetProfileTransformer() + profile_view = whylogs_profiling() + report = tf.to_html(FlyteContextManager.current_context(), profile_view, Type[DatasetProfileView]) + + assert isinstance(report, str) + assert "Profile Visualizer" in report