feat(dashboard): html report framework (#115)

* docs: added tracking for docs (#99) * bump up version to 0.2.2 (#101) Co-authored-by: Subhankar <[email protected]> * Update README.md (#102) * build(deps): bump cryptography from 41.0.3 to 41.0.4 (#103) Bumps [cryptography](https://github.com/pyca/cryptography) from 41.0.3 to 41.0.4. - [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst) - [Commits](pyca/cryptography@41.0.3...41.0.4) --- updated-dependencies: - dependency-name: cryptography dependency-type: indirect ... Signed-off-by: dependabot[bot] <[email protected]> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * feat(metrics): implemented sum metric + tests (#109) * feat(metrics): implemented standard deviation + tests (#108) * feat(validation): metric validation framework (#107) * build(deps): bump urllib3 from 1.26.16 to 1.26.17 (#114) Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.26.16 to 1.26.17. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](urllib3/urllib3@1.26.16...1.26.17) --- updated-dependencies: - dependency-name: urllib3 dependency-type: indirect ... Signed-off-by: dependabot[bot] <[email protected]> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * feat(dashboard): html report framework --------- Signed-off-by: dependabot[bot] <[email protected]> Co-authored-by: Subhankar Biswas <[email protected]> Co-authored-by: Subhankar <[email protected]> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Niyas Hameed <[email protected]>
datachecks · Oct 11, 2023 · 71b404a · 71b404a
1 parent a908163
commit 71b404a
Show file tree

Hide file tree

Showing 65 changed files with 9,960 additions and 748 deletions.
diff --git a/.gitignore b/.gitignore
@@ -125,4 +125,5 @@ processor/runnotebook.ipynb
 
 # Poetry
 #poetry.lock
-_internal_*
+_internal_*
+datachecks_report.html
diff --git a/datachecks/cli/cli.py b/datachecks/cli/cli.py
@@ -21,7 +21,7 @@
 import click
 from loguru import logger
 from rich import print
-from rich.table import Table
+from rich.table import Table, Text
 
 from datachecks.__version__ import __version__
 from datachecks.core import Configuration, Inspect, load_configuration
@@ -106,47 +106,70 @@ def inspect(
 
 def _build_metric_cli_table(*, inspect_output: InspectOutput):
     table = Table(
-        title="List of Generated Metrics", show_header=True, header_style="bold blue"
+        title="List of Generated Metrics",
+        show_header=True,
+        header_style="bold blue",
+    )
+    table.add_column(
+        "Metric Name",
+        style="cyan",
+        no_wrap=True,
     )
-    table.add_column("Metric Name", style="cyan", no_wrap=True)
     table.add_column("Data Source", style="magenta")
     table.add_column("Metric Type", style="magenta")
     table.add_column("Value", justify="right", style="green")
+    table.add_column("Valid", justify="right")
+    table.add_column("Reason", justify="right")
+
     for data_source_name, ds_metrics in inspect_output.metrics.items():
+        row = None
         if isinstance(ds_metrics, DataSourceMetrics):
             for tabel_name, table_metrics in ds_metrics.table_metrics.items():
                 for metric_identifier, metric in table_metrics.metrics.items():
                     table.add_row(
-                        f"{metric.tags.get('metric_name')}",
-                        f"{data_source_name}",
-                        f"{metric.metric_type}",
-                        f"{metric.value}",
+                        *_build_row(metric),
                     )
             for index_name, index_metrics in ds_metrics.index_metrics.items():
                 for metric_identifier, metric in index_metrics.metrics.items():
                     table.add_row(
-                        f"{metric.tags.get('metric_name')}",
-                        f"{data_source_name}",
-                        f"{metric.metric_type}",
-                        f"{metric.value}",
+                        *_build_row(metric),
                     )
         else:
             for metric_identifier, metric in ds_metrics.metrics.items():
                 table.add_row(
-                    f"{metric.tags.get('metric_name')}",
-                    f"",
-                    f"{metric.metric_type}",
-                    f"{metric.value}",
+                    *_build_row(metric),
                 )
 
     return table
 
 
 def _build_html_report(*, inspect_output: InspectOutput, report_path: str):
+    logger.info(inspect_output)
     template_params = TemplateParams(
         dashboard_id="dcs_dashboard_" + str(uuid.uuid4()).replace("-", ""),
         dashboard_info=DashboardInfoBuilder(inspect_output).build(),
     )
 
     with open(report_path, "w", encoding="utf-8") as out_file:
         out_file.write(html_template(template_params))
+
+
+def _build_row(metric):
+    _validity_style = (
+        "" if metric.is_valid is None else "red" if not metric.is_valid else "green"
+    )
+    return (
+        metric.tags.get("metric_name"),
+        metric.data_source,
+        metric.metric_type,
+        str(metric.value),
+        Text(
+            "-"
+            if metric.is_valid is None
+            else "Failed"
+            if not metric.is_valid
+            else "Passed",
+            style=_validity_style,
+        ),
+        "-" if metric.reason is None else metric.reason,
+    )
diff --git a/datachecks/core/common/models/configuration.py b/datachecks/core/common/models/configuration.py
@@ -18,6 +18,7 @@
 
 from datachecks.core.common.models.data_source_resource import Field, Index, Table
 from datachecks.core.common.models.metric import MetricsType
+from datachecks.core.common.models.validation import Validation
 
 
 class DataSourceType(str, Enum):
@@ -85,6 +86,7 @@ class MetricConfiguration:
     metric_type: MetricsType
     expression: Optional[str] = None
     resource: Optional[Union[Table, Index, Field]] = None
+    validation: Optional[Validation] = None
     filters: Optional[MetricsFilterConfiguration] = None
 
     def __post_init__(self):

diff --git a/datachecks/core/common/models/metric.py b/datachecks/core/common/models/metric.py
@@ -62,6 +62,8 @@ class MetricValue:
     table_name: Optional[str] = None
     index_name: Optional[str] = None
     field_name: Optional[str] = None
+    is_valid: Optional[bool] = None
+    reason: Optional[str] = None
     tags: Dict[str, str] = None
 
 

diff --git a/datachecks/core/common/models/validation.py b/datachecks/core/common/models/validation.py
@@ -0,0 +1,39 @@
+#  Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional
+
+
+class ConditionType(str, Enum):
+    GTE = "gte"
+    LTE = "lte"
+    GT = "gt"
+    LT = "lt"
+    EQ = "eq"
+
+
+@dataclass
+class Threshold:
+    gte: Optional[float] = None
+    lte: Optional[float] = None
+    gt: Optional[float] = None
+    lt: Optional[float] = None
+    eq: Optional[float] = None
+
+
+@dataclass
+class Validation:
+    threshold: Threshold
diff --git a/datachecks/core/configuration/configuration_parser.py b/datachecks/core/configuration/configuration_parser.py
@@ -14,7 +14,9 @@
 
 from typing import Dict, List, Union
 
-from pyparsing import Forward, Group, Suppress, Word, alphas, delimitedList, nums
+from pyparsing import Combine, Group, Literal
+from pyparsing import Optional as OptionalParsing
+from pyparsing import Word, delimitedList, nums, oneOf
 
 from datachecks.core.common.errors import DataChecksConfigurationError
 from datachecks.core.common.models.configuration import (
@@ -27,8 +29,21 @@
 )
 from datachecks.core.common.models.data_source_resource import Field, Index, Table
 from datachecks.core.common.models.metric import MetricsType
+from datachecks.core.common.models.validation import (
+    ConditionType,
+    Threshold,
+    Validation,
+)
 from datachecks.core.configuration.config_loader import parse_config
 
+CONDITION_TYPE_MAPPING = {
+    ">=": ConditionType.GTE,
+    "<=": ConditionType.LTE,
+    "=": ConditionType.EQ,
+    "<": ConditionType.LT,
+    ">": ConditionType.GT,
+}
+
 
 def parse_data_source_yaml_configurations(
     data_source_yaml_configurations: List[dict],
@@ -74,6 +89,41 @@ def _parse_resource_index(resource_str: str) -> Index:
     return Index(data_source=splits[0], name=splits[1])
 
 
+def _parse_threshold_str(threshold: str) -> Threshold:
+    try:
+        operator = oneOf(">= <= = < >").setParseAction(
+            lambda t: CONDITION_TYPE_MAPPING[t[0]]
+        )
+        number = Combine(
+            OptionalParsing(Literal("-"))
+            + Word(nums)
+            + OptionalParsing(Literal(".") + Word(nums))
+        ).setParseAction(lambda t: float(t[0]))
+
+        condition = operator + number
+        conditions = delimitedList(
+            Group(condition) | Group(condition + Literal("&") + condition),
+            delim="&",
+        )
+        result = conditions.parseString(threshold)
+        return Threshold(**{operator: value for operator, value in result})
+
+    except Exception as e:
+        raise DataChecksConfigurationError(
+            f"Invalid threshold configuration {threshold}: {str(e)}"
+        )
+
+
+def _parse_validation_configuration(validation_config: dict) -> Validation:
+    if "threshold" in validation_config:
+        threshold = _parse_threshold_str(threshold=validation_config["threshold"])
+        return Validation(threshold=threshold)
+    else:
+        raise DataChecksConfigurationError(
+            f"Invalid validation configuration {validation_config}"
+        )
+
+
 def _parse_resource_field(resource_str: str, belongs_to: str) -> Field:
     splits = resource_str.split(".")
     if len(splits) != 3:
@@ -121,7 +171,6 @@ def parse_metric_configurations(
                 ),
                 expression=expression_str,
             )
-            metric_configurations[metric_configuration.name] = metric_configuration
         else:
             resource_str = metric_yaml_configuration["resource"]
             data_source_name = resource_str.split(".")[0]
@@ -145,7 +194,14 @@ def parse_metric_configurations(
                 metric_configuration.filter = MetricsFilterConfiguration(
                     where=metric_yaml_configuration["filters"]["where"]
                 )
-            metric_configurations[metric_configuration.name] = metric_configuration
+        if (
+            "validation" in metric_yaml_configuration
+            and metric_yaml_configuration["validation"] is not None
+        ):
+            metric_configuration.validation = _parse_validation_configuration(
+                metric_yaml_configuration["validation"]
+            )
+        metric_configurations[metric_configuration.name] = metric_configuration
 
     return metric_configurations
 

diff --git a/datachecks/core/metric/base.py b/datachecks/core/metric/base.py
@@ -15,12 +15,12 @@
 import datetime
 import json
 from abc import ABC
-from typing import Dict, Optional, Union
+from typing import Optional, Tuple, Union
 
 from loguru import logger
 
-from datachecks.core.common.errors import DataChecksMetricGenerationError
 from datachecks.core.common.models.metric import MetricsType, MetricValue
+from datachecks.core.common.models.validation import ConditionType
 from datachecks.core.datasource.base import DataSource
 from datachecks.core.datasource.search_datasource import SearchIndexDataSource
 from datachecks.core.datasource.sql_datasource import SQLDataSource
@@ -104,6 +104,9 @@ def __init__(
                     self.filter_query = json.loads(filters.where)
                 elif isinstance(data_source, SQLDataSource):
                     self.filter_query = filters.where
+        self.validation = None
+        if "validation" in kwargs and kwargs["validation"] is not None:
+            self.validation = kwargs["validation"]
 
     def get_metric_identity(self):
         MetricIdentity.generate_identity(
@@ -122,15 +125,28 @@ def get_metric_value(self, **kwargs) -> Union[MetricValue, None]:
             tags = {
                 "metric_name": self.name,
             }
+            if self.metric_type.value == MetricsType.COMBINED.value:
+                value = MetricValue(
+                    identity=self.get_metric_identity(),
+                    metric_type=self.metric_type.value,
+                    value=metric_value,
+                    expression=self.expression,
+                    timestamp=datetime.datetime.utcnow().isoformat(),
+                    tags=tags,
+                )
+            else:
+                value = MetricValue(
+                    identity=self.get_metric_identity(),
+                    metric_type=self.metric_type.value,
+                    value=metric_value,
+                    timestamp=datetime.datetime.utcnow().isoformat(),
+                    data_source=self.data_source.data_source_name,
+                    expression=self.expression,
+                    tags=tags,
+                )
+            if self.validation is not None and self.validation.threshold is not None:
+                value.is_valid, value.reason = self.validate_metric(metric_value)
 
-            value = MetricValue(
-                identity=self.get_metric_identity(),
-                metric_type=self.metric_type.value,
-                value=metric_value,
-                timestamp=datetime.datetime.utcnow().isoformat(),
-                data_source=self.data_source.data_source_name,
-                tags=tags,
-            )
             if (
                 "index_name" in self.__dict__
                 and self.__dict__["index_name"] is not None
@@ -153,6 +169,41 @@ def get_metric_value(self, **kwargs) -> Union[MetricValue, None]:
             logger.error(f"Failed to generate metric {self.name}: {str(e)}")
             return None
 
+    def validate_metric(self, metric_value) -> Tuple[bool, Optional[str]]:
+        for operator, value in self.validation.threshold.__dict__.items():
+            if value is not None:
+                if ConditionType.GTE == operator:
+                    if metric_value < value:
+                        return (
+                            False,
+                            f"Less than threshold of {value}",
+                        )
+                elif ConditionType.LTE == operator:
+                    if metric_value > value:
+                        return (
+                            False,
+                            f"Greater than threshold of {value}",
+                        )
+                elif ConditionType.GT == operator:
+                    if metric_value <= value:
+                        return (
+                            False,
+                            f"Less than or equal to threshold of {value}",
+                        )
+                elif ConditionType.LT == operator:
+                    if metric_value >= value:
+                        return (
+                            False,
+                            f"Greater than or equal to threshold of {value}",
+                        )
+                elif ConditionType.EQ == operator:
+                    if metric_value != value:
+                        return (
+                            False,
+                            f"Not equal to {value}",
+                        )
+        return True, None
+
 
 class FieldMetrics(Metric, ABC):
     def __init__(

diff --git a/datachecks/core/metric/combined_metric.py b/datachecks/core/metric/combined_metric.py
@@ -97,26 +97,4 @@ def _generate_metric_value(self, metric_values: List[MetricValue]):
         Generate the metric value for this metric
         """
         expression_data = self._metric_expression_parser(self.expression, metric_values)
-        return self._perform_operation(expression_data)
-
-    def get_metric_value(
-        self, metric_values: List[MetricValue]
-    ) -> Union[MetricValue, None]:
-        try:
-            metric_value = self._generate_metric_value(metric_values)
-            tags = {
-                "metric_name": self.name,
-            }
-
-            value = MetricValue(
-                identity=self.get_metric_identity(),
-                metric_type=self.metric_type.value,
-                value=metric_value,
-                expression=self.expression,
-                timestamp=datetime.datetime.utcnow().isoformat(),
-                tags=tags,
-            )
-            return value
-        except Exception as e:
-            logger.error(f"Failed to generate metric {self.name}: {str(e)}")
-            return None
+        return round(self._perform_operation(expression_data), 2)