Skip to content

Commit

Permalink
[Subhankar] update dcs configuration structure
Browse files Browse the repository at this point in the history
  • Loading branch information
Subhankar authored and Subhankar committed Jul 8, 2024
1 parent ffd1853 commit ab8cae1
Show file tree
Hide file tree
Showing 53 changed files with 2,379 additions and 722 deletions.
143 changes: 72 additions & 71 deletions datachecks/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@
from rich.table import Table, Text

from datachecks.__version__ import __version__
from datachecks.core import Configuration, Inspect, load_configuration
from datachecks.core.common.models.metric import DataSourceMetrics
from datachecks.core import Configuration, Inspect
from datachecks.core.configuration.configuration_parser_v1 import load_configuration

# from datachecks.core.common.models.metric import DataSourceMetrics
from datachecks.core.inspect import InspectOutput
from datachecks.report.dashboard import DashboardInfoBuilder, html_template
from datachecks.report.models import TemplateParams
Expand All @@ -51,28 +53,28 @@ def main():
default=None,
help="Specify the file path for configuration",
)
# Disabled for now
# Disabled for now TODO: Enable in future for validations
# @click.option(
# "--auto-profile",
# is_flag=True,
# help="Specify if the inspection should do auto-profile of all data sources",
# )
@click.option(
"--html-report",
is_flag=True,
help="Specify if the inspection should generate HTML report",
)
@click.option(
"--report-path",
required=False,
default="datachecks_report.html",
help="Specify the file path for HTML report",
)
# @click.option(
# "--html-report",
# is_flag=True,
# help="Specify if the inspection should generate HTML report",
# )
# @click.option(
# "--report-path",
# required=False,
# default="datachecks_report.html",
# help="Specify the file path for HTML report",
# )
def inspect(
config_path: Union[str, None],
# auto_profile: bool = False, # Disabled for now
html_report: bool = False,
report_path: str = "datachecks_report.html",
# html_report: bool = False,
# report_path: str = "datachecks_report.html",
):
"""
Starts the datachecks inspection
Expand All @@ -84,21 +86,20 @@ def inspect(
f"Invalid value for '-C' / '--config-path': File '{config_path}' does not exist."
)
configuration: Configuration = load_configuration(config_path)

# inspector = Inspect(configuration=configuration, auto_profile=auto_profile) # Disabled for now
inspector = Inspect(configuration=configuration)

print("Starting [bold blue]datachecks[/bold blue] inspection...", ":zap:")
output: InspectOutput = inspector.run()

print("[bold green]Inspection completed successfully![/bold green] :tada:")
print(f"Inspection took {inspector.execution_time_taken} seconds")
if html_report:
print(f"Generating HTML report at {report_path}")
_build_html_report(inspect_output=output, report_path=report_path)
print(f"HTML report generated at {report_path}")
else:
print(_build_metric_cli_table(inspect_output=output))
# Disable for now
# if html_report:
# print(f"Generating HTML report at {report_path}")
# _build_html_report(inspect_output=output, report_path=report_path)
# print(f"HTML report generated at {report_path}")
# else:
print(_build_metric_cli_table(inspect_output=output))
sys.exit(0)

except Exception as e:
Expand All @@ -108,39 +109,60 @@ def inspect(

def _build_metric_cli_table(*, inspect_output: InspectOutput):
table = Table(
title="List of Generated Metrics",
title="List of Validations",
show_header=True,
header_style="bold blue",
)
table.add_column(
"Metric Name",
style="cyan",
no_wrap=True,
)
table.add_column("Validation Name", style="cyan", no_wrap=True)
table.add_column("Data Source", style="magenta")
table.add_column("Metric Type", style="magenta")
table.add_column("Validation Type", style="magenta")
table.add_column("Value", justify="right", style="green")
table.add_column("Valid", justify="right")
table.add_column("Is Valid", justify="right")
table.add_column("Reason", justify="right")

for data_source_name, ds_metrics in inspect_output.metrics.items():
row = None
if isinstance(ds_metrics, DataSourceMetrics):
for tabel_name, table_metrics in ds_metrics.table_metrics.items():
for metric_identifier, metric in table_metrics.metrics.items():
table.add_row(
*_build_row(metric),
)
for index_name, index_metrics in ds_metrics.index_metrics.items():
for metric_identifier, metric in index_metrics.metrics.items():
table.add_row(
*_build_row(metric),
)
else:
for metric_identifier, metric in ds_metrics.metrics.items():
table.add_row(
*_build_row(metric),
)
for identy, validation_info in inspect_output.validations.items():
_validity_style = (
""
if validation_info.is_valid is None
else "red"
if not validation_info.is_valid
else "green"
)
value = (
validation_info.name,
validation_info.data_source_name,
validation_info.validation_function,
str(validation_info.value),
Text(
"-"
if validation_info.is_valid is None
else "Failed"
if not validation_info.is_valid
else "Passed",
style=_validity_style,
),
"-" if validation_info.reason is None else validation_info.reason,
)
table.add_row(*value)

# for data_source_name, ds_metrics in inspect_output.metrics.items():
# row = None
# if isinstance(ds_metrics, DataSourceMetrics):
# for tabel_name, table_metrics in ds_metrics.table_metrics.items():
# for metric_identifier, metric in table_metrics.metrics.items():
# table.add_row(
# *_build_row(metric),
# )
# for index_name, index_metrics in ds_metrics.index_metrics.items():
# for metric_identifier, metric in index_metrics.metrics.items():
# table.add_row(
# *_build_row(metric),
# )
# else:
# for metric_identifier, metric in ds_metrics.metrics.items():
# table.add_row(
# *_build_row(metric),
# )

return table

Expand All @@ -153,24 +175,3 @@ def _build_html_report(*, inspect_output: InspectOutput, report_path: str):

with open(report_path, "w", encoding="utf-8") as out_file:
out_file.write(html_template(template_params))


def _build_row(metric):
_validity_style = (
"" if metric.is_valid is None else "red" if not metric.is_valid else "green"
)
return (
metric.tags.get("metric_name"),
metric.data_source,
metric.metric_type,
str(metric.value),
Text(
"-"
if metric.is_valid is None
else "Failed"
if not metric.is_valid
else "Passed",
style=_validity_style,
),
"-" if metric.reason is None else metric.reason,
)
97 changes: 93 additions & 4 deletions datachecks/core/common/models/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from dataclasses import dataclass
from enum import Enum
from typing import Dict, List, Optional, Union

from datachecks.core.common.models.data_source_resource import Field, Index, Table
from datachecks.core.common.models.metric import MetricsType
from datachecks.core.common.models.validation import Validation
from datachecks.core.common.models.validation import (
Threshold,
Validation,
ValidationFunction,
ValidationFunctionType,
)


class DataSourceType(str, Enum):
Expand All @@ -33,6 +38,11 @@ class DataSourceType(str, Enum):
DATABRICKS = "databricks"


class DataSourceLanguageSupport(str, Enum):
SQL = "sql"
DSL_ES = "dsl_es"


@dataclass
class DataSourceConnectionConfiguration:
"""
Expand Down Expand Up @@ -70,6 +80,84 @@ class DataSourceConfiguration:
name: str
type: DataSourceType
connection_config: DataSourceConnectionConfiguration
language_support: Optional[DataSourceLanguageSupport] = None


@dataclass
class ValidationConfig:
name: str
on: str
threshold: Optional[Threshold] = None
where: Optional[str] = None
query: Optional[str] = None
regex: Optional[str] = None
values: Optional[List] = None

def _on_field_validation(self):
if self.on is None:
raise ValueError("on field is required")
dataset_validation_functions = [
ValidationFunction.FAILED_ROWS,
ValidationFunction.COUNT_ROWS,
ValidationFunction.COUNT_DOCUMENTS,
ValidationFunction.CUSTOM_SQL,
ValidationFunction.COMPARE_COUNT_ROWS,
]
if self.on.strip() not in dataset_validation_functions:
self._validation_function_type = ValidationFunctionType.FIELD
if not re.match(r"^(\w+)\(([ \w-]+)\)$", self.on.strip()):
raise ValueError(f"on field must be a valid function, was {self.on}")
else:
column_validation_function = re.search(
r"^(\w+)\(([ \w-]+)\)$", self.on.strip()
).group(1)

if column_validation_function not in [v for v in ValidationFunction]:
raise ValueError(
f"{column_validation_function} is not a valid validation function"
)

if column_validation_function in dataset_validation_functions:
raise ValueError(
f"{column_validation_function} is a table function, should not have column name"
)

self._validation_function = ValidationFunction(
column_validation_function
)
self._validation_field_name = re.search(
r"^(\w+)\(([ \w-]+)\)$", self.on.strip()
).group(2)
else:
self._validation_function_type = ValidationFunctionType.DATASET
self._validation_function = ValidationFunction(self.on)
self._validation_field_name = None

def __post_init__(self):
self._on_field_validation()

@property
def get_validation_function(self) -> ValidationFunction:
return ValidationFunction(self._validation_function)

@property
def get_validation_function_type(self) -> ValidationFunctionType:
return self._validation_function_type

@property
def get_validation_field_name(self) -> str:
return self._validation_field_name if self._validation_field_name else None


@dataclass
class ValidationConfigByDataset:
"""
Validation configuration group
"""

data_source: str
dataset: str
validations: Dict[str, ValidationConfig]


@dataclass
Expand Down Expand Up @@ -135,6 +223,7 @@ class Configuration:
Configuration for the data checks
"""

data_sources: Dict[str, DataSourceConfiguration]
metrics: Dict[str, MetricConfiguration]
data_sources: Optional[Dict[str, DataSourceConfiguration]] = None
validations: Optional[Dict[str, ValidationConfigByDataset]] = None
metrics: Optional[Dict[str, MetricConfiguration]] = None
storage: Optional[MetricStorageConfiguration] = None
11 changes: 11 additions & 0 deletions datachecks/core/common/models/data_source_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,17 @@
from typing import Optional, Union


@dataclass
class Dataset:
"""
Dataset resource
"""

name: str
data_source: str
description: Optional[str] = None


@dataclass
class Table:
"""
Expand Down
Loading

0 comments on commit ab8cae1

Please sign in to comment.