Skip to content

Commit

Permalink
feat: add valid and invalid values functions
Browse files Browse the repository at this point in the history
- contains fixes for uuid validation
  • Loading branch information
Ryuk-me committed Aug 16, 2024
1 parent c53f80f commit a169be9
Show file tree
Hide file tree
Showing 8 changed files with 408 additions and 2 deletions.
4 changes: 3 additions & 1 deletion datachecks/core/common/models/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,16 @@ class ValidationFunction(str, Enum):
# Custom SQL
CUSTOM_SQL = "custom_sql"

# Validity validations 43
# Validity validations 45
# ========================================
COUNT_INVALID_VALUES = "count_invalid_values"
PERCENT_INVALID_VALUES = "percent_invalid_values"
COUNT_VALID_VALUES = "count_valid_values"
PERCENT_VALID_VALUES = "percent_valid_values"
COUNT_INVALID_REGEX = "count_invalid_regex"
PERCENT_INVALID_REGEX = "percent_invalid_regex"
COUNT_VALID_REGEX = "count_valid_regex"
PERCENT_VALID_REGEX = "percent_valid_regex"

# -- String Format
STRING_LENGTH_MAX = "string_length_max"
Expand Down
34 changes: 33 additions & 1 deletion datachecks/core/datasource/sql_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ def query_string_pattern_validity(
:param filters: filter condition
:return: count of valid values, count of total row count
"""
filters = f"AND {filters}" if filters else ""
filters = f"WHERE {filters}" if filters else ""
qualified_table_name = self.qualified_table_name(table)

if not regex_pattern and not predefined_regex_pattern:
Expand All @@ -421,3 +421,35 @@ def query_string_pattern_validity(
"""
result = self.fetchone(query)
return result[0], result[1]

def query_valid_invalid_values_validity(
self,
table: str,
field: str,
regex_pattern: str = None,
filters: str = None,
values: List[str] = None,
) -> Tuple[int, int]:
"""
Get the count of valid and invalid values
:param table: table name
:param field: column name
:param values: list of valid values
:param regex_pattern: regex pattern
:param filters: filter condition
:return: count of valid/invalid values and total count of valid/invalid values
"""
filters = f"WHERE {filters}" if filters else ""
qualified_table_name = self.qualified_table_name(table)
if values:
values_str = ", ".join([f"'{value}'" for value in values])
regex_query = f"CASE WHEN {field} IN ({values_str}) THEN 1 ELSE 0 END"
else:
regex_query = f"CASE WHEN {field} ~ '{regex_pattern}' THEN 1 ELSE 0 END"
query = f"""
SELECT SUM({regex_query}) AS valid_count, COUNT(*) as total_count
FROM {qualified_table_name}
{filters}
"""
result = self.fetchone(query)
return result[0], result[1]
5 changes: 5 additions & 0 deletions datachecks/core/validation/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,17 @@ def __init__(

self.threshold = validation_config.threshold
self.where_filter = None
self.values = None
self.regex_pattern = validation_config.regex

if validation_config.where:
if data_source.language_support == DataSourceLanguageSupport.DSL_ES:
self.where_filter = json.loads(validation_config.where)
elif data_source.language_support == DataSourceLanguageSupport.SQL:
self.where_filter = validation_config.where
if validation_config.values:
if data_source.language_support == DataSourceLanguageSupport.SQL:
self.values = validation_config.values

def get_validation_identity(self) -> str:
return ValidationIdentity.generate_identity(
Expand Down
20 changes: 20 additions & 0 deletions datachecks/core/validation/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,18 @@
CountDistinctValidation,
CountDuplicateValidation,
)
from datachecks.core.validation.validity_validation import ( # noqa F401 this is used in globals
CountInvalidRegex,
CountInvalidValues,
CountUUIDValidation,
CountValidRegex,
CountValidValues,
PercentInvalidRegex,
PercentInvalidValues,
PercentUUIDValidation,
PercentValidRegex,
PercentValidValues,
)


class ValidationManager:
Expand All @@ -69,6 +81,14 @@ class ValidationManager:
ValidationFunction.FRESHNESS.value: "FreshnessValueMetric",
ValidationFunction.COUNT_UUID.value: "CountUUIDValidation",
ValidationFunction.PERCENT_UUID.value: "PercentUUIDValidation",
ValidationFunction.COUNT_INVALID_VALUES.value: "CountInvalidValues",
ValidationFunction.PERCENT_INVALID_VALUES.value: "PercentInvalidValues",
ValidationFunction.COUNT_VALID_VALUES.value: "CountValidValues",
ValidationFunction.PERCENT_VALID_VALUES.value: "PercentValidValues",
ValidationFunction.COUNT_INVALID_REGEX.value: "CountInvalidRegex",
ValidationFunction.PERCENT_INVALID_REGEX.value: "PercentInvalidRegex",
ValidationFunction.COUNT_VALID_REGEX.value: "CountValidRegex",
ValidationFunction.PERCENT_VALID_REGEX.value: "PercentValidRegex",
}

def __init__(
Expand Down
180 changes: 180 additions & 0 deletions datachecks/core/validation/validity_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def _generate_metric_value(self, **kwargs) -> Union[float, int]:
valid_count, total_count = self.data_source.query_string_pattern_validity(
table=self.dataset_name,
field=self.field_name,
regex_pattern=self.regex_pattern,
predefined_regex_pattern="uuid",
filters=self.where_filter if self.where_filter is not None else None,
)
Expand All @@ -40,6 +41,7 @@ def _generate_metric_value(self, **kwargs) -> Union[float, int]:
valid_count, total_count = self.data_source.query_string_pattern_validity(
table=self.dataset_name,
field=self.field_name,
regex_pattern=self.regex_pattern,
predefined_regex_pattern="uuid",
filters=self.where_filter if self.where_filter is not None else None,
)
Expand All @@ -48,3 +50,181 @@ def _generate_metric_value(self, **kwargs) -> Union[float, int]:
raise NotImplementedError(
"UUID validation is only supported for SQL data sources"
)


class CountInvalidValues(Validation):
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
if self.values is None:
raise ValueError("Values are required for count_invalid_values validation")
if isinstance(self.data_source, SQLDataSource):
(
invalid_count,
total_count,
) = self.data_source.query_valid_invalid_values_validity(
table=self.dataset_name,
field=self.field_name,
values=self.values,
filters=self.where_filter if self.where_filter is not None else None,
)
return invalid_count
else:
raise NotImplementedError(
"Valid/Invalid values validation is only supported for SQL data sources"
)


class PercentInvalidValues(Validation):
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
if self.values is None:
raise ValueError(
"Values are required for percent_invalid_values validation"
)
if isinstance(self.data_source, SQLDataSource):
(
invalid_count,
total_count,
) = self.data_source.query_valid_invalid_values_validity(
table=self.dataset_name,
field=self.field_name,
values=self.values,
filters=self.where_filter if self.where_filter is not None else None,
)
return round(invalid_count / total_count * 100, 2) if total_count > 0 else 0
else:
raise NotImplementedError(
"Valid/Invalid values validation is only supported for SQL data sources"
)


class CountValidValues(Validation):
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
if self.values is None:
raise ValueError("Values are required for count_valid_values validation")
if isinstance(self.data_source, SQLDataSource):
(
valid_count,
total_count,
) = self.data_source.query_valid_invalid_values_validity(
table=self.dataset_name,
field=self.field_name,
values=self.values,
filters=self.where_filter if self.where_filter is not None else None,
)
return valid_count
else:
raise NotImplementedError(
"Valid/Invalid values validation is only supported for SQL data sources"
)


class PercentValidValues(Validation):
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
if self.values is None:
raise ValueError("Values are required for percent_valid_values validation")
if isinstance(self.data_source, SQLDataSource):
(
valid_count,
total_count,
) = self.data_source.query_valid_invalid_values_validity(
table=self.dataset_name,
field=self.field_name,
values=self.values,
filters=self.where_filter if self.where_filter is not None else None,
)
return round(valid_count / total_count * 100, 2) if total_count > 0 else 0
else:
raise NotImplementedError(
"Valid/Invalid values validation is only supported for SQL data sources"
)


class CountInvalidRegex(Validation):
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
if self.regex_pattern is None:
raise ValueError(
"Regex pattern is required for count_invalid_regex validation"
)
if isinstance(self.data_source, SQLDataSource):
(
invalid_count,
total_count,
) = self.data_source.query_valid_invalid_values_validity(
table=self.dataset_name,
field=self.field_name,
regex_pattern=self.regex_pattern,
filters=self.where_filter if self.where_filter is not None else None,
)
return invalid_count
else:
raise NotImplementedError(
"Valid/Invalid values validation is only supported for SQL data sources"
)


class PercentInvalidRegex(Validation):
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
if self.regex_pattern is None:
raise ValueError(
"Regex pattern is required for percent_invalid_regex validation"
)
if isinstance(self.data_source, SQLDataSource):
(
invalid_count,
total_count,
) = self.data_source.query_valid_invalid_values_validity(
table=self.dataset_name,
field=self.field_name,
regex_pattern=self.regex_pattern,
filters=self.where_filter if self.where_filter is not None else None,
)
return round(invalid_count / total_count * 100, 2) if total_count > 0 else 0
else:
raise NotImplementedError(
"Valid/Invalid values validation is only supported for SQL data sources"
)


class CountValidRegex(Validation):
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
if self.regex_pattern is None:
raise ValueError(
"Regex pattern is required for count_valid_regex validation"
)
if isinstance(self.data_source, SQLDataSource):
(
valid_count,
total_count,
) = self.data_source.query_valid_invalid_values_validity(
table=self.dataset_name,
field=self.field_name,
regex_pattern=self.regex_pattern,
filters=self.where_filter if self.where_filter is not None else None,
)
return valid_count
else:
raise NotImplementedError(
"Valid/Invalid values validation is only supported for SQL data sources"
)


class PercentValidRegex(Validation):
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
if self.regex_pattern is None:
raise ValueError(
"Regex pattern is required for percent_valid_regex validation"
)
if isinstance(self.data_source, SQLDataSource):
(
valid_count,
total_count,
) = self.data_source.query_valid_invalid_values_validity(
table=self.dataset_name,
field=self.field_name,
regex_pattern=self.regex_pattern,
filters=self.where_filter if self.where_filter is not None else None,
)
return round(valid_count / total_count * 100, 2) if total_count > 0 else 0
else:
raise NotImplementedError(
"Valid/Invalid values validation is only supported for SQL data sources"
)
Loading

0 comments on commit a169be9

Please sign in to comment.