diff --git a/datachecks/core/common/models/validation.py b/datachecks/core/common/models/validation.py index 61e4d53e..8b52b5f0 100644 --- a/datachecks/core/common/models/validation.py +++ b/datachecks/core/common/models/validation.py @@ -98,7 +98,7 @@ class ValidationFunction(str, Enum): # Custom SQL CUSTOM_SQL = "custom_sql" - # Validity validations 43 + # Validity validations 45 # ======================================== COUNT_INVALID_VALUES = "count_invalid_values" PERCENT_INVALID_VALUES = "percent_invalid_values" @@ -106,6 +106,8 @@ class ValidationFunction(str, Enum): PERCENT_VALID_VALUES = "percent_valid_values" COUNT_INVALID_REGEX = "count_invalid_regex" PERCENT_INVALID_REGEX = "percent_invalid_regex" + COUNT_VALID_REGEX = "count_valid_regex" + PERCENT_VALID_REGEX = "percent_valid_regex" # -- String Format STRING_LENGTH_MAX = "string_length_max" diff --git a/datachecks/core/datasource/sql_datasource.py b/datachecks/core/datasource/sql_datasource.py index 96faa22f..dc14d077 100644 --- a/datachecks/core/datasource/sql_datasource.py +++ b/datachecks/core/datasource/sql_datasource.py @@ -402,7 +402,7 @@ def query_string_pattern_validity( :param filters: filter condition :return: count of valid values, count of total row count """ - filters = f"AND {filters}" if filters else "" + filters = f"WHERE {filters}" if filters else "" qualified_table_name = self.qualified_table_name(table) if not regex_pattern and not predefined_regex_pattern: @@ -421,3 +421,35 @@ def query_string_pattern_validity( """ result = self.fetchone(query) return result[0], result[1] + + def query_valid_invalid_values_validity( + self, + table: str, + field: str, + regex_pattern: str = None, + filters: str = None, + values: List[str] = None, + ) -> Tuple[int, int]: + """ + Get the count of valid and invalid values + :param table: table name + :param field: column name + :param values: list of valid values + :param regex_pattern: regex pattern + :param filters: filter condition + :return: count of valid/invalid values and total count of valid/invalid values + """ + filters = f"WHERE {filters}" if filters else "" + qualified_table_name = self.qualified_table_name(table) + if values: + values_str = ", ".join([f"'{value}'" for value in values]) + regex_query = f"CASE WHEN {field} IN ({values_str}) THEN 1 ELSE 0 END" + else: + regex_query = f"CASE WHEN {field} ~ '{regex_pattern}' THEN 1 ELSE 0 END" + query = f""" + SELECT SUM({regex_query}) AS valid_count, COUNT(*) as total_count + FROM {qualified_table_name} + {filters} + """ + result = self.fetchone(query) + return result[0], result[1] diff --git a/datachecks/core/validation/base.py b/datachecks/core/validation/base.py index 4c2debc2..219e0b7b 100644 --- a/datachecks/core/validation/base.py +++ b/datachecks/core/validation/base.py @@ -85,12 +85,17 @@ def __init__( self.threshold = validation_config.threshold self.where_filter = None + self.values = None + self.regex_pattern = validation_config.regex if validation_config.where: if data_source.language_support == DataSourceLanguageSupport.DSL_ES: self.where_filter = json.loads(validation_config.where) elif data_source.language_support == DataSourceLanguageSupport.SQL: self.where_filter = validation_config.where + if validation_config.values: + if data_source.language_support == DataSourceLanguageSupport.SQL: + self.values = validation_config.values def get_validation_identity(self) -> str: return ValidationIdentity.generate_identity( diff --git a/datachecks/core/validation/manager.py b/datachecks/core/validation/manager.py index a6410a71..41b89060 100644 --- a/datachecks/core/validation/manager.py +++ b/datachecks/core/validation/manager.py @@ -47,6 +47,18 @@ CountDistinctValidation, CountDuplicateValidation, ) +from datachecks.core.validation.validity_validation import ( # noqa F401 this is used in globals + CountInvalidRegex, + CountInvalidValues, + CountUUIDValidation, + CountValidRegex, + CountValidValues, + PercentInvalidRegex, + PercentInvalidValues, + PercentUUIDValidation, + PercentValidRegex, + PercentValidValues, +) class ValidationManager: @@ -69,6 +81,14 @@ class ValidationManager: ValidationFunction.FRESHNESS.value: "FreshnessValueMetric", ValidationFunction.COUNT_UUID.value: "CountUUIDValidation", ValidationFunction.PERCENT_UUID.value: "PercentUUIDValidation", + ValidationFunction.COUNT_INVALID_VALUES.value: "CountInvalidValues", + ValidationFunction.PERCENT_INVALID_VALUES.value: "PercentInvalidValues", + ValidationFunction.COUNT_VALID_VALUES.value: "CountValidValues", + ValidationFunction.PERCENT_VALID_VALUES.value: "PercentValidValues", + ValidationFunction.COUNT_INVALID_REGEX.value: "CountInvalidRegex", + ValidationFunction.PERCENT_INVALID_REGEX.value: "PercentInvalidRegex", + ValidationFunction.COUNT_VALID_REGEX.value: "CountValidRegex", + ValidationFunction.PERCENT_VALID_REGEX.value: "PercentValidRegex", } def __init__( diff --git a/datachecks/core/validation/validity_validation.py b/datachecks/core/validation/validity_validation.py index cf4b0cbc..56df38c3 100644 --- a/datachecks/core/validation/validity_validation.py +++ b/datachecks/core/validation/validity_validation.py @@ -24,6 +24,7 @@ def _generate_metric_value(self, **kwargs) -> Union[float, int]: valid_count, total_count = self.data_source.query_string_pattern_validity( table=self.dataset_name, field=self.field_name, + regex_pattern=self.regex_pattern, predefined_regex_pattern="uuid", filters=self.where_filter if self.where_filter is not None else None, ) @@ -40,6 +41,7 @@ def _generate_metric_value(self, **kwargs) -> Union[float, int]: valid_count, total_count = self.data_source.query_string_pattern_validity( table=self.dataset_name, field=self.field_name, + regex_pattern=self.regex_pattern, predefined_regex_pattern="uuid", filters=self.where_filter if self.where_filter is not None else None, ) @@ -48,3 +50,181 @@ def _generate_metric_value(self, **kwargs) -> Union[float, int]: raise NotImplementedError( "UUID validation is only supported for SQL data sources" ) + + +class CountInvalidValues(Validation): + def _generate_metric_value(self, **kwargs) -> Union[float, int]: + if self.values is None: + raise ValueError("Values are required for count_invalid_values validation") + if isinstance(self.data_source, SQLDataSource): + ( + invalid_count, + total_count, + ) = self.data_source.query_valid_invalid_values_validity( + table=self.dataset_name, + field=self.field_name, + values=self.values, + filters=self.where_filter if self.where_filter is not None else None, + ) + return invalid_count + else: + raise NotImplementedError( + "Valid/Invalid values validation is only supported for SQL data sources" + ) + + +class PercentInvalidValues(Validation): + def _generate_metric_value(self, **kwargs) -> Union[float, int]: + if self.values is None: + raise ValueError( + "Values are required for percent_invalid_values validation" + ) + if isinstance(self.data_source, SQLDataSource): + ( + invalid_count, + total_count, + ) = self.data_source.query_valid_invalid_values_validity( + table=self.dataset_name, + field=self.field_name, + values=self.values, + filters=self.where_filter if self.where_filter is not None else None, + ) + return round(invalid_count / total_count * 100, 2) if total_count > 0 else 0 + else: + raise NotImplementedError( + "Valid/Invalid values validation is only supported for SQL data sources" + ) + + +class CountValidValues(Validation): + def _generate_metric_value(self, **kwargs) -> Union[float, int]: + if self.values is None: + raise ValueError("Values are required for count_valid_values validation") + if isinstance(self.data_source, SQLDataSource): + ( + valid_count, + total_count, + ) = self.data_source.query_valid_invalid_values_validity( + table=self.dataset_name, + field=self.field_name, + values=self.values, + filters=self.where_filter if self.where_filter is not None else None, + ) + return valid_count + else: + raise NotImplementedError( + "Valid/Invalid values validation is only supported for SQL data sources" + ) + + +class PercentValidValues(Validation): + def _generate_metric_value(self, **kwargs) -> Union[float, int]: + if self.values is None: + raise ValueError("Values are required for percent_valid_values validation") + if isinstance(self.data_source, SQLDataSource): + ( + valid_count, + total_count, + ) = self.data_source.query_valid_invalid_values_validity( + table=self.dataset_name, + field=self.field_name, + values=self.values, + filters=self.where_filter if self.where_filter is not None else None, + ) + return round(valid_count / total_count * 100, 2) if total_count > 0 else 0 + else: + raise NotImplementedError( + "Valid/Invalid values validation is only supported for SQL data sources" + ) + + +class CountInvalidRegex(Validation): + def _generate_metric_value(self, **kwargs) -> Union[float, int]: + if self.regex_pattern is None: + raise ValueError( + "Regex pattern is required for count_invalid_regex validation" + ) + if isinstance(self.data_source, SQLDataSource): + ( + invalid_count, + total_count, + ) = self.data_source.query_valid_invalid_values_validity( + table=self.dataset_name, + field=self.field_name, + regex_pattern=self.regex_pattern, + filters=self.where_filter if self.where_filter is not None else None, + ) + return invalid_count + else: + raise NotImplementedError( + "Valid/Invalid values validation is only supported for SQL data sources" + ) + + +class PercentInvalidRegex(Validation): + def _generate_metric_value(self, **kwargs) -> Union[float, int]: + if self.regex_pattern is None: + raise ValueError( + "Regex pattern is required for percent_invalid_regex validation" + ) + if isinstance(self.data_source, SQLDataSource): + ( + invalid_count, + total_count, + ) = self.data_source.query_valid_invalid_values_validity( + table=self.dataset_name, + field=self.field_name, + regex_pattern=self.regex_pattern, + filters=self.where_filter if self.where_filter is not None else None, + ) + return round(invalid_count / total_count * 100, 2) if total_count > 0 else 0 + else: + raise NotImplementedError( + "Valid/Invalid values validation is only supported for SQL data sources" + ) + + +class CountValidRegex(Validation): + def _generate_metric_value(self, **kwargs) -> Union[float, int]: + if self.regex_pattern is None: + raise ValueError( + "Regex pattern is required for count_valid_regex validation" + ) + if isinstance(self.data_source, SQLDataSource): + ( + valid_count, + total_count, + ) = self.data_source.query_valid_invalid_values_validity( + table=self.dataset_name, + field=self.field_name, + regex_pattern=self.regex_pattern, + filters=self.where_filter if self.where_filter is not None else None, + ) + return valid_count + else: + raise NotImplementedError( + "Valid/Invalid values validation is only supported for SQL data sources" + ) + + +class PercentValidRegex(Validation): + def _generate_metric_value(self, **kwargs) -> Union[float, int]: + if self.regex_pattern is None: + raise ValueError( + "Regex pattern is required for percent_valid_regex validation" + ) + if isinstance(self.data_source, SQLDataSource): + ( + valid_count, + total_count, + ) = self.data_source.query_valid_invalid_values_validity( + table=self.dataset_name, + field=self.field_name, + regex_pattern=self.regex_pattern, + filters=self.where_filter if self.where_filter is not None else None, + ) + return round(valid_count / total_count * 100, 2) if total_count > 0 else 0 + else: + raise NotImplementedError( + "Valid/Invalid values validation is only supported for SQL data sources" + ) diff --git a/docs/validations/validity.md b/docs/validations/validity.md index 28a069dc..b060d0cf 100644 --- a/docs/validations/validity.md +++ b/docs/validations/validity.md @@ -26,4 +26,105 @@ validations for product_db.products: - percentage uuid for product_id: on: percent_uuid(product_id) threshold: "> 90" +``` + +## Count Invalid Values + +The count invalid values validation checks how many entries in a dataset are invalid according to given values. + +**Example** +```yaml title="dcs_config.yaml" + - invalid values count for species: + on: count_invalid_values(species) + values: ["versicolor"] +``` + +## Percent Invalid Values +The percent invalid values validation checks the percentage of entries in a dataset that are invalid according to given values. + +**Example** + +```yaml title="dcs_config.yaml" +validations for iris_db.iris: + - invalid values percentage for species: + on: percent_invalid_values(species) + values: ["versicolor"] +``` + +## Count Valid Values + +The count valid values validation checks how many entries in a dataset are valid according to given values. + +**Example** + +```yaml title="dcs_config.yaml" + - valid values count for species: + on: count_valid_values(species) + values: ["setosa", "virginica"] +``` + +## Percent Valid Values + +The percent valid values validation checks the percentage of entries in a dataset that are valid according to given values. + +**Example** + +```yaml title="dcs_config.yaml" +validations for iris_db.iris: + - valid values percentage for species: + on: percent_valid_values(species) + values: ["setosa", "virginica"] + threshold: "> 65" +``` + +## Count Invalid Regex + +The count invalid regex validation checks how many entries in a dataset are invalid according to a given regex pattern. + +**Example** + +```yaml title="dcs_config.yaml" + - invalid regex count for species: + on: count_invalid_regex(species) + pattern: "^(setosa|virginica)$" +``` + +## Percent Invalid Regex + +The percent invalid regex validation checks the percentage of entries in a dataset that are invalid according to a given regex pattern. + +**Example** + +```yaml title="dcs_config.yaml" +validations for iris_db.iris: + - invalid regex percentage for species: + on: percent_invalid_regex(species) + pattern: "^(setosa|virginica)$" + threshold: "> 10" +``` + +## Count Valid Regex + +The count valid regex validation checks how many entries in a dataset are valid according to a given regex pattern. + +**Example** + +```yaml title="dcs_config.yaml" + - valid regex count for species: + on: count_valid_regex(species) + pattern: "^(setosa|virginica)$" +``` + +## Percent Valid Regex + +The percent valid regex validation checks the percentage of entries in a dataset that are valid according to a given regex pattern. + +**Example** + +```yaml title="dcs_config.yaml" +validations for iris_db.iris: + - valid regex percentage for species: + on: percent_valid_regex(species) + pattern: "^(setosa|virginica)$" + threshold: "> 90" ``` \ No newline at end of file diff --git a/examples/configurations/postgres/example_postgres_config.yaml b/examples/configurations/postgres/example_postgres_config.yaml index 2b39e2e4..60124b8a 100644 --- a/examples/configurations/postgres/example_postgres_config.yaml +++ b/examples/configurations/postgres/example_postgres_config.yaml @@ -64,3 +64,45 @@ validations for iris_pgsql.dcs_iris: on: min(sepal_length) threshold: "= 4" where: "species = 'virginica'" + + # Validity Validations + - dcs_iris invalid values count species: + on: count_invalid_values(species) + values: ["versicolor"] + threshold: "< 10" + + - dcs_iris invalid values percent species: + on: percent_invalid_values(species) + values: ["virginica"] + + - dcs_iris valid values count species: + on: count_valid_values(species) + values: ["virginica", "setosa"] + + - dcs_iris valid values percent species: + on: percent_valid_values(species) + values: ["setosa"] + + - dcs_iris count invalid regex species: + on: count_invalid_regex(species) + regex: ".e.*" + + - dcs_iris count invalid regex percent species: + on: percent_invalid_regex(species) + regex: ".e.*" + + - dcs_iris count valid regex species: + on: count_valid_regex(species) + regex: "^(setosa|virginica)$" + + - dcs_iris count valid regex percent species: + on: percent_valid_regex(species) + regex: "^(setosa|virginica)$" + +validations for product_db.products: + - percentage uuid for product_id: + on: percent_uuid(product_id) + threshold: "> 90" + - count uuid for product_id: + on: count_uuid(product_id) + threshold: "> 100" diff --git a/tests/integration/datasource/test_sql_datasource.py b/tests/integration/datasource/test_sql_datasource.py index 6a779ff8..c49e86f3 100644 --- a/tests/integration/datasource/test_sql_datasource.py +++ b/tests/integration/datasource/test_sql_datasource.py @@ -322,3 +322,27 @@ def test_should_run_valid_uuid(self, postgres_datasource: PostgresDataSource): ) assert valid_count == 3 assert total_row_count == 6 + + def test_should_run_count_invalid_values( + self, postgres_datasource: PostgresDataSource + ): + ( + invalid_count, + total_row_count, + ) = postgres_datasource.query_valid_invalid_values_validity( + table=self.TABLE_NAME, field="name", values=["thor"] + ) + assert invalid_count == 1 + assert total_row_count == 6 + + def test_should_run_count_invalid_regex( + self, postgres_datasource: PostgresDataSource + ): + ( + invalid_count, + total_row_count, + ) = postgres_datasource.query_valid_invalid_values_validity( + table=self.TABLE_NAME, field="name", regex_pattern=".la.*" + ) + assert invalid_count == 2 + assert total_row_count == 6