diff --git a/dcs_core/core/common/models/validation.py b/dcs_core/core/common/models/validation.py index 410a224a..8583abf2 100644 --- a/dcs_core/core/common/models/validation.py +++ b/dcs_core/core/common/models/validation.py @@ -82,6 +82,8 @@ class ValidationFunction(str, Enum): PERCENTILE_90 = "percentile_90" COUNT_ZERO = "count_zero" PERCENT_ZERO = "percent_zero" + COUNT_NEGATIVE = "count_negative" + PERCENT_NEGATIVE = "percent_negative" # Reliability validations 3 COUNT_ROWS = "count_rows" diff --git a/dcs_core/core/datasource/sql_datasource.py b/dcs_core/core/datasource/sql_datasource.py index 67b2fce6..25e70faf 100644 --- a/dcs_core/core/datasource/sql_datasource.py +++ b/dcs_core/core/datasource/sql_datasource.py @@ -649,3 +649,28 @@ def query_zero_metric( else: result = self.fetchone(zero_query)[0] return result + + def query_negative_metric( + self, table: str, field: str, operation: str, filters: str = None + ) -> Union[int, float]: + qualified_table_name = self.qualified_table_name(table) + + negative_query = ( + f"SELECT COUNT(*) FROM {qualified_table_name} WHERE {field} < 0" + ) + + if filters: + negative_query += f" AND {filters}" + + total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name}" + + if filters: + total_count_query += f" WHERE {filters}" + + if operation == "percent": + query = f"SELECT (CAST(({negative_query}) AS float) / CAST(({total_count_query}) AS float)) * 100" + else: + query = negative_query + + result = self.fetchone(query)[0] + return round(result, 2) if operation == "percent" else result diff --git a/dcs_core/core/validation/manager.py b/dcs_core/core/validation/manager.py index 36428476..2bfe40f4 100644 --- a/dcs_core/core/validation/manager.py +++ b/dcs_core/core/validation/manager.py @@ -32,6 +32,7 @@ ) from dcs_core.core.validation.numeric_validation import ( # noqa F401 this is used in globals AvgValidation, + CountNegativeValidation, CountZeroValidation, MaxValidation, MinValidation, @@ -40,6 +41,7 @@ Percentile60Validation, Percentile80Validation, Percentile90Validation, + PercentNegativeValidation, PercentZeroValidation, StdDevValidation, SumValidation, @@ -161,6 +163,8 @@ class ValidationManager: ValidationFunction.PERCENTILE_90.value: "Percentile90Validation", ValidationFunction.COUNT_ZERO.value: "CountZeroValidation", ValidationFunction.PERCENT_ZERO.value: "PercentZeroValidation", + ValidationFunction.COUNT_NEGATIVE.value: "CountNegativeValidation", + ValidationFunction.PERCENT_NEGATIVE.value: "PercentNegativeValidation", } def __init__( diff --git a/dcs_core/core/validation/numeric_validation.py b/dcs_core/core/validation/numeric_validation.py index 7b6d8aff..da54ab28 100644 --- a/dcs_core/core/validation/numeric_validation.py +++ b/dcs_core/core/validation/numeric_validation.py @@ -216,3 +216,31 @@ def _generate_metric_value(self, **kwargs) -> float: ) else: raise ValueError("Unsupported data source type for PercentZeroValidation") + + +class CountNegativeValidation(Validation): + def _generate_metric_value(self, **kwargs) -> int: + if isinstance(self.data_source, SQLDataSource): + return self.data_source.query_negative_metric( + table=self.dataset_name, + field=self.field_name, + operation="count", + filters=self.where_filter if self.where_filter is not None else None, + ) + else: + raise ValueError("Unsupported data source type for CountNegativeValidation") + + +class PercentNegativeValidation(Validation): + def _generate_metric_value(self, **kwargs) -> float: + if isinstance(self.data_source, SQLDataSource): + return self.data_source.query_negative_metric( + table=self.dataset_name, + field=self.field_name, + operation="percent", + filters=self.where_filter if self.where_filter is not None else None, + ) + else: + raise ValueError( + "Unsupported data source type for PercentNegativeValidation" + ) diff --git a/docs/validations/validity.md b/docs/validations/validity.md index dbbaf5d4..4ab33da5 100644 --- a/docs/validations/validity.md +++ b/docs/validations/validity.md @@ -475,4 +475,33 @@ validations for product_db.products: - price zero percent: on: percent_zero(price) threshold: "< 10" + +# **Numeric Negative Value Validations** + +The Numeric Negative Value Validations detect negative values in numeric fields within a dataset and ensure that they do not exceed or fall below a specified threshold. + +## **COUNT_NEGATIVE** + +This validation counts the number of negative values present in a given numeric field. + +**Example** + +```yaml +validations for product_db.products: + - negative value count should be less than 2: + on: count_negative(price) + threshold: "< 2" +``` + +## **PERCENT_NEGATIVE** + +This validation calculates the percentage of negative values in a numeric field, relative to the total number of records. + +**Example** + +```yaml +validations for product_db.products: + - negative value percentage should be less than 40%: + on: percent_negative(price) + threshold: "< 40" ``` diff --git a/examples/configurations/postgres/example_postgres_config.yaml b/examples/configurations/postgres/example_postgres_config.yaml index 623d6fe2..bc27090b 100644 --- a/examples/configurations/postgres/example_postgres_config.yaml +++ b/examples/configurations/postgres/example_postgres_config.yaml @@ -48,6 +48,15 @@ validations for iris_pgsql.dcs_iris: threshold: "< 10" + # **Negative Value Validations** + - price negative value count: + on: count_negative(price) + threshold: "< 2" + + - price negative value percentage: + on: percent_negative(price) + threshold: "< 40" + # Uniqueness Metrics - species duplicate count: on: count_duplicate(species) diff --git a/tests/core/configuration/test_configuration_v1.py b/tests/core/configuration/test_configuration_v1.py index 069d9734..aa756b34 100644 --- a/tests/core/configuration/test_configuration_v1.py +++ b/tests/core/configuration/test_configuration_v1.py @@ -993,3 +993,35 @@ def test_should_parse_percent_zero_validation(): .threshold.lt == 10 ) + + +def test_should_parse_count_negative_validation(): + yaml_string = """ + validations for product_db.products: + - count_negative for price should be less than 2: + on: count_negative(price) + threshold: "< 2" + """ + configuration = load_configuration_from_yaml_str(yaml_string) + assert ( + configuration.validations["product_db.products"] + .validations["count_negative for price should be less than 2"] + .get_validation_function + == ValidationFunction.COUNT_NEGATIVE + ) + + +def test_should_parse_percent_negative_validation(): + yaml_string = """ + validations for product_db.products: + - percent_negative for price should be less than 40%: + on: percent_negative(price) + threshold: "< 40" + """ + configuration = load_configuration_from_yaml_str(yaml_string) + assert ( + configuration.validations["product_db.products"] + .validations["percent_negative for price should be less than 40%"] + .get_validation_function + == ValidationFunction.PERCENT_NEGATIVE + ) diff --git a/tests/integration/datasource/test_sql_datasource.py b/tests/integration/datasource/test_sql_datasource.py index 85e357cd..d5f414ef 100644 --- a/tests/integration/datasource/test_sql_datasource.py +++ b/tests/integration/datasource/test_sql_datasource.py @@ -131,7 +131,8 @@ def setup_tables( figi VARCHAR(12), isin VARCHAR(12), perm_id VARCHAR(50), - salary INTEGER + salary INTEGER, + price FLOAT ) """ ) @@ -143,27 +144,27 @@ def setup_tables( ('thor', '{(utc_now - datetime.timedelta(days=10)).strftime("%Y-%m-%d")}', 1500, NULL, 'thor hammer', 'e7194aaa-5516-4362-a5ff-6ff971976bec', '123-456-7890', 'jane.doe@domain', 'C2', 'ABCDE', 40.0678, -7555555554.0060,'856-45-6789','0067340', - 'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789', 0), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id + 'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789', 0, 100.0), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id ('captain america', '{(utc_now - datetime.timedelta(days=3)).strftime("%Y-%m-%d")}', 90, 80, 'shield', 'e7194aaa-5516-4362-a5ff-6ff971976b', '(123) 456-7890', 'john.doe@.com ', 'NY', '12-345', 34.0522, -118.2437,'000-12-3456', 'B01HL06', - 'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789', 1000), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn + 'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789', 1000, -50.0), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn ('iron man', '{(utc_now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")}', 50, 70, 'suit', '1739c676-6108-4dd2-8984-2459df744936', '123 456 7890', 'contact@company..org', 'XY', '85001', 37.7749, -122.4194,'859-99-9999','4155586', - 'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678', 0), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id + 'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678', 0, -150.0), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id ('hawk eye', '{(utc_now - datetime.timedelta(days=5)).strftime("%Y-%m-%d")}', 40, 60, 'bow', '1739c676-6108-4dd2-8984-2459df746', '+1 123-456-7890', 'user@@example.com', 'TX', '30301', 51.1657, 10.4515,'123-45-67890','12345', - 'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291', 90), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol + 'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291', 90, 50.0), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol ('clark kent', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}', 35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8', '09123.456.7890', 'contact@company.org', 'ZZ', '123456', 51.5074, -0.1278,'666-45-6789','34A56B7', - '6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X', 0), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id + '6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X', 0, -25.0), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id ('black widow', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}', 35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8032c73', '+1 (123) 456-7890', 'jane_smith123@domain.co.uk', 'FL', '90210', 483.8566, 2.3522,'001-01-0001','456VGHY', - '0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890', 70) -- invalid isin -- invalid sedol + '0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890', 70, 30.0) -- invalid isin -- invalid sedol """ postgresql_connection.execute(text(insert_query)) @@ -611,7 +612,7 @@ def test_should_return_90th_percentile_age( percentile_90 = postgres_datasource.query_get_percentile( table=self.TABLE_NAME, field="age", percentile=0.9 ) - assert percentile_90 == 1500 + assert percentile_90 == 1500 # Expected 90th percentile value. def test_should_return_count_zero(self, postgres_datasource: PostgresDataSource): count_zero = postgres_datasource.query_zero_metric( @@ -624,3 +625,19 @@ def test_should_return_percent_zero(self, postgres_datasource: PostgresDataSourc table=self.TABLE_NAME, field="salary", operation="percent" ) assert round(percent_zero, 2) == 50.0 + + def test_should_return_count_negative( + self, postgres_datasource: PostgresDataSource + ): + count_negative = postgres_datasource.query_negative_metric( + table=self.TABLE_NAME, field="price", operation="count" + ) + assert count_negative == 3 + + def test_should_return_percent_negative( + self, postgres_datasource: PostgresDataSource + ): + percent_negative = postgres_datasource.query_negative_metric( + table=self.TABLE_NAME, field="price", operation="percent" + ) + assert round(percent_negative, 2) == 50.0