feat(core): simplified evaluate & report methods (#29)

Features: - Centralized main interface into evaluate method Fixes: - Default plot in DataQuality to False - Solved DeprecationWarning on pd.Series definition (closes #36) - (data-relations) dtypes definition bug fix - (engines): added clean_warnings on DQ and new engines Docs: - Update README quickstart to new flow - Added get_warnings() example - Fix docstring on engine evaluate
ydataai · Sep 22, 2021 · 50a6ca2 · 50a6ca2
1 parent 1163aae
commit 50a6ca2
Show file tree

Hide file tree

Showing 20 changed files with 479 additions and 1,467 deletions.
diff --git a/README.md b/README.md
@@ -25,22 +25,29 @@ df = pd.read_csv('./datasets/transformed/census_10k.csv')
 # create a DataQuality object from the main class that holds all quality modules
 dq = DataQuality(df=df)
 
-# run the tests
+# run the tests and outputs a summary of the quality tests
 results = dq.evaluate()
-
-# Output a report of the quality issues found by the engines
-dq.report() 
 ```
 ```
-Warnings count by priority:
+Warnings:
+	TOTAL: 5 warning(s)
 	Priority 1: 1 warning(s)
-	Priority 2: 3 warning(s)
-	TOTAL: 4 warning(s)
-List of warnings sorted by priority:
-	[DUPLICATE COLUMNS] Found 1 columns with exactly the same feature values as other columns. (Priority 1: heavy impact expected)
-	[EXACT DUPLICATES] Found 3 instances with exact duplicate feature values. (Priority 2: usage allowed, limited human intelligibility)
-	[FLATLINES] Found 4627 flatline events with a minimun length of 5 among the columns {'marital-status', 'workclass', 'income', 'native-country', 'capital-gain', 'capital-loss', 'education', 'occupation', 'workclass2', 'sex', 'education-num', 'hours-per-week', 'relationship', 'race'}. (Priority 2: usage allowed, limited human intelligibility)
-	[PREDEFINED ERRONEOUS DATA] Found 1960 ED values in the dataset. (Priority 2: usage allowed, limited human intelligibility)
+	Priority 2: 4 warning(s)
+
+Priority 1 - heavy impact expected:
+	* [DUPLICATES - DUPLICATE COLUMNS] Found 1 columns with exactly the same feature values as other columns.
+Priority 2 - usage allowed, limited human intelligibility:
+	* [DATA RELATIONS - HIGH COLLINEARITY - NUMERICAL] Found 3 numerical variables with high Variance Inflation Factor (VIF>5.0). The variables listed in results are highly collinear with other variables in the dataset. These will make model explainability harder and potentially give way to issues like overfitting. Depending on your end goal you might want to remove the highest VIF variables.
+	* [ERRONEOUS DATA - PREDEFINED ERRONEOUS DATA] Found 1960 ED values in the dataset.
+	* [DATA RELATIONS - HIGH COLLINEARITY - CATEGORICAL] Found 10 categorical variables with significant collinearity (p-value < 0.05). The variables listed in results are highly collinear with other variables in the dataset and sorted descending according to propensity. These will make model explainability harder and potentially give way to issues like overfitting. Depending on your end goal you might want to remove variables following the provided order.
+	* [DUPLICATES - EXACT DUPLICATES] Found 3 instances with exact duplicate feature values.
+```
+
+
+On top of the summary, you can retrieve a list of detected warnings for detailed inspection.
+```python
+# retrieve a list of data quality warnings 
+warnings = dq.get_warnings()
 ```
 # Examples
 

diff --git a/src/ydata_quality/bias_fairness/engine.py b/src/ydata_quality/bias_fairness/engine.py
@@ -71,7 +71,7 @@ def sensitive_predictability(self, th=0.5, adjusted_metric=True):
         """
         drop_features = self.sensitive_features + [self.label] # features to remove in prediction
 
-        performances = pd.Series(index=self.sensitive_features)
+        performances = pd.Series(index=self.sensitive_features, dtype=str)
         for feat in performances.index:
             data = self.df.drop(columns=[x for x in drop_features if x != feat]) # drop all except target
             performances[feat] = baseline_performance(df=data, label=feat, adjusted_metric=adjusted_metric)

diff --git a/src/ydata_quality/core/data_quality.py b/src/ydata_quality/core/data_quality.py
@@ -38,7 +38,7 @@ def __init__(self,
                     corr_th: float = 0.8,
                     vif_th: float = 5,
                     p_th: float = 0.05,
-                    plot: bool = True,
+                    plot: bool = False,
                     severity: str= 'ERROR'):
         """
         Engines:
@@ -154,14 +154,21 @@ def __store_warnings(self):
         for engine in self.engines.values():
             self._warnings += engine.get_warnings()
 
-    def evaluate(self):
-        "Runs all the individual data quality checks and aggregates the results."
-        results = {name: engine.evaluate(*self._eval_args.get(name,[])) for name, engine in self.engines.items()}
+    def evaluate(self, summary: bool = True) -> dict:
+        """Runs all the individual data quality checks and aggregates the results.
+
+        Arguments:
+            summary (bool): if True, prints a report containing all the warnings detected during the data quality analysis.
+        """
+        results = {name: engine.evaluate(*self._eval_args.get(name,[]), summary=False) for name, engine in self.engines.items()}
+        self.__store_warnings() # fetch all warnings from the engines
+        self.__clean_warnings()
+        if summary:
+            self._report()
         return results
 
-    def report(self):
+    def _report(self):
         "Prints a report containing all the warnings detected during the data quality analysis."
-        self.__store_warnings() # fetch all warnings from the engines
         self.__clean_warnings()
         if not self._warnings:
             print(f'{WarningStyling.OKAY}No warnings found.{WarningStyling.ENDC}')

diff --git a/src/ydata_quality/core/engine.py b/src/ydata_quality/core/engine.py
@@ -117,7 +117,7 @@ def tests(self):
         "List of individual tests available for the data quality checks."
         return self._tests
 
-    def report(self):
+    def _report(self):
         "Prints a report containing all the warnings detected during the data quality analysis."
         self.__clean_warnings()
         if not self._warnings:
@@ -133,8 +133,12 @@ def report(self):
                     print(warn_list[0].priority)
                 print(*(f"\t{warn}" for warn in warn_list), sep='\n')
 
-    def evaluate(self):
-        "Runs all the indidividual tests available within the same suite. Returns a dict of (name: results)."
+    def evaluate(self, summary: bool = True):
+        """Runs all the individual tests available within the same suite. Returns a dict of (name: results).
+
+        Arguments:
+            summary (bool): if True, prints a report containing all the warnings detected during the data quality analysis.
+        """
         self._warnings = list() # reset the warnings
         results = {}
         for test in self.tests:
@@ -143,4 +147,6 @@ def evaluate(self):
             except Exception as exc: # print a Warning and log the message
                 self._logger.warning('Skipping %s due to failure during computation. See results folder of this test for further details.', test)
                 results[test] = "[ERROR] Test failed to compute. Original exception: "+f"{exc}"
+        if summary:
+            self._report()
         return results
diff --git a/src/ydata_quality/data_expectations/engine.py b/src/ydata_quality/data_expectations/engine.py
@@ -168,7 +168,8 @@ def _expectation_level_assessment(self, results_json: dict) -> pd.DataFrame:
         return (expectation_level_report, {idx: expectations_summary[idx] for idx in expectation_level_report.index})
 
     def evaluate(self, results_json_path: str, df: pd.DataFrame = None, error_tol: int = 0,
-                rel_error_tol: Optional[float] = None, minimum_coverage: Optional[float] = 0.75) -> dict:
+                rel_error_tol: Optional[float] = None, minimum_coverage: Optional[float] = 0.75,
+                summary: bool = True) -> dict:
         """Runs tests to the validation run results and reports based on found errors.
 
         Args:
@@ -177,6 +178,7 @@ def evaluate(self, results_json_path: str, df: pd.DataFrame = None, error_tol: i
             error_tol (int): Defines how many failed expectations are tolerated.
             rel_error_tol (float): Defines the maximum fraction of failed expectations, overrides error_tol.
             minimum_coverage (float): Minimum expected fraction of DataFrame columns covered by the expectation suite.
+            summary (bool): if True, prints a report containing all the warnings detected during the data quality analysis.
         """
         df = df if isinstance(df, pd.DataFrame) else None
         results = {}
@@ -191,4 +193,7 @@ def evaluate(self, results_json_path: str, df: pd.DataFrame = None, error_tol: i
             self._logger.error("A valid DataFrame was not passed, skipping coverage fraction test.")
         results['Overall Assessment'] = self._overall_assessment(results_json_path, error_tol, rel_error_tol)
         results['Expectation Level Assessment'] = self._expectation_level_assessment(results_json_path)
+        self.__clean_warnings()
+        if summary:
+            self._report()
         return results
diff --git a/src/ydata_quality/data_relations/engine.py b/src/ydata_quality/data_relations/engine.py
@@ -51,7 +51,7 @@ def dtypes(self, df_dtypes: Tuple[pd.DataFrame, dict]):
                 dtypes[col] = dtype
         self._dtypes = dtypes
 
-    def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=None, corr_th: float=0.8,  vif_th: float=5, p_th: float=0.05, plot: bool=True) -> dict:
+    def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=None, corr_th: float=0.8,  vif_th: float=5, p_th: float=0.05, plot: bool=True, summary=True) -> dict:
         """Runs tests to the validation run results and reports based on found errors.
         Note, we perform standard normalization of numerical features in order to unbias VIF and partial correlation methods.
         This bias correction produces results equivalent to adding a constant feature to the dataset.
@@ -65,10 +65,11 @@ def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=N
             vif_th (float): Variance Inflation Factor threshold for numerical independence test, typically 5-10 is recommended. Defaults to 5.
             p_th (float): Fraction of the right tail of the chi squared CDF defining threshold for categorical independence test. Defaults to 0.05.
             plot (bool): Pass True to produce all available graphical outputs, False to suppress all graphical output.
+            summary (bool): if True, prints a report containing all the warnings detected during the data quality analysis.
         """
         assert label in df.columns or not label, "The provided label name does not exist as a column in the dataset"
         self.dtypes = (df, dtypes)  # Consider refactoring QualityEngine dtypes (df as argument of setter)
-        df = standard_normalize(df, dtypes)
+        df = standard_normalize(df, self.dtypes)
         results = {}
         corr_mat, _ = correlation_matrix(df, self.dtypes, True)
         p_corr_mat = partial_correlation_matrix(corr_mat)
@@ -85,6 +86,9 @@ def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=N
         if label:
             results['Feature Importance'] = self._feature_importance(corr_mat, p_corr_mat, label, corr_th)
         results['High Collinearity'] = self._high_collinearity_detection(df, self.dtypes, label, vif_th, p_th=p_th)
+        self.__clean_warnings()
+        if summary:
+            self._report()
         return results
 
     def _confounder_detection(self, corr_mat: pd.DataFrame, par_corr_mat: pd.DataFrame, corr_th: float) -> List[Tuple[str, str]]:

diff --git a/src/ydata_quality/drift/engine.py b/src/ydata_quality/drift/engine.py
@@ -179,7 +179,7 @@ def _2sample_feat_good_fit(self, ref_sample: pd.Series, test_sample: pd.Series)
             statistic_value, p_value = -1, None
         return statistic_value, p_value, test_name
 
-    def ref_covariate_drift(self, p_thresh: float= 0.05) -> pd.DataFrame:
+    def ref_covariate_drift(self, p_thresh: float= 0.05, plot: bool = False) -> pd.DataFrame:
         """Controls covariate drift in reference subsamples.
         The controlled metric is the number of features with no drift detection.
         This % is plotted against the size of the reference subsample.
@@ -196,7 +196,7 @@ def ref_covariate_drift(self, p_thresh: float= 0.05) -> pd.DataFrame:
             holdout.drop(self.label, axis=1, inplace=True)
         leftover_fractions = np.arange(0.2, 1.2, 0.2)
         perc_index = ["{0:.0%}".format(fraction) for fraction in leftover_fractions]
-        control_metric = pd.Series(index=perc_index)
+        control_metric = pd.Series(index=perc_index, dtype=str)
         bonferroni_p = p_thresh/len(covariates.columns)  # Bonferroni correction
         all_p_vals = pd.DataFrame(index=perc_index, columns=covariates.columns)
         for idx, fraction in enumerate(leftover_fractions):
@@ -209,20 +209,23 @@ def ref_covariate_drift(self, p_thresh: float= 0.05) -> pd.DataFrame:
             all_p_vals.iloc[idx] = p_vals
             control_metric.iloc[idx] = 100*len([p for p in p_vals if p > bonferroni_p])/len(p_vals)
         all_p_vals['Corrected p-value threshold'] = bonferroni_p
-        control_metric.plot(title='Reference sample covariate features no drift(%)',
-            xlabel='Percentage of remaining sample used',
-            ylabel='Percentage of no drift features',
-            ylim = (0, 104), style='.-')
-        plt.show()
+        if plot:
+            control_metric.plot(title='Reference sample covariate features no drift(%)',
+                xlabel='Percentage of remaining sample used',
+                ylabel='Percentage of no drift features',
+                ylim = (0, 104), style='.-')
+            plt.show()
         return all_p_vals
 
-    def ref_label_drift(self, p_thresh: float= 0.05):
+    def ref_label_drift(self, p_thresh: float= 0.05, plot: bool = False):
         """Controls label drift in the reference sample (df).
         The p-value of the test is plotted against the size of the reference subsample.
         A monotonic increase of this metric is expected as we increase the subsample size.
         The dtype is used to decide the test to be applied to the label (chi squared or KS).
         Args:
-            p_thresh (float): The p_threshold used for the test."""
+            p_thresh (float): The p_threshold used for the test.
+            plot (bool): if True, produces graphical outputs.
+        """
         if self.label is None:
             self._logger.warning("No label was provided. Test skipped.")
             return
@@ -237,10 +240,12 @@ def ref_label_drift(self, p_thresh: float= 0.05):
                 test_sample = downsample)
             p_values['Label p-value'].iloc[idx] = p_val
         p_values['p-value threshold'] = p_thresh
-        p_values.plot(title='Reference sample label p-values',
-            xlabel='Percentage of remaining sample used',
-            ylabel=f'{test_name} test p-value', style='.-')
-        plt.show()
+        if plot:
+            p_values.plot(title='Reference sample label p-values',
+                xlabel='Percentage of remaining sample used',
+                ylabel=f'{test_name} test p-value', style='.-')
+            plt.show()
+        return p_values
 
     def sample_covariate_drift(self, p_thresh: float= 0.05) -> pd.DataFrame:
         """Detects covariate drift in the test sample (measured against the full reference sample).

diff --git a/src/ydata_quality/erroneous_data/engine.py b/src/ydata_quality/erroneous_data/engine.py
@@ -98,11 +98,11 @@ def flatlines(self, th: int=5, skip: list=[]):
             self.store_warning(
                 QualityWarning(
                     test='Flatlines', category='Erroneous Data', priority=2, data=flatlines,
-                    description=f"Found {total_flatlines} flatline events with a minimun length of {th} among the columns {set(flatlines.keys())}."
+                    description=f"Found {total_flatlines} flatline events with a minimun length of {th:.0f} among the columns {set(flatlines.keys())}."
             ))
             return flatlines
         else:
-            self._logger.info("No flatline events with a minimum length of %f were found.", th)
+            self._logger.info(f"No flatline events with a minimum length of {th:.0f} were found.")
 
     def predefined_erroneous_data(self, skip: list=[], short: bool = True):
         """Runs a check against a list of predefined erroneous data values.

diff --git a/src/ydata_quality/missings/engine.py b/src/ydata_quality/missings/engine.py
@@ -154,7 +154,7 @@ def predict_missings(self, col: Union[List[str], str, None] = None, th=0.8):
         # Calculate the performance for each feature
         results = pd.Series(
                     {c: predict_missingness(df=self.df, feature=c) for c in cols},
-                    name='predict_missings'
+                    name='predict_missings', dtype=object
                 )
 
         # Subset for performances above threshold