Skip to content

Commit

Permalink
feat(core): simplified evaluate & report methods (#29)
Browse files Browse the repository at this point in the history
Features:
- Centralized main interface into evaluate method

Fixes:
- Default plot in DataQuality to False
- Solved DeprecationWarning on pd.Series definition (closes #36)
- (data-relations) dtypes definition bug fix
- (engines): added clean_warnings on DQ and new engines

Docs:
- Update README quickstart to new flow
- Added get_warnings() example
- Fix docstring on engine evaluate
  • Loading branch information
UrbanoFonseca authored Sep 22, 2021
1 parent 1163aae commit 50a6ca2
Show file tree
Hide file tree
Showing 20 changed files with 479 additions and 1,467 deletions.
31 changes: 19 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,29 @@ df = pd.read_csv('./datasets/transformed/census_10k.csv')
# create a DataQuality object from the main class that holds all quality modules
dq = DataQuality(df=df)

# run the tests
# run the tests and outputs a summary of the quality tests
results = dq.evaluate()

# Output a report of the quality issues found by the engines
dq.report()
```
```
Warnings count by priority:
Warnings:
TOTAL: 5 warning(s)
Priority 1: 1 warning(s)
Priority 2: 3 warning(s)
TOTAL: 4 warning(s)
List of warnings sorted by priority:
[DUPLICATE COLUMNS] Found 1 columns with exactly the same feature values as other columns. (Priority 1: heavy impact expected)
[EXACT DUPLICATES] Found 3 instances with exact duplicate feature values. (Priority 2: usage allowed, limited human intelligibility)
[FLATLINES] Found 4627 flatline events with a minimun length of 5 among the columns {'marital-status', 'workclass', 'income', 'native-country', 'capital-gain', 'capital-loss', 'education', 'occupation', 'workclass2', 'sex', 'education-num', 'hours-per-week', 'relationship', 'race'}. (Priority 2: usage allowed, limited human intelligibility)
[PREDEFINED ERRONEOUS DATA] Found 1960 ED values in the dataset. (Priority 2: usage allowed, limited human intelligibility)
Priority 2: 4 warning(s)
Priority 1 - heavy impact expected:
* [DUPLICATES - DUPLICATE COLUMNS] Found 1 columns with exactly the same feature values as other columns.
Priority 2 - usage allowed, limited human intelligibility:
* [DATA RELATIONS - HIGH COLLINEARITY - NUMERICAL] Found 3 numerical variables with high Variance Inflation Factor (VIF>5.0). The variables listed in results are highly collinear with other variables in the dataset. These will make model explainability harder and potentially give way to issues like overfitting. Depending on your end goal you might want to remove the highest VIF variables.
* [ERRONEOUS DATA - PREDEFINED ERRONEOUS DATA] Found 1960 ED values in the dataset.
* [DATA RELATIONS - HIGH COLLINEARITY - CATEGORICAL] Found 10 categorical variables with significant collinearity (p-value < 0.05). The variables listed in results are highly collinear with other variables in the dataset and sorted descending according to propensity. These will make model explainability harder and potentially give way to issues like overfitting. Depending on your end goal you might want to remove variables following the provided order.
* [DUPLICATES - EXACT DUPLICATES] Found 3 instances with exact duplicate feature values.
```


On top of the summary, you can retrieve a list of detected warnings for detailed inspection.
```python
# retrieve a list of data quality warnings
warnings = dq.get_warnings()
```
# Examples

Expand Down
2 changes: 1 addition & 1 deletion src/ydata_quality/bias_fairness/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def sensitive_predictability(self, th=0.5, adjusted_metric=True):
"""
drop_features = self.sensitive_features + [self.label] # features to remove in prediction

performances = pd.Series(index=self.sensitive_features)
performances = pd.Series(index=self.sensitive_features, dtype=str)
for feat in performances.index:
data = self.df.drop(columns=[x for x in drop_features if x != feat]) # drop all except target
performances[feat] = baseline_performance(df=data, label=feat, adjusted_metric=adjusted_metric)
Expand Down
19 changes: 13 additions & 6 deletions src/ydata_quality/core/data_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self,
corr_th: float = 0.8,
vif_th: float = 5,
p_th: float = 0.05,
plot: bool = True,
plot: bool = False,
severity: str= 'ERROR'):
"""
Engines:
Expand Down Expand Up @@ -154,14 +154,21 @@ def __store_warnings(self):
for engine in self.engines.values():
self._warnings += engine.get_warnings()

def evaluate(self):
"Runs all the individual data quality checks and aggregates the results."
results = {name: engine.evaluate(*self._eval_args.get(name,[])) for name, engine in self.engines.items()}
def evaluate(self, summary: bool = True) -> dict:
"""Runs all the individual data quality checks and aggregates the results.
Arguments:
summary (bool): if True, prints a report containing all the warnings detected during the data quality analysis.
"""
results = {name: engine.evaluate(*self._eval_args.get(name,[]), summary=False) for name, engine in self.engines.items()}
self.__store_warnings() # fetch all warnings from the engines
self.__clean_warnings()
if summary:
self._report()
return results

def report(self):
def _report(self):
"Prints a report containing all the warnings detected during the data quality analysis."
self.__store_warnings() # fetch all warnings from the engines
self.__clean_warnings()
if not self._warnings:
print(f'{WarningStyling.OKAY}No warnings found.{WarningStyling.ENDC}')
Expand Down
12 changes: 9 additions & 3 deletions src/ydata_quality/core/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def tests(self):
"List of individual tests available for the data quality checks."
return self._tests

def report(self):
def _report(self):
"Prints a report containing all the warnings detected during the data quality analysis."
self.__clean_warnings()
if not self._warnings:
Expand All @@ -133,8 +133,12 @@ def report(self):
print(warn_list[0].priority)
print(*(f"\t{warn}" for warn in warn_list), sep='\n')

def evaluate(self):
"Runs all the indidividual tests available within the same suite. Returns a dict of (name: results)."
def evaluate(self, summary: bool = True):
"""Runs all the individual tests available within the same suite. Returns a dict of (name: results).
Arguments:
summary (bool): if True, prints a report containing all the warnings detected during the data quality analysis.
"""
self._warnings = list() # reset the warnings
results = {}
for test in self.tests:
Expand All @@ -143,4 +147,6 @@ def evaluate(self):
except Exception as exc: # print a Warning and log the message
self._logger.warning('Skipping %s due to failure during computation. See results folder of this test for further details.', test)
results[test] = "[ERROR] Test failed to compute. Original exception: "+f"{exc}"
if summary:
self._report()
return results
7 changes: 6 additions & 1 deletion src/ydata_quality/data_expectations/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,8 @@ def _expectation_level_assessment(self, results_json: dict) -> pd.DataFrame:
return (expectation_level_report, {idx: expectations_summary[idx] for idx in expectation_level_report.index})

def evaluate(self, results_json_path: str, df: pd.DataFrame = None, error_tol: int = 0,
rel_error_tol: Optional[float] = None, minimum_coverage: Optional[float] = 0.75) -> dict:
rel_error_tol: Optional[float] = None, minimum_coverage: Optional[float] = 0.75,
summary: bool = True) -> dict:
"""Runs tests to the validation run results and reports based on found errors.
Args:
Expand All @@ -177,6 +178,7 @@ def evaluate(self, results_json_path: str, df: pd.DataFrame = None, error_tol: i
error_tol (int): Defines how many failed expectations are tolerated.
rel_error_tol (float): Defines the maximum fraction of failed expectations, overrides error_tol.
minimum_coverage (float): Minimum expected fraction of DataFrame columns covered by the expectation suite.
summary (bool): if True, prints a report containing all the warnings detected during the data quality analysis.
"""
df = df if isinstance(df, pd.DataFrame) else None
results = {}
Expand All @@ -191,4 +193,7 @@ def evaluate(self, results_json_path: str, df: pd.DataFrame = None, error_tol: i
self._logger.error("A valid DataFrame was not passed, skipping coverage fraction test.")
results['Overall Assessment'] = self._overall_assessment(results_json_path, error_tol, rel_error_tol)
results['Expectation Level Assessment'] = self._expectation_level_assessment(results_json_path)
self.__clean_warnings()
if summary:
self._report()
return results
8 changes: 6 additions & 2 deletions src/ydata_quality/data_relations/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def dtypes(self, df_dtypes: Tuple[pd.DataFrame, dict]):
dtypes[col] = dtype
self._dtypes = dtypes

def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=None, corr_th: float=0.8, vif_th: float=5, p_th: float=0.05, plot: bool=True) -> dict:
def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=None, corr_th: float=0.8, vif_th: float=5, p_th: float=0.05, plot: bool=True, summary=True) -> dict:
"""Runs tests to the validation run results and reports based on found errors.
Note, we perform standard normalization of numerical features in order to unbias VIF and partial correlation methods.
This bias correction produces results equivalent to adding a constant feature to the dataset.
Expand All @@ -65,10 +65,11 @@ def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=N
vif_th (float): Variance Inflation Factor threshold for numerical independence test, typically 5-10 is recommended. Defaults to 5.
p_th (float): Fraction of the right tail of the chi squared CDF defining threshold for categorical independence test. Defaults to 0.05.
plot (bool): Pass True to produce all available graphical outputs, False to suppress all graphical output.
summary (bool): if True, prints a report containing all the warnings detected during the data quality analysis.
"""
assert label in df.columns or not label, "The provided label name does not exist as a column in the dataset"
self.dtypes = (df, dtypes) # Consider refactoring QualityEngine dtypes (df as argument of setter)
df = standard_normalize(df, dtypes)
df = standard_normalize(df, self.dtypes)
results = {}
corr_mat, _ = correlation_matrix(df, self.dtypes, True)
p_corr_mat = partial_correlation_matrix(corr_mat)
Expand All @@ -85,6 +86,9 @@ def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=N
if label:
results['Feature Importance'] = self._feature_importance(corr_mat, p_corr_mat, label, corr_th)
results['High Collinearity'] = self._high_collinearity_detection(df, self.dtypes, label, vif_th, p_th=p_th)
self.__clean_warnings()
if summary:
self._report()
return results

def _confounder_detection(self, corr_mat: pd.DataFrame, par_corr_mat: pd.DataFrame, corr_th: float) -> List[Tuple[str, str]]:
Expand Down
31 changes: 18 additions & 13 deletions src/ydata_quality/drift/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def _2sample_feat_good_fit(self, ref_sample: pd.Series, test_sample: pd.Series)
statistic_value, p_value = -1, None
return statistic_value, p_value, test_name

def ref_covariate_drift(self, p_thresh: float= 0.05) -> pd.DataFrame:
def ref_covariate_drift(self, p_thresh: float= 0.05, plot: bool = False) -> pd.DataFrame:
"""Controls covariate drift in reference subsamples.
The controlled metric is the number of features with no drift detection.
This % is plotted against the size of the reference subsample.
Expand All @@ -196,7 +196,7 @@ def ref_covariate_drift(self, p_thresh: float= 0.05) -> pd.DataFrame:
holdout.drop(self.label, axis=1, inplace=True)
leftover_fractions = np.arange(0.2, 1.2, 0.2)
perc_index = ["{0:.0%}".format(fraction) for fraction in leftover_fractions]
control_metric = pd.Series(index=perc_index)
control_metric = pd.Series(index=perc_index, dtype=str)
bonferroni_p = p_thresh/len(covariates.columns) # Bonferroni correction
all_p_vals = pd.DataFrame(index=perc_index, columns=covariates.columns)
for idx, fraction in enumerate(leftover_fractions):
Expand All @@ -209,20 +209,23 @@ def ref_covariate_drift(self, p_thresh: float= 0.05) -> pd.DataFrame:
all_p_vals.iloc[idx] = p_vals
control_metric.iloc[idx] = 100*len([p for p in p_vals if p > bonferroni_p])/len(p_vals)
all_p_vals['Corrected p-value threshold'] = bonferroni_p
control_metric.plot(title='Reference sample covariate features no drift(%)',
xlabel='Percentage of remaining sample used',
ylabel='Percentage of no drift features',
ylim = (0, 104), style='.-')
plt.show()
if plot:
control_metric.plot(title='Reference sample covariate features no drift(%)',
xlabel='Percentage of remaining sample used',
ylabel='Percentage of no drift features',
ylim = (0, 104), style='.-')
plt.show()
return all_p_vals

def ref_label_drift(self, p_thresh: float= 0.05):
def ref_label_drift(self, p_thresh: float= 0.05, plot: bool = False):
"""Controls label drift in the reference sample (df).
The p-value of the test is plotted against the size of the reference subsample.
A monotonic increase of this metric is expected as we increase the subsample size.
The dtype is used to decide the test to be applied to the label (chi squared or KS).
Args:
p_thresh (float): The p_threshold used for the test."""
p_thresh (float): The p_threshold used for the test.
plot (bool): if True, produces graphical outputs.
"""
if self.label is None:
self._logger.warning("No label was provided. Test skipped.")
return
Expand All @@ -237,10 +240,12 @@ def ref_label_drift(self, p_thresh: float= 0.05):
test_sample = downsample)
p_values['Label p-value'].iloc[idx] = p_val
p_values['p-value threshold'] = p_thresh
p_values.plot(title='Reference sample label p-values',
xlabel='Percentage of remaining sample used',
ylabel=f'{test_name} test p-value', style='.-')
plt.show()
if plot:
p_values.plot(title='Reference sample label p-values',
xlabel='Percentage of remaining sample used',
ylabel=f'{test_name} test p-value', style='.-')
plt.show()
return p_values

def sample_covariate_drift(self, p_thresh: float= 0.05) -> pd.DataFrame:
"""Detects covariate drift in the test sample (measured against the full reference sample).
Expand Down
4 changes: 2 additions & 2 deletions src/ydata_quality/erroneous_data/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,11 @@ def flatlines(self, th: int=5, skip: list=[]):
self.store_warning(
QualityWarning(
test='Flatlines', category='Erroneous Data', priority=2, data=flatlines,
description=f"Found {total_flatlines} flatline events with a minimun length of {th} among the columns {set(flatlines.keys())}."
description=f"Found {total_flatlines} flatline events with a minimun length of {th:.0f} among the columns {set(flatlines.keys())}."
))
return flatlines
else:
self._logger.info("No flatline events with a minimum length of %f were found.", th)
self._logger.info(f"No flatline events with a minimum length of {th:.0f} were found.")

def predefined_erroneous_data(self, skip: list=[], short: bool = True):
"""Runs a check against a list of predefined erroneous data values.
Expand Down
2 changes: 1 addition & 1 deletion src/ydata_quality/missings/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def predict_missings(self, col: Union[List[str], str, None] = None, th=0.8):
# Calculate the performance for each feature
results = pd.Series(
{c: predict_missingness(df=self.df, feature=c) for c in cols},
name='predict_missings'
name='predict_missings', dtype=object
)

# Subset for performances above threshold
Expand Down
Loading

0 comments on commit 50a6ca2

Please sign in to comment.