generated from ydataai/opensource-template
-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(missings): added missings mvp (#3)
Features: - High occurrence of nulls - Missing values correlations - Missing values predictor - Performance drop - Store / get warning (core)
- Loading branch information
UrbanoFonseca
authored
Jul 8, 2021
1 parent
b0a3556
commit 6a95c37
Showing
8 changed files
with
696 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
pandas==1.2.* | ||
pydantic==1.8.2 | ||
scikit-learn==0.24.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
""" | ||
Tools to guarantee data quality on missing values. | ||
""" | ||
from ydata_quality.missings.engine import MissingsProfiler | ||
|
||
__all__ = [ | ||
"MissingsProfiler" | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
""" | ||
Implementation of MissingProfiler engine to run missing value analysis. | ||
""" | ||
from typing import List, Optional, Union | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from ydata_quality.core import QualityEngine, QualityWarning | ||
from ydata_quality.utils.modelling import (baseline_performance, | ||
performance_per_missing_value, | ||
predict_missingness) | ||
|
||
|
||
class MissingsProfiler(QualityEngine): | ||
"Main class to run missing value analysis." | ||
|
||
def __init__(self, df: pd.DataFrame, target: Optional[str] = None): | ||
""" | ||
Args: | ||
df (pd.DataFrame): reference DataFrame used to run the missing value analysis. | ||
target (str, optional): target | ||
""" | ||
super().__init__(df=df) | ||
self._target = target | ||
self._tests = ["nulls_higher_than", "high_missing_correlations", "predict_missings"] | ||
|
||
@property | ||
def target(self): | ||
return self._target | ||
|
||
@target.setter | ||
def target(self, target: str): | ||
if target not in self.df.columns: | ||
raise Exception(f'Provided target ({target}) must belong to the dataframe columns ({list(self.df.columns)}).') | ||
self._target = target | ||
|
||
def _get_null_cols(self, col: Optional[str] = None) -> List[str]: | ||
"Returns list of given column or all columns with null values in DataFrame if None." | ||
return list(self.df.columns[self.null_count(minimal=False)>0]) if col is None \ | ||
else col if isinstance(col, list) \ | ||
else [col] | ||
|
||
def __get_prediction_type(self): | ||
"Decide whether to use classification or regression setting, based on target." | ||
# TODO: Improve prediction type guesstimate based on alternative heuristics (e.g. dtypes, value_counts) | ||
if len(set(self.df[self.target])) == 2: # binary classification | ||
return 'classification' | ||
else: | ||
return 'regression' | ||
|
||
def null_count(self, col: Union[List[str], str, None] = None, normalize=False, minimal=True): | ||
"""Returns the count of null values. | ||
Args: | ||
col (optional, str): name of column to calculate nulls. If none, calculates for full dataframe. | ||
normalize (bool): flag to return nulls as proportion of total rows. Defaults to False. | ||
minimal (bool): flag to drop zero-nulls when computed for all columns. | ||
""" | ||
# if col is not provided, calculate for full dataset | ||
count = self.df.isnull().sum() if col is None else self.df[col].isnull().sum() | ||
# if normalize, return as percentage of total rows | ||
count = count / len(self.df) if normalize else count | ||
# subset | ||
if col is None and minimal: | ||
count = count[count>0] | ||
return count | ||
|
||
def nulls_higher_than(self, th=0.2): | ||
"Returns the list of columns with higher missing value percentage than the defined threshold." | ||
ratios = self.null_count(col=None, normalize=True) | ||
high_ratios = ratios[ratios >= th] | ||
if len(high_ratios) > 0: | ||
self.store_warning( | ||
QualityWarning( | ||
test='High Missings', category='Missings', priority=3, data=high_ratios, | ||
description=f"Found {len(high_ratios)} columns with more than {th*100:.1f}% of missing values." | ||
) | ||
) | ||
else: | ||
high_ratios = None | ||
return high_ratios | ||
|
||
def missing_correlations(self): | ||
"""Calculate the correlations between missing values in feature values. | ||
# TODO: Replace standard correlation coefficient by Cramer's V / Theil's U. | ||
""" | ||
nulls = self.df.loc[:, self.null_count(minimal=False) > 0] # drop columns without nulls | ||
return nulls.isnull().corr() | ||
|
||
def high_missing_correlations(self, th: float = 0.5): | ||
"Returns a list of correlation pairs with high correlation of missing values." | ||
|
||
corrs = self.missing_correlations().abs() # compute the absolute correlation | ||
np.fill_diagonal(corrs.values, -1) # remove the same column pairs | ||
corrs = corrs[corrs>th].melt(ignore_index=False).reset_index().dropna() # subset by threshold | ||
|
||
# TODO: For acyclical correlation measures (e.g. Theil's U), store direction as well | ||
|
||
# create the sorted pairs of feature names | ||
corrs['features'] = ['_'.join(sorted((i.index, i.variable))) for i in corrs.itertuples()] | ||
corrs.drop_duplicates('features', inplace=True) # deduplicate combination pairs | ||
corrs.sort_values(by='value', ascending=False, inplace=True) # sort by correlation | ||
corrs = corrs.set_index('features').rename(columns={'value': 'missings_corr'})[['missings_corr']].squeeze() # rename and subset columns | ||
|
||
if len(corrs) > 0: | ||
self.store_warning( | ||
QualityWarning( | ||
test='High Missing Correlations', category='Missings', priority=3, data=corrs, | ||
description=f"Found {len(corrs)} feature pairs with correlation "\ | ||
f"of missing values higher than defined threshold ({th})." | ||
) | ||
) | ||
return corrs | ||
|
||
def performance_drop(self, col: Union[List[str], str, None] = None, normalize=True): | ||
"""Calculate the drop in performance when the feature values of a given column are missing. | ||
Performance is measured by "AU-ROC" for binary classification and "Mean Squared Error" for regression. | ||
Args: | ||
col (Union[List[str], str, None], optional): reference for comparing performances between valued and missing value instances. | ||
If None, calculates performance_drop for all columns with missing values. | ||
normalize (bool): performance as ratio over baseline performance achieved for entire dataset. | ||
""" | ||
# Parse the columns for which to calculate the drop in performance on missings | ||
cols = self._get_null_cols(col) | ||
|
||
# Guarantee that target is defined. Otherwise skip | ||
if self.target is None: | ||
print('Argument "target" must be defined to calculate performance_drop metric. Skipping test.') | ||
pass | ||
|
||
# Guesstimate the prediction type | ||
prediction_type = self.__get_prediction_type() | ||
results = pd.DataFrame({ | ||
c: performance_per_missing_value(df=self.df, feature=c, target=self.target, type=prediction_type) | ||
for c in cols | ||
}) | ||
|
||
# Normalize the results with a baseline performance. | ||
if normalize: | ||
baseline = baseline_performance(df=self.df, target=self.target, type=prediction_type) | ||
results = results / baseline | ||
|
||
return results | ||
|
||
def predict_missings(self, col: Union[List[str], str, None] = None, th=0.8): | ||
"""Calculates the performance score of a baseline model trained to predict missingness of a specific feature. | ||
Performance is measured on "AU-ROC" for a binary classifier trained to predict occurrence of missing values. | ||
High performances signal that the occurrence of missing values for a specific feature may be impacted by the | ||
feature values of all the remaining features. | ||
Args: | ||
col (Union[List[str], str, None], optional): reference for predicting occurrence of missing values. | ||
If None, calculates predict_missings for all columns with missing values. | ||
th (float): performance threshold to generate a QualityWarning. | ||
""" | ||
# Parse the columns for which to calculate the missingness performance | ||
cols = self._get_null_cols(col) | ||
# Calculate the performance for each feature | ||
results = pd.Series( | ||
{c: predict_missingness(df=self.df, feature=c) for c in cols}, | ||
name='predict_missings' | ||
) | ||
|
||
# Subset for performances above threshold | ||
high_perfs = results[results > th] | ||
|
||
# Generate a QualityWarning if any high | ||
if len(high_perfs) > 0: | ||
self.store_warning( | ||
QualityWarning( | ||
test='Missingness Prediction', category='Missings', priority=2, data=high_perfs, | ||
description=f'Found {len(high_perfs)} features with prediction performance of missingness above threshold ({th}).' | ||
) | ||
) | ||
return results |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
""" | ||
Utilities based on building baseline machine learning models. | ||
""" | ||
|
||
import pandas as pd | ||
from sklearn.exceptions import ConvergenceWarning | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.linear_model import LinearRegression, LogisticRegression | ||
from sklearn.metrics import mean_squared_error, roc_auc_score | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.utils._testing import ignore_warnings | ||
|
||
BASELINE_CLASSIFIER = Pipeline([ | ||
('imputer', SimpleImputer()), | ||
('classifier', LogisticRegression()) | ||
]) | ||
|
||
BASELINE_REGRESSION = Pipeline([ | ||
('imputer', SimpleImputer()), | ||
('classifier', LinearRegression()) | ||
]) | ||
|
||
|
||
@ignore_warnings(category=ConvergenceWarning) | ||
def baseline_predictions(df: pd.DataFrame, target: str, type='classification'): | ||
"Train a baseline model and predict for a test set" | ||
|
||
# 1. Define the baseline model | ||
model = BASELINE_CLASSIFIER if type == 'classification' else BASELINE_REGRESSION | ||
|
||
# 2. Train overall model | ||
X, y = df.drop(target, axis=1), df[target] | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) | ||
model.fit(X_train.select_dtypes('number'), y_train) | ||
|
||
# 3. Predict | ||
if type == 'regression': | ||
y_pred = model.predict(X_test.select_dtypes('number')) | ||
elif type == 'classification': | ||
y_pred = model.predict_proba(X_test.select_dtypes('number'))[:, 1] | ||
|
||
# 4. Return both the predictions and X_test, y_test to analyze the performances | ||
return y_pred, X_test, y_test | ||
|
||
def baseline_performance(df: pd.DataFrame, target: str, type='classification'): | ||
"Train a baseline model, predict for a test set and return the performance." | ||
|
||
# 1. Define the baseline performance metric | ||
metric = roc_auc_score if type == 'classification' else mean_squared_error | ||
|
||
# 2. Get the baseline predictions | ||
y_pred, _, y_test = baseline_predictions(df=df, target=target, type=type) | ||
|
||
# 3. Get the performance | ||
return metric(y_test, y_pred) | ||
|
||
|
||
def performance_per_feature_values(df: pd.DataFrame, feature: str, target: str, type='classification'): | ||
"""Performance achieved per each value of a groupby feature.""" | ||
|
||
# 1. Define the baseline performance metric | ||
metric = roc_auc_score if type == 'classification' else mean_squared_error | ||
|
||
# 2. Get the baseline predictions | ||
y_pred, X_test, y_test = baseline_predictions(df=df, target=target, type=type) | ||
|
||
# 3. Get the performances per feature value | ||
uniques = set(X_test[feature]) | ||
results = {} | ||
for i in uniques: # for each category | ||
y_pred_cat = y_pred[X_test[feature]==i] | ||
y_true_cat = y_test[X_test[feature]==i] | ||
results[i] = metric(y_true_cat, y_pred_cat) | ||
return results | ||
|
||
def performance_per_missing_value(df: pd.DataFrame, feature: str, target: str, type='classification'): | ||
"""Performance difference between valued and missing values in feature.""" | ||
|
||
# 1. Define the baseline performance metric | ||
metric = roc_auc_score if type == 'classification' else mean_squared_error | ||
|
||
# 2. Get the baseline predictions | ||
y_pred, X_test, y_test = baseline_predictions(df=df, target=target, type=type) | ||
|
||
# 3. Get the performance per valued vs missing feature | ||
missing_mask = X_test[feature].isna() | ||
results = {} | ||
results['missing'] = metric(y_test[missing_mask], y_pred[missing_mask]) | ||
results['valued'] = metric(y_test[~missing_mask], y_pred[~missing_mask]) | ||
return results | ||
|
||
@ignore_warnings(category=ConvergenceWarning) | ||
def predict_missingness(df: pd.DataFrame, feature: str): | ||
"Train a baseline model to predict the missingness of a feature value." | ||
# 0. Preprocessing | ||
df = df.copy() # avoid altering the original DataFrame | ||
target = f'is_missing_{feature}' | ||
|
||
# 1. Define the baseline model | ||
model = BASELINE_CLASSIFIER | ||
|
||
# 2. Create the new target | ||
df[target] = df[feature].isna() | ||
|
||
# 3. Train overall model | ||
X, y = df.drop([feature, target], axis=1), df[target] | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) | ||
model.fit(X_train.select_dtypes('number'), y_train) | ||
|
||
# 4. Predict | ||
y_pred = model.predict_proba(X_test.select_dtypes('number'))[:, 1] | ||
|
||
# 5. Return the area under the roc curve | ||
return roc_auc_score(y_test, y_pred) | ||
|
Oops, something went wrong.