Skip to content

Commit

Permalink
Cleanup of simple_imputer (#346)
Browse files Browse the repository at this point in the history
* cleanup of simple_imputer

* Fixed doc and typo

* Fixed docs

* Made changes, added test

* Fixed init statement

* Fixed docs

* Flake'd
  • Loading branch information
eddiebergman authored Dec 1, 2021
1 parent 4dd22fd commit 8f9e9f6
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 56 deletions.
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from typing import Any, Dict, List, Optional, Union

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
CategoricalHyperparameter
)
from ConfigSpace.hyperparameters import CategoricalHyperparameter

import numpy as np

Expand All @@ -15,92 +13,143 @@


class SimpleImputer(BaseImputer):
"""
Impute missing values for categorical columns with '!missing!'
(In case of numpy data, the constant value is set to -1, under
the assumption that categorical data is fit with an Ordinal Scaler)
"""An imputer for categorical and numerical columns
Impute missing values for categorical columns with 'constant_!missing!'
Note:
In case of numpy data, the constant value is set to -1, under the assumption
that categorical data is fit with an Ordinal Scaler.
Attributes:
random_state (Optional[np.random.RandomState]):
The random state to use for the imputer.
numerical_strategy (str: default='mean'):
The strategy to use for imputing numerical columns.
Can be one of ['most_frequent', 'constant_!missing!']
categorical_strategy (str: default='most_frequent')
The strategy to use for imputing categorical columns.
Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
"""

def __init__(self,
random_state: Optional[Union[np.random.RandomState, int]] = None,
numerical_strategy: str = 'mean',
categorical_strategy: str = 'most_frequent'):
def __init__(
self,
random_state: Optional[np.random.RandomState] = None,
numerical_strategy: str = 'mean',
categorical_strategy: str = 'most_frequent'
):
"""
Note:
'constant' as numerical_strategy uses 0 as the default fill_value while
'constant_!missing!' uses a fill_value of -1.
This behaviour should probably be fixed.
"""
super().__init__()
self.random_state = random_state
self.numerical_strategy = numerical_strategy
self.categorical_strategy = categorical_strategy

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseImputer:
"""
The fit function calls the fit function of the underlying model
and returns the transformed array.
def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
""" Fits the underlying model and returns the transformed array.
Args:
X (np.ndarray): input features
y (Optional[np.ndarray]): input labels
X (np.ndarray):
The input features to fit on
y (Optional[np.ndarray]):
The labels for the input features `X`
Returns:
instance of self
SimpleImputer:
returns self
"""
self.check_requirements(X, y)
categorical_columns = X['dataset_properties']['categorical_columns'] \
if isinstance(X['dataset_properties']['categorical_columns'], List) else []
if len(categorical_columns) != 0:

# Choose an imputer for any categorical columns
categorical_columns = X['dataset_properties']['categorical_columns']

if isinstance(categorical_columns, List) and len(categorical_columns) != 0:
if self.categorical_strategy == 'constant_!missing!':
self.preprocessor['categorical'] = SklearnSimpleImputer(strategy='constant',
# Train data is numpy
# as of this point, where
# Ordinal Encoding is using
# for categorical. Only
# Numbers are allowed
# fill_value='!missing!',
fill_value=-1,
copy=False)
# Train data is numpy as of this point, where an Ordinal Encoding is used
# for categoricals. Only Numbers are allowed for `fill_value`
imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False)
self.preprocessor['categorical'] = imputer
else:
self.preprocessor['categorical'] = SklearnSimpleImputer(strategy=self.categorical_strategy,
copy=False)
numerical_columns = X['dataset_properties']['numerical_columns'] \
if isinstance(X['dataset_properties']['numerical_columns'], List) else []
if len(numerical_columns) != 0:
imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False)
self.preprocessor['categorical'] = imputer

# Choose an imputer for any numerical columns
numerical_columns = X['dataset_properties']['numerical_columns']

if isinstance(numerical_columns, List) and len(numerical_columns) > 0:
if self.numerical_strategy == 'constant_zero':
self.preprocessor['numerical'] = SklearnSimpleImputer(strategy='constant',
fill_value=0,
copy=False)
imputer = SklearnSimpleImputer(strategy='constant', fill_value=0, copy=False)
self.preprocessor['numerical'] = imputer
else:
self.preprocessor['numerical'] = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False)
imputer = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False)
self.preprocessor['numerical'] = imputer

return self

@staticmethod
def get_hyperparameter_search_space(
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='numerical_strategy',
value_range=("mean", "median",
"most_frequent",
"constant_zero"),
default_value="mean",
),
numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
hyperparameter='numerical_strategy',
value_range=("mean", "median", "most_frequent", "constant_zero"),
default_value="mean",
),
categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
hyperparameter='categorical_strategy',
value_range=("most_frequent",
"constant_!missing!"),
default_value="most_frequent")
value_range=("most_frequent", "constant_!missing!"),
default_value="most_frequent"
)
) -> ConfigurationSpace:
"""Get the hyperparameter search space for the SimpleImputer
Args:
dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
Properties that describe the dataset
Note: Not actually Optional, just adhering to its supertype
numerical_strategy (HyperparameterSearchSpace: default = ...)
The strategy to use for numerical imputation
caterogical_strategy (HyperparameterSearchSpace: default = ...)
The strategy to use for categorical imputation
Returns:
ConfigurationSpace
The space of possible configurations for a SimpleImputer with the given
`dataset_properties`
"""
cs = ConfigurationSpace()
assert dataset_properties is not None, "To create hyperparameter search space" \
", dataset_properties should not be None"
if len(dataset_properties['numerical_columns']) \
if isinstance(dataset_properties['numerical_columns'], List) else 0 != 0:

if dataset_properties is None:
raise ValueError("SimpleImputer requires `dataset_properties` for generating"
" a search space.")

if (
isinstance(dataset_properties['numerical_columns'], List)
and len(dataset_properties['numerical_columns']) != 0
):
add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)

if len(dataset_properties['categorical_columns']) \
if isinstance(dataset_properties['categorical_columns'], List) else 0 != 0:
if (
isinstance(dataset_properties['categorical_columns'], List)
and len(dataset_properties['categorical_columns'])
):
add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter)

return cs

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:
def get_properties(
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:
"""Get the properties of the SimpleImputer class and what it can handle
Returns:
Dict[str, Union[str, bool]]:
A dict from property names to values
"""
return {
'shortname': 'SimpleImputer',
'name': 'Simple Imputer',
Expand Down
12 changes: 12 additions & 0 deletions test/test_pipeline/components/preprocessing/test_imputers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
from numpy.testing import assert_array_equal

import pytest

from sklearn.base import BaseEstimator, clone
from sklearn.compose import make_column_transformer

Expand Down Expand Up @@ -213,6 +215,16 @@ def test_constant_imputation(self):
[7.0, '0', 9],
[4.0, '0', '0']], dtype=str))

def test_imputation_without_dataset_properties_raises_error(self):
"""Tests SimpleImputer checks for dataset properties when querying for
HyperparameterSearchSpace, even though the arg is marked `Optional`.
Expects:
* Should raise a ValueError that no dataset_properties were passed
"""
with pytest.raises(ValueError):
SimpleImputer.get_hyperparameter_search_space()


if __name__ == '__main__':
unittest.main()

0 comments on commit 8f9e9f6

Please sign in to comment.