Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ADD] scalers from autosklearn #372

Merged
merged 9 commits into from
Feb 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def get_hyperparameter_search_space(self,
'RandomKitchenSinks',
'Nystroem',
'PolynomialFeatures',
'PowerTransformer',
'TruncatedSVD',
]
for default_ in defaults:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from typing import Any, Dict, Optional, Union

import numpy as np

from sklearn.preprocessing import PowerTransformer as SklearnPowerTransformer

from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler


class PowerTransformer(BaseScaler):
"""
Map data to as close to a Gaussian distribution as possible
in order to reduce variance and minimize skewness.

Uses `yeo-johnson` power transform method. Also, data is normalised
to zero mean and unit variance.
"""
def __init__(self,
random_state: Optional[np.random.RandomState] = None):
super().__init__()
self.random_state = random_state

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:

self.check_requirements(X, y)

self.preprocessor['numerical'] = SklearnPowerTransformer(method='yeo-johnson', copy=False)
return self

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'PowerTransformer',
'name': 'PowerTransformer',
'handles_sparse': False
}
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from typing import Any, Dict, Optional, Union

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
CategoricalHyperparameter,
UniformIntegerHyperparameter
)

import numpy as np

from sklearn.preprocessing import QuantileTransformer as SklearnQuantileTransformer

from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter


class QuantileTransformer(BaseScaler):
"""
Transform the features to follow a uniform or a normal distribution
using quantiles information.

For more details of each attribute, see:
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html
"""
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
def __init__(
self,
n_quantiles: int = 1000,
output_distribution: str = "normal", # Literal["normal", "uniform"]
random_state: Optional[np.random.RandomState] = None
):
super().__init__()
self.random_state = random_state
self.n_quantiles = n_quantiles
self.output_distribution = output_distribution

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:

self.check_requirements(X, y)

self.preprocessor['numerical'] = SklearnQuantileTransformer(n_quantiles=self.n_quantiles,
output_distribution=self.output_distribution,
copy=False)
return self

@staticmethod
def get_hyperparameter_search_space(
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
n_quantiles: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="n_quantiles",
value_range=(10, 2000),
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
default_value=1000,
),
output_distribution: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="output_distribution",
value_range=("uniform", "normal"),
default_value="normal",
)
) -> ConfigurationSpace:
cs = ConfigurationSpace()

# TODO parametrize like the Random Forest as n_quantiles = n_features^param
add_hyperparameter(cs, n_quantiles, UniformIntegerHyperparameter)
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
add_hyperparameter(cs, output_distribution, CategoricalHyperparameter)

return cs

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'QuantileTransformer',
'name': 'QuantileTransformer',
'handles_sparse': False
}
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from typing import Any, Dict, Optional, Union

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
UniformFloatHyperparameter,
)

import numpy as np

from sklearn.preprocessing import RobustScaler as SklearnRobustScaler

from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter


class RobustScaler(BaseScaler):
"""
Remove the median and scale features according to the quantile_range to make
the features robust to outliers.

For more details of the preprocessor, see:
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
"""
def __init__(
self,
q_min: float = 0.25,
q_max: float = 0.75,
random_state: Optional[np.random.RandomState] = None
):
super().__init__()
self.add_fit_requirements([
FitRequirement('issparse', (bool,), user_defined=True, dataset_property=True)])
self.random_state = random_state
self.q_min = q_min
self.q_max = q_max

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:

self.check_requirements(X, y)
with_centering = bool(not X['dataset_properties']['issparse'])

self.preprocessor['numerical'] = SklearnRobustScaler(quantile_range=(self.q_min, self.q_max),
with_centering=with_centering,
copy=False)

return self

@staticmethod
def get_hyperparameter_search_space(
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
q_min: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="q_min",
value_range=(0.001, 0.3),
default_value=0.25),
q_max: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="q_max",
value_range=(0.7, 0.999),
default_value=0.75)
) -> ConfigurationSpace:
cs = ConfigurationSpace()

add_hyperparameter(cs, q_min, UniformFloatHyperparameter)
add_hyperparameter(cs, q_max, UniformFloatHyperparameter)

return cs

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'RobustScaler',
'name': 'RobustScaler',
'handles_sparse': True
}
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,21 @@ def get_hyperparameter_search_space(self,
raise ValueError("no scalers found, please add a scaler")

if default is None:
defaults = ['StandardScaler', 'Normalizer', 'MinMaxScaler', 'NoScaler']
defaults = [
'StandardScaler',
'Normalizer',
'MinMaxScaler',
'PowerTransformer',
'QuantileTransformer',
'RobustScaler',
'NoScaler'
]
for default_ in defaults:
if default_ in available_scalers:
if include is not None and default_ not in include:
continue
if exclude is not None and default_ in exclude:
continue
default = default_
break

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def random_state():
return 11


@pytest.fixture(params=['TruncatedSVD', 'PolynomialFeatures', 'PowerTransformer',
@pytest.fixture(params=['TruncatedSVD', 'PolynomialFeatures',
'Nystroem', 'KernelPCA', 'RandomKitchenSinks'])
def preprocessor(request):
return request.param
Expand Down
Loading