Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Fixes for Tabular Regression #235

Merged
merged 18 commits into from
Jun 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
include requirements.txt
include autoPyTorch/utils/logging.yaml
include autoPyTorch/configs/default_pipeline_options.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/catboost.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/rotation_forest.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/random_forest.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/knn.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/svm.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/extra_trees.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/lgb.json
include autoPyTorch/configs/greedy_portfolio.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/rotation_forest.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/random_forest.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/knn.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/svm.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/extra_trees.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/lgb.json
31 changes: 14 additions & 17 deletions autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
from autoPyTorch.optimizer.smbo import AutoMLSMBO
from autoPyTorch.pipeline.base_pipeline import BasePipeline
from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models import get_available_classifiers
from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import get_available_traditional_learners
from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score, get_metrics
from autoPyTorch.utils.common import FitRequirement, replace_string_bool_to_bool
Expand Down Expand Up @@ -590,7 +590,7 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
memory_limit = self._memory_limit
if memory_limit is not None:
memory_limit = int(math.ceil(memory_limit))
available_classifiers = get_available_classifiers()
available_classifiers = get_available_traditional_learners()
dask_futures = []

total_number_classifiers = len(available_classifiers)
Expand Down Expand Up @@ -892,21 +892,18 @@ def _search(
# ============> Run traditional ml

if enable_traditional_pipeline:
if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS:
self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...")
else:
traditional_task_name = 'runTraditional'
self._stopwatch.start_task(traditional_task_name)
elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
# We want time for at least 1 Neural network in SMAC
time_for_traditional = int(
self._time_for_task - elapsed_time - func_eval_time_limit_secs
)
self._do_traditional_prediction(
func_eval_time_limit_secs=func_eval_time_limit_secs,
time_left=time_for_traditional,
)
self._stopwatch.stop_task(traditional_task_name)
traditional_task_name = 'runTraditional'
self._stopwatch.start_task(traditional_task_name)
elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
# We want time for at least 1 Neural network in SMAC
time_for_traditional = int(
self._time_for_task - elapsed_time - func_eval_time_limit_secs
)
self._do_traditional_prediction(
func_eval_time_limit_secs=func_eval_time_limit_secs,
time_left=time_for_traditional,
)
self._stopwatch.stop_task(traditional_task_name)

# ============> Starting ensemble
elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
Expand Down
10 changes: 7 additions & 3 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def search(
budget: Optional[float] = None,
total_walltime_limit: int = 100,
func_eval_time_limit_secs: Optional[int] = None,
enable_traditional_pipeline: bool = False,
enable_traditional_pipeline: bool = True,
memory_limit: Optional[int] = 4096,
smac_scenario_args: Optional[Dict[str, Any]] = None,
get_smac_object_callback: Optional[Callable] = None,
Expand Down Expand Up @@ -151,7 +151,7 @@ def search(
total_walltime_limit // 2 to allow enough time to fit
at least 2 individual machine learning algorithms.
Set to np.inf in case no time limit is desired.
enable_traditional_pipeline (bool), (default=False):
enable_traditional_pipeline (bool), (default=True):
Not enabled for regression. This flag is here to comply
with the API.
memory_limit (Optional[int]), (default=4096): Memory
Expand Down Expand Up @@ -187,7 +187,11 @@ def search(
configurations, similar to (...herepathtogreedy...).
Additionally, the keyword 'greedy' is supported,
which would use the default portfolio from
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`.
Although portfolio selection is supported for tabular
regression, the portfolio has been built using
classification datasets. We will update a portfolio
to cover tabular regression datasets.

Returns:
self
Expand Down
100 changes: 81 additions & 19 deletions autoPyTorch/evaluation/abstract_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import autoPyTorch.pipeline.tabular_classification
import autoPyTorch.pipeline.tabular_regression
import autoPyTorch.pipeline.traditional_tabular_classification
import autoPyTorch.pipeline.traditional_tabular_regression
from autoPyTorch.automl_common.common.utils.backend import Backend
from autoPyTorch.constants import (
CLASSIFICATION_TASKS,
Expand Down Expand Up @@ -64,7 +65,7 @@ class MyTraditionalTabularClassificationPipeline(BaseEstimator):
Attributes:
dataset_properties (Dict[str, Any]):
A dictionary containing dataset specific information
random_state (Optional[Union[int, np.random.RandomState]]):
random_state (Optional[np.random.RandomState]):
Object that contains a seed and allows for reproducible results
init_params (Optional[Dict]):
An optional dictionary that is passed to the pipeline's steps. It complies
Expand All @@ -73,18 +74,18 @@ class MyTraditionalTabularClassificationPipeline(BaseEstimator):

def __init__(self, config: str,
dataset_properties: Dict[str, Any],
random_state: Optional[Union[int, np.random.RandomState]] = None,
random_state: Optional[np.random.RandomState] = None,
init_params: Optional[Dict] = None):
self.config = config
self.dataset_properties = dataset_properties
self.random_state = random_state
self.init_params = init_params
self.pipeline = autoPyTorch.pipeline.traditional_tabular_classification.\
self.pipeline = autoPyTorch.pipeline.traditional_tabular_classification. \
TraditionalTabularClassificationPipeline(dataset_properties=dataset_properties,
random_state=self.random_state)
configuration_space = self.pipeline.get_hyperparameter_search_space()
default_configuration = configuration_space.get_default_configuration().get_dictionary()
default_configuration['model_trainer:tabular_classifier:classifier'] = config
default_configuration['model_trainer:tabular_traditional_model:traditional_learner'] = config
self.configuration = Configuration(configuration_space, default_configuration)
self.pipeline.set_hyperparameters(self.configuration)

Expand All @@ -100,18 +101,15 @@ def predict(self, X: Union[np.ndarray, pd.DataFrame],
batch_size: int = 1000) -> np.array:
return self.pipeline.predict(X, batch_size=batch_size)

def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201
return False

def get_additional_run_info(self) -> Dict[str, Any]: # pylint: disable=R0201
def get_additional_run_info(self) -> Dict[str, Any]:
"""
Can be used to return additional info for the run.
Returns:
Dict[str, Any]:
Currently contains
1. pipeline_configuration: the configuration of the pipeline, i.e, the traditional model used
2. trainer_configuration: the parameters for the traditional model used.
Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs
Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs
"""
return {'pipeline_configuration': self.configuration,
'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config(),
Expand All @@ -126,6 +124,71 @@ def get_default_pipeline_options() -> Dict[str, Any]:
TraditionalTabularClassificationPipeline.get_default_pipeline_options()


class MyTraditionalTabularRegressionPipeline(BaseEstimator):
"""
A wrapper class that holds a pipeline for traditional regression.
Estimators like CatBoost, and Random Forest are considered traditional machine
learning models and are fitted before neural architecture search.

This class is an interface to fit a pipeline containing a traditional machine
learning model, and is the final object that is stored for inference.

Attributes:
dataset_properties (Dict[str, Any]):
A dictionary containing dataset specific information
random_state (Optional[np.random.RandomState]):
Object that contains a seed and allows for reproducible results
init_params (Optional[Dict]):
An optional dictionary that is passed to the pipeline's steps. It complies
a similar function as the kwargs
"""
def __init__(self, config: str,
dataset_properties: Dict[str, Any],
random_state: Optional[np.random.RandomState] = None,
init_params: Optional[Dict] = None):
self.config = config
self.dataset_properties = dataset_properties
self.random_state = random_state
self.init_params = init_params
self.pipeline = autoPyTorch.pipeline.traditional_tabular_regression. \
TraditionalTabularRegressionPipeline(dataset_properties=dataset_properties,
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
random_state=self.random_state)
configuration_space = self.pipeline.get_hyperparameter_search_space()
default_configuration = configuration_space.get_default_configuration().get_dictionary()
default_configuration['model_trainer:tabular_traditional_model:traditional_learner'] = config
self.configuration = Configuration(configuration_space, default_configuration)
self.pipeline.set_hyperparameters(self.configuration)

def fit(self, X: Dict[str, Any], y: Any,
sample_weight: Optional[np.ndarray] = None) -> object:
return self.pipeline.fit(X, y)

def predict(self, X: Union[np.ndarray, pd.DataFrame],
batch_size: int = 1000) -> np.array:
return self.pipeline.predict(X, batch_size=batch_size)

def get_additional_run_info(self) -> Dict[str, Any]:
"""
Can be used to return additional info for the run.
Returns:
Dict[str, Any]:
Currently contains
1. pipeline_configuration: the configuration of the pipeline, i.e, the traditional model used
2. trainer_configuration: the parameters for the traditional model used.
Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs
"""
return {'pipeline_configuration': self.configuration,
'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config()}

def get_pipeline_representation(self) -> Dict[str, str]:
return self.pipeline.get_pipeline_representation()

@staticmethod
def get_default_pipeline_options() -> Dict[str, Any]:
return autoPyTorch.pipeline.traditional_tabular_regression.\
TraditionalTabularRegressionPipeline.get_default_pipeline_options()


class DummyClassificationPipeline(DummyClassifier):
"""
A wrapper class that holds a pipeline for dummy classification.
Expand Down Expand Up @@ -175,9 +238,6 @@ def predict(self, X: Union[np.ndarray, pd.DataFrame],
new_X = np.ones((X.shape[0], 1))
return super(DummyClassificationPipeline, self).predict(new_X).astype(np.float32)

def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201
return False

def get_additional_run_info(self) -> Dict: # pylint: disable=R0201
return {'configuration_origin': 'DUMMY'}

Expand Down Expand Up @@ -234,12 +294,15 @@ def predict(self, X: Union[np.ndarray, pd.DataFrame],
new_X = np.ones((X.shape[0], 1))
return super(DummyRegressionPipeline, self).predict(new_X).astype(np.float32)

def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201
return False

def get_additional_run_info(self) -> Dict: # pylint: disable=R0201
return {'configuration_origin': 'DUMMY'}

def get_pipeline_representation(self) -> Dict[str, str]:
return {
'Preprocessing': 'None',
'Estimator': 'Dummy',
}

@staticmethod
def get_default_pipeline_options() -> Dict[str, Any]:
return {'budget_type': 'epochs',
Expand Down Expand Up @@ -401,8 +464,7 @@ def __init__(self, backend: Backend,
if isinstance(self.configuration, int):
self.pipeline_class = DummyRegressionPipeline
elif isinstance(self.configuration, str):
raise ValueError("Only tabular classifications tasks "
"are currently supported with traditional methods")
self.pipeline_class = MyTraditionalTabularRegressionPipeline
elif isinstance(self.configuration, Configuration):
self.pipeline_class = autoPyTorch.pipeline.tabular_regression.TabularRegressionPipeline
else:
Expand All @@ -415,8 +477,7 @@ def __init__(self, backend: Backend,
if self.task_type in TABULAR_TASKS:
self.pipeline_class = MyTraditionalTabularClassificationPipeline
else:
raise ValueError("Only tabular classifications tasks "
"are currently supported with traditional methods")
raise ValueError("Only tabular tasks are currently supported with traditional methods")
elif isinstance(self.configuration, Configuration):
if self.task_type in TABULAR_TASKS:
self.pipeline_class = autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline
Expand Down Expand Up @@ -446,6 +507,7 @@ def __init__(self, backend: Backend,
'y_test': self.y_test,
'backend': self.backend,
'logger_port': logger_port,
'optimize_metric': self.metric.name
})
assert self.pipeline_class is not None, "Could not infer pipeline class"
pipeline_config = pipeline_config if pipeline_config is not None \
Expand Down
Loading