Skip to content
This repository has been archived by the owner on Oct 9, 2023. It is now read-only.

Refactor tabular data to use classification targets handling #1114

Merged
merged 6 commits into from
Jan 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

- Fixed `InstanceSegmentationData.from_voc` ([#1102](https://github.com/PyTorchLightning/lightning-flash/pull/1102))

- Fixed a bug when loading tabular data for prediction without a target field / column ([#1114](https://github.com/PyTorchLightning/lightning-flash/pull/1114))

### Removed

## [0.6.0] - 2021-13-12
Expand Down
7 changes: 6 additions & 1 deletion docs/source/api/tabular.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ ______________
~classification.model.TabularClassifier
~classification.data.TabularClassificationData

classification.input.TabularClassificationDataFrameInput
classification.input.TabularClassificationCSVInput

Regression
__________

Expand All @@ -31,6 +34,9 @@ __________
~regression.model.TabularRegressor
~regression.data.TabularRegressionData

regression.input.TabularRegressionDataFrameInput
regression.input.TabularRegressionCSVInput

Forecasting
___________

Expand All @@ -56,5 +62,4 @@ __________________
~data.TabularData

input.TabularDataFrameInput
input.TabularCSVInput
input.TabularDeserializer
2 changes: 1 addition & 1 deletion flash/tabular/classification/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def tabular_classification():
"categorical_fields",
"num_features",
"cat_dims",
"output_dim",
"num_classes",
},
)

Expand Down
95 changes: 94 additions & 1 deletion flash/tabular/classification/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,101 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Dict, List, Optional, Type, Union

from flash.core.data.data_pipeline import DataPipelineState
from flash.core.data.io.input import Input
from flash.core.data.io.input_transform import INPUT_TRANSFORM_TYPE, InputTransform
from flash.core.utilities.imports import _PANDAS_AVAILABLE
from flash.core.utilities.stages import RunningStage
from flash.tabular.classification.input import TabularClassificationCSVInput, TabularClassificationDataFrameInput
from flash.tabular.data import TabularData

if _PANDAS_AVAILABLE:
from pandas.core.frame import DataFrame
else:
DataFrame = object


class TabularClassificationData(TabularData):
is_regression = False
@classmethod
def from_data_frame(
cls,
categorical_fields: Optional[Union[str, List[str]]] = None,
numerical_fields: Optional[Union[str, List[str]]] = None,
target_fields: Optional[Union[str, List[str]]] = None,
parameters: Optional[Dict[str, Any]] = None,
train_data_frame: Optional[DataFrame] = None,
val_data_frame: Optional[DataFrame] = None,
test_data_frame: Optional[DataFrame] = None,
predict_data_frame: Optional[DataFrame] = None,
train_transform: INPUT_TRANSFORM_TYPE = InputTransform,
val_transform: INPUT_TRANSFORM_TYPE = InputTransform,
test_transform: INPUT_TRANSFORM_TYPE = InputTransform,
predict_transform: INPUT_TRANSFORM_TYPE = InputTransform,
input_cls: Type[Input] = TabularClassificationDataFrameInput,
transform_kwargs: Optional[Dict] = None,
**data_module_kwargs: Any,
) -> "TabularClassificationData":
ds_kw = dict(
data_pipeline_state=DataPipelineState(),
transform_kwargs=transform_kwargs,
input_transforms_registry=cls.input_transforms_registry,
categorical_fields=categorical_fields,
numerical_fields=numerical_fields,
target_fields=target_fields,
parameters=parameters,
)

train_input = input_cls(RunningStage.TRAINING, train_data_frame, transform=train_transform, **ds_kw)

ds_kw["parameters"] = train_input.parameters if train_input else parameters

return cls(
train_input,
input_cls(RunningStage.VALIDATING, val_data_frame, transform=val_transform, **ds_kw),
input_cls(RunningStage.TESTING, test_data_frame, transform=test_transform, **ds_kw),
input_cls(RunningStage.PREDICTING, predict_data_frame, transform=predict_transform, **ds_kw),
**data_module_kwargs,
)

@classmethod
def from_csv(
cls,
categorical_fields: Optional[Union[str, List[str]]] = None,
numerical_fields: Optional[Union[str, List[str]]] = None,
target_fields: Optional[Union[str, List[str]]] = None,
parameters: Optional[Dict[str, Any]] = None,
train_file: Optional[str] = None,
val_file: Optional[str] = None,
test_file: Optional[str] = None,
predict_file: Optional[str] = None,
train_transform: INPUT_TRANSFORM_TYPE = InputTransform,
val_transform: INPUT_TRANSFORM_TYPE = InputTransform,
test_transform: INPUT_TRANSFORM_TYPE = InputTransform,
predict_transform: INPUT_TRANSFORM_TYPE = InputTransform,
input_cls: Type[Input] = TabularClassificationCSVInput,
transform_kwargs: Optional[Dict] = None,
**data_module_kwargs: Any,
) -> "TabularClassificationData":
ds_kw = dict(
data_pipeline_state=DataPipelineState(),
transform_kwargs=transform_kwargs,
input_transforms_registry=cls.input_transforms_registry,
categorical_fields=categorical_fields,
numerical_fields=numerical_fields,
target_fields=target_fields,
parameters=parameters,
)

train_input = input_cls(RunningStage.TRAINING, train_file, transform=train_transform, **ds_kw)

ds_kw["parameters"] = train_input.parameters if train_input else parameters

return cls(
train_input,
input_cls(RunningStage.VALIDATING, val_file, transform=val_transform, **ds_kw),
input_cls(RunningStage.TESTING, test_file, transform=test_transform, **ds_kw),
input_cls(RunningStage.PREDICTING, predict_file, transform=predict_transform, **ds_kw),
**data_module_kwargs,
)
62 changes: 62 additions & 0 deletions flash/tabular/classification/input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Dict, List, Optional, Union

from flash import DataKeys
from flash.core.data.io.classification_input import ClassificationInput
from flash.core.data.utilities.data_frame import read_csv, resolve_targets
from flash.core.utilities.imports import _PANDAS_AVAILABLE
from flash.tabular.input import TabularDataFrameInput

if _PANDAS_AVAILABLE:
from pandas.core.frame import DataFrame
else:
DataFrame = object


class TabularClassificationDataFrameInput(TabularDataFrameInput, ClassificationInput):
def load_data(
self,
data_frame: DataFrame,
categorical_fields: Optional[Union[str, List[str]]] = None,
numerical_fields: Optional[Union[str, List[str]]] = None,
target_fields: Optional[Union[str, List[str]]] = None,
parameters: Dict[str, Any] = None,
):
cat_vars, num_vars = self.preprocess(data_frame, categorical_fields, numerical_fields, parameters)

if not self.predicting:
targets = resolve_targets(data_frame, target_fields)
self.load_target_metadata(targets)
return [{DataKeys.INPUT: (c, n), DataKeys.TARGET: t} for c, n, t in zip(cat_vars, num_vars, targets)]
else:
return [{DataKeys.INPUT: (c, n)} for c, n in zip(cat_vars, num_vars)]

def load_sample(self, sample: Dict[str, Any]) -> Any:
if DataKeys.TARGET in sample:
sample[DataKeys.TARGET] = self.format_target(sample[DataKeys.TARGET])
return sample


class TabularClassificationCSVInput(TabularClassificationDataFrameInput):
def load_data(
self,
file: Optional[str],
categorical_fields: Optional[Union[str, List[str]]] = None,
numerical_fields: Optional[Union[str, List[str]]] = None,
target_fields: Optional[Union[str, List[str]]] = None,
parameters: Dict[str, Any] = None,
):
if file is not None:
return super().load_data(read_csv(file), categorical_fields, numerical_fields, target_fields, parameters)
8 changes: 4 additions & 4 deletions flash/tabular/classification/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class TabularClassifier(ClassificationAdapterTask):
embedding_sizes: List of (num_classes, emb_dim) to form categorical embeddings.
cat_dims: Number of distinct values for each categorical column
num_features: Number of columns in table
output_dim: Number of classes to classify
num_classes: Number of classes to classify
backbone: name of the model to use
loss_fn: Loss function for training, defaults to cross entropy.
optimizer: Optimizer to use for training.
Expand All @@ -59,7 +59,7 @@ def __init__(
categorical_fields: list,
cat_dims: list,
num_features: int,
output_dim: int,
num_classes: int,
backbone: str = "tabnet",
loss_fn: Callable = F.cross_entropy,
optimizer: OPTIMIZER_TYPE = "Adam",
Expand All @@ -77,7 +77,7 @@ def __init__(
categorical_fields=categorical_fields,
cat_dims=cat_dims,
num_features=num_features,
output_dim=output_dim,
output_dim=num_classes,
backbone=backbone,
backbone_kwargs=backbone_kwargs,
loss_fn=loss_fn,
Expand All @@ -102,7 +102,7 @@ def from_data(cls, datamodule, **kwargs) -> "TabularClassifier":
categorical_fields=datamodule.categorical_fields,
cat_dims=datamodule.cat_dims,
num_features=datamodule.num_features,
output_dim=datamodule.output_dim,
num_classes=datamodule.num_classes,
**kwargs,
)
return model
Expand Down
4 changes: 0 additions & 4 deletions flash/tabular/classification/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,10 @@ def _pre_transform(
codes: Dict,
mean: DataFrame,
std: DataFrame,
target: str = None,
target_codes: Dict = None,
) -> DataFrame:
df = _impute(df, num_cols)
df = _normalize(df, num_cols, mean=mean, std=std)
df = _categorize(df, cat_cols, codes=codes)
if target_codes and target:
df = _categorize(df, [target], codes=target_codes)
return df


Expand Down
107 changes: 2 additions & 105 deletions flash/tabular/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Dict, List, Optional, Type, Union
from typing import Any, Dict, List, Optional

from flash.core.data.data_module import DataModule
from flash.core.data.data_pipeline import DataPipelineState
from flash.core.data.io.input import Input
from flash.core.data.io.input_transform import INPUT_TRANSFORM_TYPE, InputTransform
from flash.core.data.io.input_transform import InputTransform
from flash.core.data.io.output_transform import OutputTransform
from flash.core.utilities.imports import _PANDAS_AVAILABLE
from flash.core.utilities.stages import RunningStage
from flash.tabular.input import TabularCSVInput, TabularDataFrameInput

if _PANDAS_AVAILABLE:
from pandas.core.frame import DataFrame
else:
DataFrame = object


class TabularData(DataModule):
"""Data module for tabular tasks."""

input_transform_cls = InputTransform
output_transform_cls = OutputTransform

is_regression: bool = False

@property
def parameters(self) -> Optional[Dict[str, Any]]:
"""The parameters dictionary created from the train data when constructing the ``TabularData`` object."""
Expand Down Expand Up @@ -70,93 +57,3 @@ def embedding_sizes(self) -> list:
# embedding_dimensions = number_of_categories**0.25
emb_dims = [max(int(n ** 0.25), 16) for n in self.cat_dims]
return list(zip(self.cat_dims, emb_dims))

@property
def output_dim(self) -> int:
return self.num_classes if not self.is_regression else 1

@classmethod
def from_data_frame(
cls,
categorical_fields: Optional[Union[str, List[str]]] = None,
numerical_fields: Optional[Union[str, List[str]]] = None,
target_fields: Optional[str] = None,
parameters: Optional[Dict[str, Any]] = None,
train_data_frame: Optional[DataFrame] = None,
val_data_frame: Optional[DataFrame] = None,
test_data_frame: Optional[DataFrame] = None,
predict_data_frame: Optional[DataFrame] = None,
train_transform: INPUT_TRANSFORM_TYPE = InputTransform,
val_transform: INPUT_TRANSFORM_TYPE = InputTransform,
test_transform: INPUT_TRANSFORM_TYPE = InputTransform,
predict_transform: INPUT_TRANSFORM_TYPE = InputTransform,
input_cls: Type[Input] = TabularDataFrameInput,
transform_kwargs: Optional[Dict] = None,
**data_module_kwargs: Any,
) -> "TabularData":

ds_kw = dict(
data_pipeline_state=DataPipelineState(),
transform_kwargs=transform_kwargs,
input_transforms_registry=cls.input_transforms_registry,
categorical_fields=categorical_fields,
numerical_fields=numerical_fields,
target_field=target_fields,
is_regression=cls.is_regression,
parameters=parameters,
)

train_input = input_cls(RunningStage.TRAINING, train_data_frame, transform=train_transform, **ds_kw)

ds_kw["parameters"] = train_input.parameters if train_input else parameters

return cls(
train_input,
input_cls(RunningStage.VALIDATING, val_data_frame, transform=val_transform, **ds_kw),
input_cls(RunningStage.TESTING, test_data_frame, transform=test_transform, **ds_kw),
input_cls(RunningStage.PREDICTING, predict_data_frame, transform=predict_transform, **ds_kw),
**data_module_kwargs,
)

@classmethod
def from_csv(
cls,
categorical_fields: Optional[Union[str, List[str]]] = None,
numerical_fields: Optional[Union[str, List[str]]] = None,
target_fields: Optional[str] = None,
parameters: Optional[Dict[str, Any]] = None,
train_file: Optional[str] = None,
val_file: Optional[str] = None,
test_file: Optional[str] = None,
predict_file: Optional[str] = None,
train_transform: INPUT_TRANSFORM_TYPE = InputTransform,
val_transform: INPUT_TRANSFORM_TYPE = InputTransform,
test_transform: INPUT_TRANSFORM_TYPE = InputTransform,
predict_transform: INPUT_TRANSFORM_TYPE = InputTransform,
input_cls: Type[Input] = TabularCSVInput,
transform_kwargs: Optional[Dict] = None,
**data_module_kwargs: Any,
) -> "TabularData":

ds_kw = dict(
data_pipeline_state=DataPipelineState(),
transform_kwargs=transform_kwargs,
input_transforms_registry=cls.input_transforms_registry,
categorical_fields=categorical_fields,
numerical_fields=numerical_fields,
target_field=target_fields,
is_regression=cls.is_regression,
parameters=parameters,
)

train_input = input_cls(RunningStage.TRAINING, train_file, transform=train_transform, **ds_kw)

ds_kw["parameters"] = train_input.parameters if train_input else parameters

return cls(
train_input,
input_cls(RunningStage.VALIDATING, val_file, transform=val_transform, **ds_kw),
input_cls(RunningStage.TESTING, test_file, transform=test_transform, **ds_kw),
input_cls(RunningStage.PREDICTING, predict_file, transform=predict_transform, **ds_kw),
**data_module_kwargs,
)
Loading