Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LAML-4] Add SAINT #131

Open
wants to merge 67 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
2c1412e
changed logging
D1MK4real Jul 19, 2023
bb95575
lint fix
D1MK4real Jul 19, 2023
3d76b96
changed logging
D1MK4real Jul 19, 2023
697ebfd
Added timm cv-library, bug-fix in multilabel linear model, added soft…
D1MK4real Jul 25, 2023
7f2616d
Merge branch 'bugfix/blender_weights' into NODE
D1MK4real Jul 31, 2023
af4eb74
Merge branch 'master' into timm_cv
dev-rinchin Aug 2, 2023
a9d1466
Added NODE neural network, added NODE example with tunning params
D1MK4real Aug 3, 2023
0608183
Merge branch 'master' into timm_cv
D1MK4real Aug 3, 2023
6415c28
added tutorial run
D1MK4real Aug 3, 2023
a2680de
new example
D1MK4real Aug 3, 2023
cc269dd
Merge branch 'master' into NODE
D1MK4real Aug 3, 2023
1f4b29d
chnged autonlp params
D1MK4real Aug 7, 2023
12c54b6
Merge branch 'timm_cv' into autoint++
D1MK4real Aug 8, 2023
81c444f
add autoint
D1MK4real Aug 10, 2023
eaa9001
added flatten versions of embeddings
D1MK4real Aug 10, 2023
294383f
not done still
D1MK4real Aug 15, 2023
984f4b0
not done still
D1MK4real Aug 15, 2023
fbc5076
not done still
D1MK4real Aug 15, 2023
3a62edb
-Lambda + MP
D1MK4real Aug 15, 2023
dc003fa
changed on comments
D1MK4real Aug 15, 2023
e7c9c91
Merge branch 'NODE' into autoint++
D1MK4real Aug 15, 2023
32bae0a
changes on comments
D1MK4real Aug 15, 2023
ade7c46
Merge branch 'timm_cv' into autoint++
D1MK4real Aug 15, 2023
a58df59
added changes on comments
D1MK4real Aug 15, 2023
4038d9e
resolve merge conflicts
D1MK4real Aug 15, 2023
788d381
resolve merge conflicts
D1MK4real Aug 15, 2023
f6fdb58
resolve merge conflicts
D1MK4real Aug 15, 2023
a7fe9be
PLR + SOFTEmb
D1MK4real Aug 16, 2023
f58b4c5
no-changes
D1MK4real Aug 28, 2023
e4e56ea
resolve merge conflicts
D1MK4real Aug 29, 2023
a50a90f
beautiful CV tutorial
D1MK4real Aug 29, 2023
c66b3c2
added some changes on comments
D1MK4real Aug 29, 2023
6b8648e
resolve mc
D1MK4real Aug 29, 2023
d045b8e
removed useless function
D1MK4real Aug 29, 2023
eb12a7f
Merge branch 'autoint++' into tabnet
D1MK4real Aug 29, 2023
382b385
removed for-for
D1MK4real Aug 29, 2023
4955b2e
WeightedEmbedder bugfix
D1MK4real Aug 30, 2023
6b332d5
Merge branch 'autoint++' into tabnet
D1MK4real Aug 30, 2023
1bdf9d5
delete unused import
D1MK4real Aug 30, 2023
b51e4de
changed link
D1MK4real Aug 30, 2023
439ed48
Merge branch 'autoint++' into tabnet
D1MK4real Aug 30, 2023
6d4a74e
add tabnet/plr/softemb
D1MK4real Aug 30, 2023
2557c4c
bugfix
D1MK4real Sep 1, 2023
42fd85f
changed import links
D1MK4real Sep 1, 2023
98aa0c7
changed import links
D1MK4real Sep 1, 2023
b462b01
resolve merge conflicts
D1MK4real Sep 6, 2023
7a8bf65
changed import links
D1MK4real Sep 6, 2023
94fdd76
bugfix
D1MK4real Sep 6, 2023
4e1aa5c
bugfix
D1MK4real Sep 6, 2023
1c4170e
some new changes
D1MK4real Sep 7, 2023
2035113
now we dont count VC for cat features for every embedding
D1MK4real Sep 7, 2023
0afe07f
no embedder bugfix
D1MK4real Sep 8, 2023
39beb9e
scheduler params
D1MK4real Sep 8, 2023
6294e9f
bfixs
D1MK4real Sep 11, 2023
81fab51
bfixs
D1MK4real Sep 11, 2023
8ca132f
merge conflicts
D1MK4real Sep 11, 2023
99d77f8
mlp embedder
D1MK4real Sep 12, 2023
41e547f
no descr
D1MK4real Sep 20, 2023
799ded2
no-verify
D1MK4real Oct 2, 2023
57aa2a5
no-verify
D1MK4real Oct 2, 2023
ef7316b
starting changing
D1MK4real Oct 4, 2023
50962dd
more changes
D1MK4real Oct 12, 2023
586ae59
Descr
D1MK4real Oct 16, 2023
db89a90
Merge branch 'master' into saint
D1MK4real Oct 16, 2023
45c7166
Descr
D1MK4real Oct 16, 2023
81e9db4
Descr
D1MK4real Oct 16, 2023
2238f7e
added poolings
D1MK4real Oct 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lightautoml/automl/presets/tabular_presets.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,7 @@ def create_automl(self, **fit_args):
"autoint",
"tabnet",
"fttransformer",
"saint",
]
available_nn_models = available_nn_models + [x + "_tuned" for x in available_nn_models]
nn_models = [
Expand Down
48 changes: 48 additions & 0 deletions lightautoml/dataset/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,19 @@ def shape(self) -> Tuple[Optional[int], Optional[int]]:
return rows, cols

# static methods - how to make 1d slice, 2s slice, concat of feature matrix etc ...
@staticmethod
def _vstack(datasets: Sequence[Any]) -> Any:
"""Abstract method - define horizontal stack of feature arrays.

Args:
datasets: Sequence of feature arrays.

Returns: # noqa DAR202
Single feature array.

"""
raise NotImplementedError("Horizontal Stack not implemented.")

@staticmethod
def _hstack(datasets: Sequence[Any]) -> Any:
"""Abstract method - define horizontal stack of feature arrays.
Expand Down Expand Up @@ -472,7 +485,42 @@ def concat(cls, datasets: Sequence["LAMLDataset"]) -> "LAMLDataset":
dataset.set_data(data, features, roles)

return dataset
@classmethod
def vconcat(cls, datasets: Sequence["LAMLDataset"]) -> "LAMLDataset":
"""Concat multiple dataset.

Default behavior - takes empty dataset from datasets[0]
and concat all features from others.

Args:
datasets: Sequence of datasets.

Returns:
Concated dataset.

"""
for check in cls._concat_checks:
check(datasets)

dataset = datasets[0].empty()
data = []
features = [*datasets[0].features]
roles = {**datasets[0].roles}
atrs = set(dataset._array_like_attrs)

for ds in datasets:
data.append(ds.data)
for atr in ds._array_like_attrs:
if atr not in atrs:
dataset._array_like_attrs.append(atr)
dataset.__dict__[atr] = ds.__dict__[atr]
atrs.update({atr})

data = cls._vstack(data)
dataset.set_data(data, features, roles)

return dataset

def drop_features(self, droplist: Sequence[str]):
"""Inplace drop columns from dataset.

Expand Down
36 changes: 36 additions & 0 deletions lightautoml/dataset/np_pd_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,18 @@ def _hstack(datasets: Sequence[np.ndarray]) -> np.ndarray:

"""
return np.hstack(datasets)
@staticmethod
def _vstack(datasets: Sequence[np.ndarray]) -> np.ndarray:
"""Concatenate function for numpy arrays.

Args:
datasets: Sequence of np.ndarray.

Returns:
Stacked features array.

"""
return np.vstack(datasets)

@staticmethod
def _get_rows(data: np.ndarray, k: IntIdx) -> np.ndarray:
Expand Down Expand Up @@ -400,6 +412,17 @@ def _hstack(datasets: Sequence[Union[sparse.csr_matrix, np.ndarray]]) -> sparse.

"""
return sparse.hstack(datasets, format="csr")
def _vstack(datasets: Sequence[Union[sparse.csr_matrix, np.ndarray]]) -> sparse.csr_matrix:
"""Concatenate function for sparse and numpy arrays.

Args:
datasets: Sequence of csr_matrix or np.ndarray.

Returns:
Sparse matrix.

"""
return sparse.vstack(datasets, format="csr")

def __init__(
self,
Expand Down Expand Up @@ -609,6 +632,19 @@ def _hstack(datasets: Sequence[DataFrame]) -> DataFrame:

"""
return pd.concat(datasets, axis=1)

@staticmethod
def _vstack(datasets: Sequence[DataFrame]) -> DataFrame:
"""Define how to concat features arrays.

Args:
datasets: Sequence of tables.

Returns:
concatenated table.

"""
return pd.concat(datasets, axis=0)

@staticmethod
def _get_rows(data: DataFrame, k: IntIdx) -> FrameOrSeries:
Expand Down
115 changes: 115 additions & 0 deletions lightautoml/dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,118 @@ def concatenate(datasets: Sequence[LAMLDataset]) -> LAMLDataset:
datasets = [datasets[n]] + [x for (y, x) in enumerate(datasets) if n != y]

return conc(datasets)



def get_common_vconcat(
datasets: Sequence[LAMLDataset],
) -> Tuple[Callable, Optional[type]]:
"""Get concatenation function for datasets of different types.

Takes multiple datasets as input and check,
if is's ok to concatenate it and return function.

Args:
datasets: Sequence of datasets.

Returns:
Function, that is able to concatenate datasets.

"""
# TODO: Add pandas + numpy via transforming to numpy?
dataset_types = set([type(x) for x in datasets])

# general - if single type, concatenation for that type
if len(dataset_types) == 1:
klass = list(dataset_types)[0]
return klass.vconcat, None

# np and sparse goes to sparse
elif dataset_types == {NumpyDataset, CSRSparseDataset}:
return CSRSparseDataset.vconcat, CSRSparseDataset

elif dataset_types == {NumpyDataset, PandasDataset}:
return numpy_and_pandas_vconcat, None

elif (dataset_types == {NumpyDataset, SeqNumpyPandasDataset}) or (
dataset_types == {PandasDataset, SeqNumpyPandasDataset}
):
return numpy_or_pandas_and_seq_vconcat, None

raise TypeError("Unable to concatenate dataset types {0}".format(list(dataset_types)))


def numpy_and_pandas_vconcat(datasets: Sequence[Union[NumpyDataset, PandasDataset]]) -> PandasDataset:
"""Concat of numpy and pandas dataset.

Args:
datasets: Sequence of datasets to concatenate.

Returns:
Concatenated dataset.

"""
datasets = [x.to_pandas() for x in datasets]

return PandasDataset.vconcat(datasets)


def numpy_or_pandas_and_seq_vconcat(
datasets: Sequence[Union[NumpyDataset, PandasDataset, SeqNumpyPandasDataset]]
) -> Union[NumpyDataset, PandasDataset]:
"""Concat plain and sequential dataset.

If both datasets have same size then concat them as plain, otherwise include seq dataset inside plain one.

Args:
datasets: one plain and one seq dataset.

Returns:
Concatenated dataset.

"""
assert len(datasets) == 2, "should be 1 sequential and 1 plain dataset"
# get 1 numpy / pandas dataset
for n, dataset in enumerate(datasets):
if type(dataset) == SeqNumpyPandasDataset:
seq_dataset = dataset
else:
plain_dataset = dataset

if len(seq_dataset.data) == len(plain_dataset):
return SeqNumpyPandasDataset.vconcat([seq_dataset, plain_dataset.to_pandas()])
else:
if hasattr(plain_dataset, "seq_data"):
plain_dataset.seq_data[seq_dataset.name] = seq_dataset
else:
plain_dataset.seq_data = {seq_dataset.name: seq_dataset}

return plain_dataset


def vconcatenate(datasets: Sequence[LAMLDataset]) -> LAMLDataset:
"""Dataset concatenation function.

Check if datasets have common concat function and then apply.
Assume to take target/folds/weights etc from first one.

Args:
datasets: Sequence of datasets.

Returns:
Dataset with concatenated features.

"""
conc, klass = get_common_vconcat([ds for ds in datasets if ds is not None])

# this part is made to avoid setting first dataset of required type
if klass is not None:

n = 0
for n, ds in enumerate(datasets):
if type(ds) is klass:
break

datasets = [datasets[n]] + [x for (y, x) in enumerate(datasets) if n != y]

return conc(datasets)
6 changes: 3 additions & 3 deletions lightautoml/ml_algo/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import numpy as np

from lightautoml.validation.base import TrainValidIterator
from lightautoml.validation.base import HoldoutIterator, TrainValidIterator

from ..dataset.base import LAMLDataset
from ..dataset.np_pd_dataset import CSRSparseDataset
Expand Down Expand Up @@ -271,8 +271,8 @@ def fit_predict(self, train_valid_iterator: TrainValidIterator) -> NumpyDataset:
"===== Start working with \x1b[1mfold {}\x1b[0m for \x1b[1m{}\x1b[0m =====".format(n, self._name)
)
self.timer.set_control_point()

model, pred = self.fit_predict_single_fold(train, valid)
self.params['is_holdout'] = isinstance(train_valid_iterator,HoldoutIterator)
model, pred = self.fit_predict_single_fold(train, valid, 0)
self.models.append(model)
preds_arr[idx] += pred.reshape((pred.shape[0], -1))
counter_arr[idx] += 1
Expand Down
Loading
Loading