Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

issue 466: fixed parsing of int targets when loading file in CSV format #467

Merged
merged 1 commit into from
Jul 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion amlb/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,9 @@ def __setattr__(self, name, value):
def __json__(self):
return self.__dict__

def __repr__(self):
return repr_def(self)

def estimate_system_params(self):
on_unfulfilled = rconfig().benchmarks.on_unfulfilled_constraint
mode = re.split(r"\W+", rconfig().run_mode, maxsplit=1)[0]
Expand Down Expand Up @@ -537,7 +540,7 @@ def run(self):

result = meta_result = None
try:
log.info("Running task %s on framework %s with config:\n%s", task_config.name, self.benchmark.framework_name, repr_def(task_config))
log.info("Running task %s on framework %s with config:\n%s", task_config.name, self.benchmark.framework_name, task_config)
json_dump(task_config, task_config.output_metadata_file, style='pretty')
meta_result = self.benchmark.framework_module.run(self._dataset, task_config)
except Exception as e:
Expand Down
15 changes: 11 additions & 4 deletions amlb/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def __init__(self, index, name, data_type, values=None, has_missing_values=False
self.index = index
self.name = name
self.data_type = data_type.lower() if data_type is not None else None
self.values = self.normalize(values).tolist() if values is not None else None
self.values = values
self.has_missing_values = has_missing_values
self.is_target = is_target
# print(self)
Expand Down Expand Up @@ -80,8 +80,16 @@ def one_hot_encoder(self):
def normalize(self, arr):
return np.char.lower(np.char.strip(np.asarray(arr).astype(str)))

@property
def values(self):
return self._values

@values.setter
def values(self, values):
self._values = self.normalize(values).tolist() if values is not None else None

def __repr__(self):
return repr_def(self)
return repr_def(self, 'all')


class Datasplit(ABC):
Expand Down Expand Up @@ -134,8 +142,7 @@ def y(self) -> DF:
@lazy_property
@profile(logger=log)
def data_enc(self) -> AM:
data = np.where(self.data.notna(), self.data.values, None)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this no longer required (or was it superfluous to begin with)?

encoded_cols = [f.label_encoder.transform(data[:, f.index]) for f in self.dataset.features]
encoded_cols = [f.label_encoder.transform(self.data.iloc[:, f.index]) for f in self.dataset.features]
# optimize mem usage : frameworks use either raw data or encoded ones,
# so we can clear the cached raw data once they've been encoded
self.release(['data', 'X', 'y'])
Expand Down
37 changes: 31 additions & 6 deletions amlb/datasets/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from ..data import Dataset, DatasetType, Datasplit, Feature
from ..datautils import read_csv, to_data_frame
from ..resources import config as rconfig
from ..utils import Namespace as ns, as_list, lazy_property, list_all_files, memoize, path_from_split, profile, split_path
from ..utils import Namespace as ns, as_list, lazy_property, list_all_files, memoize, path_from_split, profile, repr_def, split_path

from .fileutils import is_archive, is_valid_url, unarchive_file, get_file_handler

Expand Down Expand Up @@ -125,6 +125,9 @@ def _extract_train_test_paths(self, dataset, fold=None):
else:
raise ValueError(f"Invalid dataset description: {dataset}")

def __repr__(self):
return repr_def(self)


class FileDataset(Dataset):

Expand Down Expand Up @@ -166,6 +169,9 @@ def _get_metadata(self, prop):
meta = self._train.load_metadata()
return meta[prop]

def __repr__(self):
return repr_def(self, 'all')


class FileDatasplit(Datasplit):

Expand Down Expand Up @@ -212,11 +218,17 @@ def _set_feature_as_target(self, target: Feature):
ds_type = self.dataset._type
if ds_type and DatasetType[ds_type] in [DatasetType.binary, DatasetType.multiclass]:
if not target.is_categorical():
log.warning("Forcing target column %s as 'category' for classification problems: was originally detected as '%s'.",
log.warning("Forcing target column `%s` as 'category' for classification problems: was originally detected as '%s'.",
target.name, target.data_type)
# target.data_type = 'category'
self._convert_to_categorical(target)
target.is_target = True

def _convert_to_categorical(self, feature: Feature):
feature.data_type = 'category'
Comment on lines +223 to +227
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason to extract the one-line function? The function does not seem to be used anywhere else (according to my IDE).


def __repr__(self):
return repr_def(self, 'all')


class ArffDataset(FileDataset):

Expand Down Expand Up @@ -292,8 +304,10 @@ class CsvDataset(FileDataset):
def __init__(self, train_path, test_path,
target=None, features=None, type=None):
# todo: handle auto-split (if test_path is None): requires loading the training set, split, save
super().__init__(CsvDatasplit(self, train_path), CsvDatasplit(self, test_path),
super().__init__(None, None,
target=target, features=features, type=type)
self._train = CsvDatasplit(self, train_path)
self._test = CsvDatasplit(self, test_path)
Comment on lines +307 to +310
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the difference here? And if this is kept, please update the signature of the parent class' __init__ to mark the parameters as optional.

self._dtypes = None


Expand All @@ -310,7 +324,12 @@ def _ensure_loaded(self):
# df = df.convert_dtypes()
dt_conversions = {name: 'category'
for name, dtype in zip(df.dtypes.index, df.dtypes.values)
if pat.is_string_dtype(dtype) or pat.is_object_dtype(dtype)}
if pat.is_string_dtype(dtype)
or pat.is_object_dtype(dtype)
or (name == self.dataset._target
and self.dataset._type is not None
and DatasetType[self.dataset._type] in [DatasetType.binary, DatasetType.multiclass])
}
# we could be a bit more clever in the future and convert 'string' to category iff len(distinct values) << nrows
if dt_conversions:
df = df.astype(dt_conversions, copy=False)
Expand All @@ -337,8 +356,9 @@ def load_metadata(self):
for f in features:
col = self._ds.iloc[:, f.index]
f.has_missing_values = col.hasnans
# f.dtype = self._ds.dtypes[f.name]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# f.dtype = self._ds.dtypes[f.name]

if f.is_categorical():
f.values = sorted(self._ds.dtypes[f.name].categories.values)
f.values = self._unique_values(f.name)

target = self._find_target_feature(features)
self._set_feature_as_target(target)
Expand All @@ -359,6 +379,11 @@ def release(self, properties=None):
super().release(properties)
self._ds = None

def _unique_values(self, col_name: str):
dt = self._ds.dtypes[col_name]
return sorted(dt.categories.values if hasattr(dt, 'categories')
else self._ds[col_name].unique())


class FileConverter:
format = None
Expand Down
7 changes: 5 additions & 2 deletions amlb/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,9 @@ def __init__(self, type='label', target=True, encoded_type=int,
else:
raise ValueError("Encoder `type` should be one of {}.".format(['label', 'one-hot']))

def __repr__(self):
return repr_def(self)

@property
def _ignore_missing(self):
return self.for_target or self.missing_policy == 'ignore'
Expand Down Expand Up @@ -211,8 +214,8 @@ def transform(self, vec, **params):
:param params:
:return:
"""
if log.isEnabledFor(5): # logging.TRACE
log.debug("Transforming %s using %s", vec, repr_def(self))
if log.isEnabledFor(logging.TRACE):
log.debug("Transforming %s using %s", vec, self)

return_value = lambda v: v
if isinstance(vec, str):
Expand Down
1 change: 0 additions & 1 deletion amlb/frameworks/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def load_framework_definitions(frameworks_file: Union[str, List[str]], config: N
frameworks = _load_and_merge_framework_definitions(frameworks_file, config)
for tag, defs in frameworks:
_sanitize_and_add_defaults(defs, config)
log.debug("Available framework definitions:\n%s", frameworks)
return frameworks


Expand Down
1 change: 1 addition & 0 deletions amlb/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def framework_definition(self, name, tag=None):
if tag not in self._frameworks:
raise ValueError("Incorrect tag `{}`: only those among {} are allowed.".format(tag, self.config.frameworks.tags))
frameworks = self._frameworks[tag]
log.debug("Available framework definitions:\n%s", frameworks)
framework = next((f for n, f in frameworks if n.lower() == lname), None)
if not framework:
raise ValueError("Incorrect framework `{}`: not listed in {}.".format(name, self.config.frameworks.definition_file))
Expand Down
26 changes: 22 additions & 4 deletions amlb/utils/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,11 +234,29 @@ def __json__(self):
return Namespace.dict(self)


def repr_def(obj, show_private=False):
def _attributes(obj, filtr='all'):
attrs = vars(obj)
if filtr is None or filtr == 'all':
return attrs
elif filtr == 'public':
return {k: v for k, v in attrs.items() if not k.startswith('_')}
elif filtr == 'private':
return {k: v for k, v in attrs.items() if k.startswith('_')}
elif isinstance(filtr, list):
return {k: v for k, v in attrs.items() if k in filtr}
else:
assert callable(filtr)
return {k: v for k, v in attrs.items() if filtr(k)}
Comment on lines +247 to +249
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
else:
assert callable(filtr)
return {k: v for k, v in attrs.items() if filtr(k)}
elif callable(filtr):
return {k: v for k, v in attrs.items() if filtr(k)}
else:
raise ValueError("`filtr` must be a callable, a list, `None` or one of 'all', 'private', or 'public'.")

Or similar, so the error is interpretable should it ever be raised. Technically you could split it into TypeError or ValueError depending on which way it breaks, but I'd be OK with not splitting hairs here.



def _classname(obj):
return type(obj).__name__
Comment on lines +252 to +253
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question about the use of extracting a once-used one-liner.



def repr_def(obj, attributes='public'):
return "{cls}({attrs!r})".format(
cls=type(obj).__name__,
attrs=(vars(obj) if show_private
else {k: v for k, v in vars(obj).items() if k.startswith('_')})
cls=_classname(obj),
attrs=_attributes(obj, attributes)
)


Expand Down
3 changes: 2 additions & 1 deletion frameworks/H2OAutoML/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ def run(dataset: Dataset, config: TaskConfig):
train=dict(path=dataset.train.path),
test=dict(path=dataset.test.path),
target=dict(index=dataset.target.index),
domains=dict(cardinalities=[0 if f.values is None else len(f.values) for f in dataset.features])
domains=dict(cardinalities=[0 if f.values is None else len(f.values) for f in dataset.features]),
format=dataset.train.format
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
format=dataset.train.format
format=dataset.train.format,

To make future updates easier to parse.

)

config.ext.monitoring = rconfig().monitoring
Expand Down
4 changes: 4 additions & 0 deletions frameworks/H2OAutoML/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ def run(dataset, config):
test = h2o.import_file(dataset.test.path, destination_frame=frame_name('test', config), **import_kwargs)
# test.impute(method='mean')

if config.type == 'classification' and dataset.format == 'csv':
train[dataset.target.index] = train[dataset.target.index].asfactor()
test[dataset.target.index] = test[dataset.target.index].asfactor()

log.info("Running model on task %s, fold %s.", config.name, config.fold)
log.debug("Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.",
config.max_runtime_seconds, config.cores, sort_metric)
Expand Down
2 changes: 1 addition & 1 deletion runbenchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@
# now_str = datetime_iso(time=False, no_sep=True)
if args.profiling:
logging.TRACE = logging.INFO
log_levels = ns({logger: level.upper()
log_levels = ns({logger: int(level) if level.isnumeric() else level.upper()
for logger, level in [d.split(':') for d in args.logging.split(',')]} if ':' in args.logging
else dict(console=args.logging.upper(), app=args.logging.upper(), root=args.logging.upper()) if args.logging
else {}) | ns(console='INFO', app='DEBUG', root='INFO') # adding defaults if needed
Expand Down
16 changes: 16 additions & 0 deletions tests/unit/amlb/datasets/file/resources/iris_num_test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
sepallength,sepalwidth,petallength,petalwidth,class
5.0,3.5,1.6,0.6,1
5.8,4.0,1.2,0.2,1
4.9,3.1,1.5,0.1,1
5.1,3.3,1.7,0.5,1
5.4,3.7,1.5,0.2,1
5.7,2.8,4.1,1.3,2
6.3,2.3,4.4,1.3,2
6.2,2.9,4.3,1.3,2
6.0,2.2,4.0,1.0,2
5.8,2.6,4.0,1.2,2
6.0,2.2,5.0,1.5,3
6.4,2.7,5.3,1.9,3
6.7,3.3,5.7,2.5,3
6.5,3.0,5.5,1.8,3
7.2,3.2,6.0,1.8,3
Loading