-
-
Notifications
You must be signed in to change notification settings - Fork 135
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
issue 466: fixed parsing of int targets when loading file in CSV format #467
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -13,7 +13,7 @@ | |||
from ..data import Dataset, DatasetType, Datasplit, Feature | ||||
from ..datautils import read_csv, to_data_frame | ||||
from ..resources import config as rconfig | ||||
from ..utils import Namespace as ns, as_list, lazy_property, list_all_files, memoize, path_from_split, profile, split_path | ||||
from ..utils import Namespace as ns, as_list, lazy_property, list_all_files, memoize, path_from_split, profile, repr_def, split_path | ||||
|
||||
from .fileutils import is_archive, is_valid_url, unarchive_file, get_file_handler | ||||
|
||||
|
@@ -125,6 +125,9 @@ def _extract_train_test_paths(self, dataset, fold=None): | |||
else: | ||||
raise ValueError(f"Invalid dataset description: {dataset}") | ||||
|
||||
def __repr__(self): | ||||
return repr_def(self) | ||||
|
||||
|
||||
class FileDataset(Dataset): | ||||
|
||||
|
@@ -166,6 +169,9 @@ def _get_metadata(self, prop): | |||
meta = self._train.load_metadata() | ||||
return meta[prop] | ||||
|
||||
def __repr__(self): | ||||
return repr_def(self, 'all') | ||||
|
||||
|
||||
class FileDatasplit(Datasplit): | ||||
|
||||
|
@@ -212,11 +218,17 @@ def _set_feature_as_target(self, target: Feature): | |||
ds_type = self.dataset._type | ||||
if ds_type and DatasetType[ds_type] in [DatasetType.binary, DatasetType.multiclass]: | ||||
if not target.is_categorical(): | ||||
log.warning("Forcing target column %s as 'category' for classification problems: was originally detected as '%s'.", | ||||
log.warning("Forcing target column `%s` as 'category' for classification problems: was originally detected as '%s'.", | ||||
target.name, target.data_type) | ||||
# target.data_type = 'category' | ||||
self._convert_to_categorical(target) | ||||
target.is_target = True | ||||
|
||||
def _convert_to_categorical(self, feature: Feature): | ||||
feature.data_type = 'category' | ||||
Comment on lines
+223
to
+227
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason to extract the one-line function? The function does not seem to be used anywhere else (according to my IDE). |
||||
|
||||
def __repr__(self): | ||||
return repr_def(self, 'all') | ||||
|
||||
|
||||
class ArffDataset(FileDataset): | ||||
|
||||
|
@@ -292,8 +304,10 @@ class CsvDataset(FileDataset): | |||
def __init__(self, train_path, test_path, | ||||
target=None, features=None, type=None): | ||||
# todo: handle auto-split (if test_path is None): requires loading the training set, split, save | ||||
super().__init__(CsvDatasplit(self, train_path), CsvDatasplit(self, test_path), | ||||
super().__init__(None, None, | ||||
target=target, features=features, type=type) | ||||
self._train = CsvDatasplit(self, train_path) | ||||
self._test = CsvDatasplit(self, test_path) | ||||
Comment on lines
+307
to
+310
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the difference here? And if this is kept, please update the signature of the parent class' |
||||
self._dtypes = None | ||||
|
||||
|
||||
|
@@ -310,7 +324,12 @@ def _ensure_loaded(self): | |||
# df = df.convert_dtypes() | ||||
dt_conversions = {name: 'category' | ||||
for name, dtype in zip(df.dtypes.index, df.dtypes.values) | ||||
if pat.is_string_dtype(dtype) or pat.is_object_dtype(dtype)} | ||||
if pat.is_string_dtype(dtype) | ||||
or pat.is_object_dtype(dtype) | ||||
or (name == self.dataset._target | ||||
and self.dataset._type is not None | ||||
and DatasetType[self.dataset._type] in [DatasetType.binary, DatasetType.multiclass]) | ||||
} | ||||
# we could be a bit more clever in the future and convert 'string' to category iff len(distinct values) << nrows | ||||
if dt_conversions: | ||||
df = df.astype(dt_conversions, copy=False) | ||||
|
@@ -337,8 +356,9 @@ def load_metadata(self): | |||
for f in features: | ||||
col = self._ds.iloc[:, f.index] | ||||
f.has_missing_values = col.hasnans | ||||
# f.dtype = self._ds.dtypes[f.name] | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||
if f.is_categorical(): | ||||
f.values = sorted(self._ds.dtypes[f.name].categories.values) | ||||
f.values = self._unique_values(f.name) | ||||
|
||||
target = self._find_target_feature(features) | ||||
self._set_feature_as_target(target) | ||||
|
@@ -359,6 +379,11 @@ def release(self, properties=None): | |||
super().release(properties) | ||||
self._ds = None | ||||
|
||||
def _unique_values(self, col_name: str): | ||||
dt = self._ds.dtypes[col_name] | ||||
return sorted(dt.categories.values if hasattr(dt, 'categories') | ||||
else self._ds[col_name].unique()) | ||||
|
||||
|
||||
class FileConverter: | ||||
format = None | ||||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -234,11 +234,29 @@ def __json__(self): | |||||||||||||||
return Namespace.dict(self) | ||||||||||||||||
|
||||||||||||||||
|
||||||||||||||||
def repr_def(obj, show_private=False): | ||||||||||||||||
def _attributes(obj, filtr='all'): | ||||||||||||||||
attrs = vars(obj) | ||||||||||||||||
if filtr is None or filtr == 'all': | ||||||||||||||||
return attrs | ||||||||||||||||
elif filtr == 'public': | ||||||||||||||||
return {k: v for k, v in attrs.items() if not k.startswith('_')} | ||||||||||||||||
elif filtr == 'private': | ||||||||||||||||
return {k: v for k, v in attrs.items() if k.startswith('_')} | ||||||||||||||||
elif isinstance(filtr, list): | ||||||||||||||||
return {k: v for k, v in attrs.items() if k in filtr} | ||||||||||||||||
else: | ||||||||||||||||
assert callable(filtr) | ||||||||||||||||
return {k: v for k, v in attrs.items() if filtr(k)} | ||||||||||||||||
Comment on lines
+247
to
+249
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Or similar, so the error is interpretable should it ever be raised. Technically you could split it into |
||||||||||||||||
|
||||||||||||||||
|
||||||||||||||||
def _classname(obj): | ||||||||||||||||
return type(obj).__name__ | ||||||||||||||||
Comment on lines
+252
to
+253
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same question about the use of extracting a once-used one-liner. |
||||||||||||||||
|
||||||||||||||||
|
||||||||||||||||
def repr_def(obj, attributes='public'): | ||||||||||||||||
return "{cls}({attrs!r})".format( | ||||||||||||||||
cls=type(obj).__name__, | ||||||||||||||||
attrs=(vars(obj) if show_private | ||||||||||||||||
else {k: v for k, v in vars(obj).items() if k.startswith('_')}) | ||||||||||||||||
cls=_classname(obj), | ||||||||||||||||
attrs=_attributes(obj, attributes) | ||||||||||||||||
) | ||||||||||||||||
|
||||||||||||||||
|
||||||||||||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -23,7 +23,8 @@ def run(dataset: Dataset, config: TaskConfig): | |||||
train=dict(path=dataset.train.path), | ||||||
test=dict(path=dataset.test.path), | ||||||
target=dict(index=dataset.target.index), | ||||||
domains=dict(cardinalities=[0 if f.values is None else len(f.values) for f in dataset.features]) | ||||||
domains=dict(cardinalities=[0 if f.values is None else len(f.values) for f in dataset.features]), | ||||||
format=dataset.train.format | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
To make future updates easier to parse. |
||||||
) | ||||||
|
||||||
config.ext.monitoring = rconfig().monitoring | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
sepallength,sepalwidth,petallength,petalwidth,class | ||
5.0,3.5,1.6,0.6,1 | ||
5.8,4.0,1.2,0.2,1 | ||
4.9,3.1,1.5,0.1,1 | ||
5.1,3.3,1.7,0.5,1 | ||
5.4,3.7,1.5,0.2,1 | ||
5.7,2.8,4.1,1.3,2 | ||
6.3,2.3,4.4,1.3,2 | ||
6.2,2.9,4.3,1.3,2 | ||
6.0,2.2,4.0,1.0,2 | ||
5.8,2.6,4.0,1.2,2 | ||
6.0,2.2,5.0,1.5,3 | ||
6.4,2.7,5.3,1.9,3 | ||
6.7,3.3,5.7,2.5,3 | ||
6.5,3.0,5.5,1.8,3 | ||
7.2,3.2,6.0,1.8,3 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this no longer required (or was it superfluous to begin with)?