Skip to content

Commit

Permalink
Fix NonMatchingSplitsSizesError/ExpectedMoreSplits in no-code Hub dat…
Browse files Browse the repository at this point in the history
…asets when passing data_dir/data_files (#6925)

* Do not use exported dataset infos in some cases

* Add regression tests
  • Loading branch information
albertvillanova committed Jun 3, 2024
1 parent 18cebaf commit 67b6909
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 1 deletion.
7 changes: 6 additions & 1 deletion src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -1235,7 +1235,12 @@ def get_module(self) -> DatasetModule:
pass
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
if config.USE_PARQUET_EXPORT: # maybe don't use the infos from the parquet export
# Use the infos from the parquet export except in some cases:
if self.data_dir or self.data_files or (self.revision and self.revision != "main"):
use_exported_dataset_infos = False
else:
use_exported_dataset_infos = True
if config.USE_PARQUET_EXPORT and use_exported_dataset_infos:
try:
exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos(
dataset=self.name, revision=self.revision, token=self.download_config.token
Expand Down
15 changes: 15 additions & 0 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -1267,6 +1267,21 @@ def test_load_dataset_cached_local_script(dataset_loading_script_dir, data_dir,
assert f"Dataset '{SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST}' doesn't exist on the Hub" in str(exc_info.value)


@pytest.mark.integration
@pytest.mark.parametrize(
"kwargs, expected_train_num_rows, expected_test_num_rows",
[
({}, 2, 2),
({"data_dir": "data1"}, 1, 1), # GH-6918: NonMatchingSplitsSizesError
({"data_files": "data1/train.txt"}, 1, None), # GH-6939: ExpectedMoreSplits
],
)
def test_load_dataset_without_script_from_hub(kwargs, expected_train_num_rows, expected_test_num_rows):
dataset = load_dataset(SAMPLE_DATASET_IDENTIFIER3, **kwargs)
assert dataset["train"].num_rows == expected_train_num_rows
assert (dataset["test"].num_rows == expected_test_num_rows) if expected_test_num_rows else ("test" not in dataset)


@pytest.mark.integration
@pytest.mark.parametrize("stream_from_cache, ", [False, True])
def test_load_dataset_cached_from_hub(stream_from_cache, caplog):
Expand Down

0 comments on commit 67b6909

Please sign in to comment.