From 67b6909996dff985e42f12ecc663994d0ef7f4e3 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Fri, 31 May 2024 19:10:37 +0200
Subject: [PATCH] Fix NonMatchingSplitsSizesError/ExpectedMoreSplits in no-code
 Hub datasets when passing data_dir/data_files (#6925)

* Do not use exported dataset infos in some cases

* Add regression tests
---
 src/datasets/load.py |  7 ++++++-
 tests/test_load.py   | 15 +++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index fd7aa401094..824817843fd 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -1235,7 +1235,12 @@ def get_module(self) -> DatasetModule:
             pass
         metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
         dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
-        if config.USE_PARQUET_EXPORT:  # maybe don't use the infos from the parquet export
+        # Use the infos from the parquet export except in some cases:
+        if self.data_dir or self.data_files or (self.revision and self.revision != "main"):
+            use_exported_dataset_infos = False
+        else:
+            use_exported_dataset_infos = True
+        if config.USE_PARQUET_EXPORT and use_exported_dataset_infos:
             try:
                 exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos(
                     dataset=self.name, revision=self.revision, token=self.download_config.token
diff --git a/tests/test_load.py b/tests/test_load.py
index 4b2b9cbf58c..c7c413ae10b 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -1267,6 +1267,21 @@ def test_load_dataset_cached_local_script(dataset_loading_script_dir, data_dir,
     assert f"Dataset '{SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST}' doesn't exist on the Hub" in str(exc_info.value)
 
 
+@pytest.mark.integration
+@pytest.mark.parametrize(
+    "kwargs, expected_train_num_rows, expected_test_num_rows",
+    [
+        ({}, 2, 2),
+        ({"data_dir": "data1"}, 1, 1),  # GH-6918: NonMatchingSplitsSizesError
+        ({"data_files": "data1/train.txt"}, 1, None),  # GH-6939: ExpectedMoreSplits
+    ],
+)
+def test_load_dataset_without_script_from_hub(kwargs, expected_train_num_rows, expected_test_num_rows):
+    dataset = load_dataset(SAMPLE_DATASET_IDENTIFIER3, **kwargs)
+    assert dataset["train"].num_rows == expected_train_num_rows
+    assert (dataset["test"].num_rows == expected_test_num_rows) if expected_test_num_rows else ("test" not in dataset)
+
+
 @pytest.mark.integration
 @pytest.mark.parametrize("stream_from_cache, ", [False, True])
 def test_load_dataset_cached_from_hub(stream_from_cache, caplog):