From 7de06490adb9148a66621b56e71503d4e4e54a6b Mon Sep 17 00:00:00 2001 From: Wauplin Date: Fri, 7 Jun 2024 13:20:17 +0200 Subject: [PATCH 1/3] Better error handling in dataset_module_factory --- src/datasets/load.py | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/src/datasets/load.py b/src/datasets/load.py index 824817843fd..216fcc12ebf 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -37,6 +37,7 @@ import yaml from fsspec.core import url_to_fs from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem +from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError, RevisionNotFoundError from . import config from .arrow_dataset import Dataset @@ -1836,28 +1837,23 @@ def dataset_module_factory( token=download_config.token, timeout=100.0, ) - except Exception as e: # noqa catch any exception of hf_hub and consider that the dataset doesn't exist - if isinstance( - e, - ( - OfflineModeIsEnabled, - requests.exceptions.ConnectTimeout, - requests.exceptions.ConnectionError, - ), - ): - raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({type(e).__name__})") - elif "404" in str(e): - msg = f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed" - raise DatasetNotFoundError(msg + f" at revision '{revision}'" if revision else msg) - elif "401" in str(e): - msg = f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed" - msg = msg + f" at revision '{revision}'" if revision else msg - raise DatasetNotFoundError( - msg - + f". If the dataset is private or gated, make sure to log in with `huggingface-cli login` or visit the dataset page at https://huggingface.co/datasets/{path} to ask for access." - ) - else: - raise e + except ( + OfflineModeIsEnabled, + requests.exceptions.ConnectTimeout, + requests.exceptions.ConnectionError, + ) as e: + raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e + except GatedRepoError as e: + raise DatasetNotFoundError( + f"Dataset '{path}' is a gated dataset on the Hub. Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access." + ) from e + except RevisionNotFoundError as e: + raise DatasetNotFoundError( + f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub." + ) from e + except RepositoryNotFoundError as e: + raise DatasetNotFoundError(f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed.") from e + if filename in [sibling.rfilename for sibling in dataset_info.siblings]: # contains a dataset script fs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token) if _require_custom_configs or (revision and revision != "main"): From ef8f7cee79ffb070d9b5190f21128fc523b3d3ee Mon Sep 17 00:00:00 2001 From: Wauplin Date: Fri, 7 Jun 2024 13:37:42 +0200 Subject: [PATCH 2/3] fix test --- tests/test_load.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/test_load.py b/tests/test_load.py index c7c413ae10b..1986b3ccf0a 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -1048,13 +1048,9 @@ def test_load_dataset_from_hub(self): str(context.exception), ) with self.assertRaises(DatasetNotFoundError) as context: - datasets.load_dataset("_dummy", revision="0.0.0") + datasets.load_dataset("HuggingFaceFW/fineweb-edu", revision="0.0.0") self.assertIn( - "Dataset '_dummy' doesn't exist on the Hub", - str(context.exception), - ) - self.assertIn( - "at revision '0.0.0'", + "Revision '0.0.0' doesn't exist for dataset 'HuggingFaceFW/fineweb-edu' on the Hub.", str(context.exception), ) for offline_simulation_mode in list(OfflineSimulationMode): From 62350f5cddfe056c71ca2bb760250f76b5a49394 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Fri, 7 Jun 2024 13:44:44 +0200 Subject: [PATCH 3/3] Add custom message on GatedRepoError --- src/datasets/load.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/datasets/load.py b/src/datasets/load.py index 216fcc12ebf..976d4942e4a 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -1844,9 +1844,12 @@ def dataset_module_factory( ) as e: raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e except GatedRepoError as e: - raise DatasetNotFoundError( - f"Dataset '{path}' is a gated dataset on the Hub. Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access." - ) from e + message = f"Dataset '{path}' is a gated dataset on the Hub." + if "401 Client Error" in str(e): + message += " You must be authenticated to access it." + elif "403 Client Error" in str(e): + message += f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access." + raise DatasetNotFoundError(message) from e except RevisionNotFoundError as e: raise DatasetNotFoundError( f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub."