diff --git a/src/datasets/load.py b/src/datasets/load.py index 2dab9f7a7e6..dbd6abad46a 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -37,6 +37,7 @@ import yaml from fsspec.core import url_to_fs from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem +from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError, RevisionNotFoundError from . import config from .arrow_dataset import Dataset @@ -1836,28 +1837,26 @@ def dataset_module_factory( token=download_config.token, timeout=100.0, ) - except Exception as e: # noqa catch any exception of hf_hub and consider that the dataset doesn't exist - if isinstance( - e, - ( - OfflineModeIsEnabled, - requests.exceptions.ConnectTimeout, - requests.exceptions.ConnectionError, - ), - ): - raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({type(e).__name__})") - elif "404" in str(e): - msg = f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed" - raise DatasetNotFoundError(msg + f" at revision '{revision}'" if revision else msg) - elif "401" in str(e): - msg = f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed" - msg = msg + f" at revision '{revision}'" if revision else msg - raise DatasetNotFoundError( - msg - + f". If the dataset is private or gated, make sure to log in with `huggingface-cli login` or visit the dataset page at https://huggingface.co/datasets/{path} to ask for access." - ) - else: - raise e + except ( + OfflineModeIsEnabled, + requests.exceptions.ConnectTimeout, + requests.exceptions.ConnectionError, + ) as e: + raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e + except GatedRepoError as e: + message = f"Dataset '{path}' is a gated dataset on the Hub." + if "401 Client Error" in str(e): + message += " You must be authenticated to access it." + elif "403 Client Error" in str(e): + message += f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access." + raise DatasetNotFoundError(message) from e + except RevisionNotFoundError as e: + raise DatasetNotFoundError( + f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub." + ) from e + except RepositoryNotFoundError as e: + raise DatasetNotFoundError(f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed.") from e + if filename in [sibling.rfilename for sibling in dataset_info.siblings]: # contains a dataset script fs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token) if _require_custom_configs or (revision and revision != "main"): diff --git a/tests/test_load.py b/tests/test_load.py index 2c02e834892..3547a0c00a0 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -1067,13 +1067,9 @@ def test_load_dataset_from_hub(self): str(context.exception), ) with self.assertRaises(DatasetNotFoundError) as context: - datasets.load_dataset("_dummy", revision="0.0.0") + datasets.load_dataset("HuggingFaceFW/fineweb-edu", revision="0.0.0") self.assertIn( - "Dataset '_dummy' doesn't exist on the Hub", - str(context.exception), - ) - self.assertIn( - "at revision '0.0.0'", + "Revision '0.0.0' doesn't exist for dataset 'HuggingFaceFW/fineweb-edu' on the Hub.", str(context.exception), ) for offline_simulation_mode in list(OfflineSimulationMode):