Skip to content

Commit

Permalink
Better error handling in dataset_module_factory (#6959)
Browse files Browse the repository at this point in the history
* Better error handling in dataset_module_factory

* fix test

* Add custom message on GatedRepoError
  • Loading branch information
Wauplin authored Jun 10, 2024
1 parent 97513be commit 9510252
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 28 deletions.
43 changes: 21 additions & 22 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import yaml
from fsspec.core import url_to_fs
from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError, RevisionNotFoundError

from . import config
from .arrow_dataset import Dataset
Expand Down Expand Up @@ -1836,28 +1837,26 @@ def dataset_module_factory(
token=download_config.token,
timeout=100.0,
)
except Exception as e: # noqa catch any exception of hf_hub and consider that the dataset doesn't exist
if isinstance(
e,
(
OfflineModeIsEnabled,
requests.exceptions.ConnectTimeout,
requests.exceptions.ConnectionError,
),
):
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({type(e).__name__})")
elif "404" in str(e):
msg = f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed"
raise DatasetNotFoundError(msg + f" at revision '{revision}'" if revision else msg)
elif "401" in str(e):
msg = f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed"
msg = msg + f" at revision '{revision}'" if revision else msg
raise DatasetNotFoundError(
msg
+ f". If the dataset is private or gated, make sure to log in with `huggingface-cli login` or visit the dataset page at https://huggingface.co/datasets/{path} to ask for access."
)
else:
raise e
except (
OfflineModeIsEnabled,
requests.exceptions.ConnectTimeout,
requests.exceptions.ConnectionError,
) as e:
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e
except GatedRepoError as e:
message = f"Dataset '{path}' is a gated dataset on the Hub."
if "401 Client Error" in str(e):
message += " You must be authenticated to access it."
elif "403 Client Error" in str(e):
message += f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access."
raise DatasetNotFoundError(message) from e
except RevisionNotFoundError as e:
raise DatasetNotFoundError(
f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub."
) from e
except RepositoryNotFoundError as e:
raise DatasetNotFoundError(f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed.") from e

if filename in [sibling.rfilename for sibling in dataset_info.siblings]: # contains a dataset script
fs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token)
if _require_custom_configs or (revision and revision != "main"):
Expand Down
8 changes: 2 additions & 6 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -1067,13 +1067,9 @@ def test_load_dataset_from_hub(self):
str(context.exception),
)
with self.assertRaises(DatasetNotFoundError) as context:
datasets.load_dataset("_dummy", revision="0.0.0")
datasets.load_dataset("HuggingFaceFW/fineweb-edu", revision="0.0.0")
self.assertIn(
"Dataset '_dummy' doesn't exist on the Hub",
str(context.exception),
)
self.assertIn(
"at revision '0.0.0'",
"Revision '0.0.0' doesn't exist for dataset 'HuggingFaceFW/fineweb-edu' on the Hub.",
str(context.exception),
)
for offline_simulation_mode in list(OfflineSimulationMode):
Expand Down

0 comments on commit 9510252

Please sign in to comment.