Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better error handling in dataset_module_factory #6959

Merged
merged 3 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 21 additions & 22 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import yaml
from fsspec.core import url_to_fs
from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError, RevisionNotFoundError

from . import config
from .arrow_dataset import Dataset
Expand Down Expand Up @@ -1836,28 +1837,26 @@ def dataset_module_factory(
token=download_config.token,
timeout=100.0,
)
except Exception as e: # noqa catch any exception of hf_hub and consider that the dataset doesn't exist
if isinstance(
e,
(
OfflineModeIsEnabled,
requests.exceptions.ConnectTimeout,
requests.exceptions.ConnectionError,
),
):
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({type(e).__name__})")
elif "404" in str(e):
msg = f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed"
raise DatasetNotFoundError(msg + f" at revision '{revision}'" if revision else msg)
elif "401" in str(e):
msg = f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed"
msg = msg + f" at revision '{revision}'" if revision else msg
raise DatasetNotFoundError(
msg
+ f". If the dataset is private or gated, make sure to log in with `huggingface-cli login` or visit the dataset page at https://huggingface.co/datasets/{path} to ask for access."
)
else:
raise e
except (
OfflineModeIsEnabled,
requests.exceptions.ConnectTimeout,
requests.exceptions.ConnectionError,
) as e:
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e
except GatedRepoError as e:
message = f"Dataset '{path}' is a gated dataset on the Hub."
if "401 Client Error" in str(e):
message += " You must be authenticated to access it."
elif "403 Client Error" in str(e):
message += f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access."
raise DatasetNotFoundError(message) from e
except RevisionNotFoundError as e:
raise DatasetNotFoundError(
f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub."
) from e
except RepositoryNotFoundError as e:
raise DatasetNotFoundError(f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed.") from e

if filename in [sibling.rfilename for sibling in dataset_info.siblings]: # contains a dataset script
fs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token)
if _require_custom_configs or (revision and revision != "main"):
Expand Down
8 changes: 2 additions & 6 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -1048,13 +1048,9 @@ def test_load_dataset_from_hub(self):
str(context.exception),
)
with self.assertRaises(DatasetNotFoundError) as context:
datasets.load_dataset("_dummy", revision="0.0.0")
datasets.load_dataset("HuggingFaceFW/fineweb-edu", revision="0.0.0")
self.assertIn(
"Dataset '_dummy' doesn't exist on the Hub",
str(context.exception),
)
self.assertIn(
"at revision '0.0.0'",
"Revision '0.0.0' doesn't exist for dataset 'HuggingFaceFW/fineweb-edu' on the Hub.",
str(context.exception),
)
for offline_simulation_mode in list(OfflineSimulationMode):
Expand Down
Loading