Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better error handling in dataset_module_factory #6959

Merged
merged 3 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 18 additions & 22 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import yaml
from fsspec.core import url_to_fs
from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError, RevisionNotFoundError

from . import config
from .arrow_dataset import Dataset
Expand Down Expand Up @@ -1836,28 +1837,23 @@ def dataset_module_factory(
token=download_config.token,
timeout=100.0,
)
except Exception as e: # noqa catch any exception of hf_hub and consider that the dataset doesn't exist
if isinstance(
e,
(
OfflineModeIsEnabled,
requests.exceptions.ConnectTimeout,
requests.exceptions.ConnectionError,
),
):
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({type(e).__name__})")
elif "404" in str(e):
msg = f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed"
raise DatasetNotFoundError(msg + f" at revision '{revision}'" if revision else msg)
elif "401" in str(e):
msg = f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed"
msg = msg + f" at revision '{revision}'" if revision else msg
raise DatasetNotFoundError(
msg
+ f". If the dataset is private or gated, make sure to log in with `huggingface-cli login` or visit the dataset page at https://huggingface.co/datasets/{path} to ask for access."
)
else:
raise e
except (
OfflineModeIsEnabled,
requests.exceptions.ConnectTimeout,
requests.exceptions.ConnectionError,
) as e:
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e
except GatedRepoError as e:
raise DatasetNotFoundError(
f"Dataset '{path}' is a gated dataset on the Hub. Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access."
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe mention that the user may have to login ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

addressed in 62350f5

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks.

) from e
except RevisionNotFoundError as e:
raise DatasetNotFoundError(
f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub."
) from e
except RepositoryNotFoundError as e:
raise DatasetNotFoundError(f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed.") from e

if filename in [sibling.rfilename for sibling in dataset_info.siblings]: # contains a dataset script
fs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token)
if _require_custom_configs or (revision and revision != "main"):
Expand Down
8 changes: 2 additions & 6 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -1048,13 +1048,9 @@ def test_load_dataset_from_hub(self):
str(context.exception),
)
with self.assertRaises(DatasetNotFoundError) as context:
datasets.load_dataset("_dummy", revision="0.0.0")
datasets.load_dataset("HuggingFaceFW/fineweb-edu", revision="0.0.0")
self.assertIn(
"Dataset '_dummy' doesn't exist on the Hub",
str(context.exception),
)
self.assertIn(
"at revision '0.0.0'",
"Revision '0.0.0' doesn't exist for dataset 'HuggingFaceFW/fineweb-edu' on the Hub.",
str(context.exception),
)
for offline_simulation_mode in list(OfflineSimulationMode):
Expand Down
Loading