Skip to content

Commit

Permalink
Add support for fsspec>=2023.9.0 (#6244)
Browse files Browse the repository at this point in the history
* Add support for `fsspec>=2023.9.0`

* Fixes

* Style

* Fix mock fs for files in nested directories

* Nit

* More fixes

* Nit

* Remove print

* Update tests/test_data_files.py

Co-authored-by: Quentin Lhoest <[email protected]>

* Address some more comments

---------

Co-authored-by: Quentin Lhoest <[email protected]>
  • Loading branch information
2 people authored and albertvillanova committed Oct 24, 2023
1 parent 6011e88 commit 4dd7840
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 42 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@
"multiprocess",
# to save datasets locally or on any filesystem
# minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143
"fsspec[http]>=2023.1.0,<2023.9.0", # Temporary pin
"fsspec[http]>=2023.1.0",
# for data streaming via http
"aiohttp",
# To get datasets from the Datasets Hub on huggingface.co
Expand Down
1 change: 1 addition & 0 deletions src/datasets/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

# Imports
DILL_VERSION = version.parse(importlib.metadata.version("dill"))
FSSPEC_VERSION = version.parse(importlib.metadata.version("fsspec"))
PANDAS_VERSION = version.parse(importlib.metadata.version("pandas"))
PYARROW_VERSION = version.parse(importlib.metadata.version("pyarrow"))

Expand Down
62 changes: 31 additions & 31 deletions src/datasets/data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from fsspec import get_fs_token_paths
from fsspec.implementations.http import HTTPFileSystem
from huggingface_hub import HfFileSystem
from packaging import version
from tqdm.contrib.concurrent import thread_map

from . import config
Expand Down Expand Up @@ -42,23 +43,17 @@ class EmptyDatasetError(FileNotFoundError):
Split.TEST: ["test", "testing", "eval", "evaluation"],
}
NON_WORDS_CHARS = "-._ 0-9"
KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
if config.FSSPEC_VERSION < version.parse("2023.9.0"):
KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
else:
KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**/*[{sep}/]{keyword}[{sep}/]**"]

DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST]
DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME = {
split: [
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
for keyword in SPLIT_KEYWORDS[split]
for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
]
for split in DEFAULT_SPLITS
}
DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
split: [
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
for keyword in SPLIT_KEYWORDS[split]
for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
for pattern in KEYWORDS_IN_PATH_NAME_BASE_PATTERNS
]
for split in DEFAULT_SPLITS
}
Expand All @@ -69,16 +64,21 @@ class EmptyDatasetError(FileNotFoundError):

ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED]
ALL_DEFAULT_PATTERNS = [
DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME,
DEFAULT_PATTERNS_SPLIT_IN_FILENAME,
DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME,
DEFAULT_PATTERNS_ALL,
]
METADATA_PATTERNS = [
"metadata.csv",
"**/metadata.csv",
"metadata.jsonl",
"**/metadata.jsonl",
] # metadata file for ImageFolder and AudioFolder
if config.FSSPEC_VERSION < version.parse("2023.9.0"):
METADATA_PATTERNS = [
"metadata.csv",
"**/metadata.csv",
"metadata.jsonl",
"**/metadata.jsonl",
] # metadata file for ImageFolder and AudioFolder
else:
METADATA_PATTERNS = [
"**/metadata.csv",
"**/metadata.jsonl",
] # metadata file for ImageFolder and AudioFolder
WILDCARD_CHARACTERS = "*[]"
FILES_TO_IGNORE = [
"README.md",
Expand Down Expand Up @@ -297,10 +297,10 @@ def resolve_pattern(
- data/** to match all the files inside "data" and its subdirectories
The patterns are resolved using the fsspec glob.
Here are some behaviors specific to fsspec glob that are different from glob.glob, Path.glob, Path.match or fnmatch:
- '*' matches only first level items
- '**' matches all items
- '**/*' matches all at least second level items
glob.glob, Path.glob, Path.match or fnmatch do not support ** with a prefix/suffix other than a forward slash /.
For instance, this means **.json is the same as *.json. On the contrary, the fsspec glob has no limits regarding the ** prefix/suffix,
resulting in **.json being equivalent to **/*.json.
More generally:
- '*' matches any character except a forward-slash (to match just the file or directory name)
Expand Down Expand Up @@ -417,7 +417,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
Output:
{"train": ["**train*"], "test": ["**test*"]}
{'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
Input:
Expand All @@ -435,7 +436,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
Output:
{"train": ["**train*/**"], "test": ["**test*/**"]}
{'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
Input:
Expand All @@ -452,11 +454,9 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
Output:
{
"train": ["data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
"test": ["data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
"random": ["data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
}
{'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}
In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
"""
Expand Down
22 changes: 12 additions & 10 deletions tests/test_data_files.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import copy
import os
from pathlib import Path, PurePath
from pathlib import Path
from typing import List
from unittest.mock import patch

Expand Down Expand Up @@ -493,15 +493,16 @@ def mock_fs(file_paths: List[str]):
Example:
```py
>>> fs = mock_fs(["data/train.txt", "data.test.txt"])
>>> DummyTestFS = mock_fs(["data/train.txt", "data.test.txt"])
>>> fs = DummyTestFS()
>>> assert fsspec.get_filesystem_class("mock").__name__ == "DummyTestFS"
>>> assert type(fs).__name__ == "DummyTestFS"
>>> print(fs.glob("**"))
["data", "data/train.txt", "data.test.txt"]
```
"""

dir_paths = {file_path.rsplit("/")[0] for file_path in file_paths if "/" in file_path}
dir_paths = {file_path.rsplit("/", 1)[0] for file_path in file_paths if "/" in file_path}
fs_contents = [{"name": dir_path, "type": "directory"} for dir_path in dir_paths] + [
{"name": file_path, "type": "file", "size": 10} for file_path in file_paths
]
Expand Down Expand Up @@ -619,16 +620,17 @@ def resolver(pattern):
["metadata.jsonl"],
["metadata.csv"],
# nested metadata files
["data/metadata.jsonl", "data/train/metadata.jsonl"],
["data/metadata.csv", "data/train/metadata.csv"],
["metadata.jsonl", "data/metadata.jsonl"],
["metadata.csv", "data/metadata.csv"],
],
)
def test_get_metadata_files_patterns(metadata_files):
DummyTestFS = mock_fs(metadata_files)
fs = DummyTestFS()

def resolver(pattern):
return [PurePath(path) for path in set(metadata_files) if PurePath(path).match(pattern)]
return [file_path for file_path in fs.glob(pattern) if fs.isfile(file_path)]

patterns = _get_metadata_files_patterns(resolver)
matched = [path for path in metadata_files for pattern in patterns if PurePath(path).match(pattern)]
# Use set to remove the difference between in behavior between PurePath.match and mathcing via fsspec.glob
assert len(set(matched)) == len(metadata_files)
assert sorted(set(matched)) == sorted(metadata_files)
matched = [file_path for pattern in patterns for file_path in resolver(pattern)]
assert sorted(matched) == sorted(metadata_files)

0 comments on commit 4dd7840

Please sign in to comment.