diff --git a/docs/source/api.rst b/docs/source/api.rst index 9fdcd1f17..94c4cdd5b 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -224,6 +224,7 @@ Other Known Implementations - `ossfs`_ for Alibaba Cloud (Aliyun) Object Storage System (OSS) - `p9fs`_ for 9P (Plan 9 Filesystem Protocol) servers - `s3fs`_ for Amazon S3 and other compatible stores +- `tosfs`_ for ByteDance volcano engine Tinder Object Storage (TOS) - `wandbfs`_ to access Wandb run data (experimental) - `webdav4`_ for WebDAV - `xrootd`_ for xrootd, with protocol "root://" @@ -243,6 +244,7 @@ Other Known Implementations .. _ossfs: https://github.com/fsspec/ossfs .. _p9fs: https://github.com/pbchekin/p9fs-py .. _s3fs: https://s3fs.readthedocs.io/en/latest/ +.. _tosfs: https://tosfs.readthedocs.io/en/latest/ .. _wandbfs: https://github.com/jkulhanek/wandbfs .. _webdav4: https://github.com/skshetry/webdav4 .. _xrootd: https://github.com/CoffeaTeam/fsspec-xrootd diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 48b4e3971..fea257b44 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,49 @@ Changelog ========= +2024.10.0 +--------- + +Fixes + +- Performance of memoryFS rm (#1725) +- Performance of git FS info (#1712) +- Avoid git hex for newer pygit (#1703) +- tests fix for zip (#1700, 1691) +- missing open_async for dirFS (#1698) +- handle pathlib in zip (#1689) +- skip tests needing kerchunk if not installed (#1689) +- allow repeated kwargs in unchain (#1673) + +Other + +- Code style (#1704, 1706) +- allow pyarrow in referenceFS parquet (#1692) +- don't hardcode test port for parallel runs (#1690) + + +2024.9.0 +-------- + +Enhancements + +- fewer stat calls in localFS (#1659) +- faster find in ZIP (#1664) + +Fixes + +- paths without "/" in dirFS (#1638) +- paths with "/" in FTS (#1643, 1644) +- ls in parquet-based nested reference sets, and append (#1645, 1657) +- exception handling for SMB (#1650) + + +Other + +- style (#1640, 1641, 1660) +- docs: xrootd (#1646) +- CI back on miniconda (#1658) + 2024.6.1 -------- diff --git a/fsspec/asyn.py b/fsspec/asyn.py index a040efc4b..de41839ea 100644 --- a/fsspec/asyn.py +++ b/fsspec/asyn.py @@ -344,6 +344,10 @@ async def _rm(self, path, recursive=False, batch_size=None, **kwargs): async def _cp_file(self, path1, path2, **kwargs): raise NotImplementedError + async def _mv_file(self, path1, path2): + await self._cp_file(path1, path2) + await self._rm_file(path1) + async def _copy( self, path1, @@ -1072,7 +1076,7 @@ async def flush(self, force=False): self.offset = 0 try: await self._initiate_upload() - except: # noqa: E722 + except: self.closed = True raise diff --git a/fsspec/core.py b/fsspec/core.py index de66be28d..1954667fe 100644 --- a/fsspec/core.py +++ b/fsspec/core.py @@ -346,7 +346,10 @@ def _un_chain(path, kwargs): kws = kwargs.pop(protocol, {}) if bit is bits[0]: kws.update(kwargs) - kw = dict(**extra_kwargs, **kws) + kw = dict( + **{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]}, + **kws, + ) bit = cls._strip_protocol(bit) if ( protocol in {"blockcache", "filecache", "simplecache"} @@ -578,7 +581,7 @@ def expand_paths_if_needed(paths, mode, num, fs, name_function): paths = list(paths) if "w" in mode: # read mode - if sum([1 for p in paths if "*" in p]) > 1: + if sum(1 for p in paths if "*" in p) > 1: raise ValueError( "When writing data, only one filename mask can be specified." ) diff --git a/fsspec/implementations/arrow.py b/fsspec/implementations/arrow.py index f9fea70d2..530df901a 100644 --- a/fsspec/implementations/arrow.py +++ b/fsspec/implementations/arrow.py @@ -128,7 +128,7 @@ def cp_file(self, path1, path2, **kwargs): with self.open(tmp_fname, "wb") as rstream: shutil.copyfileobj(lstream, rstream) self.fs.move(tmp_fname, path2) - except BaseException: # noqa + except BaseException: with suppress(FileNotFoundError): self.fs.delete_file(tmp_fname) raise diff --git a/fsspec/implementations/dbfs.py b/fsspec/implementations/dbfs.py index 19f2ffc19..30c2947b0 100644 --- a/fsspec/implementations/dbfs.py +++ b/fsspec/implementations/dbfs.py @@ -79,7 +79,7 @@ def ls(self, path, detail=True, **kwargs): if e.error_code == "RESOURCE_DOES_NOT_EXIST": raise FileNotFoundError(e.message) from e - raise e + raise files = r["files"] out = [ { @@ -125,7 +125,7 @@ def makedirs(self, path, exist_ok=True): if e.error_code == "RESOURCE_ALREADY_EXISTS": raise FileExistsError(e.message) from e - raise e + raise self.invalidate_cache(self._parent(path)) def mkdir(self, path, create_parents=True, **kwargs): @@ -171,7 +171,7 @@ def rm(self, path, recursive=False, **kwargs): # Using the same exception as the os module would use here raise OSError(e.message) from e - raise e + raise self.invalidate_cache(self._parent(path)) def mv( @@ -216,7 +216,7 @@ def mv( elif e.error_code == "RESOURCE_ALREADY_EXISTS": raise FileExistsError(e.message) from e - raise e + raise self.invalidate_cache(self._parent(source_path)) self.invalidate_cache(self._parent(destination_path)) @@ -299,7 +299,7 @@ def _create_handle(self, path, overwrite=True): if e.error_code == "RESOURCE_ALREADY_EXISTS": raise FileExistsError(e.message) from e - raise e + raise def _close_handle(self, handle): """ @@ -316,7 +316,7 @@ def _close_handle(self, handle): if e.error_code == "RESOURCE_DOES_NOT_EXIST": raise FileNotFoundError(e.message) from e - raise e + raise def _add_data(self, handle, data): """ @@ -346,7 +346,7 @@ def _add_data(self, handle, data): elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED": raise ValueError(e.message) from e - raise e + raise def _get_data(self, path, start, end): """ @@ -376,7 +376,7 @@ def _get_data(self, path, start, end): elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]: raise ValueError(e.message) from e - raise e + raise def invalidate_cache(self, path=None): if path is None: diff --git a/fsspec/implementations/dirfs.py b/fsspec/implementations/dirfs.py index 6a28909da..0bee86469 100644 --- a/fsspec/implementations/dirfs.py +++ b/fsspec/implementations/dirfs.py @@ -370,3 +370,15 @@ def open( *args, **kwargs, ) + + async def open_async( + self, + path, + *args, + **kwargs, + ): + return await self.fs.open_async( + self._join(path), + *args, + **kwargs, + ) diff --git a/fsspec/implementations/ftp.py b/fsspec/implementations/ftp.py index 9e245d39f..f3471b996 100644 --- a/fsspec/implementations/ftp.py +++ b/fsspec/implementations/ftp.py @@ -2,7 +2,7 @@ import sys import uuid import warnings -from ftplib import FTP, Error, error_perm +from ftplib import FTP, FTP_TLS, Error, error_perm from typing import Any from ..spec import AbstractBufferedFile, AbstractFileSystem @@ -27,6 +27,7 @@ def __init__( tempdir=None, timeout=30, encoding="utf-8", + tls=False, **kwargs, ): """ @@ -56,28 +57,37 @@ def __init__( Timeout of the ftp connection in seconds encoding: str Encoding to use for directories and filenames in FTP connection + tls: bool + Use FTP-TLS, by default False """ super().__init__(**kwargs) self.host = host self.port = port self.tempdir = tempdir or "/tmp" - self.cred = username, password, acct + self.cred = username or "", password or "", acct or "" self.timeout = timeout self.encoding = encoding if block_size is not None: self.blocksize = block_size else: self.blocksize = 2**16 + self.tls = tls self._connect() + if self.tls: + self.ftp.prot_p() def _connect(self): + if self.tls: + ftp_cls = FTP_TLS + else: + ftp_cls = FTP if sys.version_info >= (3, 9): - self.ftp = FTP(timeout=self.timeout, encoding=self.encoding) + self.ftp = ftp_cls(timeout=self.timeout, encoding=self.encoding) elif self.encoding: warnings.warn("`encoding` not supported for python<3.9, ignoring") - self.ftp = FTP(timeout=self.timeout) + self.ftp = ftp_cls(timeout=self.timeout) else: - self.ftp = FTP(timeout=self.timeout) + self.ftp = ftp_cls(timeout=self.timeout) self.ftp.connect(self.host, self.port) self.ftp.login(*self.cred) diff --git a/fsspec/implementations/git.py b/fsspec/implementations/git.py index 7c34d93e0..7b9d3539a 100644 --- a/fsspec/implementations/git.py +++ b/fsspec/implementations/git.py @@ -55,6 +55,8 @@ def _path_to_object(self, path, ref): tree = comm.tree for part in parts: if part and isinstance(tree, pygit2.Tree): + if part not in tree: + raise FileNotFoundError(path) tree = tree[part] return tree @@ -69,46 +71,32 @@ def _get_kwargs_from_urls(path): out["ref"], path = path.split("@", 1) return out + @staticmethod + def _object_to_info(obj, path=None): + # obj.name and obj.filemode are None for the root tree! + is_dir = isinstance(obj, pygit2.Tree) + return { + "type": "directory" if is_dir else "file", + "name": ( + "/".join([path, obj.name or ""]).lstrip("/") if path else obj.name + ), + "hex": str(obj.id), + "mode": "100644" if obj.filemode is None else f"{obj.filemode:o}", + "size": 0 if is_dir else obj.size, + } + def ls(self, path, detail=True, ref=None, **kwargs): - path = self._strip_protocol(path) - tree = self._path_to_object(path, ref) - if isinstance(tree, pygit2.Tree): - out = [] - for obj in tree: - if isinstance(obj, pygit2.Tree): - out.append( - { - "type": "directory", - "name": "/".join([path, obj.name]).lstrip("/"), - "hex": obj.hex, - "mode": f"{obj.filemode:o}", - "size": 0, - } - ) - else: - out.append( - { - "type": "file", - "name": "/".join([path, obj.name]).lstrip("/"), - "hex": obj.hex, - "mode": f"{obj.filemode:o}", - "size": obj.size, - } - ) - else: - obj = tree - out = [ - { - "type": "file", - "name": obj.name, - "hex": obj.hex, - "mode": f"{obj.filemode:o}", - "size": obj.size, - } - ] - if detail: - return out - return [o["name"] for o in out] + tree = self._path_to_object(self._strip_protocol(path), ref) + return [ + GitFileSystem._object_to_info(obj, path) + if detail + else GitFileSystem._object_to_info(obj, path)["name"] + for obj in (tree if isinstance(tree, pygit2.Tree) else [tree]) + ] + + def info(self, path, ref=None, **kwargs): + tree = self._path_to_object(self._strip_protocol(path), ref) + return GitFileSystem._object_to_info(tree, path) def ukey(self, path, ref=None): return self.info(path, ref=ref)["hex"] diff --git a/fsspec/implementations/http.py b/fsspec/implementations/http.py index 7b5a38bb3..47dfb88f9 100644 --- a/fsspec/implementations/http.py +++ b/fsspec/implementations/http.py @@ -358,9 +358,10 @@ def _open( kw = self.kwargs.copy() kw["asynchronous"] = self.asynchronous kw.update(kwargs) - size = size or self.info(path, **kwargs)["size"] + info = {} + size = size or info.update(self.info(path, **kwargs)) or info["size"] session = sync(self.loop, self.set_session) - if block_size and size: + if block_size and size and info.get("partial", True): return HTTPFile( self, path, @@ -520,9 +521,9 @@ async def _isdir(self, path): class HTTPFile(AbstractBufferedFile): """ - A file-like object pointing to a remove HTTP(S) resource + A file-like object pointing to a remote HTTP(S) resource - Supports only reading, with read-ahead of a predermined block-size. + Supports only reading, with read-ahead of a predetermined block-size. In the case that the server does not supply the filesize, only reading of the complete file in one go is supported. @@ -835,10 +836,6 @@ async def _file_info(url, session, size_policy="head", **kwargs): async with r: r.raise_for_status() - # TODO: - # recognise lack of 'Accept-Ranges', - # or 'Accept-Ranges': 'none' (not 'bytes') - # to mean streaming only, no random access => return None if "Content-Length" in r.headers: # Some servers may choose to ignore Accept-Encoding and return # compressed content, in which case the returned size is unreliable. @@ -853,6 +850,11 @@ async def _file_info(url, session, size_policy="head", **kwargs): if "Content-Type" in r.headers: info["mimetype"] = r.headers["Content-Type"].partition(";")[0] + if r.headers.get("Accept-Ranges") == "none": + # Some servers may explicitly discourage partial content requests, but + # the lack of "Accept-Ranges" does not always indicate they would fail + info["partial"] = False + info["url"] = str(r.url) for checksum_field in ["ETag", "Content-MD5", "Digest"]: diff --git a/fsspec/implementations/local.py b/fsspec/implementations/local.py index 9881606f1..f032d8aeb 100644 --- a/fsspec/implementations/local.py +++ b/fsspec/implementations/local.py @@ -79,6 +79,14 @@ def info(self, path, **kwargs): t = "file" else: t = "other" + + size = out.st_size + if link: + try: + out2 = path.stat(follow_symlinks=True) + size = out2.st_size + except OSError: + size = 0 path = self._strip_protocol(path.path) else: # str or path-like @@ -87,6 +95,7 @@ def info(self, path, **kwargs): link = stat.S_ISLNK(out.st_mode) if link: out = os.stat(path, follow_symlinks=True) + size = out.st_size if stat.S_ISDIR(out.st_mode): t = "directory" elif stat.S_ISREG(out.st_mode): @@ -95,20 +104,15 @@ def info(self, path, **kwargs): t = "other" result = { "name": path, - "size": out.st_size, + "size": size, "type": t, "created": out.st_ctime, "islink": link, } for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]: result[field] = getattr(out, f"st_{field}") - if result["islink"]: + if link: result["destination"] = os.readlink(path) - try: - out2 = os.stat(path, follow_symlinks=True) - result["size"] = out2.st_size - except OSError: - result["size"] = 0 return result def lexists(self, path, **kwargs): diff --git a/fsspec/implementations/memory.py b/fsspec/implementations/memory.py index d06db4c38..b65939148 100644 --- a/fsspec/implementations/memory.py +++ b/fsspec/implementations/memory.py @@ -250,6 +250,10 @@ def created(self, path): except KeyError as e: raise FileNotFoundError(path) from e + def isfile(self, path): + path = self._strip_protocol(path) + return path in self.store + def rm(self, path, recursive=False, maxdepth=None): if isinstance(path, str): path = self._strip_protocol(path) @@ -257,14 +261,14 @@ def rm(self, path, recursive=False, maxdepth=None): path = [self._strip_protocol(p) for p in path] paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth) for p in reversed(paths): + if self.isfile(p): + self.rm_file(p) # If the expanded path doesn't exist, it is only because the expanded # path was a directory that does not exist in self.pseudo_dirs. This # is possible if you directly create files without making the # directories first. - if not self.exists(p): + elif not self.exists(p): continue - if self.isfile(p): - self.rm_file(p) else: self.rmdir(p) diff --git a/fsspec/implementations/reference.py b/fsspec/implementations/reference.py index 7c01881a5..3d22ddc40 100644 --- a/fsspec/implementations/reference.py +++ b/fsspec/implementations/reference.py @@ -7,7 +7,7 @@ import os from itertools import chain from functools import lru_cache -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal import fsspec.core @@ -104,7 +104,13 @@ def pd(self): return pd def __init__( - self, root, fs=None, out_root=None, cache_size=128, categorical_threshold=10 + self, + root, + fs=None, + out_root=None, + cache_size=128, + categorical_threshold=10, + engine: Literal["fastparquet", "pyarrow"] = "fastparquet", ): """ @@ -126,16 +132,25 @@ def __init__( Encode urls as pandas.Categorical to reduce memory footprint if the ratio of the number of unique urls to total number of refs for each variable is greater than or equal to this number. (default 10) + engine: Literal["fastparquet","pyarrow"] + Engine choice for reading parquet files. (default is "fastparquet") """ + self.root = root self.chunk_sizes = {} self.out_root = out_root or self.root self.cat_thresh = categorical_threshold + self.engine = engine self.cache_size = cache_size self.url = self.root + "/{field}/refs.{record}.parq" # TODO: derive fs from `root` self.fs = fsspec.filesystem("file") if fs is None else fs + from importlib.util import find_spec + + if self.engine == "pyarrow" and find_spec("pyarrow") is None: + raise ImportError("engine choice `pyarrow` is not installed.") + def __getattr__(self, item): if item in ("_items", "record_size", "zmetadata"): self.setup() @@ -159,8 +174,8 @@ def open_refs(field, record): path = self.url.format(field=field, record=record) data = io.BytesIO(self.fs.cat_file(path)) try: - df = self.pd.read_parquet(data, engine="fastparquet") - refs = {c: df[c].values for c in df.columns} + df = self.pd.read_parquet(data, engine=self.engine) + refs = {c: df[c].to_numpy() for c in df.columns} except IOError: refs = None return refs @@ -466,18 +481,28 @@ def write(self, field, record, base_url=None, storage_options=None): fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq" self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True) + + if self.engine == "pyarrow": + df_backend_kwargs = {"write_statistics": False} + elif self.engine == "fastparquet": + df_backend_kwargs = { + "stats": False, + "object_encoding": object_encoding, + "has_nulls": has_nulls, + } + else: + raise NotImplementedError(f"{self.engine} not supported") + df.to_parquet( fn, - engine="fastparquet", + engine=self.engine, storage_options=storage_options or getattr(self.fs, "storage_options", None), compression="zstd", index=False, - stats=False, - object_encoding=object_encoding, - has_nulls=has_nulls, - # **kwargs, + **df_backend_kwargs, ) + partition.clear() self._items.pop((field, record)) @@ -489,6 +514,7 @@ def flush(self, base_url=None, storage_options=None): base_url: str Location of the output """ + # write what we have so far and clear sub chunks for thing in list(self._items): if isinstance(thing, tuple): @@ -1003,9 +1029,11 @@ def _process_gen(self, gens): out = {} for gen in gens: dimension = { - k: v - if isinstance(v, list) - else range(v.get("start", 0), v["stop"], v.get("step", 1)) + k: ( + v + if isinstance(v, list) + else range(v.get("start", 0), v["stop"], v.get("step", 1)) + ) for k, v in gen["dimensions"].items() } products = ( diff --git a/fsspec/implementations/smb.py b/fsspec/implementations/smb.py index a3c2d1b2d..db6b3f5c3 100644 --- a/fsspec/implementations/smb.py +++ b/fsspec/implementations/smb.py @@ -202,7 +202,7 @@ def _connect(self): else: # All another ValueError exceptions should be raised, as they are not # related to network issues. - raise exc + raise except Exception as exc: # Save the exception and retry to connect. This except might be dropped # in the future, once all exceptions suited for retry are identified. diff --git a/fsspec/implementations/tests/ftp_tls.py b/fsspec/implementations/tests/ftp_tls.py new file mode 100644 index 000000000..6d1359bfa --- /dev/null +++ b/fsspec/implementations/tests/ftp_tls.py @@ -0,0 +1,38 @@ +import os + +from pyftpdlib.authorizers import DummyAuthorizer +from pyftpdlib.handlers import TLS_FTPHandler +from pyftpdlib.servers import FTPServer + + +def ftp(): + """Script to run FTP server that accepts TLS""" + # Set up FTP server parameters + FTP_HOST = "localhost" + FTP_PORT = 2121 + FTP_DIRECTORY = os.path.dirname(os.path.abspath(__file__)) + + # Instantiate a dummy authorizer + authorizer = DummyAuthorizer() + authorizer.add_user( + "user", + "pass", + FTP_DIRECTORY, + "elradfmwMT", + ) + authorizer.add_anonymous(FTP_DIRECTORY) + + # Instantiate TLS_FTPHandler with required parameters + handler = TLS_FTPHandler + handler.certfile = os.path.join(os.path.dirname(__file__), "keycert.pem") + handler.authorizer = authorizer + + # Instantiate FTP server with TLS handler and authorizer + server = FTPServer((FTP_HOST, FTP_PORT), handler) + server.authorizer = authorizer + + server.serve_forever() + + +if __name__ == "__main__": + ftp() diff --git a/fsspec/implementations/tests/keycert.pem b/fsspec/implementations/tests/keycert.pem new file mode 100644 index 000000000..2093f1d15 --- /dev/null +++ b/fsspec/implementations/tests/keycert.pem @@ -0,0 +1,24 @@ +-----BEGIN EC PARAMETERS----- +BggqhkjOPQMBBw== +-----END EC PARAMETERS----- +-----BEGIN EC PRIVATE KEY----- +MHcCAQEEIBTg1e61mzYYPJ+MDkOWCSevnT1HUaaK9iopgTGyDoIuoAoGCCqGSM49 +AwEHoUQDQgAEDy3E+4WgohcRUlaSZBndEZQBTyoRztCSoaDbhZkqsPFBbeaGJ5zA +E7qX+9LICDezAUsCiq2RYltOqDCsELteiQ== +-----END EC PRIVATE KEY----- +-----BEGIN CERTIFICATE----- +MIICdzCCAh2gAwIBAgIUNN4kmTSxbLOoQXLFiYOs2XeK1jIwCgYIKoZIzj0EAwIw +gY8xCzAJBgNVBAYTAk5MMRUwEwYDVQQIDAxadWlkLUhvbGxhbmQxDjAMBgNVBAcM +BURlbGZ0MRAwDgYDVQQKDAdXaGlmZmxlMQ0wCwYDVQQLDARERVZBMRIwEAYDVQQD +DAlCYXJ0dmFuRXMxJDAiBgkqhkiG9w0BCQEWFWJhcnQudmFuZXNAd2hpZmZsZS5u +bDAgFw0yNDA0MTgxMDI0NDFaGA8yMjk4MDIwMTEwMjQ0MVowgY8xCzAJBgNVBAYT +Ak5MMRUwEwYDVQQIDAxadWlkLUhvbGxhbmQxDjAMBgNVBAcMBURlbGZ0MRAwDgYD +VQQKDAdXaGlmZmxlMQ0wCwYDVQQLDARERVZBMRIwEAYDVQQDDAlCYXJ0dmFuRXMx +JDAiBgkqhkiG9w0BCQEWFWJhcnQudmFuZXNAd2hpZmZsZS5ubDBZMBMGByqGSM49 +AgEGCCqGSM49AwEHA0IABA8txPuFoKIXEVJWkmQZ3RGUAU8qEc7QkqGg24WZKrDx +QW3mhiecwBO6l/vSyAg3swFLAoqtkWJbTqgwrBC7XomjUzBRMB0GA1UdDgQWBBRb +1nPqritk/P2cbDzTw9SQ9vO7JDAfBgNVHSMEGDAWgBRb1nPqritk/P2cbDzTw9SQ +9vO7JDAPBgNVHRMBAf8EBTADAQH/MAoGCCqGSM49BAMCA0gAMEUCIBcvCFS4AD3p +Ix1v8pp3hcMvGFIQLeczh4kXkPfZWvBkAiEAiTCqsdKhZi8k814H6FFkaoQVIjTe +iUtUlW6RfyDNZ9E= +-----END CERTIFICATE----- diff --git a/fsspec/implementations/tests/test_arrow.py b/fsspec/implementations/tests/test_arrow.py index af706c530..b9cbb2137 100644 --- a/fsspec/implementations/tests/test_arrow.py +++ b/fsspec/implementations/tests/test_arrow.py @@ -5,7 +5,7 @@ pyarrow_fs = pytest.importorskip("pyarrow.fs") FileSystem = pyarrow_fs.FileSystem -from fsspec.implementations.arrow import ArrowFSWrapper, HadoopFileSystem # noqa +from fsspec.implementations.arrow import ArrowFSWrapper, HadoopFileSystem # noqa: E402 @pytest.fixture(scope="function") diff --git a/fsspec/implementations/tests/test_dirfs.py b/fsspec/implementations/tests/test_dirfs.py index 990ab11de..46ddd0c1a 100644 --- a/fsspec/implementations/tests/test_dirfs.py +++ b/fsspec/implementations/tests/test_dirfs.py @@ -589,6 +589,16 @@ def test_sign(mocker, dirfs): dirfs.fs.sign.assert_called_once_with(f"{PATH}/file", *ARGS, **KWARGS) +@pytest.mark.asyncio +async def test_open_async(mocker, adirfs): + adirfs.fs.open_async = mocker.AsyncMock() + assert ( + await adirfs.open_async("file", *ARGS, **KWARGS) + == adirfs.fs.open_async.return_value + ) + adirfs.fs.open_async.assert_called_once_with(f"{PATH}/file", *ARGS, **KWARGS) + + def test_open(mocker, dirfs): dirfs.fs.open = mocker.Mock() assert dirfs.open("file", *ARGS, **KWARGS) == dirfs.fs.open.return_value diff --git a/fsspec/implementations/tests/test_ftp.py b/fsspec/implementations/tests/test_ftp.py index 7bf0c0e5d..e480ecaff 100644 --- a/fsspec/implementations/tests/test_ftp.py +++ b/fsspec/implementations/tests/test_ftp.py @@ -2,6 +2,7 @@ import subprocess import sys import time +from ftplib import FTP, FTP_TLS import pytest @@ -17,7 +18,7 @@ def ftp(): pytest.importorskip("pyftpdlib") P = subprocess.Popen( - [sys.executable, "-m", "pyftpdlib", "-d", here], + [sys.executable, os.path.join(here, "ftp_tls.py")], stderr=subprocess.STDOUT, stdout=subprocess.PIPE, ) @@ -29,9 +30,31 @@ def ftp(): P.wait() -def test_basic(ftp): +@pytest.mark.parametrize( + "tls,exp_cls", + ( + (False, FTP), + (True, FTP_TLS), + ), +) +def test_tls(ftp, tls, exp_cls): host, port = ftp - fs = FTPFileSystem(host, port) + fs = FTPFileSystem(host, port, tls=tls) + assert isinstance(fs.ftp, exp_cls) + + +@pytest.mark.parametrize( + "tls,username,password", + ( + (False, "", ""), + (True, "", ""), + (False, "user", "pass"), + (True, "user", "pass"), + ), +) +def test_basic(ftp, tls, username, password): + host, port = ftp + fs = FTPFileSystem(host, port, username, password, tls=tls) assert fs.ls("/", detail=False) == sorted(os.listdir(here)) out = fs.cat(f"/{os.path.basename(__file__)}") assert out == open(__file__, "rb").read() diff --git a/fsspec/implementations/tests/test_git.py b/fsspec/implementations/tests/test_git.py index ffa7b47d9..2aeb544a1 100644 --- a/fsspec/implementations/tests/test_git.py +++ b/fsspec/implementations/tests/test_git.py @@ -61,6 +61,39 @@ def test_refs(repo): assert f.read() == b"data3" +def _check_FileNotFoundError(f, *args, **kwargs): + with pytest.raises(FileNotFoundError): + f(*args, **kwargs) + + +def test_file_existence_checks(repo): + d, sha = repo + + fs, _ = fsspec.url_to_fs(f"git://{d}:abranch@") + + assert fs.lexists("inner") + assert fs.exists("inner") + assert fs.isdir("inner") + assert fs.info("inner") + assert fs.ls("inner") + + assert fs.lexists("inner/file1") + assert fs.exists("inner/file1") + assert fs.info("inner/file1") + assert fs.ls("inner/file1") + + assert not fs.lexists("non-existing-file") + assert not fs.exists("non-existing-file") + + assert not fs.isfile("non-existing-file") + assert not fs.isdir("non-existing-file") + + _check_FileNotFoundError(fs.info, "non-existing-file") + _check_FileNotFoundError(fs.size, "non-existing-file") + _check_FileNotFoundError(fs.ls, "non-existing-file") + _check_FileNotFoundError(fs.open, "non-existing-file") + + def test_url(repo): d, sha = repo fs, _, paths = fsspec.core.get_fs_token_paths(f"git://file1::file://{d}") diff --git a/fsspec/implementations/tests/test_http.py b/fsspec/implementations/tests/test_http.py index 81e438a81..91b16ce99 100644 --- a/fsspec/implementations/tests/test_http.py +++ b/fsspec/implementations/tests/test_http.py @@ -16,20 +16,20 @@ def test_list(server): h = fsspec.filesystem("http") - out = h.glob(server + "/index/*") - assert out == [server + "/index/realfile"] + out = h.glob(server.address + "/index/*") + assert out == [server.realfile] def test_list_invalid_args(server): with pytest.raises(TypeError): h = fsspec.filesystem("http", use_foobar=True) - h.glob(server + "/index/*") + h.glob(server.address + "/index/*") def test_list_cache(server): h = fsspec.filesystem("http", use_listings_cache=True) - out = h.glob(server + "/index/*") - assert out == [server + "/index/realfile"] + out = h.glob(server.address + "/index/*") + assert out == [server.realfile] def test_list_cache_with_expiry_time_cached(server): @@ -40,14 +40,14 @@ def test_list_cache_with_expiry_time_cached(server): # By querying the filesystem with "use_listings_cache=True", # the cache will automatically get populated. - out = h.glob(server + "/index/*") - assert out == [server + "/index/realfile"] + out = h.glob(server.address + "/index/*") + assert out == [server.realfile] # Verify cache content. assert len(h.dircache) == 1 - out = h.glob(server + "/index/*") - assert out == [server + "/index/realfile"] + out = h.glob(server.address + "/index/*") + assert out == [server.realfile] def test_list_cache_with_expiry_time_purged(server): @@ -58,26 +58,26 @@ def test_list_cache_with_expiry_time_purged(server): # By querying the filesystem with "use_listings_cache=True", # the cache will automatically get populated. - out = h.glob(server + "/index/*") - assert out == [server + "/index/realfile"] + out = h.glob(server.address + "/index/*") + assert out == [server.realfile] assert len(h.dircache) == 1 # Verify cache content. - assert server + "/index/" in h.dircache - assert len(h.dircache.get(server + "/index/")) == 1 + assert server.address + "/index/" in h.dircache + assert len(h.dircache.get(server.address + "/index/")) == 1 # Wait beyond the TTL / cache expiry time. time.sleep(0.31) # Verify that the cache item should have been purged. - cached_items = h.dircache.get(server + "/index/") + cached_items = h.dircache.get(server.address + "/index/") assert cached_items is None # Verify that after clearing the item from the cache, # it can get populated again. - out = h.glob(server + "/index/*") - assert out == [server + "/index/realfile"] - cached_items = h.dircache.get(server + "/index/") + out = h.glob(server.address + "/index/*") + assert out == [server.realfile] + cached_items = h.dircache.get(server.address + "/index/") assert len(cached_items) == 1 @@ -89,8 +89,8 @@ def test_list_cache_reuse(server): # By querying the filesystem with "use_listings_cache=True", # the cache will automatically get populated. - out = h.glob(server + "/index/*") - assert out == [server + "/index/realfile"] + out = h.glob(server.address + "/index/*") + assert out == [server.realfile] # Verify cache content. assert len(h.dircache) == 1 @@ -114,53 +114,53 @@ def test_ls_raises_filenotfound(server): h = fsspec.filesystem("http") with pytest.raises(FileNotFoundError): - h.ls(server + "/not-a-key") + h.ls(server.address + "/not-a-key") def test_list_cache_with_max_paths(server): h = fsspec.filesystem("http", use_listings_cache=True, max_paths=5) - out = h.glob(server + "/index/*") - assert out == [server + "/index/realfile"] + out = h.glob(server.address + "/index/*") + assert out == [server.realfile] def test_list_cache_with_skip_instance_cache(server): h = fsspec.filesystem("http", use_listings_cache=True, skip_instance_cache=True) - out = h.glob(server + "/index/*") - assert out == [server + "/index/realfile"] + out = h.glob(server.address + "/index/*") + assert out == [server.realfile] def test_glob_return_subfolders(server): h = fsspec.filesystem("http") - out = h.glob(server + "/simple/*") + out = h.glob(server.address + "/simple/*") assert set(out) == { - server + "/simple/dir/", - server + "/simple/file", + server.address + "/simple/dir/", + server.address + "/simple/file", } def test_isdir(server): h = fsspec.filesystem("http") - assert h.isdir(server + "/index/") - assert not h.isdir(server + "/index/realfile") - assert not h.isdir(server + "doesnotevenexist") + assert h.isdir(server.address + "/index/") + assert not h.isdir(server.realfile) + assert not h.isdir(server.address + "doesnotevenexist") def test_policy_arg(server): h = fsspec.filesystem("http", size_policy="get") - out = h.glob(server + "/index/*") - assert out == [server + "/index/realfile"] + out = h.glob(server.address + "/index/*") + assert out == [server.realfile] def test_exists(server): h = fsspec.filesystem("http") - assert not h.exists(server + "/notafile") + assert not h.exists(server.address + "/notafile") with pytest.raises(FileNotFoundError): - h.cat(server + "/notafile") + h.cat(server.address + "/notafile") def test_read(server): h = fsspec.filesystem("http") - out = server + "/index/realfile" + out = server.realfile with h.open(out, "rb") as f: assert f.read() == data with h.open(out, "rb", block_size=0) as f: @@ -174,7 +174,7 @@ def test_file_pickle(server): # via HTTPFile h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"}) - out = server + "/index/realfile" + out = server.realfile with fsspec.open(out, headers={"give_length": "true", "head_ok": "true"}) as f: pic = pickle.loads(pickle.dumps(f)) @@ -188,7 +188,7 @@ def test_file_pickle(server): # via HTTPStreamFile h = fsspec.filesystem("http") - out = server + "/index/realfile" + out = server.realfile with h.open(out, "rb") as f: out = pickle.dumps(f) assert f.read() == data @@ -198,7 +198,7 @@ def test_file_pickle(server): def test_methods(server): h = fsspec.filesystem("http") - url = server + "/index/realfile" + url = server.realfile assert h.exists(url) assert h.cat(url) == data @@ -219,7 +219,7 @@ def test_methods(server): ) def test_random_access(server, headers): h = fsspec.filesystem("http", headers=headers) - url = server + "/index/realfile" + url = server.realfile with h.open(url, "rb") as f: if headers: assert f.size == len(data) @@ -237,18 +237,22 @@ def test_random_access(server, headers): @pytest.mark.parametrize( "headers", [ - {"ignore_range": "true", "head_ok": "true", "head_give_length": "true"}, + # HTTPFile seeks, response headers lack size, assumed no range support + {"head_ok": "true", "head_give_length": "true"}, + # HTTPFile seeks, response is not a range {"ignore_range": "true", "give_length": "true"}, {"ignore_range": "true", "give_range": "true"}, + # HTTPStreamFile does not seek (past 0) + {"accept_range": "none", "head_ok": "true", "give_length": "true"}, ], ) def test_no_range_support(server, headers): h = fsspec.filesystem("http", headers=headers) - url = server + "/index/realfile" + url = server.realfile with h.open(url, "rb") as f: # Random access is not possible if the server doesn't respect Range - f.seek(5) with pytest.raises(ValueError): + f.seek(5) f.read(10) # Reading from the beginning should still work @@ -258,7 +262,7 @@ def test_no_range_support(server, headers): def test_stream_seek(server): h = fsspec.filesystem("http") - url = server + "/index/realfile" + url = server.realfile with h.open(url, "rb") as f: f.seek(0) # is OK data1 = f.read(5) @@ -271,11 +275,11 @@ def test_stream_seek(server): def test_mapper_url(server): h = fsspec.filesystem("http") - mapper = h.get_mapper(server + "/index/") + mapper = h.get_mapper(server.address + "/index/") assert mapper.root.startswith("http:") assert list(mapper) - mapper2 = fsspec.get_mapper(server + "/index/") + mapper2 = fsspec.get_mapper(server.address + "/index/") assert mapper2.root.startswith("http:") assert list(mapper) == list(mapper2) @@ -284,7 +288,7 @@ def test_content_length_zero(server): h = fsspec.filesystem( "http", headers={"give_length": "true", "zero_length": "true"} ) - url = server + "/index/realfile" + url = server.realfile with h.open(url, "rb") as f: assert f.read() == data @@ -294,7 +298,7 @@ def test_content_encoding_gzip(server): h = fsspec.filesystem( "http", headers={"give_length": "true", "gzip_encoding": "true"} ) - url = server + "/index/realfile" + url = server.realfile with h.open(url, "rb") as f: assert isinstance(f, HTTPStreamFile) @@ -304,7 +308,7 @@ def test_content_encoding_gzip(server): def test_download(server, tmpdir): h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "}) - url = server + "/index/realfile" + url = server.realfile fn = os.path.join(tmpdir, "afile") h.get(url, fn) assert open(fn, "rb").read() == data @@ -312,8 +316,8 @@ def test_download(server, tmpdir): def test_multi_download(server, tmpdir): h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "}) - urla = server + "/index/realfile" - urlb = server + "/index/otherfile" + urla = server.realfile + urlb = server.address + "/index/otherfile" fna = os.path.join(tmpdir, "afile") fnb = os.path.join(tmpdir, "bfile") h.get([urla, urlb], [fna, fnb]) @@ -323,25 +327,25 @@ def test_multi_download(server, tmpdir): def test_ls(server): h = fsspec.filesystem("http") - l = h.ls(server + "/data/20020401/", detail=False) - nc = server + "/data/20020401/GRACEDADM_CLSM0125US_7D.A20020401.030.nc4" + l = h.ls(server.address + "/data/20020401/", detail=False) + nc = server.address + "/data/20020401/GRACEDADM_CLSM0125US_7D.A20020401.030.nc4" assert nc in l assert len(l) == 11 - assert all(u["type"] == "file" for u in h.ls(server + "/data/20020401/")) - assert h.glob(server + "/data/20020401/*.nc4") == [nc] + assert all(u["type"] == "file" for u in h.ls(server.address + "/data/20020401/")) + assert h.glob(server.address + "/data/20020401/*.nc4") == [nc] def test_mcat(server): h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "}) - urla = server + "/index/realfile" - urlb = server + "/index/otherfile" + urla = server.realfile + urlb = server.address + "/index/otherfile" out = h.cat([urla, urlb]) assert out == {urla: data, urlb: data} def test_cat_file_range(server): h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "}) - urla = server + "/index/realfile" + urla = server.realfile assert h.cat(urla, start=1, end=10) == data[1:10] assert h.cat(urla, start=1) == data[1:] @@ -354,37 +358,37 @@ def test_cat_file_range(server): def test_cat_file_range_numpy(server): np = pytest.importorskip("numpy") h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "}) - urla = server + "/index/realfile" + urla = server.realfile assert h.cat(urla, start=np.int8(1), end=np.int8(10)) == data[1:10] out = h.cat_ranges([urla, urla], starts=np.array([1, 5]), ends=np.array([10, 15])) assert out == [data[1:10], data[5:15]] def test_mcat_cache(server): - urla = server + "/index/realfile" - urlb = server + "/index/otherfile" + urla = server.realfile + urlb = server.address + "/index/otherfile" fs = fsspec.filesystem("simplecache", target_protocol="http") assert fs.cat([urla, urlb]) == {urla: data, urlb: data} def test_mcat_expand(server): h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "}) - out = h.cat(server + "/index/*") - assert out == {server + "/index/realfile": data} + out = h.cat(server.address + "/index/*") + assert out == {server.realfile: data} def test_info(server): fs = fsspec.filesystem("http", headers={"give_etag": "true", "head_ok": "true"}) - info = fs.info(server + "/index/realfile") + info = fs.info(server.realfile) assert info["ETag"] == "xxx" fs = fsspec.filesystem("http", headers={"give_mimetype": "true"}) - info = fs.info(server + "/index/realfile") + info = fs.info(server.realfile) assert info["mimetype"] == "text/html" fs = fsspec.filesystem("http", headers={"redirect": "true"}) - info = fs.info(server + "/redirectme") - assert info["url"] == server + "/index/realfile" + info = fs.info(server.address + "/redirectme") + assert info["url"] == server.realfile @pytest.mark.parametrize("method", ["POST", "PUT"]) @@ -396,21 +400,21 @@ def test_put_file(server, tmp_path, method, reset_files): fs = fsspec.filesystem("http", headers={"head_ok": "true", "give_length": "true"}) with pytest.raises(FileNotFoundError): - fs.info(server + "/hey") + fs.info(server.address + "/hey") - fs.put_file(src_file, server + "/hey", method=method) - assert fs.info(server + "/hey")["size"] == len(data) + fs.put_file(src_file, server.address + "/hey", method=method) + assert fs.info(server.address + "/hey")["size"] == len(data) - fs.get_file(server + "/hey", dwl_file) + fs.get_file(server.address + "/hey", dwl_file) assert dwl_file.read_bytes() == data src_file.write_bytes(b"xxx") with open(src_file, "rb") as stream: - fs.put_file(stream, server + "/hey_2", method=method) - assert fs.cat(server + "/hey_2") == b"xxx" + fs.put_file(stream, server.address + "/hey_2", method=method) + assert fs.cat(server.address + "/hey_2") == b"xxx" - fs.put_file(io.BytesIO(b"yyy"), server + "/hey_3", method=method) - assert fs.cat(server + "/hey_3") == b"yyy" + fs.put_file(io.BytesIO(b"yyy"), server.address + "/hey_3", method=method) + assert fs.cat(server.address + "/hey_3") == b"yyy" async def get_aiohttp(): @@ -446,7 +450,7 @@ def test_async_other_thread(server): th.start() fs = fsspec.filesystem("http", asynchronous=True, loop=loop) asyncio.run_coroutine_threadsafe(fs.set_session(), loop=loop).result() - url = server + "/index/realfile" + url = server.realfile cor = fs._cat([url]) fut = asyncio.run_coroutine_threadsafe(cor, loop=loop) assert fut.result() == {url: data} @@ -459,7 +463,7 @@ async def _(): session = await fs.set_session() # creates client - url = server + "/index/realfile" + url = server.realfile with pytest.raises((NotImplementedError, RuntimeError)): fs.cat([url]) out = await fs._cat([url]) @@ -489,7 +493,7 @@ def test_processes(server, method): if win and method != "spawn": pytest.skip("Windows can only spawn") ctx = mp.get_context(method) - fn = server + "/index/realfile" + fn = server.realfile fs = fsspec.filesystem("http") q = ctx.Queue() @@ -509,7 +513,7 @@ def test_close(get_client): @pytest.mark.asyncio async def test_async_file(server): fs = fsspec.filesystem("http", asynchronous=True, skip_instance_cache=True) - fn = server + "/index/realfile" + fn = server.realfile of = await fs.open_async(fn) async with of as f: out1 = await f.read(10) @@ -521,19 +525,21 @@ async def test_async_file(server): def test_encoded(server): fs = fsspec.filesystem("http", encoded=True) - out = fs.cat(server + "/Hello%3A%20G%C3%BCnter", headers={"give_path": "true"}) + out = fs.cat( + server.address + "/Hello%3A%20G%C3%BCnter", headers={"give_path": "true"} + ) assert json.loads(out)["path"] == "/Hello%3A%20G%C3%BCnter" with pytest.raises(aiohttp.client_exceptions.ClientError): - fs.cat(server + "/Hello: Günter", headers={"give_path": "true"}) + fs.cat(server.address + "/Hello: Günter", headers={"give_path": "true"}) fs = fsspec.filesystem("http", encoded=False) - out = fs.cat(server + "/Hello: Günter", headers={"give_path": "true"}) + out = fs.cat(server.address + "/Hello: Günter", headers={"give_path": "true"}) assert json.loads(out)["path"] == "/Hello:%20G%C3%BCnter" def test_with_cache(server): fs = fsspec.filesystem("http", headers={"head_ok": "true", "give_length": "true"}) - fn = server + "/index/realfile" + fn = server.realfile fs1 = fsspec.filesystem("blockcache", fs=fs) with fs1.open(fn, "rb") as f: out = f.read() @@ -545,16 +551,18 @@ async def test_async_expand_path(server): fs = fsspec.filesystem("http", asynchronous=True, skip_instance_cache=True) # maxdepth=1 - assert await fs._expand_path(server + "/index", recursive=True, maxdepth=1) == [ - server + "/index", - server + "/index/realfile", + assert await fs._expand_path( + server.address + "/index", recursive=True, maxdepth=1 + ) == [ + server.address + "/index", + server.address + "/index/realfile", ] # maxdepth=0 with pytest.raises(ValueError): - await fs._expand_path(server + "/index", maxdepth=0) + await fs._expand_path(server.address + "/index", maxdepth=0) with pytest.raises(ValueError): - await fs._expand_path(server + "/index", recursive=True, maxdepth=0) + await fs._expand_path(server.address + "/index", recursive=True, maxdepth=0) await fs._session.close() @@ -564,12 +572,12 @@ async def test_async_walk(server): fs = fsspec.filesystem("http", asynchronous=True, skip_instance_cache=True) # No maxdepth - res = [a async for a in fs._walk(server + "/index")] - assert res == [(server + "/index", [], ["realfile"])] + res = [a async for a in fs._walk(server.address + "/index")] + assert res == [(server.address + "/index", [], ["realfile"])] # maxdepth=0 with pytest.raises(ValueError): - async for a in fs._walk(server + "/index", maxdepth=0): + async for a in fs._walk(server.address + "/index", maxdepth=0): pass await fs._session.close() diff --git a/fsspec/implementations/tests/test_reference.py b/fsspec/implementations/tests/test_reference.py index 09c55d41c..fefdd6024 100644 --- a/fsspec/implementations/tests/test_reference.py +++ b/fsspec/implementations/tests/test_reference.py @@ -10,18 +10,18 @@ ReferenceFileSystem, ReferenceNotReachable, ) -from fsspec.tests.conftest import data, realfile, reset_files, server, win # noqa: F401 +from fsspec.tests.conftest import data, reset_files, server, win # noqa: F401 -def test_simple(server): # noqa: F811 +def test_simple(server): # The dictionary in refs may be dumped with a different separator # depending on whether json or ujson is imported from fsspec.implementations.reference import json as json_impl refs = { "a": b"data", - "b": (realfile, 0, 5), - "c": (realfile, 1, 5), + "b": (server.realfile, 0, 5), + "c": (server.realfile, 1, 5), "d": b"base64:aGVsbG8=", "e": {"key": "value"}, } @@ -37,7 +37,7 @@ def test_simple(server): # noqa: F811 assert f.read(2) == "he" -def test_simple_ver1(server): # noqa: F811 +def test_simple_ver1(server): # The dictionary in refs may be dumped with a different separator # depending on whether json or ujson is imported from fsspec.implementations.reference import json as json_impl @@ -46,8 +46,8 @@ def test_simple_ver1(server): # noqa: F811 "version": 1, "refs": { "a": b"data", - "b": (realfile, 0, 5), - "c": (realfile, 1, 5), + "b": (server.realfile, 0, 5), + "c": (server.realfile, 1, 5), "d": b"base64:aGVsbG8=", "e": {"key": "value"}, }, @@ -75,8 +75,8 @@ def test_target_options(m): assert fs.cat("a") == b"hello" -def test_ls(server): # noqa: F811 - refs = {"a": b"data", "b": (realfile, 0, 5), "c/d": (realfile, 1, 6)} +def test_ls(server): + refs = {"a": b"data", "b": (server.realfile, 0, 5), "c/d": (server.realfile, 1, 6)} h = fsspec.filesystem("http") fs = fsspec.filesystem("reference", fo=refs, fs=h) @@ -99,12 +99,12 @@ def test_nested_dirs_ls(): assert {e["name"] for e in fs.ls("B")} == {"B/C", "B/_"} -def test_info(server): # noqa: F811 +def test_info(server): refs = { "a": b"data", - "b": (realfile, 0, 5), - "c/d": (realfile, 1, 6), - "e": (realfile,), + "b": (server.realfile, 0, 5), + "c/d": (server.realfile, 1, 6), + "e": (server.realfile,), } h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"}) fs = fsspec.filesystem("reference", fo=refs, fs=h) @@ -117,9 +117,9 @@ def test_info(server): # noqa: F811 def test_mutable(server, m): refs = { "a": b"data", - "b": (realfile, 0, 5), - "c/d": (realfile, 1, 6), - "e": (realfile,), + "b": (server.realfile, 0, 5), + "c/d": (server.realfile, 1, 6), + "e": (server.realfile,), } h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"}) fs = fsspec.filesystem("reference", fo=refs, fs=h) @@ -173,13 +173,13 @@ def test_put_get_single(tmpdir): assert fs.cat("hi") == b"data" -def test_defaults(server): # noqa: F811 +def test_defaults(server): refs = {"a": b"data", "b": (None, 0, 5)} fs = fsspec.filesystem( "reference", fo=refs, target_protocol="http", - target=realfile, + target=server.realfile, remote_protocol="http", ) @@ -761,17 +761,25 @@ def test_append_parquet(lazy_refs, m): assert lazy2["data/1"] == b"Adata" -def test_deep_parq(m): +@pytest.mark.parametrize("engine", ["fastparquet", "pyarrow"]) +def test_deep_parq(m, engine): + pytest.importorskip("kerchunk") zarr = pytest.importorskip("zarr") + lz = fsspec.implementations.reference.LazyReferenceMapper.create( - "memory://out.parq", fs=m + "memory://out.parq", + fs=m, + engine=engine, ) g = zarr.open_group(lz, mode="w") + g2 = g.create_group("instant") g2.create_dataset(name="one", data=[1, 2, 3]) lz.flush() - lz = fsspec.implementations.reference.LazyReferenceMapper("memory://out.parq", fs=m) + lz = fsspec.implementations.reference.LazyReferenceMapper( + "memory://out.parq", fs=m, engine=engine + ) g = zarr.open_group(lz) assert g.instant.one[:].tolist() == [1, 2, 3] assert sorted(_["name"] for _ in lz.ls("")) == [".zgroup", ".zmetadata", "instant"] @@ -779,6 +787,7 @@ def test_deep_parq(m): "instant/.zgroup", "instant/one", ] + assert sorted(_["name"] for _ in lz.ls("instant/one")) == [ "instant/one/.zarray", "instant/one/0", diff --git a/fsspec/implementations/tests/test_smb.py b/fsspec/implementations/tests/test_smb.py index 68b595725..a83e3cc91 100644 --- a/fsspec/implementations/tests/test_smb.py +++ b/fsspec/implementations/tests/test_smb.py @@ -50,7 +50,7 @@ def smb_params(request): cfg = "-p -u 'testuser;testpass' -s 'home;/share;no;no;no;testuser'" port = request.param if request.param is not None else default_port img = ( - f"docker run --name {container} --detach -p 139:139 -p {port}:445 dperson/samba" # noqa: E231 E501 + f"docker run --name {container} --detach -p 139:139 -p {port}:445 dperson/samba" ) cmd = f"{img} {cfg}" try: diff --git a/fsspec/implementations/tests/test_zip.py b/fsspec/implementations/tests/test_zip.py index ec30c8778..14d00086e 100644 --- a/fsspec/implementations/tests/test_zip.py +++ b/fsspec/implementations/tests/test_zip.py @@ -1,10 +1,13 @@ import collections.abc import os.path +from pathlib import Path +from shutil import make_archive import pytest import fsspec from fsspec.implementations.tests.test_archive import archive_data, tempzip +from fsspec.implementations.zip import ZipFileSystem def test_info(): @@ -132,3 +135,348 @@ def test_append(m, tmpdir): fs.close() assert len(fsspec.open_files("zip://*::memory://out.zip")) == 2 + + +@pytest.fixture(name="zip_file") +def zip_file_fixture(tmp_path): + data_dir = tmp_path / "data/" + data_dir.mkdir() + file1 = data_dir / "file1.txt" + file1.write_text("Hello, World!") + file2 = data_dir / "file2.txt" + file2.write_text("Lorem ipsum dolor sit amet") + + empty_dir = data_dir / "dir1" + empty_dir.mkdir() + + dir_with_files = data_dir / "dir2" + dir_with_files.mkdir() + file3 = dir_with_files / "file3.txt" + file3.write_text("Hello!") + + potential_mix_up_path = data_dir / "dir2startwithsamename.txt" + potential_mix_up_path.write_text("Hello again!") + + zip_file = tmp_path / "test" + return Path(make_archive(zip_file, "zip", data_dir)) + + +def _assert_all_except_context_dependent_variables(result, expected_result): + for path in expected_result.keys(): + assert result[path] + fields = [ + "orig_filename", + "filename", + "compress_type", + "comment", + "extra", + "CRC", + "compress_size", + "file_size", + "name", + "size", + "type", + ] + + result_without_date_time = {k: result[path][k] for k in fields} + + expected_result_without_date_time = { + k: expected_result[path][k] for k in fields + } + assert result_without_date_time == expected_result_without_date_time + + +def test_find_returns_expected_result_detail_true(zip_file): + zip_file_system = ZipFileSystem(zip_file) + + result = zip_file_system.find("/", detail=True) + + expected_result = { + "dir2/file3.txt": { + "orig_filename": "dir2/file3.txt", + "filename": "dir2/file3.txt", + "date_time": (2024, 8, 16, 10, 46, 18), + "compress_type": 8, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 2175008768, + "header_offset": 260, + "CRC": 2636827734, + "compress_size": 8, + "file_size": 6, + "_raw_time": 21961, + "_end_offset": 312, + "name": "dir2/file3.txt", + "size": 6, + "type": "file", + }, + "file1.txt": { + "orig_filename": "file1.txt", + "filename": "file1.txt", + "date_time": (2024, 8, 16, 10, 46, 18), + "compress_type": 8, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 2175008768, + "header_offset": 139, + "CRC": 3964322768, + "compress_size": 15, + "file_size": 13, + "_raw_time": 21961, + "_end_offset": 193, + "name": "file1.txt", + "size": 13, + "type": "file", + }, + "file2.txt": { + "orig_filename": "file2.txt", + "filename": "file2.txt", + "date_time": (2024, 8, 16, 10, 46, 18), + "compress_type": 8, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 2175008768, + "header_offset": 193, + "CRC": 1596576865, + "compress_size": 28, + "file_size": 26, + "_raw_time": 21961, + "_end_offset": 260, + "name": "file2.txt", + "size": 26, + "type": "file", + }, + } + + _assert_all_except_context_dependent_variables(result, expected_result) + + +def test_find_returns_expected_result_detail_false(zip_file): + zip_file_system = ZipFileSystem(zip_file) + + result = zip_file_system.find("/", detail=False) + expected_result = [ + "dir2/file3.txt", + "dir2startwithsamename.txt", + "file1.txt", + "file2.txt", + ] + + assert result == expected_result + + +def test_find_returns_expected_result_detail_true_include_dirs(zip_file): + zip_file_system = ZipFileSystem(zip_file) + + result = zip_file_system.find("/", detail=True, withdirs=True) + expected_result = { + "dir1": { + "orig_filename": "dir1/", + "filename": "dir1/", + "date_time": (2024, 8, 16, 10, 54, 24), + "compress_type": 0, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 1106051088, + "header_offset": 0, + "CRC": 0, + "compress_size": 0, + "file_size": 0, + "_raw_time": 22220, + "_end_offset": 35, + "name": "dir1", + "size": 0, + "type": "directory", + }, + "dir2": { + "orig_filename": "dir2/", + "filename": "dir2/", + "date_time": (2024, 8, 16, 10, 54, 24), + "compress_type": 0, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 1106051088, + "header_offset": 35, + "CRC": 0, + "compress_size": 0, + "file_size": 0, + "_raw_time": 22220, + "_end_offset": 70, + "name": "dir2", + "size": 0, + "type": "directory", + }, + "dir2/file3.txt": { + "orig_filename": "dir2/file3.txt", + "filename": "dir2/file3.txt", + "date_time": (2024, 8, 16, 10, 54, 24), + "compress_type": 8, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 2175008768, + "header_offset": 260, + "CRC": 2636827734, + "compress_size": 8, + "file_size": 6, + "_raw_time": 22220, + "_end_offset": 312, + "name": "dir2/file3.txt", + "size": 6, + "type": "file", + }, + "file1.txt": { + "orig_filename": "file1.txt", + "filename": "file1.txt", + "date_time": (2024, 8, 16, 10, 54, 24), + "compress_type": 8, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 2175008768, + "header_offset": 139, + "CRC": 3964322768, + "compress_size": 15, + "file_size": 13, + "_raw_time": 22220, + "_end_offset": 193, + "name": "file1.txt", + "size": 13, + "type": "file", + }, + "file2.txt": { + "orig_filename": "file2.txt", + "filename": "file2.txt", + "date_time": (2024, 8, 16, 10, 54, 24), + "compress_type": 8, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 2175008768, + "header_offset": 193, + "CRC": 1596576865, + "compress_size": 28, + "file_size": 26, + "_raw_time": 22220, + "_end_offset": 260, + "name": "file2.txt", + "size": 26, + "type": "file", + }, + } + + _assert_all_except_context_dependent_variables(result, expected_result) + + +def test_find_returns_expected_result_detail_false_include_dirs(zip_file): + zip_file_system = ZipFileSystem(zip_file) + + result = zip_file_system.find("/", detail=False, withdirs=True) + expected_result = [ + "dir1", + "dir2", + "dir2/file3.txt", + "dir2startwithsamename.txt", + "file1.txt", + "file2.txt", + ] + + assert result == expected_result + + +def test_find_returns_expected_result_path_set(zip_file): + zip_file_system = ZipFileSystem(zip_file) + + result = zip_file_system.find("/dir2") + expected_result = ["dir2/file3.txt"] + + assert result == expected_result + + +def test_find_with_and_without_slash_should_return_same_result(zip_file): + zip_file_system = ZipFileSystem(zip_file) + + assert zip_file_system.find("/dir2/") == zip_file_system.find("/dir2") + + +def test_find_should_return_file_if_exact_match(zip_file): + zip_file_system = ZipFileSystem(zip_file) + + result = zip_file_system.find("/dir2startwithsamename.txt", detail=False) + expected_result = ["dir2startwithsamename.txt"] + + assert result == expected_result + + +def test_find_returns_expected_result_recursion_depth_set(zip_file): + zip_file_system = ZipFileSystem(zip_file) + result = zip_file_system.find("/", maxdepth=1) + + expected_result = [ + "dir2startwithsamename.txt", + "file1.txt", + "file2.txt", + ] + + assert result == expected_result diff --git a/fsspec/implementations/webhdfs.py b/fsspec/implementations/webhdfs.py index 4bac5d51a..300bb9cdf 100644 --- a/fsspec/implementations/webhdfs.py +++ b/fsspec/implementations/webhdfs.py @@ -102,7 +102,7 @@ def __init__( if self._cached: return super().__init__(**kwargs) - self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1" # noqa + self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1" self.kerb = kerberos self.kerb_kwargs = kerb_kwargs or {} self.pars = {} @@ -393,7 +393,7 @@ def cp_file(self, lpath, rpath, **kwargs): with self.open(tmp_fname, "wb") as rstream: shutil.copyfileobj(lstream, rstream) self.mv(tmp_fname, rpath) - except BaseException: # noqa + except BaseException: with suppress(FileNotFoundError): self.rm(tmp_fname) raise diff --git a/fsspec/implementations/zip.py b/fsspec/implementations/zip.py index 9d9c046bf..6db3ae278 100644 --- a/fsspec/implementations/zip.py +++ b/fsspec/implementations/zip.py @@ -1,3 +1,4 @@ +import os import zipfile import fsspec @@ -48,7 +49,7 @@ def __init__( if mode not in set("rwa"): raise ValueError(f"mode '{mode}' no understood") self.mode = mode - if isinstance(fo, str): + if isinstance(fo, (str, os.PathLike)): if mode == "a": m = "r+b" else: @@ -132,3 +133,45 @@ def _open( out.size = info["size"] out.name = info["name"] return out + + def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs): + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + # Remove the leading slash, as the zip file paths are always + # given without a leading slash + path = path.lstrip("/") + path_parts = list(filter(lambda s: bool(s), path.split("/"))) + + def _matching_starts(file_path): + file_parts = filter(lambda s: bool(s), file_path.split("/")) + return all(a == b for a, b in zip(path_parts, file_parts)) + + self._get_dirs() + + result = {} + # To match posix find, if an exact file name is given, we should + # return only that file + if path in self.dir_cache and self.dir_cache[path]["type"] == "file": + result[path] = self.dir_cache[path] + return result if detail else [path] + + for file_path, file_info in self.dir_cache.items(): + if not (path == "" or _matching_starts(file_path)): + continue + + if file_info["type"] == "directory": + if withdirs: + if file_path not in result: + result[file_path.strip("/")] = file_info + continue + + if file_path not in result: + result[file_path] = file_info if detail else None + + if maxdepth: + path_depth = path.count("/") + result = { + k: v for k, v in result.items() if k.count("/") - path_depth < maxdepth + } + return result if detail else sorted(result) diff --git a/fsspec/spec.py b/fsspec/spec.py index 1463a4499..8284366ba 100644 --- a/fsspec/spec.py +++ b/fsspec/spec.py @@ -428,11 +428,9 @@ def walk(self, path, maxdepth=None, topdown=True, on_error="omit", **kwargs): except (FileNotFoundError, OSError) as e: if on_error == "raise": raise - elif callable(on_error): + if callable(on_error): on_error(e) - if detail: - return path, {}, {} - return path, [], [] + return for info in listing: # each info name must be at least [path]/part , but here @@ -650,7 +648,7 @@ def info(self, path, **kwargs): Returns a single dictionary, with exactly the same information as ``ls`` would with ``detail=True``. - The default implementation should calls ls and could be overridden by a + The default implementation calls ls and could be overridden by a shortcut. kwargs are passed on to ```ls()``. Some file systems might not be able to measure the file's size, in @@ -1892,7 +1890,7 @@ def flush(self, force=False): self.offset = 0 try: self._initiate_upload() - except: # noqa: E722 + except: self.closed = True raise diff --git a/fsspec/tests/abstract/__init__.py b/fsspec/tests/abstract/__init__.py index 45d081921..44181420f 100644 --- a/fsspec/tests/abstract/__init__.py +++ b/fsspec/tests/abstract/__init__.py @@ -4,9 +4,9 @@ import pytest from fsspec.implementations.local import LocalFileSystem -from fsspec.tests.abstract.copy import AbstractCopyTests # noqa -from fsspec.tests.abstract.get import AbstractGetTests # noqa -from fsspec.tests.abstract.put import AbstractPutTests # noqa +from fsspec.tests.abstract.copy import AbstractCopyTests # noqa: F401 +from fsspec.tests.abstract.get import AbstractGetTests # noqa: F401 +from fsspec.tests.abstract.put import AbstractPutTests # noqa: F401 class BaseAbstractFixtures: diff --git a/fsspec/tests/conftest.py b/fsspec/tests/conftest.py index fb1efb041..31d99faf2 100644 --- a/fsspec/tests/conftest.py +++ b/fsspec/tests/conftest.py @@ -5,25 +5,34 @@ import threading from collections import ChainMap from http.server import BaseHTTPRequestHandler, HTTPServer +from types import SimpleNamespace import pytest requests = pytest.importorskip("requests") -port = 9898 data = b"\n".join([b"some test data"] * 1000) -realfile = f"http://127.0.0.1:{port}/index/realfile" -index = b'Link' % realfile.encode() listing = open( os.path.join(os.path.dirname(__file__), "data", "listing.html"), "rb" ).read() win = os.name == "nt" +def _make_realfile(baseurl): + return f"{baseurl}/index/realfile" + + +def _make_index_listing(baseurl): + realfile = _make_realfile(baseurl) + return b'Link' % realfile.encode() + + def _make_listing(*paths): - return "\n".join( - f'Link_{i}' - for i, f in enumerate(paths) - ).encode() + def _make_listing_port(baseurl): + return "\n".join( + f'Link_{i}' for i, f in enumerate(paths) + ).encode() + + return _make_listing_port @pytest.fixture @@ -39,7 +48,7 @@ class HTTPTestHandler(BaseHTTPRequestHandler): static_files = { "/index/realfile": data, "/index/otherfile": data, - "/index": index, + "/index": _make_index_listing, "/data/20020401": listing, "/simple/": _make_listing("/simple/file", "/simple/dir/"), "/simple/file": data, @@ -64,14 +73,17 @@ def _respond(self, code=200, headers=None, data=b""): self.wfile.write(data) def do_GET(self): + baseurl = f"http://{self.server.server_name}:{self.server.server_port}" file_path = self.path if file_path.endswith("/") and file_path.rstrip("/") in self.files: file_path = file_path.rstrip("/") file_data = self.files.get(file_path) + if callable(file_data): + file_data = file_data(baseurl) if "give_path" in self.headers: return self._respond(200, data=json.dumps({"path": self.path}).encode()) if "redirect" in self.headers and file_path != "/index/realfile": - new_url = f"http://127.0.0.1:{port}/index/realfile" + new_url = _make_realfile(baseurl) return self._respond(301, {"Location": new_url}) if file_data is None: return self._respond(404) @@ -135,10 +147,10 @@ def read_chunks(self): self.rfile.readline() def do_HEAD(self): + r_headers = {} if "head_not_auth" in self.headers: - return self._respond( - 403, {"Content-Length": 123}, b"not authorized for HEAD request" - ) + r_headers["Content-Length"] = 123 + return self._respond(403, r_headers, b"not authorized for HEAD request") elif "head_ok" not in self.headers: return self._respond(405) @@ -148,34 +160,34 @@ def do_HEAD(self): return self._respond(404) if ("give_length" in self.headers) or ("head_give_length" in self.headers): - response_headers = {"Content-Length": len(file_data)} if "zero_length" in self.headers: - response_headers["Content-Length"] = 0 + r_headers["Content-Length"] = 0 elif "gzip_encoding" in self.headers: file_data = gzip.compress(file_data) - response_headers["Content-Encoding"] = "gzip" - response_headers["Content-Length"] = len(file_data) - - self._respond(200, response_headers) + r_headers["Content-Encoding"] = "gzip" + r_headers["Content-Length"] = len(file_data) + else: + r_headers["Content-Length"] = len(file_data) elif "give_range" in self.headers: - self._respond( - 200, {"Content-Range": f"0-{len(file_data) - 1}/{len(file_data)}"} - ) + r_headers["Content-Range"] = f"0-{len(file_data) - 1}/{len(file_data)}" elif "give_etag" in self.headers: - self._respond(200, {"ETag": "xxx"}) - else: - self._respond(200) # OK response, but no useful info + r_headers["ETag"] = "xxx" + + if self.headers.get("accept_range") == "none": + r_headers["Accept-Ranges"] = "none" + + self._respond(200, r_headers) @contextlib.contextmanager def serve(): - server_address = ("", port) + server_address = ("", 0) httpd = HTTPServer(server_address, HTTPTestHandler) th = threading.Thread(target=httpd.serve_forever) th.daemon = True th.start() try: - yield f"http://127.0.0.1:{port}" + yield f"http://{httpd.server_name}:{httpd.server_port}" finally: httpd.socket.close() httpd.shutdown() @@ -185,4 +197,5 @@ def serve(): @pytest.fixture(scope="module") def server(): with serve() as s: - yield s + server = SimpleNamespace(address=s, realfile=_make_realfile(s)) + yield server diff --git a/fsspec/tests/test_async.py b/fsspec/tests/test_async.py index e1a29420f..aa3c9bd4f 100644 --- a/fsspec/tests/test_async.py +++ b/fsspec/tests/test_async.py @@ -203,9 +203,6 @@ async def _upload_chunk(self, final=False): async def get_data(self): return self.temp_buffer.getbuffer().tobytes() - async def get_data(self): - return self.temp_buffer.getbuffer().tobytes() - @pytest.mark.asyncio async def test_async_streamed_file_write(): diff --git a/fsspec/tests/test_caches.py b/fsspec/tests/test_caches.py index 5bde713b8..6176d8001 100644 --- a/fsspec/tests/test_caches.py +++ b/fsspec/tests/test_caches.py @@ -215,7 +215,7 @@ def test_background(server, monkeypatch): import fsspec head = {"head_ok": "true", "head_give_length": "true"} - urla = server + "/index/realfile" + urla = server.realfile h = fsspec.filesystem("http", headers=head) thread_ids = {threading.current_thread().ident} f = h.open(urla, block_size=5, cache_type="background") diff --git a/fsspec/tests/test_core.py b/fsspec/tests/test_core.py index 53592aff9..1cdeec90c 100644 --- a/fsspec/tests/test_core.py +++ b/fsspec/tests/test_core.py @@ -465,3 +465,21 @@ def test_chained_url(ftp_writable): def test_automkdir_local(): fs, _ = fsspec.core.url_to_fs("file://", auto_mkdir=True) assert fs.auto_mkdir is True + + +def test_repeated_argument(): + pytest.importorskip("adlfs") + from fsspec.core import url_to_fs + + fs, url = url_to_fs( + "az://DIR@ACCOUNT.blob.core.windows.net/DATA", + anon=False, + account_name="ACCOUNT", + ) + assert fs.storage_options == {"account_name": "ACCOUNT", "anon": False} + with pytest.raises(TypeError): + url_to_fs( + "az://DIR@ACCOUNT.blob.core.windows.net/DATA", + anon=False, + account_name="OTHER", + ) diff --git a/fsspec/tests/test_downstream.py b/fsspec/tests/test_downstream.py index 172b2a7a7..1f0a0bc0e 100644 --- a/fsspec/tests/test_downstream.py +++ b/fsspec/tests/test_downstream.py @@ -4,7 +4,7 @@ pytest.importorskip("moto") try: - from s3fs.tests.test_s3fs import ( # noqa: E402,F401 + from s3fs.tests.test_s3fs import ( # noqa: F401 endpoint_uri, s3, s3_base, diff --git a/fsspec/tests/test_fuse.py b/fsspec/tests/test_fuse.py index db627ffc9..5222d2fd9 100644 --- a/fsspec/tests/test_fuse.py +++ b/fsspec/tests/test_fuse.py @@ -6,10 +6,10 @@ import pytest try: - pytest.importorskip("fuse") # noqa: E402 + pytest.importorskip("fuse") except OSError: # can succeed in importing fuse, but fail to load so - pytest.importorskip("nonexistent") # noqa: E402 + pytest.importorskip("nonexistent") from fsspec.fuse import main, run from fsspec.implementations.memory import MemoryFileSystem diff --git a/fsspec/tests/test_generic.py b/fsspec/tests/test_generic.py index fc4c8bf01..bd9745f0c 100644 --- a/fsspec/tests/test_generic.py +++ b/fsspec/tests/test_generic.py @@ -7,10 +7,10 @@ def test_remote_async_ops(server): fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"}) fs = fsspec.filesystem("generic", default_method="current") - out = fs.info(server + "/index/realfile") + out = fs.info(server.realfile) assert out["size"] == len(data) assert out["type"] == "file" - assert fs.isfile(server + "/index/realfile") # this method from superclass + assert fs.isfile(server.realfile) # this method from superclass def test_touch_rm(m): @@ -29,7 +29,7 @@ def test_touch_rm(m): def test_cp_async_to_sync(server, m): fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"}) fs = fsspec.filesystem("generic", default_method="current") - fs.cp([server + "/index/realfile"], ["memory://realfile"]) + fs.cp([server.realfile], ["memory://realfile"]) assert m.cat("realfile") == data fs.rm("memory://realfile") @@ -45,7 +45,7 @@ def test_pipe_cat_sync(m): def test_cat_async(server): fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"}) fs = fsspec.filesystem("generic", default_method="current") - assert fs.cat(server + "/index/realfile") == data + assert fs.cat(server.realfile) == data def test_rsync(tmpdir, m): diff --git a/fsspec/tests/test_registry.py b/fsspec/tests/test_registry.py index 0664912a1..fae72368f 100644 --- a/fsspec/tests/test_registry.py +++ b/fsspec/tests/test_registry.py @@ -105,7 +105,7 @@ def test_entry_points_registered_on_import(clear_registry, clean_imports): import_location = "importlib.metadata.entry_points" with patch(import_location, return_value={"fsspec.specs": [mock_ep]}): assert "test" not in registry - import fsspec # noqa + import fsspec # noqa: F401 get_filesystem_class("test") assert "test" in registry @@ -117,7 +117,7 @@ def test_filesystem_warning_arrow_hdfs_deprecated(clear_registry, clean_imports) mock_ep.value = "fsspec.spec.AbstractFileSystem" import_location = "importlib.metadata.entry_points" with patch(import_location, return_value={"fsspec.specs": [mock_ep]}): - import fsspec # noqa + import fsspec # noqa: F401 with pytest.warns(DeprecationWarning): filesystem("arrow_hdfs") diff --git a/fsspec/tests/test_spec.py b/fsspec/tests/test_spec.py index e11b7abdd..3927c6550 100644 --- a/fsspec/tests/test_spec.py +++ b/fsspec/tests/test_spec.py @@ -1276,7 +1276,7 @@ def glob_files_folder(tmp_path): local_fake_dir = str(tmp_path) for path_info in PATHS_FOR_GLOB_TESTS: if path_info["type"] == "file": - local_fs.touch(path=f"{str(tmp_path)}/{path_info['name']}") + local_fs.touch(path=f"{tmp_path}/{path_info['name']}") return local_fake_dir diff --git a/fsspec/tests/test_utils.py b/fsspec/tests/test_utils.py index b9167b5d2..1eeee912b 100644 --- a/fsspec/tests/test_utils.py +++ b/fsspec/tests/test_utils.py @@ -261,7 +261,6 @@ def test_common_prefix(paths, out): ( (["/path1"], "/path2", False, ["/path2"]), (["/path1"], "/path2", True, ["/path2/path1"]), - (["/path1"], "/path2", False, ["/path2"]), (["/path1"], "/path2/", True, ["/path2/path1"]), (["/path1"], ["/path2"], False, ["/path2"]), (["/path1"], ["/path2"], True, ["/path2"]), @@ -279,18 +278,6 @@ def test_common_prefix(paths, out): True, ["/path2/more/path1", "/path2/more/path2"], ), - ( - ["/more/path1", "/more/path2"], - "/path2", - False, - ["/path2/path1", "/path2/path2"], - ), - ( - ["/more/path1", "/more/path2"], - "/path2", - True, - ["/path2/more/path1", "/path2/more/path2"], - ), ( ["/more/path1", "/more/path2"], "/path2/", diff --git a/fsspec/utils.py b/fsspec/utils.py index 703d55f4e..faa63937f 100644 --- a/fsspec/utils.py +++ b/fsspec/utils.py @@ -427,10 +427,7 @@ def is_exception(obj: Any) -> bool: def isfilelike(f: Any) -> TypeGuard[IO[bytes]]: - for attr in ["read", "close", "tell"]: - if not hasattr(f, attr): - return False - return True + return all(hasattr(f, attr) for attr in ["read", "close", "tell"]) def get_protocol(url: str) -> str: diff --git a/pyproject.toml b/pyproject.toml index 945ecdabd..48368711f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -192,9 +192,6 @@ ignore = [ "B026", # No explicit `stacklevel` keyword argument found "B028", - # Within an `except` clause, raise exceptions with `raise ... from err` or - # `raise ... from None` to distinguish them from errors in exception handling - "B904", # Assigning lambda expression "E731", # Ambiguous variable names