From 7793ab89a56d70814de6d123c257e81f8e5bd52f Mon Sep 17 00:00:00 2001 From: Johan Dahlberg Date: Thu, 22 Aug 2024 21:58:39 +0200 Subject: [PATCH] Improve performance find zip archive (#1664) Co-authored-by: Martin Durant --- fsspec/implementations/tests/test_zip.py | 340 +++++++++++++++++++++++ fsspec/implementations/zip.py | 42 +++ 2 files changed, 382 insertions(+) diff --git a/fsspec/implementations/tests/test_zip.py b/fsspec/implementations/tests/test_zip.py index ec30c8778..ecd082f3f 100644 --- a/fsspec/implementations/tests/test_zip.py +++ b/fsspec/implementations/tests/test_zip.py @@ -1,10 +1,13 @@ import collections.abc import os.path +from pathlib import Path +from shutil import make_archive import pytest import fsspec from fsspec.implementations.tests.test_archive import archive_data, tempzip +from fsspec.implementations.zip import ZipFileSystem def test_info(): @@ -132,3 +135,340 @@ def test_append(m, tmpdir): fs.close() assert len(fsspec.open_files("zip://*::memory://out.zip")) == 2 + + +@pytest.fixture(name="zip_file") +def zip_file_fixture(tmp_path): + data_dir = tmp_path / "data/" + data_dir.mkdir() + file1 = data_dir / "file1.txt" + file1.write_text("Hello, World!") + file2 = data_dir / "file2.txt" + file2.write_text("Lorem ipsum dolor sit amet") + + empty_dir = data_dir / "dir1" + empty_dir.mkdir() + + dir_with_files = data_dir / "dir2" + dir_with_files.mkdir() + file3 = dir_with_files / "file3.txt" + file3.write_text("Hello!") + + potential_mix_up_path = data_dir / "dir2startwithsamename.txt" + potential_mix_up_path.write_text("Hello again!") + + zip_file = tmp_path / "test" + return Path(make_archive(zip_file, "zip", data_dir)) + + +def _assert_all_except_context_dependent_variables(result, expected_result): + for path in expected_result.keys(): + assert result[path] + result_without_date_time = result[path].copy() + result_without_date_time.pop("date_time") + result_without_date_time.pop("_raw_time") + result_without_date_time.pop("external_attr") + result_without_date_time.pop("create_system") + + expected_result_without_date_time = expected_result[path].copy() + expected_result_without_date_time.pop("date_time") + expected_result_without_date_time.pop("_raw_time") + expected_result_without_date_time.pop("external_attr") + expected_result_without_date_time.pop("create_system") + assert result_without_date_time == expected_result_without_date_time + + +def test_find_returns_expected_result_detail_true(zip_file): + zip_file_system = ZipFileSystem(zip_file) + + result = zip_file_system.find("/", detail=True) + + expected_result = { + "dir2/file3.txt": { + "orig_filename": "dir2/file3.txt", + "filename": "dir2/file3.txt", + "date_time": (2024, 8, 16, 10, 46, 18), + "compress_type": 8, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 2175008768, + "header_offset": 260, + "CRC": 2636827734, + "compress_size": 8, + "file_size": 6, + "_raw_time": 21961, + "_end_offset": 312, + "name": "dir2/file3.txt", + "size": 6, + "type": "file", + }, + "file1.txt": { + "orig_filename": "file1.txt", + "filename": "file1.txt", + "date_time": (2024, 8, 16, 10, 46, 18), + "compress_type": 8, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 2175008768, + "header_offset": 139, + "CRC": 3964322768, + "compress_size": 15, + "file_size": 13, + "_raw_time": 21961, + "_end_offset": 193, + "name": "file1.txt", + "size": 13, + "type": "file", + }, + "file2.txt": { + "orig_filename": "file2.txt", + "filename": "file2.txt", + "date_time": (2024, 8, 16, 10, 46, 18), + "compress_type": 8, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 2175008768, + "header_offset": 193, + "CRC": 1596576865, + "compress_size": 28, + "file_size": 26, + "_raw_time": 21961, + "_end_offset": 260, + "name": "file2.txt", + "size": 26, + "type": "file", + }, + } + + _assert_all_except_context_dependent_variables(result, expected_result) + + +def test_find_returns_expected_result_detail_false(zip_file): + zip_file_system = ZipFileSystem(zip_file) + + result = zip_file_system.find("/", detail=False) + expected_result = [ + "dir2/file3.txt", + "dir2startwithsamename.txt", + "file1.txt", + "file2.txt", + ] + + assert result == expected_result + + +def test_find_returns_expected_result_detail_true_include_dirs(zip_file): + zip_file_system = ZipFileSystem(zip_file) + + result = zip_file_system.find("/", detail=True, withdirs=True) + expected_result = { + "dir1": { + "orig_filename": "dir1/", + "filename": "dir1/", + "date_time": (2024, 8, 16, 10, 54, 24), + "compress_type": 0, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 1106051088, + "header_offset": 0, + "CRC": 0, + "compress_size": 0, + "file_size": 0, + "_raw_time": 22220, + "_end_offset": 35, + "name": "dir1", + "size": 0, + "type": "directory", + }, + "dir2": { + "orig_filename": "dir2/", + "filename": "dir2/", + "date_time": (2024, 8, 16, 10, 54, 24), + "compress_type": 0, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 1106051088, + "header_offset": 35, + "CRC": 0, + "compress_size": 0, + "file_size": 0, + "_raw_time": 22220, + "_end_offset": 70, + "name": "dir2", + "size": 0, + "type": "directory", + }, + "dir2/file3.txt": { + "orig_filename": "dir2/file3.txt", + "filename": "dir2/file3.txt", + "date_time": (2024, 8, 16, 10, 54, 24), + "compress_type": 8, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 2175008768, + "header_offset": 260, + "CRC": 2636827734, + "compress_size": 8, + "file_size": 6, + "_raw_time": 22220, + "_end_offset": 312, + "name": "dir2/file3.txt", + "size": 6, + "type": "file", + }, + "file1.txt": { + "orig_filename": "file1.txt", + "filename": "file1.txt", + "date_time": (2024, 8, 16, 10, 54, 24), + "compress_type": 8, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 2175008768, + "header_offset": 139, + "CRC": 3964322768, + "compress_size": 15, + "file_size": 13, + "_raw_time": 22220, + "_end_offset": 193, + "name": "file1.txt", + "size": 13, + "type": "file", + }, + "file2.txt": { + "orig_filename": "file2.txt", + "filename": "file2.txt", + "date_time": (2024, 8, 16, 10, 54, 24), + "compress_type": 8, + "_compresslevel": None, + "comment": b"", + "extra": b"", + "create_system": 3, + "create_version": 20, + "extract_version": 20, + "reserved": 0, + "flag_bits": 0, + "volume": 0, + "internal_attr": 0, + "external_attr": 2175008768, + "header_offset": 193, + "CRC": 1596576865, + "compress_size": 28, + "file_size": 26, + "_raw_time": 22220, + "_end_offset": 260, + "name": "file2.txt", + "size": 26, + "type": "file", + }, + } + + _assert_all_except_context_dependent_variables(result, expected_result) + + +def test_find_returns_expected_result_detail_false_include_dirs(zip_file): + zip_file_system = ZipFileSystem(zip_file) + + result = zip_file_system.find("/", detail=False, withdirs=True) + expected_result = [ + "dir1", + "dir2", + "dir2/file3.txt", + "dir2startwithsamename.txt", + "file1.txt", + "file2.txt", + ] + + assert result == expected_result + + +def test_find_returns_expected_result_path_set(zip_file): + zip_file_system = ZipFileSystem(zip_file) + + result = zip_file_system.find("/dir2") + expected_result = ["dir2/file3.txt"] + + assert result == expected_result + + +def test_find_with_and_without_slash_should_return_same_result(zip_file): + zip_file_system = ZipFileSystem(zip_file) + + assert zip_file_system.find("/dir2/") == zip_file_system.find("/dir2") + + +def test_find_should_return_file_if_exact_match(zip_file): + zip_file_system = ZipFileSystem(zip_file) + + result = zip_file_system.find("/dir2startwithsamename.txt", detail=False) + expected_result = ["dir2startwithsamename.txt"] + + assert result == expected_result + + +def test_find_returns_expected_result_recursion_depth_set(zip_file): + zip_file_system = ZipFileSystem(zip_file) + result = zip_file_system.find("/", maxdepth=1) + + expected_result = [ + "dir2startwithsamename.txt", + "file1.txt", + "file2.txt", + ] + + assert result == expected_result diff --git a/fsspec/implementations/zip.py b/fsspec/implementations/zip.py index 9d9c046bf..aa6a57842 100644 --- a/fsspec/implementations/zip.py +++ b/fsspec/implementations/zip.py @@ -132,3 +132,45 @@ def _open( out.size = info["size"] out.name = info["name"] return out + + def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs): + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + # Remove the leading slash, as the zip file paths are always + # given without a leading slash + path = path.lstrip("/") + path_parts = list(filter(lambda s: bool(s), path.split("/"))) + + def _matching_starts(file_path): + file_parts = filter(lambda s: bool(s), file_path.split("/")) + return all(a == b for a, b in zip(path_parts, file_parts)) + + self._get_dirs() + + result = {} + # To match posix find, if an exact file name is given, we should + # return only that file + if path in self.dir_cache and self.dir_cache[path]["type"] == "file": + result[path] = self.dir_cache[path] + return result if detail else [path] + + for file_path, file_info in self.dir_cache.items(): + if not (path == "" or _matching_starts(file_path)): + continue + + if file_info["type"] == "directory": + if withdirs: + if file_path not in result: + result[file_path.strip("/")] = file_info + continue + + if file_path not in result: + result[file_path] = file_info if detail else None + + if maxdepth: + path_depth = path.count("/") + result = { + k: v for k, v in result.items() if k.count("/") - path_depth < maxdepth + } + return result if detail else sorted(result)