Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance find zip archive #1664

340 changes: 340 additions & 0 deletions fsspec/implementations/tests/test_zip.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import collections.abc
import os.path
from pathlib import Path
from shutil import make_archive

import pytest

import fsspec
from fsspec.implementations.tests.test_archive import archive_data, tempzip
from fsspec.implementations.zip import ZipFileSystem


def test_info():
Expand Down Expand Up @@ -132,3 +135,340 @@ def test_append(m, tmpdir):
fs.close()

assert len(fsspec.open_files("zip://*::memory://out.zip")) == 2


@pytest.fixture(name="zip_file")
def zip_file_fixture(tmp_path):
data_dir = tmp_path / "data/"
data_dir.mkdir()
file1 = data_dir / "file1.txt"
file1.write_text("Hello, World!")
file2 = data_dir / "file2.txt"
file2.write_text("Lorem ipsum dolor sit amet")

empty_dir = data_dir / "dir1"
empty_dir.mkdir()

dir_with_files = data_dir / "dir2"
dir_with_files.mkdir()
file3 = dir_with_files / "file3.txt"
file3.write_text("Hello!")

potential_mix_up_path = data_dir / "dir2startwithsamename.txt"
potential_mix_up_path.write_text("Hello again!")

zip_file = tmp_path / "test"
return Path(make_archive(zip_file, "zip", data_dir))


def _assert_all_except_context_dependent_variables(result, expected_result):
for path in expected_result.keys():
assert result[path]
result_without_date_time = result[path].copy()
result_without_date_time.pop("date_time")
result_without_date_time.pop("_raw_time")
result_without_date_time.pop("external_attr")
result_without_date_time.pop("create_system")

expected_result_without_date_time = expected_result[path].copy()
expected_result_without_date_time.pop("date_time")
expected_result_without_date_time.pop("_raw_time")
expected_result_without_date_time.pop("external_attr")
expected_result_without_date_time.pop("create_system")
assert result_without_date_time == expected_result_without_date_time


def test_find_returns_expected_result_detail_true(zip_file):
zip_file_system = ZipFileSystem(zip_file)

result = zip_file_system.find("/", detail=True)

expected_result = {
"dir2/file3.txt": {
"orig_filename": "dir2/file3.txt",
"filename": "dir2/file3.txt",
"date_time": (2024, 8, 16, 10, 46, 18),
"compress_type": 8,
"_compresslevel": None,
"comment": b"",
"extra": b"",
"create_system": 3,
"create_version": 20,
"extract_version": 20,
"reserved": 0,
"flag_bits": 0,
"volume": 0,
"internal_attr": 0,
"external_attr": 2175008768,
"header_offset": 260,
"CRC": 2636827734,
"compress_size": 8,
"file_size": 6,
"_raw_time": 21961,
"_end_offset": 312,
"name": "dir2/file3.txt",
"size": 6,
"type": "file",
},
"file1.txt": {
"orig_filename": "file1.txt",
"filename": "file1.txt",
"date_time": (2024, 8, 16, 10, 46, 18),
"compress_type": 8,
"_compresslevel": None,
"comment": b"",
"extra": b"",
"create_system": 3,
"create_version": 20,
"extract_version": 20,
"reserved": 0,
"flag_bits": 0,
"volume": 0,
"internal_attr": 0,
"external_attr": 2175008768,
"header_offset": 139,
"CRC": 3964322768,
"compress_size": 15,
"file_size": 13,
"_raw_time": 21961,
"_end_offset": 193,
"name": "file1.txt",
"size": 13,
"type": "file",
},
"file2.txt": {
"orig_filename": "file2.txt",
"filename": "file2.txt",
"date_time": (2024, 8, 16, 10, 46, 18),
"compress_type": 8,
"_compresslevel": None,
"comment": b"",
"extra": b"",
"create_system": 3,
"create_version": 20,
"extract_version": 20,
"reserved": 0,
"flag_bits": 0,
"volume": 0,
"internal_attr": 0,
"external_attr": 2175008768,
"header_offset": 193,
"CRC": 1596576865,
"compress_size": 28,
"file_size": 26,
"_raw_time": 21961,
"_end_offset": 260,
"name": "file2.txt",
"size": 26,
"type": "file",
},
}

_assert_all_except_context_dependent_variables(result, expected_result)


def test_find_returns_expected_result_detail_false(zip_file):
zip_file_system = ZipFileSystem(zip_file)

result = zip_file_system.find("/", detail=False)
expected_result = [
"dir2/file3.txt",
"dir2startwithsamename.txt",
"file1.txt",
"file2.txt",
]

assert result == expected_result


def test_find_returns_expected_result_detail_true_include_dirs(zip_file):
zip_file_system = ZipFileSystem(zip_file)

result = zip_file_system.find("/", detail=True, withdirs=True)
expected_result = {
"dir1": {
"orig_filename": "dir1/",
"filename": "dir1/",
"date_time": (2024, 8, 16, 10, 54, 24),
"compress_type": 0,
"_compresslevel": None,
"comment": b"",
"extra": b"",
"create_system": 3,
"create_version": 20,
"extract_version": 20,
"reserved": 0,
"flag_bits": 0,
"volume": 0,
"internal_attr": 0,
"external_attr": 1106051088,
"header_offset": 0,
"CRC": 0,
"compress_size": 0,
"file_size": 0,
"_raw_time": 22220,
"_end_offset": 35,
"name": "dir1",
"size": 0,
"type": "directory",
},
"dir2": {
"orig_filename": "dir2/",
"filename": "dir2/",
"date_time": (2024, 8, 16, 10, 54, 24),
"compress_type": 0,
"_compresslevel": None,
"comment": b"",
"extra": b"",
"create_system": 3,
"create_version": 20,
"extract_version": 20,
"reserved": 0,
"flag_bits": 0,
"volume": 0,
"internal_attr": 0,
"external_attr": 1106051088,
"header_offset": 35,
"CRC": 0,
"compress_size": 0,
"file_size": 0,
"_raw_time": 22220,
"_end_offset": 70,
"name": "dir2",
"size": 0,
"type": "directory",
},
"dir2/file3.txt": {
"orig_filename": "dir2/file3.txt",
"filename": "dir2/file3.txt",
"date_time": (2024, 8, 16, 10, 54, 24),
"compress_type": 8,
"_compresslevel": None,
"comment": b"",
"extra": b"",
"create_system": 3,
"create_version": 20,
"extract_version": 20,
"reserved": 0,
"flag_bits": 0,
"volume": 0,
"internal_attr": 0,
"external_attr": 2175008768,
"header_offset": 260,
"CRC": 2636827734,
"compress_size": 8,
"file_size": 6,
"_raw_time": 22220,
"_end_offset": 312,
"name": "dir2/file3.txt",
"size": 6,
"type": "file",
},
"file1.txt": {
"orig_filename": "file1.txt",
"filename": "file1.txt",
"date_time": (2024, 8, 16, 10, 54, 24),
"compress_type": 8,
"_compresslevel": None,
"comment": b"",
"extra": b"",
"create_system": 3,
"create_version": 20,
"extract_version": 20,
"reserved": 0,
"flag_bits": 0,
"volume": 0,
"internal_attr": 0,
"external_attr": 2175008768,
"header_offset": 139,
"CRC": 3964322768,
"compress_size": 15,
"file_size": 13,
"_raw_time": 22220,
"_end_offset": 193,
"name": "file1.txt",
"size": 13,
"type": "file",
},
"file2.txt": {
"orig_filename": "file2.txt",
"filename": "file2.txt",
"date_time": (2024, 8, 16, 10, 54, 24),
"compress_type": 8,
"_compresslevel": None,
"comment": b"",
"extra": b"",
"create_system": 3,
"create_version": 20,
"extract_version": 20,
"reserved": 0,
"flag_bits": 0,
"volume": 0,
"internal_attr": 0,
"external_attr": 2175008768,
"header_offset": 193,
"CRC": 1596576865,
"compress_size": 28,
"file_size": 26,
"_raw_time": 22220,
"_end_offset": 260,
"name": "file2.txt",
"size": 26,
"type": "file",
},
}

_assert_all_except_context_dependent_variables(result, expected_result)


def test_find_returns_expected_result_detail_false_include_dirs(zip_file):
zip_file_system = ZipFileSystem(zip_file)

result = zip_file_system.find("/", detail=False, withdirs=True)
expected_result = [
"dir1",
"dir2",
"dir2/file3.txt",
"dir2startwithsamename.txt",
"file1.txt",
"file2.txt",
]

assert result == expected_result


def test_find_returns_expected_result_path_set(zip_file):
zip_file_system = ZipFileSystem(zip_file)

result = zip_file_system.find("/dir2")
expected_result = ["dir2/file3.txt"]

assert result == expected_result


def test_find_with_and_without_slash_should_return_same_result(zip_file):
zip_file_system = ZipFileSystem(zip_file)

assert zip_file_system.find("/dir2/") == zip_file_system.find("/dir2")


def test_find_should_return_file_if_exact_match(zip_file):
zip_file_system = ZipFileSystem(zip_file)

result = zip_file_system.find("/dir2startwithsamename.txt", detail=False)
expected_result = ["dir2startwithsamename.txt"]

assert result == expected_result


def test_find_returns_expected_result_recursion_depth_set(zip_file):
zip_file_system = ZipFileSystem(zip_file)
result = zip_file_system.find("/", maxdepth=1)

expected_result = [
"dir2startwithsamename.txt",
"file1.txt",
"file2.txt",
]

assert result == expected_result
Loading
Loading