You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I noticed that calling find on ZipFileSystem can be very slow for large archives, taking upwards to two minutes on zip-files containing ~5000 files. I have created an alternative find implementation (see below), that relies on the infolist in the zip file, which is considerable faster (~0.2 seconds).
I'm wondering if you would be interested in bringing these changes in to fsspec? For our particular use-case (reading hive-style partitioned parquet files from zip-archives) this is absolutely necessary, but I am not sure for how many others this is an issue, so I thought I'd ask.
class _CustomZipFileSystem(ZipFileSystem):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
if maxdepth is not None and maxdepth < 1:
raise ValueError("maxdepth must be at least 1")
result = {}
def _below_max_recursion_depth(path):
if not maxdepth:
return True
depth = len(path.split("/"))
return depth <= maxdepth
for zip_info in self.zip.infolist():
file_name = zip_info.filename
if not file_name.startswith(path.lstrip("/")):
continue
# zip files can contain explicit or implicit directories
# hence the need to either add them directly or infer them
# from the file paths
if zip_info.is_dir():
if withdirs:
if not result.get(file_name) and _below_max_recursion_depth(
file_name
):
result[file_name.strip("/")] = (
self.info(file_name) if detail else None
)
continue
else:
continue # Skip along to the next entry if we don't want to add the dirs
if not result.get(file_name):
if _below_max_recursion_depth(file_name):
result[file_name] = self.info(file_name) if detail else None
# Here we handle the case of implicitly adding the
# directories if they have been requested
if withdirs:
directories = file_name.split("/")
for i in range(1, len(directories)):
dir_path = "/".join(directories[:i]).strip(
"/"
) # remove the trailing slash, as this is not expected
if not result.get(dir_path) and _below_max_recursion_depth(
dir_path
):
result[dir_path] = {
"name": dir_path,
"size": 0,
"type": "directory",
}
return result if detail else sorted(list(result.keys()))
And some tests for this:
class TestCustomZipFileSystem:
@pytest.fixture(name="zip_file")
def zip_file_fixture(self, tmp_path):
data_dir = tmp_path / "data/"
data_dir.mkdir()
file1 = data_dir / "file1.txt"
file1.write_text("Hello, World!")
file2 = data_dir / "file2.txt"
file2.write_text("Lorem ipsum dolor sit amet")
empty_dir = data_dir / "dir1"
empty_dir.mkdir()
dir_with_files = data_dir / "dir2"
dir_with_files.mkdir()
file3 = dir_with_files / "file3.txt"
file3.write_text("Hello!")
zip_file = tmp_path / "test"
return Path(make_archive(zip_file, "zip", data_dir))
@pytest.mark.parametrize("detail", [True, False])
@pytest.mark.parametrize("withdirs", [True, False])
@pytest.mark.parametrize("max_depth", [None, 1, 2])
def test_ensure_parity(self, zip_file, detail, withdirs, max_depth):
custom_zip_file_system = _CustomZipFileSystem(zip_file)
zip_file_system = ZipFileSystem(zip_file)
result = custom_zip_file_system.find(
"/", detail=detail, withdirs=withdirs, max_depth=max_depth
)
expected_result = zip_file_system.find(
"/", detail=detail, withdirs=withdirs, max_depth=max_depth
)
assert result
assert result == expected_result
def test_find_returns_expected_result_detail_true(self, zip_file):
custom_zip_file_system = _CustomZipFileSystem(zip_file)
zip_file_system = ZipFileSystem(zip_file)
result = custom_zip_file_system.find("/", detail=True)
expected_result = zip_file_system.find("/", detail=True)
assert result
assert result == expected_result
def test_find_returns_expected_result_detail_false(self, zip_file):
custom_zip_file_system = _CustomZipFileSystem(zip_file)
zip_file_system = ZipFileSystem(zip_file)
result = custom_zip_file_system.find("/", detail=False)
expected_result = zip_file_system.find("/", detail=False)
assert result
assert result == expected_result
def test_find_returns_expected_result_detail_true_include_dirs(self, zip_file):
custom_zip_file_system = _CustomZipFileSystem(zip_file)
zip_file_system = ZipFileSystem(zip_file)
result = custom_zip_file_system.find("/", detail=True, withdirs=True)
expected_result = zip_file_system.find("/", detail=True, withdirs=True)
assert result
assert result == expected_result
def test_find_returns_expected_result_detail_false_include_dirs(self, zip_file):
custom_zip_file_system = _CustomZipFileSystem(zip_file)
zip_file_system = ZipFileSystem(zip_file)
result = custom_zip_file_system.find("/", detail=False, withdirs=True)
expected_result = zip_file_system.find("/", detail=False, withdirs=True)
assert result
assert result == expected_result
def test_find_returns_expected_result_recursion_depth_set(self, zip_file):
custom_zip_file_system = _CustomZipFileSystem(zip_file)
zip_file_system = ZipFileSystem(zip_file)
result = custom_zip_file_system.find("/", maxdepth=1)
expected_result = zip_file_system.find("/", maxdepth=1)
assert result
assert result == expected_result
def test_find_returns_expected_result_path_set(self, zip_file):
custom_zip_file_system = _CustomZipFileSystem(zip_file)
zip_file_system = ZipFileSystem(zip_file)
result = custom_zip_file_system.find("/dir2")
expected_result = zip_file_system.find("/dir2")
assert result
assert result == expected_result
The text was updated successfully, but these errors were encountered:
I noticed that calling
find
onZipFileSystem
can be very slow for large archives, taking upwards to two minutes on zip-files containing ~5000 files. I have created an alternativefind
implementation (see below), that relies on theinfolist
in the zip file, which is considerable faster (~0.2 seconds).I'm wondering if you would be interested in bringing these changes in to
fsspec
? For our particular use-case (reading hive-style partitioned parquet files from zip-archives) this is absolutely necessary, but I am not sure for how many others this is an issue, so I thought I'd ask.And some tests for this:
The text was updated successfully, but these errors were encountered: