Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Slow find on ZipFileSystem #1662

Closed
johandahlberg opened this issue Aug 14, 2024 · 2 comments
Closed

Slow find on ZipFileSystem #1662

johandahlberg opened this issue Aug 14, 2024 · 2 comments

Comments

@johandahlberg
Copy link
Contributor

I noticed that calling find on ZipFileSystem can be very slow for large archives, taking upwards to two minutes on zip-files containing ~5000 files. I have created an alternative find implementation (see below), that relies on the infolist in the zip file, which is considerable faster (~0.2 seconds).

I'm wondering if you would be interested in bringing these changes in to fsspec? For our particular use-case (reading hive-style partitioned parquet files from zip-archives) this is absolutely necessary, but I am not sure for how many others this is an issue, so I thought I'd ask.

class _CustomZipFileSystem(ZipFileSystem):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
        if maxdepth is not None and maxdepth < 1:
            raise ValueError("maxdepth must be at least 1")

        result = {}

        def _below_max_recursion_depth(path):
            if not maxdepth:
                return True

            depth = len(path.split("/"))
            return depth <= maxdepth

        for zip_info in self.zip.infolist():
            file_name = zip_info.filename
            if not file_name.startswith(path.lstrip("/")):
                continue

            # zip files can contain explicit or implicit directories
            # hence the need to either add them directly or infer them
            # from the file paths
            if zip_info.is_dir():
                if withdirs:
                    if not result.get(file_name) and _below_max_recursion_depth(
                        file_name
                    ):
                        result[file_name.strip("/")] = (
                            self.info(file_name) if detail else None
                        )
                    continue
                else:
                    continue  # Skip along to the next entry if we don't want to add the dirs

            if not result.get(file_name):
                if _below_max_recursion_depth(file_name):
                    result[file_name] = self.info(file_name) if detail else None

                # Here we handle the case of implicitly adding the
                # directories if they have been requested
                if withdirs:
                    directories = file_name.split("/")
                    for i in range(1, len(directories)):
                        dir_path = "/".join(directories[:i]).strip(
                            "/"
                        )  # remove the trailing slash, as this is not expected
                        if not result.get(dir_path) and _below_max_recursion_depth(
                            dir_path
                        ):
                            result[dir_path] = {
                                "name": dir_path,
                                "size": 0,
                                "type": "directory",
                            }

        return result if detail else sorted(list(result.keys()))

And some tests for this:

class TestCustomZipFileSystem:
    @pytest.fixture(name="zip_file")
    def zip_file_fixture(self, tmp_path):
        data_dir = tmp_path / "data/"
        data_dir.mkdir()
        file1 = data_dir / "file1.txt"
        file1.write_text("Hello, World!")
        file2 = data_dir / "file2.txt"
        file2.write_text("Lorem ipsum dolor sit amet")

        empty_dir = data_dir / "dir1"
        empty_dir.mkdir()

        dir_with_files = data_dir / "dir2"
        dir_with_files.mkdir()
        file3 = dir_with_files / "file3.txt"
        file3.write_text("Hello!")

        zip_file = tmp_path / "test"
        return Path(make_archive(zip_file, "zip", data_dir))

    @pytest.mark.parametrize("detail", [True, False])
    @pytest.mark.parametrize("withdirs", [True, False])
    @pytest.mark.parametrize("max_depth", [None, 1, 2])
    def test_ensure_parity(self, zip_file, detail, withdirs, max_depth):
        custom_zip_file_system = _CustomZipFileSystem(zip_file)
        zip_file_system = ZipFileSystem(zip_file)
        result = custom_zip_file_system.find(
            "/", detail=detail, withdirs=withdirs, max_depth=max_depth
        )
        expected_result = zip_file_system.find(
            "/", detail=detail, withdirs=withdirs, max_depth=max_depth
        )
        assert result
        assert result == expected_result

    def test_find_returns_expected_result_detail_true(self, zip_file):
        custom_zip_file_system = _CustomZipFileSystem(zip_file)
        zip_file_system = ZipFileSystem(zip_file)

        result = custom_zip_file_system.find("/", detail=True)
        expected_result = zip_file_system.find("/", detail=True)

        assert result
        assert result == expected_result

    def test_find_returns_expected_result_detail_false(self, zip_file):
        custom_zip_file_system = _CustomZipFileSystem(zip_file)
        zip_file_system = ZipFileSystem(zip_file)

        result = custom_zip_file_system.find("/", detail=False)
        expected_result = zip_file_system.find("/", detail=False)

        assert result
        assert result == expected_result

    def test_find_returns_expected_result_detail_true_include_dirs(self, zip_file):
        custom_zip_file_system = _CustomZipFileSystem(zip_file)
        zip_file_system = ZipFileSystem(zip_file)

        result = custom_zip_file_system.find("/", detail=True, withdirs=True)
        expected_result = zip_file_system.find("/", detail=True, withdirs=True)

        assert result
        assert result == expected_result

    def test_find_returns_expected_result_detail_false_include_dirs(self, zip_file):
        custom_zip_file_system = _CustomZipFileSystem(zip_file)
        zip_file_system = ZipFileSystem(zip_file)

        result = custom_zip_file_system.find("/", detail=False, withdirs=True)
        expected_result = zip_file_system.find("/", detail=False, withdirs=True)

        assert result
        assert result == expected_result

    def test_find_returns_expected_result_recursion_depth_set(self, zip_file):
        custom_zip_file_system = _CustomZipFileSystem(zip_file)
        zip_file_system = ZipFileSystem(zip_file)

        result = custom_zip_file_system.find("/", maxdepth=1)
        expected_result = zip_file_system.find("/", maxdepth=1)

        assert result
        assert result == expected_result

    def test_find_returns_expected_result_path_set(self, zip_file):
        custom_zip_file_system = _CustomZipFileSystem(zip_file)
        zip_file_system = ZipFileSystem(zip_file)

        result = custom_zip_file_system.find("/dir2")
        expected_result = zip_file_system.find("/dir2")

        assert result
        assert result == expected_result
@martindurant
Copy link
Member

Absolutely, happy to see improved performance that doesn't break anything - please make a PR, and I can review.

@johandahlberg
Copy link
Contributor Author

Super. I'll get to it as soon as I can.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants