PBENCH-1014 Using Tarball.extract in Inventory API for extracting fil…

…es from tarball (#3105) * PBENCH-1014 Using Tarball.extract in Inventory API to extract file content from tarball.
distributed-system-analysis · Jun 26, 2023 · 999f797 · 999f797
1 parent fa225e2
commit 999f797
Show file tree

Hide file tree

Showing 4 changed files with 185 additions and 119 deletions.
diff --git a/lib/pbench/server/api/resources/datasets_inventory.py b/lib/pbench/server/api/resources/datasets_inventory.py
@@ -17,7 +17,7 @@
     ParamType,
     Schema,
 )
-from pbench.server.cache_manager import CacheManager, TarballNotFound
+from pbench.server.cache_manager import CacheManager, CacheType, TarballNotFound
 
 
 class DatasetsInventory(ApiBase):
@@ -54,27 +54,18 @@ def _get(
             APIAbort, reporting either "NOT_FOUND" or "UNSUPPORTED_MEDIA_TYPE"
 
 
-        GET /api/v1/datasets/inventory/{dataset}/{target}
+        GET /api/v1/datasets/{dataset}/inventory/{target}
         """
-
         dataset = params.uri["dataset"]
         target = params.uri.get("target")
 
         cache_m = CacheManager(self.config, current_app.logger)
         try:
-            tarball = cache_m.find_dataset(dataset.resource_id)
+            file_info = cache_m.filestream(dataset, target)
         except TarballNotFound as e:
             raise APIAbort(HTTPStatus.NOT_FOUND, str(e))
 
-        if target is None:
-            file_path = tarball.tarball_path
-        else:
-            dataset_location = tarball.unpacked
-            if dataset_location is None:
-                raise APIAbort(HTTPStatus.NOT_FOUND, "The dataset is not unpacked")
-            file_path = dataset_location / target
-
-        if file_path.is_file():
+        if file_info["type"] == CacheType.FILE:
             # Tell send_file to set `Content-Disposition` to "attachment" if
             # targeting the large binary tarball. Otherwise we'll recommend the
             # default "inline": only `is_file()` paths are allowed here, and
@@ -84,17 +75,15 @@ def _get(
             #
             # NOTE: we could choose to be "smarter" based on file size, file
             # type codes (e.g., .csv, .json), and so forth.
-            return send_file(
-                file_path,
+            resp = send_file(
+                file_info["stream"],
                 as_attachment=target is None,
-                download_name=file_path.name,
-            )
-        elif file_path.exists():
-            raise APIAbort(
-                HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
-                "The specified path does not refer to a regular file",
+                download_name=file_info["name"],
             )
+            file_info["stream"].close()
+            return resp
         else:
             raise APIAbort(
-                HTTPStatus.NOT_FOUND, "The specified path does not refer to a file"
+                HTTPStatus.BAD_REQUEST,
+                "The specified path does not refer to a regular file",
             )
diff --git a/lib/pbench/server/cache_manager.py b/lib/pbench/server/cache_manager.py
@@ -7,7 +7,7 @@
 import shutil
 import subprocess
 import tarfile
-from typing import Optional, Union
+from typing import IO, Optional, Union
 
 from pbench.common import MetadataLog, selinux
 from pbench.server import JSONOBJECT, PbenchServerConfig
@@ -373,7 +373,6 @@ def traverse_cmap(path: Path, cachemap: dict) -> dict[str, dict]:
                     raise BadDirpath(
                         f"Found a file {file_l!r} where a directory was expected in path {str(path)!r}"
                     )
-
             return f_entries[path.name]
         except KeyError as exc:
             raise BadDirpath(
@@ -430,29 +429,57 @@ def get_info(self, path: Path) -> JSONOBJECT:
         return fd_info
 
     @staticmethod
-    def extract(tarball_path: Path, path: str) -> Optional[str]:
-        """Extract a file from the tarball and return it as a string
+    def extract(tarball_path: Path, path: Path) -> Optional[IO[bytes]]:
+        """Returns the file stream which yields the contents of
+        a file at the specified path in the Tarball
+
+        Args:
+            tarball_path: absolute path of the tarball
+            path: relative path within the tarball
+
+        Raise:
+            BadDirpath on failure extracting the file from tarball
+        """
+        try:
+            return tarfile.open(tarball_path, "r:*").extractfile(path)
+        except Exception as exc:
+            raise BadDirpath(
+                f"A problem occurred processing {str(path)!r} from {str(tarball_path)!r}: {exc}"
+            )
 
-        Report failures by raising exceptions.
+    def filestream(self, path: str) -> Optional[dict]:
+        """Returns a dictionary containing information about the target
+        file and file stream
 
         Args:
             path: relative path within the tarball of a file
 
         Raises:
-            MetadataError on failure opening the tarball
             TarballUnpackError on failure to extract the named path
 
         Returns:
-            The named file as a string
+            Dictionary with file info and file stream
         """
-        try:
-            tar = tarfile.open(tarball_path, "r:*")
-        except Exception as exc:
-            raise MetadataError(tarball_path, exc) from exc
-        try:
-            return tar.extractfile(path).read().decode()
-        except Exception as exc:
-            raise TarballUnpackError(tarball_path, f"Unable to extract {path}") from exc
+        if not path:
+            info = {
+                "name": self.name,
+                "type": CacheType.FILE,
+                "stream": self.tarball_path.open("rb"),
+            }
+        else:
+            file_path = Path(self.name) / path
+            info = self.get_info(file_path)
+            if info["type"] == CacheType.FILE:
+                try:
+                    info["stream"] = Tarball.extract(self.tarball_path, file_path)
+                except Exception as exc:
+                    raise TarballUnpackError(
+                        self.tarball_path, f"Unable to extract {str(file_path)!r}"
+                    ) from exc
+            else:
+                info["stream"] = None
+
+        return info
 
     @staticmethod
     def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]:
@@ -465,15 +492,13 @@ def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]:
         name = Dataset.stem(tarball_path)
         try:
             data = Tarball.extract(tarball_path, f"{name}/metadata.log")
-        except TarballUnpackError:
-            data = None
-        if data:
-            metadata = MetadataLog()
-            metadata.read_string(data)
-            metadata = {s: dict(metadata.items(s)) for s in metadata.sections()}
-            return metadata
-        else:
+        except BadDirpath:
             return None
+        else:
+            metadata_log = MetadataLog()
+            metadata_log.read_file(e.decode() for e in data)
+            metadata = {s: dict(metadata_log.items(s)) for s in metadata_log.sections()}
+            return metadata
 
     @staticmethod
     def subprocess_run(
@@ -946,6 +971,13 @@ def create(self, tarfile: Path) -> Tarball:
             controller: associated controller name
             tarfile: dataset tarball path
 
+        Raises
+            BadDirpath: Failure on extracting the file from tarball
+            MetadataError: Failure on getting metadata from metadata.log file
+                in the tarball
+            BadFilename: A bad tarball path was given
+            DuplicateTarball: A duplicate tarball name was detected
+
         Returns
             Tarball object
         """
@@ -955,8 +987,6 @@ def create(self, tarfile: Path) -> Tarball:
                 controller_name = metadata["run"]["controller"]
             else:
                 controller_name = "unknown"
-        except MetadataError:
-            raise
         except Exception as exc:
             raise MetadataError(tarfile, exc)
 
@@ -1005,6 +1035,10 @@ def get_info(self, dataset_id: str, path: Path) -> dict:
         tmap = tarball.get_info(path)
         return tmap
 
+    def filestream(self, dataset, target):
+        tarball = self.find_dataset(dataset.resource_id)
+        return tarball.filestream(target)
+
     def uncache(self, dataset_id: str):
         """Remove the unpacked tarball tree.
 

diff --git a/lib/pbench/test/unit/server/test_cache_manager.py b/lib/pbench/test/unit/server/test_cache_manager.py
@@ -1,4 +1,5 @@
 import hashlib
+import io
 from logging import Logger
 import os
 from pathlib import Path
@@ -132,12 +133,6 @@ def test_metadata(
     ):
         """Test behavior with metadata.log access errors."""
 
-        def fake_extract_file(self, path):
-            raise KeyError(f"filename {path} not found")
-
-        def fake_tarfile_open(self, path):
-            raise tarfile.TarError("Invalid Tarfile")
-
         def fake_metadata(tar_path):
             return {"pbench": {"date": "2002-05-16T00:00:00"}}
 
@@ -155,13 +150,6 @@ def fake_metadata_controller(tar_path):
         source_tarball, source_md5, md5 = tarball
         cm = CacheManager(server_config, make_logger)
 
-        expected_metaerror = f"A problem occurred processing metadata.log from {source_tarball!s}: 'Invalid Tarfile'"
-        with monkeypatch.context() as m:
-            m.setattr(tarfile, "open", fake_tarfile_open)
-            with pytest.raises(MetadataError) as exc:
-                cm.create(source_tarball)
-            assert str(exc.value) == expected_metaerror
-
         expected_metaerror = f"A problem occurred processing metadata.log from {source_tarball!s}: \"'run'\""
         with monkeypatch.context() as m:
             m.setattr(Tarball, "_get_metadata", fake_metadata)
@@ -183,11 +171,6 @@ def fake_metadata_controller(tar_path):
                 cm.create(source_tarball)
             assert str(exc.value) == expected_metaerror
 
-        with monkeypatch.context() as m:
-            m.setattr(tarfile.TarFile, "extractfile", fake_extract_file)
-            cm.create(source_tarball)
-            assert cm[md5].metadata is None
-
     def test_with_metadata(
         self, monkeypatch, selinux_disabled, server_config, make_logger, tarball
     ):
@@ -367,6 +350,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
                         f11.txt
                         f12_sym -> ../../..
                     f1.json
+                    metadata.log
 
 
             Generated cache map
@@ -376,6 +360,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
                     'details': <cache_manager.FileInfo object>,
                     'children': {
                         'f1.json': {'details': <cache_manager.FileInfo object>},
+                        'metadata.log': {'details': <cache_manager.FileInfo object>},
                         'subdir1': {
                             'details': <cache_manager.FileInfo object>,
                             'children': {
@@ -412,6 +397,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
             sub_dir = tmp_path / dir_name
             sub_dir.mkdir(parents=True, exist_ok=True)
             (sub_dir / "f1.json").touch()
+            (sub_dir / "metadata.log").touch()
             for i in range(1, 4):
                 (sub_dir / "subdir1" / f"subdir1{i}").mkdir(parents=True, exist_ok=True)
             (sub_dir / "subdir1" / "f11.txt").touch()
@@ -891,6 +877,60 @@ def test_cache_map_get_info_cmap(
             file_info = tb.get_info(Path(file_path))
             assert file_info == expected_msg
 
+    @pytest.mark.parametrize(
+        "file_path, exp_file_type, exp_stream",
+        [
+            ("", CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")),
+            (None, CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")),
+            ("f1.json", CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")),
+            ("subdir1/subdir12", CacheType.DIRECTORY, None),
+        ],
+    )
+    def test_filestream(
+        self, monkeypatch, tmp_path, file_path, exp_file_type, exp_stream
+    ):
+        """Test to extract file contents/stream from a file"""
+        tar = Path("/mock/dir_name.tar.xz")
+        cache = Path("/mock/.cache")
+
+        with monkeypatch.context() as m:
+            m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
+            m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
+            m.setattr(Tarball, "extract", lambda _t, _p: exp_stream)
+            m.setattr(Path, "open", lambda _s, _m="rb": exp_stream)
+            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
+            tar_dir = TestCacheManager.MockController.generate_test_result_tree(
+                tmp_path, "dir_name"
+            )
+            tb.cache_map(tar_dir)
+            file_info = tb.filestream(file_path)
+            assert file_info["type"] == exp_file_type
+            assert file_info["stream"] == exp_stream
+
+    def test_filestream_tarfile_open(self, monkeypatch, tmp_path):
+        """Test to check non-existent file or tarfile unpack issue"""
+        tar = Path("/mock/dir_name.tar.xz")
+        cache = Path("/mock/.cache")
+
+        def fake_tarfile_open(self, path):
+            raise tarfile.TarError("Invalid Tarfile")
+
+        with monkeypatch.context() as m:
+            m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
+            m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
+            m.setattr(tarfile, "open", fake_tarfile_open)
+            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
+            tar_dir = TestCacheManager.MockController.generate_test_result_tree(
+                tmp_path, "dir_name"
+            )
+            tb.cache_map(tar_dir)
+
+            path = Path(tb.name) / "subdir1/f11.txt"
+            expected_error_msg = f"An error occurred while unpacking {tb.tarball_path}: Unable to extract {str(path)!r}"
+            with pytest.raises(TarballUnpackError) as exc:
+                tb.filestream("subdir1/f11.txt")
+            assert str(exc.value) == expected_error_msg
+
     def test_find(
         self, selinux_enabled, server_config, make_logger, tarball, monkeypatch
     ):