From 999f797834157d72a6a5d1f8d6ccacd124087e9a Mon Sep 17 00:00:00 2001
From: Riya <riyad17031998@gmail.com>
Date: Mon, 26 Jun 2023 23:35:11 +0530
Subject: [PATCH] PBENCH-1014 Using Tarball.extract in Inventory API for
 extracting files from tarball (#3105)

* PBENCH-1014

Using Tarball.extract in Inventory API to extract file content from tarball.
---
 .../api/resources/datasets_inventory.py       |  33 ++----
 lib/pbench/server/cache_manager.py            |  84 +++++++++----
 .../test/unit/server/test_cache_manager.py    |  76 +++++++++---
 .../unit/server/test_datasets_inventory.py    | 111 +++++++++---------
 4 files changed, 185 insertions(+), 119 deletions(-)

diff --git a/lib/pbench/server/api/resources/datasets_inventory.py b/lib/pbench/server/api/resources/datasets_inventory.py
index 16aa72ab7d..ca8ce93605 100644
--- a/lib/pbench/server/api/resources/datasets_inventory.py
+++ b/lib/pbench/server/api/resources/datasets_inventory.py
@@ -17,7 +17,7 @@
     ParamType,
     Schema,
 )
-from pbench.server.cache_manager import CacheManager, TarballNotFound
+from pbench.server.cache_manager import CacheManager, CacheType, TarballNotFound
 
 
 class DatasetsInventory(ApiBase):
@@ -54,27 +54,18 @@ def _get(
             APIAbort, reporting either "NOT_FOUND" or "UNSUPPORTED_MEDIA_TYPE"
 
 
-        GET /api/v1/datasets/inventory/{dataset}/{target}
+        GET /api/v1/datasets/{dataset}/inventory/{target}
         """
-
         dataset = params.uri["dataset"]
         target = params.uri.get("target")
 
         cache_m = CacheManager(self.config, current_app.logger)
         try:
-            tarball = cache_m.find_dataset(dataset.resource_id)
+            file_info = cache_m.filestream(dataset, target)
         except TarballNotFound as e:
             raise APIAbort(HTTPStatus.NOT_FOUND, str(e))
 
-        if target is None:
-            file_path = tarball.tarball_path
-        else:
-            dataset_location = tarball.unpacked
-            if dataset_location is None:
-                raise APIAbort(HTTPStatus.NOT_FOUND, "The dataset is not unpacked")
-            file_path = dataset_location / target
-
-        if file_path.is_file():
+        if file_info["type"] == CacheType.FILE:
             # Tell send_file to set `Content-Disposition` to "attachment" if
             # targeting the large binary tarball. Otherwise we'll recommend the
             # default "inline": only `is_file()` paths are allowed here, and
@@ -84,17 +75,15 @@ def _get(
             #
             # NOTE: we could choose to be "smarter" based on file size, file
             # type codes (e.g., .csv, .json), and so forth.
-            return send_file(
-                file_path,
+            resp = send_file(
+                file_info["stream"],
                 as_attachment=target is None,
-                download_name=file_path.name,
-            )
-        elif file_path.exists():
-            raise APIAbort(
-                HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
-                "The specified path does not refer to a regular file",
+                download_name=file_info["name"],
             )
+            file_info["stream"].close()
+            return resp
         else:
             raise APIAbort(
-                HTTPStatus.NOT_FOUND, "The specified path does not refer to a file"
+                HTTPStatus.BAD_REQUEST,
+                "The specified path does not refer to a regular file",
             )
diff --git a/lib/pbench/server/cache_manager.py b/lib/pbench/server/cache_manager.py
index d594a49efd..21b52f5ea0 100644
--- a/lib/pbench/server/cache_manager.py
+++ b/lib/pbench/server/cache_manager.py
@@ -7,7 +7,7 @@
 import shutil
 import subprocess
 import tarfile
-from typing import Optional, Union
+from typing import IO, Optional, Union
 
 from pbench.common import MetadataLog, selinux
 from pbench.server import JSONOBJECT, PbenchServerConfig
@@ -373,7 +373,6 @@ def traverse_cmap(path: Path, cachemap: dict) -> dict[str, dict]:
                     raise BadDirpath(
                         f"Found a file {file_l!r} where a directory was expected in path {str(path)!r}"
                     )
-
             return f_entries[path.name]
         except KeyError as exc:
             raise BadDirpath(
@@ -430,29 +429,57 @@ def get_info(self, path: Path) -> JSONOBJECT:
         return fd_info
 
     @staticmethod
-    def extract(tarball_path: Path, path: str) -> Optional[str]:
-        """Extract a file from the tarball and return it as a string
+    def extract(tarball_path: Path, path: Path) -> Optional[IO[bytes]]:
+        """Returns the file stream which yields the contents of
+        a file at the specified path in the Tarball
+
+        Args:
+            tarball_path: absolute path of the tarball
+            path: relative path within the tarball
+
+        Raise:
+            BadDirpath on failure extracting the file from tarball
+        """
+        try:
+            return tarfile.open(tarball_path, "r:*").extractfile(path)
+        except Exception as exc:
+            raise BadDirpath(
+                f"A problem occurred processing {str(path)!r} from {str(tarball_path)!r}: {exc}"
+            )
 
-        Report failures by raising exceptions.
+    def filestream(self, path: str) -> Optional[dict]:
+        """Returns a dictionary containing information about the target
+        file and file stream
 
         Args:
             path: relative path within the tarball of a file
 
         Raises:
-            MetadataError on failure opening the tarball
             TarballUnpackError on failure to extract the named path
 
         Returns:
-            The named file as a string
+            Dictionary with file info and file stream
         """
-        try:
-            tar = tarfile.open(tarball_path, "r:*")
-        except Exception as exc:
-            raise MetadataError(tarball_path, exc) from exc
-        try:
-            return tar.extractfile(path).read().decode()
-        except Exception as exc:
-            raise TarballUnpackError(tarball_path, f"Unable to extract {path}") from exc
+        if not path:
+            info = {
+                "name": self.name,
+                "type": CacheType.FILE,
+                "stream": self.tarball_path.open("rb"),
+            }
+        else:
+            file_path = Path(self.name) / path
+            info = self.get_info(file_path)
+            if info["type"] == CacheType.FILE:
+                try:
+                    info["stream"] = Tarball.extract(self.tarball_path, file_path)
+                except Exception as exc:
+                    raise TarballUnpackError(
+                        self.tarball_path, f"Unable to extract {str(file_path)!r}"
+                    ) from exc
+            else:
+                info["stream"] = None
+
+        return info
 
     @staticmethod
     def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]:
@@ -465,15 +492,13 @@ def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]:
         name = Dataset.stem(tarball_path)
         try:
             data = Tarball.extract(tarball_path, f"{name}/metadata.log")
-        except TarballUnpackError:
-            data = None
-        if data:
-            metadata = MetadataLog()
-            metadata.read_string(data)
-            metadata = {s: dict(metadata.items(s)) for s in metadata.sections()}
-            return metadata
-        else:
+        except BadDirpath:
             return None
+        else:
+            metadata_log = MetadataLog()
+            metadata_log.read_file(e.decode() for e in data)
+            metadata = {s: dict(metadata_log.items(s)) for s in metadata_log.sections()}
+            return metadata
 
     @staticmethod
     def subprocess_run(
@@ -946,6 +971,13 @@ def create(self, tarfile: Path) -> Tarball:
             controller: associated controller name
             tarfile: dataset tarball path
 
+        Raises
+            BadDirpath: Failure on extracting the file from tarball
+            MetadataError: Failure on getting metadata from metadata.log file
+                in the tarball
+            BadFilename: A bad tarball path was given
+            DuplicateTarball: A duplicate tarball name was detected
+
         Returns
             Tarball object
         """
@@ -955,8 +987,6 @@ def create(self, tarfile: Path) -> Tarball:
                 controller_name = metadata["run"]["controller"]
             else:
                 controller_name = "unknown"
-        except MetadataError:
-            raise
         except Exception as exc:
             raise MetadataError(tarfile, exc)
 
@@ -1005,6 +1035,10 @@ def get_info(self, dataset_id: str, path: Path) -> dict:
         tmap = tarball.get_info(path)
         return tmap
 
+    def filestream(self, dataset, target):
+        tarball = self.find_dataset(dataset.resource_id)
+        return tarball.filestream(target)
+
     def uncache(self, dataset_id: str):
         """Remove the unpacked tarball tree.
 
diff --git a/lib/pbench/test/unit/server/test_cache_manager.py b/lib/pbench/test/unit/server/test_cache_manager.py
index 93333e1ea6..5f4f065da3 100644
--- a/lib/pbench/test/unit/server/test_cache_manager.py
+++ b/lib/pbench/test/unit/server/test_cache_manager.py
@@ -1,4 +1,5 @@
 import hashlib
+import io
 from logging import Logger
 import os
 from pathlib import Path
@@ -132,12 +133,6 @@ def test_metadata(
     ):
         """Test behavior with metadata.log access errors."""
 
-        def fake_extract_file(self, path):
-            raise KeyError(f"filename {path} not found")
-
-        def fake_tarfile_open(self, path):
-            raise tarfile.TarError("Invalid Tarfile")
-
         def fake_metadata(tar_path):
             return {"pbench": {"date": "2002-05-16T00:00:00"}}
 
@@ -155,13 +150,6 @@ def fake_metadata_controller(tar_path):
         source_tarball, source_md5, md5 = tarball
         cm = CacheManager(server_config, make_logger)
 
-        expected_metaerror = f"A problem occurred processing metadata.log from {source_tarball!s}: 'Invalid Tarfile'"
-        with monkeypatch.context() as m:
-            m.setattr(tarfile, "open", fake_tarfile_open)
-            with pytest.raises(MetadataError) as exc:
-                cm.create(source_tarball)
-            assert str(exc.value) == expected_metaerror
-
         expected_metaerror = f"A problem occurred processing metadata.log from {source_tarball!s}: \"'run'\""
         with monkeypatch.context() as m:
             m.setattr(Tarball, "_get_metadata", fake_metadata)
@@ -183,11 +171,6 @@ def fake_metadata_controller(tar_path):
                 cm.create(source_tarball)
             assert str(exc.value) == expected_metaerror
 
-        with monkeypatch.context() as m:
-            m.setattr(tarfile.TarFile, "extractfile", fake_extract_file)
-            cm.create(source_tarball)
-            assert cm[md5].metadata is None
-
     def test_with_metadata(
         self, monkeypatch, selinux_disabled, server_config, make_logger, tarball
     ):
@@ -367,6 +350,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
                         f11.txt
                         f12_sym -> ../../..
                     f1.json
+                    metadata.log
 
 
             Generated cache map
@@ -376,6 +360,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
                     'details': <cache_manager.FileInfo object>,
                     'children': {
                         'f1.json': {'details': <cache_manager.FileInfo object>},
+                        'metadata.log': {'details': <cache_manager.FileInfo object>},
                         'subdir1': {
                             'details': <cache_manager.FileInfo object>,
                             'children': {
@@ -412,6 +397,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
             sub_dir = tmp_path / dir_name
             sub_dir.mkdir(parents=True, exist_ok=True)
             (sub_dir / "f1.json").touch()
+            (sub_dir / "metadata.log").touch()
             for i in range(1, 4):
                 (sub_dir / "subdir1" / f"subdir1{i}").mkdir(parents=True, exist_ok=True)
             (sub_dir / "subdir1" / "f11.txt").touch()
@@ -891,6 +877,60 @@ def test_cache_map_get_info_cmap(
             file_info = tb.get_info(Path(file_path))
             assert file_info == expected_msg
 
+    @pytest.mark.parametrize(
+        "file_path, exp_file_type, exp_stream",
+        [
+            ("", CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")),
+            (None, CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")),
+            ("f1.json", CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")),
+            ("subdir1/subdir12", CacheType.DIRECTORY, None),
+        ],
+    )
+    def test_filestream(
+        self, monkeypatch, tmp_path, file_path, exp_file_type, exp_stream
+    ):
+        """Test to extract file contents/stream from a file"""
+        tar = Path("/mock/dir_name.tar.xz")
+        cache = Path("/mock/.cache")
+
+        with monkeypatch.context() as m:
+            m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
+            m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
+            m.setattr(Tarball, "extract", lambda _t, _p: exp_stream)
+            m.setattr(Path, "open", lambda _s, _m="rb": exp_stream)
+            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
+            tar_dir = TestCacheManager.MockController.generate_test_result_tree(
+                tmp_path, "dir_name"
+            )
+            tb.cache_map(tar_dir)
+            file_info = tb.filestream(file_path)
+            assert file_info["type"] == exp_file_type
+            assert file_info["stream"] == exp_stream
+
+    def test_filestream_tarfile_open(self, monkeypatch, tmp_path):
+        """Test to check non-existent file or tarfile unpack issue"""
+        tar = Path("/mock/dir_name.tar.xz")
+        cache = Path("/mock/.cache")
+
+        def fake_tarfile_open(self, path):
+            raise tarfile.TarError("Invalid Tarfile")
+
+        with monkeypatch.context() as m:
+            m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
+            m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
+            m.setattr(tarfile, "open", fake_tarfile_open)
+            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
+            tar_dir = TestCacheManager.MockController.generate_test_result_tree(
+                tmp_path, "dir_name"
+            )
+            tb.cache_map(tar_dir)
+
+            path = Path(tb.name) / "subdir1/f11.txt"
+            expected_error_msg = f"An error occurred while unpacking {tb.tarball_path}: Unable to extract {str(path)!r}"
+            with pytest.raises(TarballUnpackError) as exc:
+                tb.filestream("subdir1/f11.txt")
+            assert str(exc.value) == expected_error_msg
+
     def test_find(
         self, selinux_enabled, server_config, make_logger, tarball, monkeypatch
     ):
diff --git a/lib/pbench/test/unit/server/test_datasets_inventory.py b/lib/pbench/test/unit/server/test_datasets_inventory.py
index 8f046b2c0d..6ee6f7e688 100644
--- a/lib/pbench/test/unit/server/test_datasets_inventory.py
+++ b/lib/pbench/test/unit/server/test_datasets_inventory.py
@@ -1,11 +1,12 @@
 from http import HTTPStatus
+import io
 from pathlib import Path
 from typing import Any, Optional
 
 import pytest
 import requests
 
-from pbench.server.cache_manager import CacheManager
+from pbench.server.cache_manager import CacheManager, CacheType, Controller
 from pbench.server.database.models.datasets import Dataset, DatasetNotFound
 
 
@@ -31,9 +32,8 @@ def query_api(
             except DatasetNotFound:
                 dataset_id = dataset  # Allow passing deliberately bad value
             headers = {"authorization": f"bearer {pbench_drb_token}"}
-            k = "" if target is None else f"/{target}"
             response = client.get(
-                f"{server_config.rest_uri}/datasets/{dataset_id}/inventory{k}",
+                f"{server_config.rest_uri}/datasets/{dataset_id}/inventory/{target}",
                 headers=headers,
             )
             assert response.status_code == expected_status
@@ -41,14 +41,24 @@ def query_api(
 
         return query_api
 
-    def mock_find_dataset(self, dataset):
-        class Tarball(object):
-            unpacked = Path("/dataset/")
-            tarball_path = Path("/dataset/tarball.tar.xz")
-
+    class MockTarball:
+        def __init__(self, path: Path, controller: Controller):
+            self.name = "dir_name.tar.xz"
+            self.tarball_path = path
+            self.unpacked = None
+
+        def filestream(self, target):
+            info = {
+                "name": "f1.json",
+                "type": CacheType.FILE,
+                "stream": io.BytesIO(b"file_as_a_byte_stream"),
+            }
+            return info
+
+    def mock_find_dataset(self, dataset) -> MockTarball:
         # Validate the resource_id
         Dataset.query(resource_id=dataset)
-        return Tarball
+        return self.MockTarball(Path("/mock/dir_name.tar.xz"), "abc")
 
     def test_get_no_dataset(self, query_get_as):
         response = query_get_as(
@@ -68,80 +78,73 @@ def test_unauthorized_access(self, query_get_as):
             "message": "User drb is not authorized to READ a resource owned by test with private access"
         }
 
-    def test_dataset_is_not_unpacked(self, query_get_as, monkeypatch):
-        def mock_find_not_unpacked(self, dataset):
-            class Tarball(object):
-                unpacked = None
-
-            # Validate the resource_id
-            Dataset.query(resource_id=dataset)
-            return Tarball
-
-        monkeypatch.setattr(CacheManager, "find_dataset", mock_find_not_unpacked)
-
-        response = query_get_as("fio_2", "1-default", HTTPStatus.NOT_FOUND)
-        assert response.json == {"message": "The dataset is not unpacked"}
-
-    def test_path_is_directory(self, query_get_as, monkeypatch):
+    @pytest.mark.parametrize("key", (None, "", "subdir1"))
+    def test_path_is_directory(self, query_get_as, monkeypatch, key):
+        filestream_dict = {"type": CacheType.DIRECTORY, "stream": None}
         monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset)
+        monkeypatch.setattr(
+            self.MockTarball, "filestream", lambda _s, _t: filestream_dict
+        )
         monkeypatch.setattr(Path, "is_file", lambda self: False)
         monkeypatch.setattr(Path, "exists", lambda self: True)
 
-        response = query_get_as("fio_2", "1-default", HTTPStatus.UNSUPPORTED_MEDIA_TYPE)
+        response = query_get_as("fio_2", key, HTTPStatus.BAD_REQUEST)
         assert response.json == {
             "message": "The specified path does not refer to a regular file"
         }
 
     def test_not_a_file(self, query_get_as, monkeypatch):
+        filestream_dict = {"type": CacheType.SYMLINK, "stream": None}
         monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset)
+        monkeypatch.setattr(
+            self.MockTarball, "filestream", lambda _s, _t: filestream_dict
+        )
         monkeypatch.setattr(Path, "is_file", lambda self: False)
         monkeypatch.setattr(Path, "exists", lambda self: False)
 
-        response = query_get_as("fio_2", "1-default", HTTPStatus.NOT_FOUND)
+        response = query_get_as("fio_2", "subdir1/f1_sym", HTTPStatus.BAD_REQUEST)
         assert response.json == {
-            "message": "The specified path does not refer to a file"
+            "message": "The specified path does not refer to a regular file"
         }
 
     def test_dataset_in_given_path(self, query_get_as, monkeypatch):
-        mock_args: Optional[tuple[Path, dict[str, Any]]] = None
-
-        def mock_send_file(path_or_file, *args, **kwargs):
-            nonlocal mock_args
-            mock_args = (path_or_file, kwargs)
-            return {"status": "OK"}
-
-        monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset)
-        monkeypatch.setattr(Path, "is_file", lambda self: True)
-        monkeypatch.setattr(
-            "pbench.server.api.resources.datasets_inventory.send_file", mock_send_file
-        )
+        mock_close = False
 
-        response = query_get_as("fio_2", "1-default/default.csv", HTTPStatus.OK)
-        assert response.status_code == HTTPStatus.OK
+        class MockBytesIO(io.BytesIO):
+            def close(self):
+                nonlocal mock_close
+                mock_close = True
+                super().close()
 
-        path, args = mock_args
-        assert str(path) == "/dataset/1-default/default.csv"
-        assert args["as_attachment"] is False
-        assert args["download_name"] == "default.csv"
-
-    @pytest.mark.parametrize("key", (None, ""))
-    def test_get_result_tarball(self, query_get_as, monkeypatch, key):
         mock_args: Optional[tuple[Path, dict[str, Any]]] = None
+        exp_stream = MockBytesIO(b"file_as_a_byte_stream")
+        filestream_dict = {
+            "name": "f1.json",
+            "type": CacheType.FILE,
+            "stream": exp_stream,
+        }
 
         def mock_send_file(path_or_file, *args, **kwargs):
             nonlocal mock_args
+            assert path_or_file == exp_stream
             mock_args = (path_or_file, kwargs)
             return {"status": "OK"}
 
         monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset)
-        monkeypatch.setattr(Path, "is_file", lambda self: True)
+        monkeypatch.setattr(
+            self.MockTarball, "filestream", lambda _s, _t: filestream_dict
+        )
         monkeypatch.setattr(
             "pbench.server.api.resources.datasets_inventory.send_file", mock_send_file
         )
 
-        response = query_get_as("fio_2", key, HTTPStatus.OK)
+        response = query_get_as("fio_2", "f1.json", HTTPStatus.OK)
         assert response.status_code == HTTPStatus.OK
-        path, args = mock_args
-        assert str(path) == "/dataset/tarball.tar.xz"
-        assert args["as_attachment"] is True
-        assert args["download_name"] == "tarball.tar.xz"
+
+        file_content, args = mock_args
+
+        assert isinstance(file_content, io.BytesIO)
+        assert file_content == exp_stream
+        assert args["as_attachment"] is False
+        assert args["download_name"] == "f1.json"
+        assert mock_close