From 999f797834157d72a6a5d1f8d6ccacd124087e9a Mon Sep 17 00:00:00 2001 From: Riya Date: Mon, 26 Jun 2023 23:35:11 +0530 Subject: [PATCH] PBENCH-1014 Using Tarball.extract in Inventory API for extracting files from tarball (#3105) * PBENCH-1014 Using Tarball.extract in Inventory API to extract file content from tarball. --- .../api/resources/datasets_inventory.py | 33 ++---- lib/pbench/server/cache_manager.py | 84 +++++++++---- .../test/unit/server/test_cache_manager.py | 76 +++++++++--- .../unit/server/test_datasets_inventory.py | 111 +++++++++--------- 4 files changed, 185 insertions(+), 119 deletions(-) diff --git a/lib/pbench/server/api/resources/datasets_inventory.py b/lib/pbench/server/api/resources/datasets_inventory.py index 16aa72ab7d..ca8ce93605 100644 --- a/lib/pbench/server/api/resources/datasets_inventory.py +++ b/lib/pbench/server/api/resources/datasets_inventory.py @@ -17,7 +17,7 @@ ParamType, Schema, ) -from pbench.server.cache_manager import CacheManager, TarballNotFound +from pbench.server.cache_manager import CacheManager, CacheType, TarballNotFound class DatasetsInventory(ApiBase): @@ -54,27 +54,18 @@ def _get( APIAbort, reporting either "NOT_FOUND" or "UNSUPPORTED_MEDIA_TYPE" - GET /api/v1/datasets/inventory/{dataset}/{target} + GET /api/v1/datasets/{dataset}/inventory/{target} """ - dataset = params.uri["dataset"] target = params.uri.get("target") cache_m = CacheManager(self.config, current_app.logger) try: - tarball = cache_m.find_dataset(dataset.resource_id) + file_info = cache_m.filestream(dataset, target) except TarballNotFound as e: raise APIAbort(HTTPStatus.NOT_FOUND, str(e)) - if target is None: - file_path = tarball.tarball_path - else: - dataset_location = tarball.unpacked - if dataset_location is None: - raise APIAbort(HTTPStatus.NOT_FOUND, "The dataset is not unpacked") - file_path = dataset_location / target - - if file_path.is_file(): + if file_info["type"] == CacheType.FILE: # Tell send_file to set `Content-Disposition` to "attachment" if # targeting the large binary tarball. Otherwise we'll recommend the # default "inline": only `is_file()` paths are allowed here, and @@ -84,17 +75,15 @@ def _get( # # NOTE: we could choose to be "smarter" based on file size, file # type codes (e.g., .csv, .json), and so forth. - return send_file( - file_path, + resp = send_file( + file_info["stream"], as_attachment=target is None, - download_name=file_path.name, - ) - elif file_path.exists(): - raise APIAbort( - HTTPStatus.UNSUPPORTED_MEDIA_TYPE, - "The specified path does not refer to a regular file", + download_name=file_info["name"], ) + file_info["stream"].close() + return resp else: raise APIAbort( - HTTPStatus.NOT_FOUND, "The specified path does not refer to a file" + HTTPStatus.BAD_REQUEST, + "The specified path does not refer to a regular file", ) diff --git a/lib/pbench/server/cache_manager.py b/lib/pbench/server/cache_manager.py index d594a49efd..21b52f5ea0 100644 --- a/lib/pbench/server/cache_manager.py +++ b/lib/pbench/server/cache_manager.py @@ -7,7 +7,7 @@ import shutil import subprocess import tarfile -from typing import Optional, Union +from typing import IO, Optional, Union from pbench.common import MetadataLog, selinux from pbench.server import JSONOBJECT, PbenchServerConfig @@ -373,7 +373,6 @@ def traverse_cmap(path: Path, cachemap: dict) -> dict[str, dict]: raise BadDirpath( f"Found a file {file_l!r} where a directory was expected in path {str(path)!r}" ) - return f_entries[path.name] except KeyError as exc: raise BadDirpath( @@ -430,29 +429,57 @@ def get_info(self, path: Path) -> JSONOBJECT: return fd_info @staticmethod - def extract(tarball_path: Path, path: str) -> Optional[str]: - """Extract a file from the tarball and return it as a string + def extract(tarball_path: Path, path: Path) -> Optional[IO[bytes]]: + """Returns the file stream which yields the contents of + a file at the specified path in the Tarball + + Args: + tarball_path: absolute path of the tarball + path: relative path within the tarball + + Raise: + BadDirpath on failure extracting the file from tarball + """ + try: + return tarfile.open(tarball_path, "r:*").extractfile(path) + except Exception as exc: + raise BadDirpath( + f"A problem occurred processing {str(path)!r} from {str(tarball_path)!r}: {exc}" + ) - Report failures by raising exceptions. + def filestream(self, path: str) -> Optional[dict]: + """Returns a dictionary containing information about the target + file and file stream Args: path: relative path within the tarball of a file Raises: - MetadataError on failure opening the tarball TarballUnpackError on failure to extract the named path Returns: - The named file as a string + Dictionary with file info and file stream """ - try: - tar = tarfile.open(tarball_path, "r:*") - except Exception as exc: - raise MetadataError(tarball_path, exc) from exc - try: - return tar.extractfile(path).read().decode() - except Exception as exc: - raise TarballUnpackError(tarball_path, f"Unable to extract {path}") from exc + if not path: + info = { + "name": self.name, + "type": CacheType.FILE, + "stream": self.tarball_path.open("rb"), + } + else: + file_path = Path(self.name) / path + info = self.get_info(file_path) + if info["type"] == CacheType.FILE: + try: + info["stream"] = Tarball.extract(self.tarball_path, file_path) + except Exception as exc: + raise TarballUnpackError( + self.tarball_path, f"Unable to extract {str(file_path)!r}" + ) from exc + else: + info["stream"] = None + + return info @staticmethod def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]: @@ -465,15 +492,13 @@ def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]: name = Dataset.stem(tarball_path) try: data = Tarball.extract(tarball_path, f"{name}/metadata.log") - except TarballUnpackError: - data = None - if data: - metadata = MetadataLog() - metadata.read_string(data) - metadata = {s: dict(metadata.items(s)) for s in metadata.sections()} - return metadata - else: + except BadDirpath: return None + else: + metadata_log = MetadataLog() + metadata_log.read_file(e.decode() for e in data) + metadata = {s: dict(metadata_log.items(s)) for s in metadata_log.sections()} + return metadata @staticmethod def subprocess_run( @@ -946,6 +971,13 @@ def create(self, tarfile: Path) -> Tarball: controller: associated controller name tarfile: dataset tarball path + Raises + BadDirpath: Failure on extracting the file from tarball + MetadataError: Failure on getting metadata from metadata.log file + in the tarball + BadFilename: A bad tarball path was given + DuplicateTarball: A duplicate tarball name was detected + Returns Tarball object """ @@ -955,8 +987,6 @@ def create(self, tarfile: Path) -> Tarball: controller_name = metadata["run"]["controller"] else: controller_name = "unknown" - except MetadataError: - raise except Exception as exc: raise MetadataError(tarfile, exc) @@ -1005,6 +1035,10 @@ def get_info(self, dataset_id: str, path: Path) -> dict: tmap = tarball.get_info(path) return tmap + def filestream(self, dataset, target): + tarball = self.find_dataset(dataset.resource_id) + return tarball.filestream(target) + def uncache(self, dataset_id: str): """Remove the unpacked tarball tree. diff --git a/lib/pbench/test/unit/server/test_cache_manager.py b/lib/pbench/test/unit/server/test_cache_manager.py index 93333e1ea6..5f4f065da3 100644 --- a/lib/pbench/test/unit/server/test_cache_manager.py +++ b/lib/pbench/test/unit/server/test_cache_manager.py @@ -1,4 +1,5 @@ import hashlib +import io from logging import Logger import os from pathlib import Path @@ -132,12 +133,6 @@ def test_metadata( ): """Test behavior with metadata.log access errors.""" - def fake_extract_file(self, path): - raise KeyError(f"filename {path} not found") - - def fake_tarfile_open(self, path): - raise tarfile.TarError("Invalid Tarfile") - def fake_metadata(tar_path): return {"pbench": {"date": "2002-05-16T00:00:00"}} @@ -155,13 +150,6 @@ def fake_metadata_controller(tar_path): source_tarball, source_md5, md5 = tarball cm = CacheManager(server_config, make_logger) - expected_metaerror = f"A problem occurred processing metadata.log from {source_tarball!s}: 'Invalid Tarfile'" - with monkeypatch.context() as m: - m.setattr(tarfile, "open", fake_tarfile_open) - with pytest.raises(MetadataError) as exc: - cm.create(source_tarball) - assert str(exc.value) == expected_metaerror - expected_metaerror = f"A problem occurred processing metadata.log from {source_tarball!s}: \"'run'\"" with monkeypatch.context() as m: m.setattr(Tarball, "_get_metadata", fake_metadata) @@ -183,11 +171,6 @@ def fake_metadata_controller(tar_path): cm.create(source_tarball) assert str(exc.value) == expected_metaerror - with monkeypatch.context() as m: - m.setattr(tarfile.TarFile, "extractfile", fake_extract_file) - cm.create(source_tarball) - assert cm[md5].metadata is None - def test_with_metadata( self, monkeypatch, selinux_disabled, server_config, make_logger, tarball ): @@ -367,6 +350,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path: f11.txt f12_sym -> ../../.. f1.json + metadata.log Generated cache map @@ -376,6 +360,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path: 'details': , 'children': { 'f1.json': {'details': }, + 'metadata.log': {'details': }, 'subdir1': { 'details': , 'children': { @@ -412,6 +397,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path: sub_dir = tmp_path / dir_name sub_dir.mkdir(parents=True, exist_ok=True) (sub_dir / "f1.json").touch() + (sub_dir / "metadata.log").touch() for i in range(1, 4): (sub_dir / "subdir1" / f"subdir1{i}").mkdir(parents=True, exist_ok=True) (sub_dir / "subdir1" / "f11.txt").touch() @@ -891,6 +877,60 @@ def test_cache_map_get_info_cmap( file_info = tb.get_info(Path(file_path)) assert file_info == expected_msg + @pytest.mark.parametrize( + "file_path, exp_file_type, exp_stream", + [ + ("", CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")), + (None, CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")), + ("f1.json", CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")), + ("subdir1/subdir12", CacheType.DIRECTORY, None), + ], + ) + def test_filestream( + self, monkeypatch, tmp_path, file_path, exp_file_type, exp_stream + ): + """Test to extract file contents/stream from a file""" + tar = Path("/mock/dir_name.tar.xz") + cache = Path("/mock/.cache") + + with monkeypatch.context() as m: + m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__) + m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__) + m.setattr(Tarball, "extract", lambda _t, _p: exp_stream) + m.setattr(Path, "open", lambda _s, _m="rb": exp_stream) + tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None)) + tar_dir = TestCacheManager.MockController.generate_test_result_tree( + tmp_path, "dir_name" + ) + tb.cache_map(tar_dir) + file_info = tb.filestream(file_path) + assert file_info["type"] == exp_file_type + assert file_info["stream"] == exp_stream + + def test_filestream_tarfile_open(self, monkeypatch, tmp_path): + """Test to check non-existent file or tarfile unpack issue""" + tar = Path("/mock/dir_name.tar.xz") + cache = Path("/mock/.cache") + + def fake_tarfile_open(self, path): + raise tarfile.TarError("Invalid Tarfile") + + with monkeypatch.context() as m: + m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__) + m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__) + m.setattr(tarfile, "open", fake_tarfile_open) + tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None)) + tar_dir = TestCacheManager.MockController.generate_test_result_tree( + tmp_path, "dir_name" + ) + tb.cache_map(tar_dir) + + path = Path(tb.name) / "subdir1/f11.txt" + expected_error_msg = f"An error occurred while unpacking {tb.tarball_path}: Unable to extract {str(path)!r}" + with pytest.raises(TarballUnpackError) as exc: + tb.filestream("subdir1/f11.txt") + assert str(exc.value) == expected_error_msg + def test_find( self, selinux_enabled, server_config, make_logger, tarball, monkeypatch ): diff --git a/lib/pbench/test/unit/server/test_datasets_inventory.py b/lib/pbench/test/unit/server/test_datasets_inventory.py index 8f046b2c0d..6ee6f7e688 100644 --- a/lib/pbench/test/unit/server/test_datasets_inventory.py +++ b/lib/pbench/test/unit/server/test_datasets_inventory.py @@ -1,11 +1,12 @@ from http import HTTPStatus +import io from pathlib import Path from typing import Any, Optional import pytest import requests -from pbench.server.cache_manager import CacheManager +from pbench.server.cache_manager import CacheManager, CacheType, Controller from pbench.server.database.models.datasets import Dataset, DatasetNotFound @@ -31,9 +32,8 @@ def query_api( except DatasetNotFound: dataset_id = dataset # Allow passing deliberately bad value headers = {"authorization": f"bearer {pbench_drb_token}"} - k = "" if target is None else f"/{target}" response = client.get( - f"{server_config.rest_uri}/datasets/{dataset_id}/inventory{k}", + f"{server_config.rest_uri}/datasets/{dataset_id}/inventory/{target}", headers=headers, ) assert response.status_code == expected_status @@ -41,14 +41,24 @@ def query_api( return query_api - def mock_find_dataset(self, dataset): - class Tarball(object): - unpacked = Path("/dataset/") - tarball_path = Path("/dataset/tarball.tar.xz") - + class MockTarball: + def __init__(self, path: Path, controller: Controller): + self.name = "dir_name.tar.xz" + self.tarball_path = path + self.unpacked = None + + def filestream(self, target): + info = { + "name": "f1.json", + "type": CacheType.FILE, + "stream": io.BytesIO(b"file_as_a_byte_stream"), + } + return info + + def mock_find_dataset(self, dataset) -> MockTarball: # Validate the resource_id Dataset.query(resource_id=dataset) - return Tarball + return self.MockTarball(Path("/mock/dir_name.tar.xz"), "abc") def test_get_no_dataset(self, query_get_as): response = query_get_as( @@ -68,80 +78,73 @@ def test_unauthorized_access(self, query_get_as): "message": "User drb is not authorized to READ a resource owned by test with private access" } - def test_dataset_is_not_unpacked(self, query_get_as, monkeypatch): - def mock_find_not_unpacked(self, dataset): - class Tarball(object): - unpacked = None - - # Validate the resource_id - Dataset.query(resource_id=dataset) - return Tarball - - monkeypatch.setattr(CacheManager, "find_dataset", mock_find_not_unpacked) - - response = query_get_as("fio_2", "1-default", HTTPStatus.NOT_FOUND) - assert response.json == {"message": "The dataset is not unpacked"} - - def test_path_is_directory(self, query_get_as, monkeypatch): + @pytest.mark.parametrize("key", (None, "", "subdir1")) + def test_path_is_directory(self, query_get_as, monkeypatch, key): + filestream_dict = {"type": CacheType.DIRECTORY, "stream": None} monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset) + monkeypatch.setattr( + self.MockTarball, "filestream", lambda _s, _t: filestream_dict + ) monkeypatch.setattr(Path, "is_file", lambda self: False) monkeypatch.setattr(Path, "exists", lambda self: True) - response = query_get_as("fio_2", "1-default", HTTPStatus.UNSUPPORTED_MEDIA_TYPE) + response = query_get_as("fio_2", key, HTTPStatus.BAD_REQUEST) assert response.json == { "message": "The specified path does not refer to a regular file" } def test_not_a_file(self, query_get_as, monkeypatch): + filestream_dict = {"type": CacheType.SYMLINK, "stream": None} monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset) + monkeypatch.setattr( + self.MockTarball, "filestream", lambda _s, _t: filestream_dict + ) monkeypatch.setattr(Path, "is_file", lambda self: False) monkeypatch.setattr(Path, "exists", lambda self: False) - response = query_get_as("fio_2", "1-default", HTTPStatus.NOT_FOUND) + response = query_get_as("fio_2", "subdir1/f1_sym", HTTPStatus.BAD_REQUEST) assert response.json == { - "message": "The specified path does not refer to a file" + "message": "The specified path does not refer to a regular file" } def test_dataset_in_given_path(self, query_get_as, monkeypatch): - mock_args: Optional[tuple[Path, dict[str, Any]]] = None - - def mock_send_file(path_or_file, *args, **kwargs): - nonlocal mock_args - mock_args = (path_or_file, kwargs) - return {"status": "OK"} - - monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset) - monkeypatch.setattr(Path, "is_file", lambda self: True) - monkeypatch.setattr( - "pbench.server.api.resources.datasets_inventory.send_file", mock_send_file - ) + mock_close = False - response = query_get_as("fio_2", "1-default/default.csv", HTTPStatus.OK) - assert response.status_code == HTTPStatus.OK + class MockBytesIO(io.BytesIO): + def close(self): + nonlocal mock_close + mock_close = True + super().close() - path, args = mock_args - assert str(path) == "/dataset/1-default/default.csv" - assert args["as_attachment"] is False - assert args["download_name"] == "default.csv" - - @pytest.mark.parametrize("key", (None, "")) - def test_get_result_tarball(self, query_get_as, monkeypatch, key): mock_args: Optional[tuple[Path, dict[str, Any]]] = None + exp_stream = MockBytesIO(b"file_as_a_byte_stream") + filestream_dict = { + "name": "f1.json", + "type": CacheType.FILE, + "stream": exp_stream, + } def mock_send_file(path_or_file, *args, **kwargs): nonlocal mock_args + assert path_or_file == exp_stream mock_args = (path_or_file, kwargs) return {"status": "OK"} monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset) - monkeypatch.setattr(Path, "is_file", lambda self: True) + monkeypatch.setattr( + self.MockTarball, "filestream", lambda _s, _t: filestream_dict + ) monkeypatch.setattr( "pbench.server.api.resources.datasets_inventory.send_file", mock_send_file ) - response = query_get_as("fio_2", key, HTTPStatus.OK) + response = query_get_as("fio_2", "f1.json", HTTPStatus.OK) assert response.status_code == HTTPStatus.OK - path, args = mock_args - assert str(path) == "/dataset/tarball.tar.xz" - assert args["as_attachment"] is True - assert args["download_name"] == "tarball.tar.xz" + + file_content, args = mock_args + + assert isinstance(file_content, io.BytesIO) + assert file_content == exp_stream + assert args["as_attachment"] is False + assert args["download_name"] == "f1.json" + assert mock_close