From 61be9dfb4a1a9e9a8e7bff774b962734a0971db4 Mon Sep 17 00:00:00 2001 From: David Butenhof Date: Thu, 6 Jul 2023 07:23:07 -0400 Subject: [PATCH] Harmonize extract with API logic (#3473) * Harmonize "inventory" APIs PBENCH-1194 Rework Tarball inventory access to avoid the nascent cache map so that we can rely on this inside the Pbench Server process where tarballs haven't been unpacked. This also fixes some incompatibilities in the Quisby APIs which relied on the older extract semantics. In order to avoid stale streams on the `inventory` API I encapsulated the `extractfile` stream into an object that can behave as a stream but will close both the stream and tarball when done: this is deferred until Flask tears down the Response object. Finally, I've added functional tests validating the `contents`, `inventory`, `visualize`, and `compare` APIs so we'll catch any further infrastructure "drift" that might affect them. --- .../server/api/resources/datasets_compare.py | 32 ++-- .../api/resources/datasets_inventory.py | 49 +++-- .../api/resources/datasets_visualize.py | 32 +--- lib/pbench/server/cache_manager.py | 154 +++++++++++---- .../server/{test_put.py => test_datasets.py} | 128 ++++++++++++- .../test/unit/server/test_cache_manager.py | 175 +++++++++++++++--- .../test/unit/server/test_datasets_compare.py | 64 +++---- .../unit/server/test_datasets_inventory.py | 112 +++++------ .../unit/server/test_datasets_visualize.py | 98 +++++----- 9 files changed, 591 insertions(+), 253 deletions(-) rename lib/pbench/test/functional/server/{test_put.py => test_datasets.py} (81%) diff --git a/lib/pbench/server/api/resources/datasets_compare.py b/lib/pbench/server/api/resources/datasets_compare.py index 5f79b03083..dcb17cbafb 100644 --- a/lib/pbench/server/api/resources/datasets_compare.py +++ b/lib/pbench/server/api/resources/datasets_compare.py @@ -20,11 +20,7 @@ ParamType, Schema, ) -from pbench.server.cache_manager import ( - CacheManager, - TarballNotFound, - TarballUnpackError, -) +from pbench.server.cache_manager import CacheManager from pbench.server.database.models.datasets import Metadata @@ -93,28 +89,26 @@ def _get( dataset.access, ) ) + + benchmark_type = BenchmarkName.__members__.get(benchmark.upper()) + if not benchmark_type: + raise APIAbort( + HTTPStatus.BAD_REQUEST, f"Unsupported Benchmark: {benchmark}" + ) + cache_m = CacheManager(self.config, current_app.logger) stream_file = {} for dataset in datasets: try: - tarball = cache_m.find_dataset(dataset.resource_id) - except TarballNotFound as e: + info = cache_m.get_inventory(dataset.resource_id, "result.csv") + file = info["stream"].read().decode("utf-8") + info["stream"].close() + except Exception as e: raise APIInternalError( - f"Expected dataset with ID '{dataset.resource_id}' is missing from the cache manager." + f"{dataset.name} is missing 'result.csv' file" ) from e - try: - file = tarball.extract( - tarball.tarball_path, f"{tarball.name}/result.csv" - ) - except TarballUnpackError as e: - raise APIInternalError(str(e)) from e stream_file[dataset.name] = file - benchmark_type = BenchmarkName.__members__.get(benchmark.upper()) - if not benchmark_type: - raise APIAbort( - HTTPStatus.UNSUPPORTED_MEDIA_TYPE, f"Unsupported Benchmark: {benchmark}" - ) get_quisby_data = QuisbyProcessing().compare_csv_to_json( benchmark_type, InputType.STREAM, stream_file ) diff --git a/lib/pbench/server/api/resources/datasets_inventory.py b/lib/pbench/server/api/resources/datasets_inventory.py index ca8ce93605..ae7e9d7aee 100644 --- a/lib/pbench/server/api/resources/datasets_inventory.py +++ b/lib/pbench/server/api/resources/datasets_inventory.py @@ -10,6 +10,7 @@ ApiAuthorizationType, ApiBase, ApiContext, + APIInternalError, ApiMethod, ApiParams, ApiSchema, @@ -53,7 +54,6 @@ def _get( Raises: APIAbort, reporting either "NOT_FOUND" or "UNSUPPORTED_MEDIA_TYPE" - GET /api/v1/datasets/{dataset}/inventory/{target} """ dataset = params.uri["dataset"] @@ -61,29 +61,38 @@ def _get( cache_m = CacheManager(self.config, current_app.logger) try: - file_info = cache_m.filestream(dataset, target) + file_info = cache_m.get_inventory(dataset.resource_id, target) except TarballNotFound as e: raise APIAbort(HTTPStatus.NOT_FOUND, str(e)) - if file_info["type"] == CacheType.FILE: - # Tell send_file to set `Content-Disposition` to "attachment" if - # targeting the large binary tarball. Otherwise we'll recommend the - # default "inline": only `is_file()` paths are allowed here, and - # most are likely "displayable". While `send_file` will guess the - # download_name from the path, setting it explicitly does no harm - # and supports a unit test mock with no real file. - # - # NOTE: we could choose to be "smarter" based on file size, file - # type codes (e.g., .csv, .json), and so forth. - resp = send_file( - file_info["stream"], - as_attachment=target is None, - download_name=file_info["name"], - ) - file_info["stream"].close() - return resp - else: + if file_info["type"] != CacheType.FILE: raise APIAbort( HTTPStatus.BAD_REQUEST, "The specified path does not refer to a regular file", ) + + # Tell send_file to set `Content-Disposition` to "attachment" if + # targeting the large binary tarball. Otherwise we'll recommend the + # default "inline": only `is_file()` paths are allowed here, and + # most are likely "displayable". While `send_file` will guess the + # download_name from the path, setting it explicitly does no harm + # and supports a unit test mock with no real file. + # + # Werkzeug will set the mime type and content encoding based on the + # download_name suffix. + # + # We link a callback to close the file stream and TarFile object on + # completion. + stream = file_info["stream"] + try: + resp = send_file( + stream, as_attachment=target is None, download_name=file_info["name"] + ) + except Exception as e: + if stream: + stream.close() + raise APIInternalError( + f"Problem sending {dataset}:{target} stream {stream}: {str(e)!r}" + ) + resp.call_on_close(stream.close) + return resp diff --git a/lib/pbench/server/api/resources/datasets_visualize.py b/lib/pbench/server/api/resources/datasets_visualize.py index 87673ada79..e7e505a55d 100644 --- a/lib/pbench/server/api/resources/datasets_visualize.py +++ b/lib/pbench/server/api/resources/datasets_visualize.py @@ -19,12 +19,8 @@ ParamType, Schema, ) -from pbench.server.cache_manager import ( - CacheManager, - TarballNotFound, - TarballUnpackError, -) -from pbench.server.database import Dataset +from pbench.server.cache_manager import CacheManager +from pbench.server.database.models.datasets import Metadata class DatasetsVisualize(ApiBase): @@ -63,29 +59,21 @@ def _get( """ dataset = params.uri["dataset"] - cache_m = CacheManager(self.config, current_app.logger) - try: - tarball = cache_m.find_dataset(dataset.resource_id) - except TarballNotFound as e: - raise APIAbort( - HTTPStatus.NOT_FOUND, f"No dataset with ID '{e.tarball}' found" - ) from e - - metadata = self._get_dataset_metadata( - dataset, ["dataset.metalog.pbench.script"] - ) - benchmark = metadata["dataset.metalog.pbench.script"].upper() + metadata = Metadata.getvalue(dataset, "dataset.metalog.pbench.script") + benchmark = metadata.upper() benchmark_type = BenchmarkName.__members__.get(benchmark) if not benchmark_type: raise APIAbort( - HTTPStatus.UNSUPPORTED_MEDIA_TYPE, f"Unsupported Benchmark: {benchmark}" + HTTPStatus.BAD_REQUEST, f"Unsupported Benchmark: {benchmark}" ) - name = Dataset.stem(tarball.tarball_path) + cache_m = CacheManager(self.config, current_app.logger) try: - file = tarball.extract(tarball.tarball_path, f"{name}/result.csv") - except TarballUnpackError as e: + info = cache_m.get_inventory(dataset.resource_id, "result.csv") + file = info["stream"].read().decode("utf-8") + info["stream"].close() + except Exception as e: raise APIInternalError(str(e)) from e get_quisby_data = QuisbyProcessing().extract_data( diff --git a/lib/pbench/server/cache_manager.py b/lib/pbench/server/cache_manager.py index 21b52f5ea0..7d0b424e12 100644 --- a/lib/pbench/server/cache_manager.py +++ b/lib/pbench/server/cache_manager.py @@ -7,7 +7,7 @@ import shutil import subprocess import tarfile -from typing import IO, Optional, Union +from typing import Any, IO, Optional, Union from pbench.common import MetadataLog, selinux from pbench.server import JSONOBJECT, PbenchServerConfig @@ -42,6 +42,17 @@ def __str__(self) -> str: return f"The file path {self.path!r} is not a tarball" +class CacheExtractBadPath(CacheManagerError): + """Request to extract a path that's bad or not a file""" + + def __init__(self, tar_name: Path, path: Union[str, Path]): + self.name = tar_name.name + self.path = str(path) + + def __str__(self) -> str: + return f"Unable to extract {self.path} from {self.name}" + + class TarballNotFound(CacheManagerError): """The dataset was not found in the ARCHIVE tree.""" @@ -173,6 +184,66 @@ def make_cache_object(dir_path: Path, path: Path) -> CacheObject: ) +class Inventory: + """Encapsulate the tarfile TarFile object and file stream + + This encapsulation allows cleaner downstream handling, so that we can close + both the extractfile file stream and the tarball object itself when we're + done. This eliminates interference with later operations. + """ + + def __init__(self, stream: IO[bytes], tarfile: Optional[tarfile.TarFile] = None): + """Construct an instance to track extracted inventory + + This encapsulates many byte stream operations so that it can be used + as if it were a byte stream. + + Args: + stream: the data stream of a specific tarball member + tarfile: the TarFile object + """ + self.tarfile = tarfile + self.stream = stream + + def close(self): + """Close both the byte stream and, if open, a tarfile object""" + self.stream.close() + if self.tarfile: + self.tarfile.close() + + def getbuffer(self): + """Return the underlying byte buffer (used by send_file)""" + return self.stream.getbuffer() + + def read(self, *args, **kwargs) -> bytes: + """Encapsulate a read operation""" + return self.stream.read(*args, **kwargs) + + def readable(self) -> bool: + """Return the readable state of the stream""" + return self.stream.readable() + + def seek(self, *args, **kwargs) -> int: + """Allow setting the relative position in the stream""" + return self.stream.seek(*args, **kwargs) + + def __repr__(self) -> str: + """Return a string representation""" + return f"" + + def __iter__(self): + """Allow iterating through lines in the buffer""" + return self + + def __next__(self): + """Iterate through lines in the buffer""" + line = self.stream.readline() + if line: + return line + else: + raise StopIteration() + + class Tarball: """Representation of an on-disk tarball. @@ -382,6 +453,12 @@ def traverse_cmap(path: Path, cachemap: dict) -> dict[str, dict]: def get_info(self, path: Path) -> JSONOBJECT: """Returns the details of the given file/directory in dict format + NOTE: This requires a call to the cache_map method to build a map that + can be traversed. Currently this is done only on unpack, and isn't + useful except within the `pbench-index` process. This map needs to + either be built dynamically (potentially expensive) or persisted in + SQL or perhaps Redis. + Args: path: path of the file/sub-directory @@ -429,34 +506,41 @@ def get_info(self, path: Path) -> JSONOBJECT: return fd_info @staticmethod - def extract(tarball_path: Path, path: Path) -> Optional[IO[bytes]]: - """Returns the file stream which yields the contents of - a file at the specified path in the Tarball + def extract(tarball_path: Path, path: Path) -> Inventory: + """Returns a file stream for a file within a tarball Args: tarball_path: absolute path of the tarball path: relative path within the tarball + Returns: + An inventory object that mimics an IO[bytes] object while also + maintaining a reference to the tarfile TarFile object to be + closed later. + Raise: - BadDirpath on failure extracting the file from tarball + TarballNotFound on failure opening the tarball + CacheExtractBadPath if the target cannot be extracted """ try: - return tarfile.open(tarball_path, "r:*").extractfile(path) + tar = tarfile.open(tarball_path, "r:*") except Exception as exc: - raise BadDirpath( - f"A problem occurred processing {str(path)!r} from {str(tarball_path)!r}: {exc}" - ) + raise TarballNotFound(str(tarball_path)) from exc + try: + stream = tar.extractfile(str(path)) + except Exception as exc: + raise CacheExtractBadPath(tarball_path, path) from exc + else: + if not stream: + raise CacheExtractBadPath(tarball_path, path) + return Inventory(stream, tar) - def filestream(self, path: str) -> Optional[dict]: - """Returns a dictionary containing information about the target - file and file stream + def get_inventory(self, path: str) -> Optional[JSONOBJECT]: + """Access the file stream of a tarball member file. Args: path: relative path within the tarball of a file - Raises: - TarballUnpackError on failure to extract the named path - Returns: Dictionary with file info and file stream """ @@ -464,20 +548,12 @@ def filestream(self, path: str) -> Optional[dict]: info = { "name": self.name, "type": CacheType.FILE, - "stream": self.tarball_path.open("rb"), + "stream": Inventory(self.tarball_path.open("rb"), None), } else: file_path = Path(self.name) / path - info = self.get_info(file_path) - if info["type"] == CacheType.FILE: - try: - info["stream"] = Tarball.extract(self.tarball_path, file_path) - except Exception as exc: - raise TarballUnpackError( - self.tarball_path, f"Unable to extract {str(file_path)!r}" - ) from exc - else: - info["stream"] = None + stream = Tarball.extract(self.tarball_path, file_path) + info = {"name": file_path.name, "type": CacheType.FILE, "stream": stream} return info @@ -492,11 +568,12 @@ def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]: name = Dataset.stem(tarball_path) try: data = Tarball.extract(tarball_path, f"{name}/metadata.log") - except BadDirpath: + except CacheExtractBadPath: return None else: metadata_log = MetadataLog() metadata_log.read_file(e.decode() for e in data) + data.close() metadata = {s: dict(metadata_log.items(s)) for s in metadata_log.sections()} return metadata @@ -1021,7 +1098,7 @@ def unpack(self, dataset_id: str) -> Tarball: tarball.controller.unpack(dataset_id) return tarball - def get_info(self, dataset_id: str, path: Path) -> dict: + def get_info(self, dataset_id: str, path: Path) -> dict[str, Any]: """Get information about dataset files from the cache map Args: @@ -1035,9 +1112,24 @@ def get_info(self, dataset_id: str, path: Path) -> dict: tmap = tarball.get_info(path) return tmap - def filestream(self, dataset, target): - tarball = self.find_dataset(dataset.resource_id) - return tarball.filestream(target) + def get_inventory(self, dataset_id: str, target: str) -> Optional[JSONOBJECT]: + """Return filestream data for a file within a dataset tarball + + { + "name": "filename", + "type": CacheType.FILE, + "stream": + } + + Args: + dataset: Dataset resource ID + target: relative file path within the tarball + + Returns: + File info including a byte stream for a regular file + """ + tarball = self.find_dataset(dataset_id) + return tarball.get_inventory(target) def uncache(self, dataset_id: str): """Remove the unpacked tarball tree. diff --git a/lib/pbench/test/functional/server/test_put.py b/lib/pbench/test/functional/server/test_datasets.py similarity index 81% rename from lib/pbench/test/functional/server/test_put.py rename to lib/pbench/test/functional/server/test_datasets.py index 0e1d4e9ff0..5865cf9ae3 100644 --- a/lib/pbench/test/functional/server/test_put.py +++ b/lib/pbench/test/functional/server/test_datasets.py @@ -1,3 +1,4 @@ +from configparser import ConfigParser from dataclasses import dataclass from datetime import datetime, timedelta, timezone from http import HTTPStatus @@ -8,10 +9,11 @@ import dateutil.parser import pytest +from requests import Response from requests.exceptions import HTTPError from pbench.client import API, PbenchServerClient -from pbench.client.types import Dataset +from pbench.client.types import Dataset, JSONOBJECT TARBALL_DIR = Path("lib/pbench/test/functional/server/tarballs") SPECIAL_DIR = TARBALL_DIR / "special" @@ -497,6 +499,130 @@ def test_list_filter_typed(self, server_client: PbenchServerClient, login_user): ), f"Filter failed to return {m['dataset.name']}, with expiration in range ({deletion:%Y-%m-%d})" +class TestInventory: + """Validate APIs involving tarball inventory""" + + @pytest.mark.dependency(name="contents", depends=["index"], scope="session") + def test_contents(self, server_client: PbenchServerClient, login_user): + """Check that we can retrieve the root directory TOC + + NOTE: the TOC API currently uses the Elasticsearch run-toc index, so + the datasets must have gotten through indexing. + """ + datasets = server_client.get_list( + owner="tester", + metadata=["server.archiveonly"], + ) + + for dataset in datasets: + response = server_client.get( + API.DATASETS_CONTENTS, {"dataset": dataset.resource_id, "target": ""} + ) + assert ( + response.ok + ), f"CONTENTS {dataset.name} failed {response.status_code}:{response.json()['message']}" + json = response.json() + + # assert that we have directories and/or files: an empty root + # directory is technically possible, but not legal unless it's a + # trivial "archiveonly" dataset. NOTE: this will also fail if + # either the "directories" or "files" JSON keys are missing. + archive = dataset.metadata["server.archiveonly"] + assert json["directories"] or json["files"] or archive + + # Unless archiveonly, we need a metadata.log + assert archive or "metadata.log" in (f["name"] for f in json["files"]) + + @pytest.mark.dependency(name="visualize", depends=["upload"], scope="session") + def test_visualize(self, server_client: PbenchServerClient, login_user): + """Check that we can generate visualization data from a dataset + + Identify all "uperf" runs (pbench-uperf wrapper script) as that's all + we can currently support. + """ + datasets = server_client.get_list( + owner="tester", + filter=["dataset.metalog.pbench.script:uperf"], + ) + + for dataset in datasets: + response = server_client.get( + API.DATASETS_VISUALIZE, {"dataset": dataset.resource_id} + ) + assert ( + response.ok + ), f"VISUALIZE {dataset.name} failed {response.status_code}:{response.json()['message']}" + json = response.json() + assert json["status"] == "success" + assert "csv_data" in json + assert json["json_data"]["dataset_name"] == dataset.name + assert isinstance(json["json_data"]["data"], list) + + @pytest.mark.dependency(name="compare", depends=["upload"], scope="session") + def test_compare(self, server_client: PbenchServerClient, login_user): + """Check that we can compare two datasets. + + Identify all "uperf" runs (pbench-uperf wrapper script) as that's all + we can currently support. + """ + datasets = server_client.get_list( + owner="tester", + filter=["dataset.metalog.pbench.script:uperf"], + ) + + candidates = [dataset.resource_id for dataset in datasets] + + # We need at least two "uperf" datasets to compare, but they can be the + # same ... so if we only have one (the normal case), duplicate it. + if len(candidates) == 1: + candidates.append(candidates[0]) + + # In the unlikely event we find multiple uperf datasets, compare only + # the first two. + response = server_client.get( + API.DATASETS_COMPARE, params={"datasets": candidates[:2]} + ) + json = response.json() + assert ( + response.ok + ), f"COMPARE {candidates[:2]} failed {response.status_code}:{json['message']}" + assert json["status"] == "success" + assert isinstance(json["json_data"]["data"], list) + + @pytest.mark.dependency(name="inventory", depends=["upload"], scope="session") + def test_inventory(self, server_client: PbenchServerClient, login_user): + """Check that we can retrieve inventory files from a tarball + + The most universal tarball artifact is "metadata.log", which is + mandatory for all "non-archive-only" datasets. So find them all and + ensure that each yields the same metadata.log that the server read + during upload. + """ + + def read_metadata(response: Response) -> JSONOBJECT: + metadata_log = ConfigParser(interpolation=None) + metadata_log.read_string(response.text) + metadata = {s: dict(metadata_log.items(s)) for s in metadata_log.sections()} + return metadata + + datasets = server_client.get_list( + owner="tester", + metadata=["dataset.metalog"], + filter=["server.archiveonly:false:bool"], + ) + + for dataset in datasets: + response = server_client.get( + API.DATASETS_INVENTORY, + {"dataset": dataset.resource_id, "target": "metadata.log"}, + ) + assert ( + response.ok + ), f"INVENTORY {dataset.name} failed {response.status_code}:{response.json()['message']}" + meta = read_metadata(response) + assert meta == dataset.metadata["dataset.metalog"] + + class TestUpdate: @pytest.mark.dependency(name="publish", depends=["index"], scope="session") @pytest.mark.parametrize("access", ("public", "private")) diff --git a/lib/pbench/test/unit/server/test_cache_manager.py b/lib/pbench/test/unit/server/test_cache_manager.py index 5f4f065da3..2bd59f4c2b 100644 --- a/lib/pbench/test/unit/server/test_cache_manager.py +++ b/lib/pbench/test/unit/server/test_cache_manager.py @@ -7,16 +7,20 @@ import shutil import subprocess import tarfile +from typing import IO import pytest +from pbench.server import JSONOBJECT from pbench.server.cache_manager import ( BadDirpath, BadFilename, + CacheExtractBadPath, CacheManager, CacheType, Controller, DuplicateTarball, + Inventory, MetadataError, Tarball, TarballModeChangeError, @@ -882,11 +886,11 @@ def test_cache_map_get_info_cmap( [ ("", CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")), (None, CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")), - ("f1.json", CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")), - ("subdir1/subdir12", CacheType.DIRECTORY, None), + ("f1.json", CacheType.FILE, io.BytesIO(b"file_as_a_byte_stream")), + ("subdir1/f12_sym", CacheType.FILE, io.BytesIO(b"file1_as_a_byte_stream")), ], ) - def test_filestream( + def test_get_inventory( self, monkeypatch, tmp_path, file_path, exp_file_type, exp_stream ): """Test to extract file contents/stream from a file""" @@ -896,41 +900,170 @@ def test_filestream( with monkeypatch.context() as m: m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__) m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__) - m.setattr(Tarball, "extract", lambda _t, _p: exp_stream) + m.setattr(Tarball, "extract", lambda _t, _p: Inventory(exp_stream)) m.setattr(Path, "open", lambda _s, _m="rb": exp_stream) tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None)) tar_dir = TestCacheManager.MockController.generate_test_result_tree( tmp_path, "dir_name" ) tb.cache_map(tar_dir) - file_info = tb.filestream(file_path) + file_info = tb.get_inventory(file_path) assert file_info["type"] == exp_file_type - assert file_info["stream"] == exp_stream + assert file_info["stream"].stream == exp_stream - def test_filestream_tarfile_open(self, monkeypatch, tmp_path): - """Test to check non-existent file or tarfile unpack issue""" - tar = Path("/mock/dir_name.tar.xz") - cache = Path("/mock/.cache") + def test_cm_inventory(self, monkeypatch, server_config, make_logger): + """Verify the happy path of the high level get_inventory""" + id = None + + class MockTarball: + def get_inventory(self, target: str) -> JSONOBJECT: + return { + "name": target, + "type": CacheType.FILE, + "stream": Inventory(io.BytesIO(b"success")), + } + + def mock_find_dataset(self, dataset: str) -> MockTarball: + nonlocal id + id = dataset + + return MockTarball() + + with monkeypatch.context() as m: + m.setattr(CacheManager, "find_dataset", mock_find_dataset) + cm = CacheManager(server_config, make_logger) + inventory = cm.get_inventory("dataset", "target") + assert id == "dataset" + assert inventory["name"] == "target" + assert inventory["stream"].read() == b"success" + + def test_tarfile_extract(self, monkeypatch, tmp_path): + """Test to check Tarball.extract success""" + tar = Path("/mock/result.tar.xz") + contents = b"[test]\nfoo=bar\n" + + class MockTarFile: + def extractfile(self, path: str) -> IO[bytes]: + if path == "metadata.log": + return io.BytesIO(contents) + raise Exception("you can't handle exceptions") + + def fake_tarfile_open(tarfile: str, *args): + if str(tarfile) == str(tar): + return MockTarFile() + raise Exception("You didn't see this coming") + + with monkeypatch.context() as m: + m.setattr(tarfile, "open", fake_tarfile_open) + got = Tarball.extract(tar, Path("metadata.log")) + assert isinstance(got, Inventory) + assert got.read() == contents + + def test_tarfile_open_fails(self, monkeypatch, tmp_path): + """Test to check non-existent tarfile""" + tar = Path("/mock/result.tar.xz") def fake_tarfile_open(self, path): raise tarfile.TarError("Invalid Tarfile") with monkeypatch.context() as m: - m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__) - m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__) m.setattr(tarfile, "open", fake_tarfile_open) - tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None)) - tar_dir = TestCacheManager.MockController.generate_test_result_tree( - tmp_path, "dir_name" - ) - tb.cache_map(tar_dir) - path = Path(tb.name) / "subdir1/f11.txt" - expected_error_msg = f"An error occurred while unpacking {tb.tarball_path}: Unable to extract {str(path)!r}" - with pytest.raises(TarballUnpackError) as exc: - tb.filestream("subdir1/f11.txt") + expected_error_msg = f"The dataset tarball named '{tar}' is not found" + with pytest.raises(TarballNotFound) as exc: + Tarball.extract(tar, Path("subdir1/f11.txt")) assert str(exc.value) == expected_error_msg + def test_tarfile_extractfile_fails(self, monkeypatch, tmp_path): + """Test to check non-existent path in tarfile""" + tar = Path("/mock/result.tar.xz") + path = Path("subdir/f11.txt") + + class MockTarFile: + def extractfile(self, path): + raise Exception("Mr Robot refuses trivial human command") + + def fake_tarfile_open(self, path): + return MockTarFile() + + with monkeypatch.context() as m: + m.setattr(tarfile, "open", fake_tarfile_open) + expected_error_msg = f"Unable to extract {path} from {tar.name}" + with pytest.raises(CacheExtractBadPath) as exc: + Tarball.extract(tar, path) + assert str(exc.value) == expected_error_msg + + def test_tarfile_extractfile_notfile(self, monkeypatch, tmp_path): + """Test to check target that's not a file""" + tar = Path("/mock/result.tar.xz") + path = Path("subdir/f11.txt") + + class MockTarFile: + def extractfile(self, path): + return None + + def fake_tarfile_open(self, path): + return MockTarFile() + + with monkeypatch.context() as m: + m.setattr(tarfile, "open", fake_tarfile_open) + expected_error_msg = f"Unable to extract {path} from {tar.name}" + with pytest.raises(CacheExtractBadPath) as exc: + Tarball.extract(tar, path) + assert str(exc.value) == expected_error_msg + + @pytest.mark.parametrize( + "tarball,stream", (("hasmetalog.tar.xz", True), ("nometalog.tar.xz", False)) + ) + def test_get_metadata(self, monkeypatch, tarball, stream): + """Verify access and processing of `metadata.log`""" + + @staticmethod + def fake_extract(t: Path, f: Path): + if str(t) == tarball and str(f) == f"{Dataset.stem(t)}/metadata.log": + if stream: + return Inventory(io.BytesIO(b"[test]\nfoo = bar\n")) + raise CacheExtractBadPath(t, f) + raise Exception(f"Unexpected mock exception with stream:{stream}: {t}, {f}") + + with monkeypatch.context() as m: + m.setattr(Tarball, "extract", fake_extract) + metadata = Tarball._get_metadata(Path(tarball)) + + if stream: + assert metadata == {"test": {"foo": "bar"}} + else: + assert metadata is None + + def test_inventory(self): + closed = False + + class MockTarFile: + def close(self): + nonlocal closed + closed = True + + def __repr__(self) -> str: + return "" + + raw = b"abcde\nfghij\n" + stream = Inventory(io.BytesIO(raw), MockTarFile()) + assert re.match( + r"^ from >$", + str(stream), + ) + + assert stream.getbuffer() == raw + assert stream.readable() + assert stream.read(5) == b"abcde" + assert stream.read() == b"\nfghij\n" + assert stream.seek(0) == 0 + assert [b for b in stream] == [b"abcde\n", b"fghij\n"] + stream.close() + assert closed + with pytest.raises(ValueError): + stream.read() + def test_find( self, selinux_enabled, server_config, make_logger, tarball, monkeypatch ): diff --git a/lib/pbench/test/unit/server/test_datasets_compare.py b/lib/pbench/test/unit/server/test_datasets_compare.py index 0f6dd2ba9d..fb6cda9458 100644 --- a/lib/pbench/test/unit/server/test_datasets_compare.py +++ b/lib/pbench/test/unit/server/test_datasets_compare.py @@ -1,13 +1,13 @@ from http import HTTPStatus +from io import BytesIO from pathlib import Path -from typing import Optional +from typing import Any, Optional -from pquisby.lib.post_processing import QuisbyProcessing import pytest import requests from pbench.server import JSON -from pbench.server.cache_manager import CacheManager, TarballUnpackError +from pbench.server.cache_manager import CacheExtractBadPath, CacheManager, Inventory from pbench.server.database.models.datasets import Dataset, DatasetNotFound, Metadata from pbench.server.database.models.users import User @@ -55,51 +55,31 @@ def query_api( return query_api - class MockTarball: - tarball_path = Path("/dataset/tarball.tar.xz") - name = "tarball" - - @staticmethod - def extract(_tarball_path: Path, _path: str) -> str: - return "CSV_file_as_a_string" - - def mock_find_dataset(self, dataset) -> MockTarball: - # Validate the resource_id - Dataset.query(resource_id=dataset) - return self.MockTarball() - def test_dataset_not_present(self, query_get_as, monkeypatch): monkeypatch.setattr(Metadata, "getvalue", mock_get_value) query_get_as(["fio_2"], "drb", HTTPStatus.INTERNAL_SERVER_ERROR) def test_unsuccessful_get_with_incorrect_data(self, query_get_as, monkeypatch): - @staticmethod - def mock_extract(_tarball_path: Path, _path: str) -> str: - return "IncorrectData" + def mock_get_inventory(_self, _dataset: str, _path: str) -> dict[str, Any]: + return {"stream": Inventory(BytesIO(b"IncorrectData"))} - def mock_compare_csv_to_json( - self, benchmark_name, input_type, data_stream - ) -> JSON: - return {"status": "failed", "exception": "Unsupported Media Type"} + class MockQuisby: + def compare_csv_to_json(self, _b, _i, _d) -> JSON: + return {"status": "failed", "exception": "Unsupported Media Type"} - monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset) - monkeypatch.setattr(self.MockTarball, "extract", mock_extract) + monkeypatch.setattr(CacheManager, "get_inventory", mock_get_inventory) monkeypatch.setattr(Metadata, "getvalue", mock_get_value) monkeypatch.setattr( - QuisbyProcessing, "compare_csv_to_json", mock_compare_csv_to_json + "pbench.server.api.resources.datasets_compare.QuisbyProcessing", MockQuisby ) query_get_as(["uperf_1", "uperf_2"], "test", HTTPStatus.INTERNAL_SERVER_ERROR) - def test_tarball_unpack_exception(self, query_get_as, monkeypatch): - @staticmethod - def mock_extract(_tarball_path: Path, _path: str): - raise TarballUnpackError( - _tarball_path, f"Testing unpack exception for path {_path}" - ) + def test_get_inventory_exception(self, query_get_as, monkeypatch): + def mock_get_inventory(_self, _dataset: str, _path: str) -> dict[str, Any]: + raise CacheExtractBadPath(Path("tarball"), _path) - monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset) - monkeypatch.setattr(self.MockTarball, "extract", mock_extract) + monkeypatch.setattr(CacheManager, "get_inventory", mock_get_inventory) monkeypatch.setattr(Metadata, "getvalue", mock_get_value) query_get_as(["uperf_1", "uperf_2"], "test", HTTPStatus.INTERNAL_SERVER_ERROR) @@ -139,7 +119,7 @@ def mock_extract(_tarball_path: Path, _path: str): ( "test", ["uperf_3", "uperf_4"], - HTTPStatus.UNSUPPORTED_MEDIA_TYPE, + HTTPStatus.BAD_REQUEST, "Unsupported Benchmark: hammerDB", ), ), @@ -147,15 +127,17 @@ def mock_extract(_tarball_path: Path, _path: str): def test_datasets_with_different_benchmark( self, user, datasets, exp_status, exp_message, query_get_as, monkeypatch ): - def mock_compare_csv_to_json( - self, benchmark_name, input_type, data_stream - ) -> JSON: - return {"status": "success", "json_data": "quisby_data"} + class MockQuisby: + def compare_csv_to_json(self, _b, _i, _d) -> JSON: + return {"status": "success", "json_data": "quisby_data"} + + def mock_get_inventory(_self, _dataset: str, _path: str) -> dict[str, Any]: + return {"stream": Inventory(BytesIO(b"IncorrectData"))} - monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset) + monkeypatch.setattr(CacheManager, "get_inventory", mock_get_inventory) monkeypatch.setattr(Metadata, "getvalue", mock_get_value) monkeypatch.setattr( - QuisbyProcessing, "compare_csv_to_json", mock_compare_csv_to_json + "pbench.server.api.resources.datasets_compare.QuisbyProcessing", MockQuisby ) response = query_get_as(datasets, user, exp_status) diff --git a/lib/pbench/test/unit/server/test_datasets_inventory.py b/lib/pbench/test/unit/server/test_datasets_inventory.py index 6ee6f7e688..cab31fbb46 100644 --- a/lib/pbench/test/unit/server/test_datasets_inventory.py +++ b/lib/pbench/test/unit/server/test_datasets_inventory.py @@ -1,12 +1,14 @@ from http import HTTPStatus import io from pathlib import Path -from typing import Any, Optional +from typing import Callable, Optional +from flask import Response import pytest import requests -from pbench.server.cache_manager import CacheManager, CacheType, Controller +from pbench.server import JSONOBJECT +from pbench.server.cache_manager import CacheManager, CacheType, Inventory from pbench.server.database.models.datasets import Dataset, DatasetNotFound @@ -41,25 +43,6 @@ def query_api( return query_api - class MockTarball: - def __init__(self, path: Path, controller: Controller): - self.name = "dir_name.tar.xz" - self.tarball_path = path - self.unpacked = None - - def filestream(self, target): - info = { - "name": "f1.json", - "type": CacheType.FILE, - "stream": io.BytesIO(b"file_as_a_byte_stream"), - } - return info - - def mock_find_dataset(self, dataset) -> MockTarball: - # Validate the resource_id - Dataset.query(resource_id=dataset) - return self.MockTarball(Path("/mock/dir_name.tar.xz"), "abc") - def test_get_no_dataset(self, query_get_as): response = query_get_as( "nonexistent-dataset", "metadata.log", HTTPStatus.NOT_FOUND @@ -80,11 +63,10 @@ def test_unauthorized_access(self, query_get_as): @pytest.mark.parametrize("key", (None, "", "subdir1")) def test_path_is_directory(self, query_get_as, monkeypatch, key): - filestream_dict = {"type": CacheType.DIRECTORY, "stream": None} - monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset) - monkeypatch.setattr( - self.MockTarball, "filestream", lambda _s, _t: filestream_dict - ) + def mock_get_inventory(_s, _d: str, _t: str): + return {"type": CacheType.DIRECTORY, "stream": None} + + monkeypatch.setattr(CacheManager, "get_inventory", mock_get_inventory) monkeypatch.setattr(Path, "is_file", lambda self: False) monkeypatch.setattr(Path, "exists", lambda self: True) @@ -94,11 +76,10 @@ def test_path_is_directory(self, query_get_as, monkeypatch, key): } def test_not_a_file(self, query_get_as, monkeypatch): - filestream_dict = {"type": CacheType.SYMLINK, "stream": None} - monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset) - monkeypatch.setattr( - self.MockTarball, "filestream", lambda _s, _t: filestream_dict - ) + def mock_get_inventory(_s, _d: str, _t: str): + return {"type": CacheType.SYMLINK, "stream": None} + + monkeypatch.setattr(CacheManager, "get_inventory", mock_get_inventory) monkeypatch.setattr(Path, "is_file", lambda self: False) monkeypatch.setattr(Path, "exists", lambda self: False) @@ -110,30 +91,29 @@ def test_not_a_file(self, query_get_as, monkeypatch): def test_dataset_in_given_path(self, query_get_as, monkeypatch): mock_close = False - class MockBytesIO(io.BytesIO): - def close(self): - nonlocal mock_close - mock_close = True - super().close() - - mock_args: Optional[tuple[Path, dict[str, Any]]] = None - exp_stream = MockBytesIO(b"file_as_a_byte_stream") - filestream_dict = { - "name": "f1.json", - "type": CacheType.FILE, - "stream": exp_stream, - } + def call_on_close(_c: Callable): + nonlocal mock_close + mock_close = True + + mock_args: Optional[tuple[Path, JSONOBJECT]] = None + exp_stream = io.BytesIO(b"file_as_a_byte_stream") + + def mock_get_inventory(_s, _d: str, _t: str): + return { + "name": "f1.json", + "type": CacheType.FILE, + "stream": Inventory(exp_stream, None), + } + + response = Response() + monkeypatch.setattr(response, "call_on_close", call_on_close) def mock_send_file(path_or_file, *args, **kwargs): nonlocal mock_args - assert path_or_file == exp_stream mock_args = (path_or_file, kwargs) - return {"status": "OK"} + return response - monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset) - monkeypatch.setattr( - self.MockTarball, "filestream", lambda _s, _t: filestream_dict - ) + monkeypatch.setattr(CacheManager, "get_inventory", mock_get_inventory) monkeypatch.setattr( "pbench.server.api.resources.datasets_inventory.send_file", mock_send_file ) @@ -143,8 +123,36 @@ def mock_send_file(path_or_file, *args, **kwargs): file_content, args = mock_args - assert isinstance(file_content, io.BytesIO) - assert file_content == exp_stream + assert isinstance(file_content, Inventory) + assert file_content.stream == exp_stream assert args["as_attachment"] is False assert args["download_name"] == "f1.json" assert mock_close + + def test_send_fail(self, query_get_as, monkeypatch): + mock_close = False + + def mock_close(_s): + nonlocal mock_close + mock_close = True + + exp_stream = io.BytesIO(b"file_as_a_byte_stream") + + def mock_get_inventory(_s, _d: str, _t: str): + return { + "name": "f1.json", + "type": CacheType.FILE, + "stream": Inventory(exp_stream, None), + } + + def mock_send_file(path_or_file, *args, **kwargs): + raise Exception("I'm failing to succeed") + + monkeypatch.setattr(exp_stream, "close", mock_close) + monkeypatch.setattr(CacheManager, "get_inventory", mock_get_inventory) + monkeypatch.setattr( + "pbench.server.api.resources.datasets_inventory.send_file", mock_send_file + ) + + query_get_as("fio_2", "f1.json", HTTPStatus.INTERNAL_SERVER_ERROR) + assert mock_close diff --git a/lib/pbench/test/unit/server/test_datasets_visualize.py b/lib/pbench/test/unit/server/test_datasets_visualize.py index 92e14d1a9d..4e5554d647 100644 --- a/lib/pbench/test/unit/server/test_datasets_visualize.py +++ b/lib/pbench/test/unit/server/test_datasets_visualize.py @@ -1,14 +1,21 @@ from http import HTTPStatus +from io import BytesIO from pathlib import Path +from typing import Optional from pquisby.lib.post_processing import QuisbyProcessing import pytest import requests -from pbench.server import JSON -from pbench.server.api.resources import ApiBase -from pbench.server.cache_manager import CacheManager, Tarball -from pbench.server.database.models.datasets import Dataset, DatasetNotFound +from pbench.server import JSON, JSONOBJECT +from pbench.server.cache_manager import ( + CacheManager, + CacheType, + Inventory, + TarballNotFound, +) +from pbench.server.database.models.datasets import Dataset, DatasetNotFound, Metadata +from pbench.server.database.models.users import User class TestVisualize: @@ -42,42 +49,49 @@ def query_api( return query_api - def mock_find_dataset(self, _dataset: str) -> Tarball: - class Tarball(object): - tarball_path = Path("/dataset/tarball.tar.xz") - - def extract(_tarball_path: Path, _path: str) -> str: - return "CSV_file_as_a_byte_stream" - - return Tarball + def mock_get_inventory(self, _dataset: str, target: str): + return { + "name": Path(target).name, + "type": CacheType.FILE, + "stream": Inventory(BytesIO(b"CSV_file_as_a_byte_stream"), None), + } - def mock_get_dataset_metadata(self, _dataset, _key) -> JSON: - return {"dataset.metalog.pbench.script": "uperf"} + @staticmethod + def mock_getvalue(_d: Dataset, _k: str, _u: Optional[User] = None) -> JSON: + return "uperf" def test_get_no_dataset(self, query_get_as): + """The dataset ID doesn't exist""" + response = query_get_as("nonexistent-dataset", "drb", HTTPStatus.NOT_FOUND) assert response.json == {"message": "Dataset 'nonexistent-dataset' not found"} - def test_dataset_not_present(self, query_get_as): - response = query_get_as("fio_2", "drb", HTTPStatus.NOT_FOUND) - assert response.json == { - "message": "No dataset with ID 'random_md5_string4' found" - } + def test_dataset_not_cached(self, monkeypatch, query_get_as): + """The dataset exists, but isn't in the cache manager""" + + def mock_inventory_not_found(self, d: str, _t: str) -> JSONOBJECT: + raise TarballNotFound(d) + + monkeypatch.setattr(Metadata, "getvalue", self.mock_getvalue) + monkeypatch.setattr(CacheManager, "get_inventory", mock_inventory_not_found) + query_get_as("fio_2", "drb", HTTPStatus.INTERNAL_SERVER_ERROR) def test_unauthorized_access(self, query_get_as): + """The dataset exists but isn't READABLE""" + response = query_get_as("test", "drb", HTTPStatus.FORBIDDEN) assert response.json == { "message": "User drb is not authorized to READ a resource owned by test with private access" } def test_successful_get(self, query_get_as, monkeypatch): + """Quisby processing succeeds""" + def mock_extract_data(self, test_name, dataset_name, input_type, data) -> JSON: return {"status": "success", "json_data": "quisby_data"} - monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset) - monkeypatch.setattr( - ApiBase, "_get_dataset_metadata", self.mock_get_dataset_metadata - ) + monkeypatch.setattr(CacheManager, "get_inventory", self.mock_get_inventory) + monkeypatch.setattr(Metadata, "getvalue", self.mock_getvalue) monkeypatch.setattr(QuisbyProcessing, "extract_data", mock_extract_data) response = query_get_as("uperf_1", "test", HTTPStatus.OK) @@ -85,24 +99,13 @@ def mock_extract_data(self, test_name, dataset_name, input_type, data) -> JSON: assert response.json["json_data"] == "quisby_data" def test_unsuccessful_get_with_incorrect_data(self, query_get_as, monkeypatch): - def mock_find_dataset_with_incorrect_data(self, dataset) -> Tarball: - class Tarball(object): - tarball_path = Path("/dataset/tarball.tar.xz") - - def extract(tarball_path, path) -> str: - return "IncorrectData" - - return Tarball + """Quisby processing fails""" def mock_extract_data(self, test_name, dataset_name, input_type, data) -> JSON: return {"status": "failed", "exception": "Unsupported Media Type"} - monkeypatch.setattr( - CacheManager, "find_dataset", mock_find_dataset_with_incorrect_data - ) - monkeypatch.setattr( - ApiBase, "_get_dataset_metadata", self.mock_get_dataset_metadata - ) + monkeypatch.setattr(CacheManager, "get_inventory", self.mock_get_inventory) + monkeypatch.setattr(Metadata, "getvalue", self.mock_getvalue) monkeypatch.setattr(QuisbyProcessing, "extract_data", mock_extract_data) response = query_get_as("uperf_1", "test", HTTPStatus.INTERNAL_SERVER_ERROR) assert response.json["message"].startswith( @@ -110,18 +113,21 @@ def mock_extract_data(self, test_name, dataset_name, input_type, data) -> JSON: ) def test_unsupported_benchmark(self, query_get_as, monkeypatch): - flag = True + """The benchmark name isn't supported""" + + extract_not_called = True def mock_extract_data(*args, **kwargs) -> JSON: - nonlocal flag - flag = False + nonlocal extract_not_called + extract_not_called = False - def mock_get_metadata(self, dataset, key) -> JSON: - return {"dataset.metalog.pbench.script": "hammerDB"} + @staticmethod + def mock_get_metadata(_d: Dataset, _k: str, _u: Optional[User] = None) -> JSON: + return "hammerDB" - monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset) - monkeypatch.setattr(ApiBase, "_get_dataset_metadata", mock_get_metadata) + monkeypatch.setattr(CacheManager, "get_inventory", self.mock_get_inventory) + monkeypatch.setattr(Metadata, "getvalue", mock_get_metadata) monkeypatch.setattr(QuisbyProcessing, "extract_data", mock_extract_data) - response = query_get_as("uperf_1", "test", HTTPStatus.UNSUPPORTED_MEDIA_TYPE) + response = query_get_as("uperf_1", "test", HTTPStatus.BAD_REQUEST) assert response.json["message"] == "Unsupported Benchmark: HAMMERDB" - assert flag is True + assert extract_not_called