Compare datasets - Integrate Quisby into Pbench Server API (#3470)

PBENCH-1189 --------- Co-authored-by: siddardh <sira@redhat27!>
distributed-system-analysis · Jun 26, 2023 · 1eebe0f · 1eebe0f
1 parent 999f797
commit 1eebe0f
Show file tree

Hide file tree

Showing 5 changed files with 300 additions and 0 deletions.
diff --git a/lib/pbench/client/__init__.py b/lib/pbench/client/__init__.py
@@ -39,6 +39,7 @@ class API(Enum):
     """
 
     DATASETS = "datasets"
+    DATASETS_COMPARE = "datasets_compare"
     DATASETS_CONTENTS = "datasets_contents"
     DATASETS_DETAIL = "datasets_detail"
     DATASETS_INVENTORY = "datasets_inventory"

diff --git a/lib/pbench/server/api/__init__.py b/lib/pbench/server/api/__init__.py
@@ -14,6 +14,7 @@
 from pbench.common.logger import get_pbench_logger
 from pbench.server import PbenchServerConfig
 from pbench.server.api.resources.api_key import APIKeyManage
+from pbench.server.api.resources.datasets_compare import DatasetsCompare
 from pbench.server.api.resources.datasets_inventory import DatasetsInventory
 from pbench.server.api.resources.datasets_list import DatasetsList
 from pbench.server.api.resources.datasets_metadata import DatasetsMetadata
@@ -63,6 +64,12 @@ def register_endpoints(api: Api, app: Flask, config: PbenchServerConfig):
         endpoint="datasets",
         resource_class_args=(config,),
     )
+    api.add_resource(
+        DatasetsCompare,
+        f"{base_uri}/compare",
+        endpoint="datasets_compare",
+        resource_class_args=(config,),
+    )
     api.add_resource(
         DatasetsContents,
         f"{base_uri}/datasets/<string:dataset>/contents/",

diff --git a/lib/pbench/server/api/resources/datasets_compare.py b/lib/pbench/server/api/resources/datasets_compare.py
@@ -0,0 +1,125 @@
+from http import HTTPStatus
+from urllib.request import Request
+
+from flask import current_app, jsonify
+from flask.wrappers import Response
+from pquisby.lib.post_processing import BenchmarkName, InputType, QuisbyProcessing
+
+from pbench.server import OperationCode, PbenchServerConfig
+from pbench.server.api.resources import (
+    APIAbort,
+    ApiAuthorization,
+    ApiAuthorizationType,
+    ApiBase,
+    ApiContext,
+    APIInternalError,
+    ApiMethod,
+    ApiParams,
+    ApiSchema,
+    Parameter,
+    ParamType,
+    Schema,
+)
+from pbench.server.cache_manager import (
+    CacheManager,
+    TarballNotFound,
+    TarballUnpackError,
+)
+from pbench.server.database.models.datasets import Metadata
+
+
+class DatasetsCompare(ApiBase):
+    """
+    This class implements the Server API used to retrieve comparison data for visualization.
+    """
+
+    def __init__(self, config: PbenchServerConfig):
+        super().__init__(
+            config,
+            ApiSchema(
+                ApiMethod.GET,
+                OperationCode.READ,
+                query_schema=Schema(
+                    Parameter(
+                        "datasets",
+                        ParamType.LIST,
+                        element_type=ParamType.DATASET,
+                        string_list=",",
+                        required=True,
+                    ),
+                ),
+                authorization=ApiAuthorizationType.NONE,
+            ),
+        )
+
+    def _get(
+        self, params: ApiParams, request: Request, context: ApiContext
+    ) -> Response:
+        """
+        This function is using Quisby to compare results into a form that supports visualization
+
+        Args:
+            params: includes the uri parameters, which provide the list of dataset.
+            request: Original incoming Request object
+            context: API context dictionary
+
+        Raises:
+            UnauthorizedAccess : The user isn't authorized for the requested access.
+            APIAbort, reporting "NOT_FOUND" and "INTERNAL_SERVER_ERROR"
+            APIInternalError, reporting the failure message
+
+        GET /api/v1/compare?datasets=d1,d2,d3
+        """
+
+        datasets = params.query.get("datasets")
+        benchmark_choice = None
+        for dataset in datasets:
+            benchmark = Metadata.getvalue(dataset, "dataset.metalog.pbench.script")
+            # Validate if all the selected datasets is of same benchmark
+            if not benchmark_choice:
+                benchmark_choice = benchmark
+            elif benchmark != benchmark_choice:
+                raise APIAbort(
+                    HTTPStatus.BAD_REQUEST,
+                    f"Selected dataset benchmarks must match: {benchmark_choice} and {benchmark} cannot be compared.",
+                )
+
+            # Validate if the user is authorized to access the selected datasets
+            self._check_authorization(
+                ApiAuthorization(
+                    ApiAuthorizationType.USER_ACCESS,
+                    OperationCode.READ,
+                    dataset.owner_id,
+                    dataset.access,
+                )
+            )
+        cache_m = CacheManager(self.config, current_app.logger)
+        stream_file = {}
+        for dataset in datasets:
+            try:
+                tarball = cache_m.find_dataset(dataset.resource_id)
+            except TarballNotFound as e:
+                raise APIInternalError(
+                    f"Expected dataset with ID '{dataset.resource_id}' is missing from the cache manager."
+                ) from e
+            try:
+                file = tarball.extract(
+                    tarball.tarball_path, f"{tarball.name}/result.csv"
+                )
+            except TarballUnpackError as e:
+                raise APIInternalError(str(e)) from e
+            stream_file[dataset.name] = file
+
+        benchmark_type = BenchmarkName.__members__.get(benchmark.upper())
+        if not benchmark_type:
+            raise APIAbort(
+                HTTPStatus.UNSUPPORTED_MEDIA_TYPE, f"Unsupported Benchmark: {benchmark}"
+            )
+        get_quisby_data = QuisbyProcessing().compare_csv_to_json(
+            benchmark_type, InputType.STREAM, stream_file
+        )
+        if get_quisby_data["status"] != "success":
+            raise APIInternalError(
+                f"Quisby processing failure. Exception: {get_quisby_data['exception']}"
+            )
+        return jsonify(get_quisby_data)
diff --git a/lib/pbench/test/unit/server/test_datasets_compare.py b/lib/pbench/test/unit/server/test_datasets_compare.py
@@ -0,0 +1,166 @@
+from http import HTTPStatus
+from pathlib import Path
+from typing import Optional
+
+from pquisby.lib.post_processing import QuisbyProcessing
+import pytest
+import requests
+
+from pbench.server import JSON
+from pbench.server.cache_manager import CacheManager, TarballUnpackError
+from pbench.server.database.models.datasets import Dataset, DatasetNotFound, Metadata
+from pbench.server.database.models.users import User
+
+
+def mock_get_value(dataset: Dataset, key: str, user: Optional[User] = None) -> str:
+    if dataset.name == "uperf_3" or dataset.name == "uperf_4":
+        return "hammerDB"
+    return "uperf"
+
+
+class TestCompareDatasets:
+    @pytest.fixture()
+    def query_get_as(self, client, server_config, more_datasets, get_token_func):
+        """
+        Helper fixture to perform the API query and validate an expected
+        return status.
+
+        Args:
+            client: Flask test API client fixture
+            server_config: Pbench config fixture
+            more_datasets: Dataset construction fixture
+            get_token_func: Pbench token fixture
+        """
+
+        def query_api(
+            datasets: list, user: str, expected_status: HTTPStatus
+        ) -> requests.Response:
+            ds_list = []
+            for dataset in datasets:
+                try:
+                    dataset_id = Dataset.query(name=dataset).resource_id
+                    ds_list.append(dataset_id)
+                except DatasetNotFound:
+                    ds_list.append(dataset)  # Allow passing deliberately bad value
+            headers = None
+            if user:
+                headers = {"authorization": f"bearer {get_token_func(user)}"}
+            response = client.get(
+                f"{server_config.rest_uri}/compare",
+                query_string={"datasets": ds_list},
+                headers=headers,
+            )
+            assert response.status_code == expected_status
+            return response
+
+        return query_api
+
+    class MockTarball:
+        tarball_path = Path("/dataset/tarball.tar.xz")
+        name = "tarball"
+
+        @staticmethod
+        def extract(_tarball_path: Path, _path: str) -> str:
+            return "CSV_file_as_a_string"
+
+    def mock_find_dataset(self, dataset) -> MockTarball:
+        # Validate the resource_id
+        Dataset.query(resource_id=dataset)
+        return self.MockTarball()
+
+    def test_dataset_not_present(self, query_get_as, monkeypatch):
+        monkeypatch.setattr(Metadata, "getvalue", mock_get_value)
+
+        query_get_as(["fio_2"], "drb", HTTPStatus.INTERNAL_SERVER_ERROR)
+
+    def test_unsuccessful_get_with_incorrect_data(self, query_get_as, monkeypatch):
+        @staticmethod
+        def mock_extract(_tarball_path: Path, _path: str) -> str:
+            return "IncorrectData"
+
+        def mock_compare_csv_to_json(
+            self, benchmark_name, input_type, data_stream
+        ) -> JSON:
+            return {"status": "failed", "exception": "Unsupported Media Type"}
+
+        monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset)
+        monkeypatch.setattr(self.MockTarball, "extract", mock_extract)
+        monkeypatch.setattr(Metadata, "getvalue", mock_get_value)
+        monkeypatch.setattr(
+            QuisbyProcessing, "compare_csv_to_json", mock_compare_csv_to_json
+        )
+        query_get_as(["uperf_1", "uperf_2"], "test", HTTPStatus.INTERNAL_SERVER_ERROR)
+
+    def test_tarball_unpack_exception(self, query_get_as, monkeypatch):
+        @staticmethod
+        def mock_extract(_tarball_path: Path, _path: str):
+            raise TarballUnpackError(
+                _tarball_path, f"Testing unpack exception for path {_path}"
+            )
+
+        monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset)
+        monkeypatch.setattr(self.MockTarball, "extract", mock_extract)
+        monkeypatch.setattr(Metadata, "getvalue", mock_get_value)
+        query_get_as(["uperf_1", "uperf_2"], "test", HTTPStatus.INTERNAL_SERVER_ERROR)
+
+    @pytest.mark.parametrize(
+        "user,datasets,exp_status,exp_message",
+        (
+            (
+                "drb",
+                ["uperf_1", "nonexistent-dataset"],
+                HTTPStatus.BAD_REQUEST,
+                "Unrecognized list value ['nonexistent-dataset'] given for parameter datasets; expected Dataset",
+            ),
+            (
+                "drb",
+                ["uperf_1", "uperf_2"],
+                HTTPStatus.FORBIDDEN,
+                "User drb is not authorized to READ a resource owned by test with private access",
+            ),
+            (
+                "test",
+                ["uperf_1", "uperf_2"],
+                HTTPStatus.OK,
+                None,
+            ),
+            (
+                None,
+                ["fio_1", "fio_2"],
+                HTTPStatus.OK,
+                None,
+            ),
+            (
+                "test",
+                ["fio_1", "uperf_3"],
+                HTTPStatus.BAD_REQUEST,
+                "Selected dataset benchmarks must match: uperf and hammerDB cannot be compared.",
+            ),
+            (
+                "test",
+                ["uperf_3", "uperf_4"],
+                HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
+                "Unsupported Benchmark: hammerDB",
+            ),
+        ),
+    )
+    def test_datasets_with_different_benchmark(
+        self, user, datasets, exp_status, exp_message, query_get_as, monkeypatch
+    ):
+        def mock_compare_csv_to_json(
+            self, benchmark_name, input_type, data_stream
+        ) -> JSON:
+            return {"status": "success", "json_data": "quisby_data"}
+
+        monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset)
+        monkeypatch.setattr(Metadata, "getvalue", mock_get_value)
+        monkeypatch.setattr(
+            QuisbyProcessing, "compare_csv_to_json", mock_compare_csv_to_json
+        )
+
+        response = query_get_as(datasets, user, exp_status)
+        if exp_status == HTTPStatus.OK:
+            assert response.json["status"] == "success"
+            assert response.json["json_data"] == "quisby_data"
+        else:
+            assert response.json["message"] == exp_message
diff --git a/lib/pbench/test/unit/server/test_endpoint_configure.py b/lib/pbench/test/unit/server/test_endpoint_configure.py
@@ -59,6 +59,7 @@ def check_config(self, client, server_config, host, my_headers={}):
                     "template": f"{uri}/datasets/{{dataset}}",
                     "params": {"dataset": {"type": "string"}},
                 },
+                "datasets_compare": {"template": f"{uri}/compare", "params": {}},
                 "datasets_contents": {
                     "template": f"{uri}/datasets/{{dataset}}/contents/{{target}}",
                     "params": {