Harmonize extract with API logic (#3473)

* Harmonize "inventory" APIs PBENCH-1194 Rework Tarball inventory access to avoid the nascent cache map so that we can rely on this inside the Pbench Server process where tarballs haven't been unpacked. This also fixes some incompatibilities in the Quisby APIs which relied on the older extract semantics. In order to avoid stale streams on the `inventory` API I encapsulated the `extractfile` stream into an object that can behave as a stream but will close both the stream and tarball when done: this is deferred until Flask tears down the Response object. Finally, I've added functional tests validating the `contents`, `inventory`, `visualize`, and `compare` APIs so we'll catch any further infrastructure "drift" that might affect them.
distributed-system-analysis · Jul 6, 2023 · 61be9df · 61be9df
1 parent b6d63ed
commit 61be9df
Show file tree

Hide file tree

Showing 9 changed files with 591 additions and 253 deletions.
diff --git a/lib/pbench/server/api/resources/datasets_compare.py b/lib/pbench/server/api/resources/datasets_compare.py
@@ -20,11 +20,7 @@
     ParamType,
     Schema,
 )
-from pbench.server.cache_manager import (
-    CacheManager,
-    TarballNotFound,
-    TarballUnpackError,
-)
+from pbench.server.cache_manager import CacheManager
 from pbench.server.database.models.datasets import Metadata
 
 
@@ -93,28 +89,26 @@ def _get(
                     dataset.access,
                 )
             )
+
+        benchmark_type = BenchmarkName.__members__.get(benchmark.upper())
+        if not benchmark_type:
+            raise APIAbort(
+                HTTPStatus.BAD_REQUEST, f"Unsupported Benchmark: {benchmark}"
+            )
+
         cache_m = CacheManager(self.config, current_app.logger)
         stream_file = {}
         for dataset in datasets:
             try:
-                tarball = cache_m.find_dataset(dataset.resource_id)
-            except TarballNotFound as e:
+                info = cache_m.get_inventory(dataset.resource_id, "result.csv")
+                file = info["stream"].read().decode("utf-8")
+                info["stream"].close()
+            except Exception as e:
                 raise APIInternalError(
-                    f"Expected dataset with ID '{dataset.resource_id}' is missing from the cache manager."
+                    f"{dataset.name} is missing 'result.csv' file"
                 ) from e
-            try:
-                file = tarball.extract(
-                    tarball.tarball_path, f"{tarball.name}/result.csv"
-                )
-            except TarballUnpackError as e:
-                raise APIInternalError(str(e)) from e
             stream_file[dataset.name] = file
 
-        benchmark_type = BenchmarkName.__members__.get(benchmark.upper())
-        if not benchmark_type:
-            raise APIAbort(
-                HTTPStatus.UNSUPPORTED_MEDIA_TYPE, f"Unsupported Benchmark: {benchmark}"
-            )
         get_quisby_data = QuisbyProcessing().compare_csv_to_json(
             benchmark_type, InputType.STREAM, stream_file
         )

diff --git a/lib/pbench/server/api/resources/datasets_inventory.py b/lib/pbench/server/api/resources/datasets_inventory.py
@@ -10,6 +10,7 @@
     ApiAuthorizationType,
     ApiBase,
     ApiContext,
+    APIInternalError,
     ApiMethod,
     ApiParams,
     ApiSchema,
@@ -53,37 +54,45 @@ def _get(
         Raises:
             APIAbort, reporting either "NOT_FOUND" or "UNSUPPORTED_MEDIA_TYPE"
 
-
         GET /api/v1/datasets/{dataset}/inventory/{target}
         """
         dataset = params.uri["dataset"]
         target = params.uri.get("target")
 
         cache_m = CacheManager(self.config, current_app.logger)
         try:
-            file_info = cache_m.filestream(dataset, target)
+            file_info = cache_m.get_inventory(dataset.resource_id, target)
         except TarballNotFound as e:
             raise APIAbort(HTTPStatus.NOT_FOUND, str(e))
 
-        if file_info["type"] == CacheType.FILE:
-            # Tell send_file to set `Content-Disposition` to "attachment" if
-            # targeting the large binary tarball. Otherwise we'll recommend the
-            # default "inline": only `is_file()` paths are allowed here, and
-            # most are likely "displayable". While `send_file` will guess the
-            # download_name from the path, setting it explicitly does no harm
-            # and supports a unit test mock with no real file.
-            #
-            # NOTE: we could choose to be "smarter" based on file size, file
-            # type codes (e.g., .csv, .json), and so forth.
-            resp = send_file(
-                file_info["stream"],
-                as_attachment=target is None,
-                download_name=file_info["name"],
-            )
-            file_info["stream"].close()
-            return resp
-        else:
+        if file_info["type"] != CacheType.FILE:
             raise APIAbort(
                 HTTPStatus.BAD_REQUEST,
                 "The specified path does not refer to a regular file",
             )
+
+        # Tell send_file to set `Content-Disposition` to "attachment" if
+        # targeting the large binary tarball. Otherwise we'll recommend the
+        # default "inline": only `is_file()` paths are allowed here, and
+        # most are likely "displayable". While `send_file` will guess the
+        # download_name from the path, setting it explicitly does no harm
+        # and supports a unit test mock with no real file.
+        #
+        # Werkzeug will set the mime type and content encoding based on the
+        # download_name suffix.
+        #
+        # We link a callback to close the file stream and TarFile object on
+        # completion.
+        stream = file_info["stream"]
+        try:
+            resp = send_file(
+                stream, as_attachment=target is None, download_name=file_info["name"]
+            )
+        except Exception as e:
+            if stream:
+                stream.close()
+            raise APIInternalError(
+                f"Problem sending {dataset}:{target} stream {stream}: {str(e)!r}"
+            )
+        resp.call_on_close(stream.close)
+        return resp
diff --git a/lib/pbench/server/api/resources/datasets_visualize.py b/lib/pbench/server/api/resources/datasets_visualize.py
@@ -19,12 +19,8 @@
     ParamType,
     Schema,
 )
-from pbench.server.cache_manager import (
-    CacheManager,
-    TarballNotFound,
-    TarballUnpackError,
-)
-from pbench.server.database import Dataset
+from pbench.server.cache_manager import CacheManager
+from pbench.server.database.models.datasets import Metadata
 
 
 class DatasetsVisualize(ApiBase):
@@ -63,29 +59,21 @@ def _get(
         """
 
         dataset = params.uri["dataset"]
-        cache_m = CacheManager(self.config, current_app.logger)
 
-        try:
-            tarball = cache_m.find_dataset(dataset.resource_id)
-        except TarballNotFound as e:
-            raise APIAbort(
-                HTTPStatus.NOT_FOUND, f"No dataset with ID '{e.tarball}' found"
-            ) from e
-
-        metadata = self._get_dataset_metadata(
-            dataset, ["dataset.metalog.pbench.script"]
-        )
-        benchmark = metadata["dataset.metalog.pbench.script"].upper()
+        metadata = Metadata.getvalue(dataset, "dataset.metalog.pbench.script")
+        benchmark = metadata.upper()
         benchmark_type = BenchmarkName.__members__.get(benchmark)
         if not benchmark_type:
             raise APIAbort(
-                HTTPStatus.UNSUPPORTED_MEDIA_TYPE, f"Unsupported Benchmark: {benchmark}"
+                HTTPStatus.BAD_REQUEST, f"Unsupported Benchmark: {benchmark}"
             )
 
-        name = Dataset.stem(tarball.tarball_path)
+        cache_m = CacheManager(self.config, current_app.logger)
         try:
-            file = tarball.extract(tarball.tarball_path, f"{name}/result.csv")
-        except TarballUnpackError as e:
+            info = cache_m.get_inventory(dataset.resource_id, "result.csv")
+            file = info["stream"].read().decode("utf-8")
+            info["stream"].close()
+        except Exception as e:
             raise APIInternalError(str(e)) from e
 
         get_quisby_data = QuisbyProcessing().extract_data(