Skip to content

Commit

Permalink
Harmonize extract with API logic (#3473)
Browse files Browse the repository at this point in the history
* Harmonize "inventory" APIs

PBENCH-1194

Rework Tarball inventory access to avoid the nascent cache map so that we can
rely on this inside the Pbench Server process where tarballs haven't been
unpacked.

This also fixes some incompatibilities in the Quisby APIs which relied on the
older extract semantics.

In order to avoid stale streams on the `inventory` API I encapsulated the
`extractfile` stream into an object that can behave as a stream but will close
both the stream and tarball when done: this is deferred until Flask tears down
the Response object.

Finally, I've added functional tests validating the `contents`, `inventory`,
`visualize`, and `compare` APIs so we'll catch any further infrastructure
"drift" that might affect them.
  • Loading branch information
dbutenhof authored Jul 6, 2023
1 parent b6d63ed commit 61be9df
Show file tree
Hide file tree
Showing 9 changed files with 591 additions and 253 deletions.
32 changes: 13 additions & 19 deletions lib/pbench/server/api/resources/datasets_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,7 @@
ParamType,
Schema,
)
from pbench.server.cache_manager import (
CacheManager,
TarballNotFound,
TarballUnpackError,
)
from pbench.server.cache_manager import CacheManager
from pbench.server.database.models.datasets import Metadata


Expand Down Expand Up @@ -93,28 +89,26 @@ def _get(
dataset.access,
)
)

benchmark_type = BenchmarkName.__members__.get(benchmark.upper())
if not benchmark_type:
raise APIAbort(
HTTPStatus.BAD_REQUEST, f"Unsupported Benchmark: {benchmark}"
)

cache_m = CacheManager(self.config, current_app.logger)
stream_file = {}
for dataset in datasets:
try:
tarball = cache_m.find_dataset(dataset.resource_id)
except TarballNotFound as e:
info = cache_m.get_inventory(dataset.resource_id, "result.csv")
file = info["stream"].read().decode("utf-8")
info["stream"].close()
except Exception as e:
raise APIInternalError(
f"Expected dataset with ID '{dataset.resource_id}' is missing from the cache manager."
f"{dataset.name} is missing 'result.csv' file"
) from e
try:
file = tarball.extract(
tarball.tarball_path, f"{tarball.name}/result.csv"
)
except TarballUnpackError as e:
raise APIInternalError(str(e)) from e
stream_file[dataset.name] = file

benchmark_type = BenchmarkName.__members__.get(benchmark.upper())
if not benchmark_type:
raise APIAbort(
HTTPStatus.UNSUPPORTED_MEDIA_TYPE, f"Unsupported Benchmark: {benchmark}"
)
get_quisby_data = QuisbyProcessing().compare_csv_to_json(
benchmark_type, InputType.STREAM, stream_file
)
Expand Down
49 changes: 29 additions & 20 deletions lib/pbench/server/api/resources/datasets_inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
ApiAuthorizationType,
ApiBase,
ApiContext,
APIInternalError,
ApiMethod,
ApiParams,
ApiSchema,
Expand Down Expand Up @@ -53,37 +54,45 @@ def _get(
Raises:
APIAbort, reporting either "NOT_FOUND" or "UNSUPPORTED_MEDIA_TYPE"
GET /api/v1/datasets/{dataset}/inventory/{target}
"""
dataset = params.uri["dataset"]
target = params.uri.get("target")

cache_m = CacheManager(self.config, current_app.logger)
try:
file_info = cache_m.filestream(dataset, target)
file_info = cache_m.get_inventory(dataset.resource_id, target)
except TarballNotFound as e:
raise APIAbort(HTTPStatus.NOT_FOUND, str(e))

if file_info["type"] == CacheType.FILE:
# Tell send_file to set `Content-Disposition` to "attachment" if
# targeting the large binary tarball. Otherwise we'll recommend the
# default "inline": only `is_file()` paths are allowed here, and
# most are likely "displayable". While `send_file` will guess the
# download_name from the path, setting it explicitly does no harm
# and supports a unit test mock with no real file.
#
# NOTE: we could choose to be "smarter" based on file size, file
# type codes (e.g., .csv, .json), and so forth.
resp = send_file(
file_info["stream"],
as_attachment=target is None,
download_name=file_info["name"],
)
file_info["stream"].close()
return resp
else:
if file_info["type"] != CacheType.FILE:
raise APIAbort(
HTTPStatus.BAD_REQUEST,
"The specified path does not refer to a regular file",
)

# Tell send_file to set `Content-Disposition` to "attachment" if
# targeting the large binary tarball. Otherwise we'll recommend the
# default "inline": only `is_file()` paths are allowed here, and
# most are likely "displayable". While `send_file` will guess the
# download_name from the path, setting it explicitly does no harm
# and supports a unit test mock with no real file.
#
# Werkzeug will set the mime type and content encoding based on the
# download_name suffix.
#
# We link a callback to close the file stream and TarFile object on
# completion.
stream = file_info["stream"]
try:
resp = send_file(
stream, as_attachment=target is None, download_name=file_info["name"]
)
except Exception as e:
if stream:
stream.close()
raise APIInternalError(
f"Problem sending {dataset}:{target} stream {stream}: {str(e)!r}"
)
resp.call_on_close(stream.close)
return resp
32 changes: 10 additions & 22 deletions lib/pbench/server/api/resources/datasets_visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,8 @@
ParamType,
Schema,
)
from pbench.server.cache_manager import (
CacheManager,
TarballNotFound,
TarballUnpackError,
)
from pbench.server.database import Dataset
from pbench.server.cache_manager import CacheManager
from pbench.server.database.models.datasets import Metadata


class DatasetsVisualize(ApiBase):
Expand Down Expand Up @@ -63,29 +59,21 @@ def _get(
"""

dataset = params.uri["dataset"]
cache_m = CacheManager(self.config, current_app.logger)

try:
tarball = cache_m.find_dataset(dataset.resource_id)
except TarballNotFound as e:
raise APIAbort(
HTTPStatus.NOT_FOUND, f"No dataset with ID '{e.tarball}' found"
) from e

metadata = self._get_dataset_metadata(
dataset, ["dataset.metalog.pbench.script"]
)
benchmark = metadata["dataset.metalog.pbench.script"].upper()
metadata = Metadata.getvalue(dataset, "dataset.metalog.pbench.script")
benchmark = metadata.upper()
benchmark_type = BenchmarkName.__members__.get(benchmark)
if not benchmark_type:
raise APIAbort(
HTTPStatus.UNSUPPORTED_MEDIA_TYPE, f"Unsupported Benchmark: {benchmark}"
HTTPStatus.BAD_REQUEST, f"Unsupported Benchmark: {benchmark}"
)

name = Dataset.stem(tarball.tarball_path)
cache_m = CacheManager(self.config, current_app.logger)
try:
file = tarball.extract(tarball.tarball_path, f"{name}/result.csv")
except TarballUnpackError as e:
info = cache_m.get_inventory(dataset.resource_id, "result.csv")
file = info["stream"].read().decode("utf-8")
info["stream"].close()
except Exception as e:
raise APIInternalError(str(e)) from e

get_quisby_data = QuisbyProcessing().extract_data(
Expand Down
Loading

0 comments on commit 61be9df

Please sign in to comment.