Skip to content

Commit

Permalink
PBENCH-1014 Using Tarball.extract in Inventory API for extracting fil…
Browse files Browse the repository at this point in the history
…es from tarball (#3105)

* PBENCH-1014

Using Tarball.extract in Inventory API to extract file content from tarball.
  • Loading branch information
riya-17 authored Jun 26, 2023
1 parent fa225e2 commit 999f797
Show file tree
Hide file tree
Showing 4 changed files with 185 additions and 119 deletions.
33 changes: 11 additions & 22 deletions lib/pbench/server/api/resources/datasets_inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
ParamType,
Schema,
)
from pbench.server.cache_manager import CacheManager, TarballNotFound
from pbench.server.cache_manager import CacheManager, CacheType, TarballNotFound


class DatasetsInventory(ApiBase):
Expand Down Expand Up @@ -54,27 +54,18 @@ def _get(
APIAbort, reporting either "NOT_FOUND" or "UNSUPPORTED_MEDIA_TYPE"
GET /api/v1/datasets/inventory/{dataset}/{target}
GET /api/v1/datasets/{dataset}/inventory/{target}
"""

dataset = params.uri["dataset"]
target = params.uri.get("target")

cache_m = CacheManager(self.config, current_app.logger)
try:
tarball = cache_m.find_dataset(dataset.resource_id)
file_info = cache_m.filestream(dataset, target)
except TarballNotFound as e:
raise APIAbort(HTTPStatus.NOT_FOUND, str(e))

if target is None:
file_path = tarball.tarball_path
else:
dataset_location = tarball.unpacked
if dataset_location is None:
raise APIAbort(HTTPStatus.NOT_FOUND, "The dataset is not unpacked")
file_path = dataset_location / target

if file_path.is_file():
if file_info["type"] == CacheType.FILE:
# Tell send_file to set `Content-Disposition` to "attachment" if
# targeting the large binary tarball. Otherwise we'll recommend the
# default "inline": only `is_file()` paths are allowed here, and
Expand All @@ -84,17 +75,15 @@ def _get(
#
# NOTE: we could choose to be "smarter" based on file size, file
# type codes (e.g., .csv, .json), and so forth.
return send_file(
file_path,
resp = send_file(
file_info["stream"],
as_attachment=target is None,
download_name=file_path.name,
)
elif file_path.exists():
raise APIAbort(
HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
"The specified path does not refer to a regular file",
download_name=file_info["name"],
)
file_info["stream"].close()
return resp
else:
raise APIAbort(
HTTPStatus.NOT_FOUND, "The specified path does not refer to a file"
HTTPStatus.BAD_REQUEST,
"The specified path does not refer to a regular file",
)
84 changes: 59 additions & 25 deletions lib/pbench/server/cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import shutil
import subprocess
import tarfile
from typing import Optional, Union
from typing import IO, Optional, Union

from pbench.common import MetadataLog, selinux
from pbench.server import JSONOBJECT, PbenchServerConfig
Expand Down Expand Up @@ -373,7 +373,6 @@ def traverse_cmap(path: Path, cachemap: dict) -> dict[str, dict]:
raise BadDirpath(
f"Found a file {file_l!r} where a directory was expected in path {str(path)!r}"
)

return f_entries[path.name]
except KeyError as exc:
raise BadDirpath(
Expand Down Expand Up @@ -430,29 +429,57 @@ def get_info(self, path: Path) -> JSONOBJECT:
return fd_info

@staticmethod
def extract(tarball_path: Path, path: str) -> Optional[str]:
"""Extract a file from the tarball and return it as a string
def extract(tarball_path: Path, path: Path) -> Optional[IO[bytes]]:
"""Returns the file stream which yields the contents of
a file at the specified path in the Tarball
Args:
tarball_path: absolute path of the tarball
path: relative path within the tarball
Raise:
BadDirpath on failure extracting the file from tarball
"""
try:
return tarfile.open(tarball_path, "r:*").extractfile(path)
except Exception as exc:
raise BadDirpath(
f"A problem occurred processing {str(path)!r} from {str(tarball_path)!r}: {exc}"
)

Report failures by raising exceptions.
def filestream(self, path: str) -> Optional[dict]:
"""Returns a dictionary containing information about the target
file and file stream
Args:
path: relative path within the tarball of a file
Raises:
MetadataError on failure opening the tarball
TarballUnpackError on failure to extract the named path
Returns:
The named file as a string
Dictionary with file info and file stream
"""
try:
tar = tarfile.open(tarball_path, "r:*")
except Exception as exc:
raise MetadataError(tarball_path, exc) from exc
try:
return tar.extractfile(path).read().decode()
except Exception as exc:
raise TarballUnpackError(tarball_path, f"Unable to extract {path}") from exc
if not path:
info = {
"name": self.name,
"type": CacheType.FILE,
"stream": self.tarball_path.open("rb"),
}
else:
file_path = Path(self.name) / path
info = self.get_info(file_path)
if info["type"] == CacheType.FILE:
try:
info["stream"] = Tarball.extract(self.tarball_path, file_path)
except Exception as exc:
raise TarballUnpackError(
self.tarball_path, f"Unable to extract {str(file_path)!r}"
) from exc
else:
info["stream"] = None

return info

@staticmethod
def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]:
Expand All @@ -465,15 +492,13 @@ def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]:
name = Dataset.stem(tarball_path)
try:
data = Tarball.extract(tarball_path, f"{name}/metadata.log")
except TarballUnpackError:
data = None
if data:
metadata = MetadataLog()
metadata.read_string(data)
metadata = {s: dict(metadata.items(s)) for s in metadata.sections()}
return metadata
else:
except BadDirpath:
return None
else:
metadata_log = MetadataLog()
metadata_log.read_file(e.decode() for e in data)
metadata = {s: dict(metadata_log.items(s)) for s in metadata_log.sections()}
return metadata

@staticmethod
def subprocess_run(
Expand Down Expand Up @@ -946,6 +971,13 @@ def create(self, tarfile: Path) -> Tarball:
controller: associated controller name
tarfile: dataset tarball path
Raises
BadDirpath: Failure on extracting the file from tarball
MetadataError: Failure on getting metadata from metadata.log file
in the tarball
BadFilename: A bad tarball path was given
DuplicateTarball: A duplicate tarball name was detected
Returns
Tarball object
"""
Expand All @@ -955,8 +987,6 @@ def create(self, tarfile: Path) -> Tarball:
controller_name = metadata["run"]["controller"]
else:
controller_name = "unknown"
except MetadataError:
raise
except Exception as exc:
raise MetadataError(tarfile, exc)

Expand Down Expand Up @@ -1005,6 +1035,10 @@ def get_info(self, dataset_id: str, path: Path) -> dict:
tmap = tarball.get_info(path)
return tmap

def filestream(self, dataset, target):
tarball = self.find_dataset(dataset.resource_id)
return tarball.filestream(target)

def uncache(self, dataset_id: str):
"""Remove the unpacked tarball tree.
Expand Down
76 changes: 58 additions & 18 deletions lib/pbench/test/unit/server/test_cache_manager.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import hashlib
import io
from logging import Logger
import os
from pathlib import Path
Expand Down Expand Up @@ -132,12 +133,6 @@ def test_metadata(
):
"""Test behavior with metadata.log access errors."""

def fake_extract_file(self, path):
raise KeyError(f"filename {path} not found")

def fake_tarfile_open(self, path):
raise tarfile.TarError("Invalid Tarfile")

def fake_metadata(tar_path):
return {"pbench": {"date": "2002-05-16T00:00:00"}}

Expand All @@ -155,13 +150,6 @@ def fake_metadata_controller(tar_path):
source_tarball, source_md5, md5 = tarball
cm = CacheManager(server_config, make_logger)

expected_metaerror = f"A problem occurred processing metadata.log from {source_tarball!s}: 'Invalid Tarfile'"
with monkeypatch.context() as m:
m.setattr(tarfile, "open", fake_tarfile_open)
with pytest.raises(MetadataError) as exc:
cm.create(source_tarball)
assert str(exc.value) == expected_metaerror

expected_metaerror = f"A problem occurred processing metadata.log from {source_tarball!s}: \"'run'\""
with monkeypatch.context() as m:
m.setattr(Tarball, "_get_metadata", fake_metadata)
Expand All @@ -183,11 +171,6 @@ def fake_metadata_controller(tar_path):
cm.create(source_tarball)
assert str(exc.value) == expected_metaerror

with monkeypatch.context() as m:
m.setattr(tarfile.TarFile, "extractfile", fake_extract_file)
cm.create(source_tarball)
assert cm[md5].metadata is None

def test_with_metadata(
self, monkeypatch, selinux_disabled, server_config, make_logger, tarball
):
Expand Down Expand Up @@ -367,6 +350,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
f11.txt
f12_sym -> ../../..
f1.json
metadata.log
Generated cache map
Expand All @@ -376,6 +360,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
'details': <cache_manager.FileInfo object>,
'children': {
'f1.json': {'details': <cache_manager.FileInfo object>},
'metadata.log': {'details': <cache_manager.FileInfo object>},
'subdir1': {
'details': <cache_manager.FileInfo object>,
'children': {
Expand Down Expand Up @@ -412,6 +397,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
sub_dir = tmp_path / dir_name
sub_dir.mkdir(parents=True, exist_ok=True)
(sub_dir / "f1.json").touch()
(sub_dir / "metadata.log").touch()
for i in range(1, 4):
(sub_dir / "subdir1" / f"subdir1{i}").mkdir(parents=True, exist_ok=True)
(sub_dir / "subdir1" / "f11.txt").touch()
Expand Down Expand Up @@ -891,6 +877,60 @@ def test_cache_map_get_info_cmap(
file_info = tb.get_info(Path(file_path))
assert file_info == expected_msg

@pytest.mark.parametrize(
"file_path, exp_file_type, exp_stream",
[
("", CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")),
(None, CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")),
("f1.json", CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")),
("subdir1/subdir12", CacheType.DIRECTORY, None),
],
)
def test_filestream(
self, monkeypatch, tmp_path, file_path, exp_file_type, exp_stream
):
"""Test to extract file contents/stream from a file"""
tar = Path("/mock/dir_name.tar.xz")
cache = Path("/mock/.cache")

with monkeypatch.context() as m:
m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
m.setattr(Tarball, "extract", lambda _t, _p: exp_stream)
m.setattr(Path, "open", lambda _s, _m="rb": exp_stream)
tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
tar_dir = TestCacheManager.MockController.generate_test_result_tree(
tmp_path, "dir_name"
)
tb.cache_map(tar_dir)
file_info = tb.filestream(file_path)
assert file_info["type"] == exp_file_type
assert file_info["stream"] == exp_stream

def test_filestream_tarfile_open(self, monkeypatch, tmp_path):
"""Test to check non-existent file or tarfile unpack issue"""
tar = Path("/mock/dir_name.tar.xz")
cache = Path("/mock/.cache")

def fake_tarfile_open(self, path):
raise tarfile.TarError("Invalid Tarfile")

with monkeypatch.context() as m:
m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
m.setattr(tarfile, "open", fake_tarfile_open)
tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
tar_dir = TestCacheManager.MockController.generate_test_result_tree(
tmp_path, "dir_name"
)
tb.cache_map(tar_dir)

path = Path(tb.name) / "subdir1/f11.txt"
expected_error_msg = f"An error occurred while unpacking {tb.tarball_path}: Unable to extract {str(path)!r}"
with pytest.raises(TarballUnpackError) as exc:
tb.filestream("subdir1/f11.txt")
assert str(exc.value) == expected_error_msg

def test_find(
self, selinux_enabled, server_config, make_logger, tarball, monkeypatch
):
Expand Down
Loading

0 comments on commit 999f797

Please sign in to comment.