Skip to content

Commit

Permalink
Add python package size delta report feature for SMD images
Browse files Browse the repository at this point in the history
  • Loading branch information
aws-tianquaw committed Apr 4, 2024
1 parent b09be00 commit 54bea35
Show file tree
Hide file tree
Showing 9 changed files with 273 additions and 5 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ repos:
hooks:
- id: autoflake
args: ['--in-place', '--expand-star-imports', '--ignore-init-module-imports', '--remove-all-unused-imports']
additional_dependencies: [setuptools]
- repo: https://github.com/psf/black
rev: 23.3.0
hooks:
Expand Down
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,16 @@ VERSION=<Insert SageMaker Distribution version in semver format here. example: 0
python ./src/main.py generate-staleness-report --target-patch-version $VERSION
```

### Package Size Delta Report

If you want to generate/view the package size delta report for a given
SageMaker distribution image version comparing to a base image version, then run the following command:

```
BASE_PATCH_VERSION=<Insert SageMaker Distribution version of the base image in semver format here. example: 1.6.1>
VERSION=<Insert SageMaker Distribution version of the target image in semver format here. example: 1.6.2>
python ./src/main.py generate-size-report --base-patch-version $BASE_PATCH_VERSION --target-patch-version $VERSION
```


## Example use cases
Expand Down
31 changes: 31 additions & 0 deletions src/collect_pkg_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""
Read installed Conda packages with sorted size.
"""

import json
import os
import pathlib


def dump_conda_package_metadata():
prefix = os.environ["CONDA_PREFIX"]
meta_data_path = pathlib.Path(prefix) / "conda-meta"
meta_data_files = meta_data_path.glob("*.json")

meta_data = dict()
for meta_data_file in meta_data_files:
name = meta_data_file.name.split("-")[0]
with open(meta_data_file, "r", encoding="utf-8") as f:
metadata = json.load(f)
version = metadata["version"]
size = metadata["size"]
meta_data[name] = {"version": version, "size": size}

# Sort the pakcage sizes in decreasing order
meta_data = {k: v for k, v in sorted(meta_data.items(), key=lambda item: item[1]["size"], reverse=True)}

print(json.dumps(meta_data))


if __name__ == "__main__":
dump_conda_package_metadata()
2 changes: 2 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"additional_packages_env_in_file": "gpu.additional_packages_env.in",
"image_tag_generator": "{image_version}-gpu",
"env_out_filename": "gpu.env.out",
"package_metadata_filename": "gpu_pkg_metadata.json",
"pytest_flags": ["--use-gpu"],
"image_type": "gpu",
},
Expand All @@ -20,6 +21,7 @@
"additional_packages_env_in_file": "cpu.additional_packages_env.in",
"image_tag_generator": "{image_version}-cpu",
"env_out_filename": "cpu.env.out",
"package_metadata_filename": "cpu_pkg_metadata.json",
"pytest_flags": [],
"image_type": "cpu",
},
Expand Down
43 changes: 42 additions & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@
_PATCH,
_get_dependency_upper_bound_for_runtime_upgrade,
)
from package_staleness import generate_package_staleness_report
from package_report import (
generate_package_size_report,
generate_package_staleness_report,
)
from release_notes_generator import generate_release_notes
from utils import (
get_dir_for_version,
Expand Down Expand Up @@ -237,6 +240,9 @@ def _build_local_images(
target_version: Version, target_ecr_repo_list: list[str], force: bool, skip_tests=False
) -> (list[str], list[dict[str, str]]):
target_version_dir = get_dir_for_version(target_version)
# Copy the script for collecting python package metadata to target version directory.
# This is a temporary solution, should be modified once we separate the SMD tool and the build artifacts.
shutil.copy("src/collect_pkg_metadata.py", target_version_dir)

generated_image_ids = []
generated_image_versions = []
Expand Down Expand Up @@ -267,6 +273,9 @@ def _build_local_images(
with open(f'{target_version_dir}/{config["env_out_filename"]}', "wb") as f:
f.write(container_logs)

# Generate Python package metadata for the image built.
_generate_python_package_metadata(target_version_dir, image, config)

# Generate change logs. Use the original image generator config which contains the name
# of the actual env.in file instead of the 'config'.
generate_change_log(target_version, image_generator_config)
Expand All @@ -285,6 +294,10 @@ def _build_local_images(
"localhost/sagemaker-distribution", config["image_tag_generator"].format(image_version=str(target_version))
)

# Clean up the script for collecting python package metadata from build artifacts.
if os.path.exists(f"{target_version_dir}/collect_pkg_metadata.py"):
os.remove(f"{target_version_dir}/collect_pkg_metadata.py")

return generated_image_ids, generated_image_versions


Expand Down Expand Up @@ -328,6 +341,19 @@ def _get_ecr_credentials(region, repository: str) -> (str, str):
return base64.b64decode(_authorization_data["authorizationToken"]).decode().split(":")


def _generate_python_package_metadata(target_version_dir, image, image_config):
try:
pkg_metadata = _docker_client.containers.run(
image=image.id, detach=False, auto_remove=True, command=f"python /tmp/collect_pkg_metadata.py"
)
with open(f'{target_version_dir}/{image_config["package_metadata_filename"]}', "wb") as f:
f.write(pkg_metadata)
except ContainerError as e:
print(e.container.logs().decode("utf-8"))
# After printing the logs, raise the exception (which is the old behavior)
raise


def get_arg_parser():
parser = argparse.ArgumentParser(
description="A command line utility to create new versions of Amazon SageMaker Distribution"
Expand Down Expand Up @@ -399,6 +425,21 @@ def get_arg_parser():
required=True,
help="Specify the base patch version for which the package staleness report needs to be " "generated.",
)
package_size_parser = subparsers.add_parser(
"generate-size-report",
help="Generates package size report for each of the packages in the given " "image version.",
)
package_size_parser.set_defaults(func=generate_package_size_report)
package_size_parser.add_argument(
"--base-patch-version",
required=True,
help="Specify the base patch version for which the package size report needs to be " "generated.",
)
package_size_parser.add_argument(
"--target-patch-version",
required=True,
help="Specify the target patch version for which the package size report needs to be " "generated.",
)
return parser


Expand Down
105 changes: 102 additions & 3 deletions src/package_staleness.py → src/package_report.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import json
import os

import conda.cli.python_api
from conda.models.match_spec import MatchSpec

from config import _image_generator_configs
from dependency_upgrader import _dependency_metadata
from utils import get_dir_for_version, get_match_specs, get_semver
from utils import get_dir_for_version, get_match_specs, get_semver, sizeof_fmt


def _get_package_versions_in_upstream(target_packages_match_spec_out, target_version) -> dict[str, str]:
Expand Down Expand Up @@ -48,7 +49,9 @@ def _get_package_versions_in_upstream(target_packages_match_spec_out, target_ver
return package_to_version_mapping


def _generate_report(package_versions_in_upstream, target_packages_match_spec_out, image_config, version):
def _generate_staleness_report_per_image(
package_versions_in_upstream, target_packages_match_spec_out, image_config, version
):
print("\n# Staleness Report: " + str(version) + "(" + image_config["image_type"] + ")\n")
print("Package | Current Version in the Distribution image | Latest Relevant Version in " "Upstream")
print("---|---|---")
Expand Down Expand Up @@ -89,6 +92,90 @@ def _get_installed_package_versions_and_conda_versions(
return target_packages_match_spec_out, latest_package_versions_in_upstream


def _generate_python_package_size_report_per_image(base_version_dir, target_version_dir, image_config, target_version):
print("\n# Python Package Size Report: " + str(target_version) + "(" + image_config["image_type"] + ")\n")
target_pkg_metadata_file = f'{target_version_dir}/{image_config["package_metadata_filename"]}'
base_pkg_metadata_file = f'{base_version_dir}/{image_config["package_metadata_filename"]}'
if not os.path.exists(target_pkg_metadata_file):
raise Exception("No Python package metadata file found for target version, please try re-build the image.")
with open(target_pkg_metadata_file) as jsonfile:
target_pkg_metadata = json.load(jsonfile)
base_pkg_metadata = None
base_total_size = None
if not os.path.exists(base_pkg_metadata_file):
print("WARNING: No Python package metadata file found for base version, only partial results will be shown.")
else:
with open(base_pkg_metadata_file) as jsonfile:
base_pkg_metadata = json.load(jsonfile)
base_total_size = sum(d["size"] for d in base_pkg_metadata.values())

# Print out the total size change of all Python packages in the image.
target_total_size = sum(d["size"] for d in target_pkg_metadata.values())
total_size_delta_val = (target_total_size - base_total_size) if base_total_size else None
total_size_delta_rel = (total_size_delta_val / base_total_size) if base_total_size else None
print("\n## Python Packages Total Size Delta\n")
print("Current Version Total Size | Base Version Total Size | Size Change (abs) | Size Change (%)")
print("---|---|---|---")
print(
sizeof_fmt(target_total_size)
+ "|"
+ (sizeof_fmt(base_total_size) if base_total_size else "-")
+ "|"
+ (sizeof_fmt(total_size_delta_val) if total_size_delta_val else "-")
+ "|"
+ (str(round(total_size_delta_rel * 100, 2)) if total_size_delta_rel else "-")
)

# Print out the largest 20 Python packages in the image, sorted decending by size.
print("\n## Top-20 Largest Python Packages\n")
print("Package | Current Version in the Distribution image | Size")
print("---|---|---")

for i, (k, v) in enumerate(target_pkg_metadata.items()):
if i >= 20:
break
print(k + "|" + v["version"] + "|" + sizeof_fmt(v["size"]))

# Print out the size delta for each changed/new package in the image, sorted decending by size.
if base_pkg_metadata:
print("\n## Python Package Size Delta\n")
print(
"Package | Current Version in the Distribution image | Latest Relevant Version in Upstream | Size Change (abs) | Size Change (%)"
)
print("---|---|---|---|---")
package_size_delta_dict = dict()
for k, v in target_pkg_metadata.items():
if k not in base_pkg_metadata or base_pkg_metadata[k]["version"] != v["version"]:
base_pkg_size = base_pkg_metadata[k]["size"] if k in base_pkg_metadata else 0
size_delta_abs = v["size"] - base_pkg_size
package_size_delta_dict[k] = {
"current_version": v["version"],
"base_version": base_pkg_metadata[k]["version"] if k in base_pkg_metadata else "-",
"size_delta_abs": size_delta_abs,
"size_delta_rel": (size_delta_abs / base_pkg_size) if base_pkg_size else None,
}
# Sort the package size delta based on absolute size diff in decending order.
package_size_delta_dict = {
k: v
for k, v in sorted(
package_size_delta_dict.items(), key=lambda item: item[1]["size_delta_abs"], reverse=True
)
}

for k, v in package_size_delta_dict.items():
print(
k
+ "|"
+ v["current_version"]
+ "|"
+ v["base_version"]
+ "|"
+ sizeof_fmt(v["size_delta_abs"])
+ "|"
+ (str(round(v["size_delta_rel"] * 100, 2)) if v["size_delta_rel"] else "-")
)


def generate_package_staleness_report(args):
target_version = get_semver(args.target_patch_version)
target_version_dir = get_dir_for_version(target_version)
Expand All @@ -97,6 +184,18 @@ def generate_package_staleness_report(args):
target_packages_match_spec_out,
latest_package_versions_in_upstream,
) = _get_installed_package_versions_and_conda_versions(image_config, target_version_dir, target_version)
_generate_report(
_generate_staleness_report_per_image(
latest_package_versions_in_upstream, target_packages_match_spec_out, image_config, target_version
)


def generate_package_size_report(args):
target_version = get_semver(args.target_patch_version)
target_version_dir = get_dir_for_version(target_version)

base_version = get_semver(args.base_patch_version)
base_version_dir = get_dir_for_version(base_version)
for image_config in _image_generator_configs:
_generate_python_package_size_report_per_image(
base_version_dir, target_version_dir, image_config, target_version
)
9 changes: 9 additions & 0 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,12 @@ def get_match_specs(file_path) -> dict[str, MatchSpec]:
assert "conda" in requirement_spec.environment.dependencies

return {MatchSpec(i).get("name"): MatchSpec(i) for i in requirement_spec.environment.dependencies["conda"]}


def sizeof_fmt(num):
# Convert byte to human-readable size units.
for unit in ("B", "KB", "MB", "GB"):
if abs(num) < 1024.0:
return f"{num:3.2f}{unit}"
num /= 1024.0
return f"{num:.2f}TB"
3 changes: 3 additions & 0 deletions template/v1/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ RUN mkdir -p /etc/code-editor/extensions && \
USER $MAMBA_USER
COPY --chown=$MAMBA_USER:$MAMBA_USER $ENV_IN_FILENAME *.in /tmp/

# Util tool for collecting python package metadata
COPY --chown=$MAMBA_USER:$MAMBA_USER ./collect_pkg_metadata.p[y] /tmp/

# Make sure that $ENV_IN_FILENAME has a newline at the end before the `tee` command runs. Otherwise, nasty things
# will happen.
RUN if [[ -z $ARG_BASED_ENV_IN_FILENAME ]] ; \
Expand Down
Loading

0 comments on commit 54bea35

Please sign in to comment.