Skip to content

Commit

Permalink
Excpect scale-configs to all live in test-infra now (#5788)
Browse files Browse the repository at this point in the history
Part of the workflow to move the LF scale-config.yml files from
pytorch/pytorch to test-infra (details in
#5767)

This updates the validation script to no longer expect to update the
pytorch/pytorch version of these scale configs. It resulted in many
other aspects of that file becoming simpler as well.
  • Loading branch information
ZainRizvi authored Oct 21, 2024
1 parent d731856 commit 774c10f
Showing 1 changed file with 33 additions and 88 deletions.
121 changes: 33 additions & 88 deletions .github/scripts/validate_scale_config.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
# Takes the scale-config.yml file in test-infra/.github/scale-config.yml and runs the following
# validations against it:
# 1. Internal validation: Ensure that every linux runner type listed has the corresponding Amazon 2023 variant
# 1. Internal validation: Runs a custom set of sanity checks against the runner types defined in the file
# 2. External validation: Ensure that every runner type listed (linux & windows) have corresponding runner types in
# pytorch/pytorch's .github/lf-scale-config.yml and .github/lf-canary-scale-config.yml that have the "lf."
# "lf.c." prefixes added correspondingly
# This script assumes that it is being run from the root of the test-infra repository
# the Linux Foundation fleet's scale config files (.github/lf-scale-config.yml and .github/lf-canary-scale-config.yml).
# Those files are expected to have the "lf." and "lf.c." prefixes added to each runner type

import argparse
import copy
import json
import os
import tempfile

import urllib.request
from pathlib import Path

from typing import Any, cast, Dict, List, NamedTuple

Expand All @@ -29,8 +28,6 @@

RUNNER_TYPE_CONFIG_KEY = "runner_types"

GITHUB_PYTORCH_REPO_RAW_URL = "https://raw.githubusercontent.com/pytorch/pytorch/main/"

PREFIX_META = ""
PREFIX_LF = "lf."
PREFIX_LF_CANARY = "lf.c."
Expand Down Expand Up @@ -71,23 +68,19 @@ def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Validate scale-config.yml file")

parser.add_argument(
"--test-infra-repo-root",
type=str,
required=False,
default=".",
help="Path to the root of the local test-infra repository. Default is the current directory",
)
parser.add_argument(
"--pytorch-repo-root",
type=str,
required=False,
help="Path to the root of the local pytorch repository. If omitted, uses the "
"main branch from github pytorch/pytorch",
"--generate",
"-g",
action="store_true",
help="Update the generated scale configs based on the source scale config",
)

return parser.parse_args()


def get_repo_root() -> Path:
return Path(__file__).resolve().parent.parent.parent


def runner_types_are_equivalent(
runner1_type: str,
runner1_config: Dict[str, str],
Expand Down Expand Up @@ -146,8 +139,8 @@ def runner_types_are_equivalent(
return are_same


def is_config_consistent_internally(runner_types: Dict[str, Dict[str, str]]) -> bool:
f"""
def is_config_valid_internally(runner_types: Dict[str, Dict[str, str]]) -> bool:
"""
Ensure that for every linux runner type in the config:
1 - they match RunnerTypeScaleConfig https://github.com/pytorch/test-infra/blob/f3c58fea68ec149391570d15a4d0a03bc26fbe4f/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts#L50
Expand Down Expand Up @@ -218,8 +211,6 @@ def generate_repo_scale_config(
Generate the new scale config file with the same layout as the original file,
but with the expected_prefix added to the runner types
"""

print(f"Generating updated {dest_config_file}")
source_config = load_yaml_file(source_config_file)
base_runner_types = set(source_config[RUNNER_TYPE_CONFIG_KEY].keys())

Expand All @@ -245,9 +236,9 @@ def generate_repo_scale_config(
f.write(line)


def load_yaml_file(scale_config_path: str) -> Dict[str, Any]:
def load_yaml_file(scale_config_path: Path) -> Dict[str, Any]:
# Verify file exists
if not os.path.exists(scale_config_path):
if not scale_config_path.exists():
print(
f"Could not find file {scale_config_path}. Please verify the path given on the command line."
)
Expand All @@ -268,97 +259,51 @@ def download_file(url: str, local_filename: str) -> None:
f.write(content)


def pull_temp_config_from_github_repo(config_path: str) -> str:
config_url = GITHUB_PYTORCH_REPO_RAW_URL + config_path

temp_dir = tempfile.mkdtemp()
config_path = os.path.join(temp_dir, config_path)
download_file(config_url, config_path)

return config_path


class ScaleConfigInfo(NamedTuple):
path: str # full path to scale config file
path: Path # full path to scale config file
prefix: str # prefix this fleet's runners types should have


def main() -> None:
repo_root = get_repo_root()

args = parse_args()

source_scale_config_info = ScaleConfigInfo(
path=os.path.join(args.test_infra_repo_root, META_SCALE_CONFIG_PATH),
path=repo_root / META_SCALE_CONFIG_PATH,
prefix=PREFIX_META,
)

# Contains scale configs that are generated from the source scale config
generated_scale_config_infos: List[ScaleConfigInfo] = [
ScaleConfigInfo(
path=os.path.join(args.test_infra_repo_root, LF_SCALE_CONFIG_PATH),
path=repo_root / LF_SCALE_CONFIG_PATH,
prefix=PREFIX_LF,
),
ScaleConfigInfo(
path=os.path.join(args.test_infra_repo_root, LF_CANARY_SCALE_CONFIG_PATH),
path=repo_root / LF_CANARY_SCALE_CONFIG_PATH,
prefix=PREFIX_LF_CANARY,
),
]

generate_files = True
if args.pytorch_repo_root is None:
# This is expected during a CI run
generate_files = False
print(
"Using github's pytorch/pytorch repository as the source for the pytorch scale config files"
)

generated_scale_config_infos.append(
ScaleConfigInfo(
path=pull_temp_config_from_github_repo(LF_SCALE_CONFIG_PATH),
prefix=PREFIX_LF,
)
)
generated_scale_config_infos.append(
ScaleConfigInfo(
path=pull_temp_config_from_github_repo(LF_CANARY_SCALE_CONFIG_PATH),
prefix=PREFIX_LF_CANARY,
)
)
else:
# This is expected during a local run
generated_scale_config_infos.append(
ScaleConfigInfo(
path=os.path.join(args.pytorch_repo_root, LF_SCALE_CONFIG_PATH),
prefix=PREFIX_LF,
)
)
generated_scale_config_infos.append(
ScaleConfigInfo(
path=os.path.join(args.pytorch_repo_root, LF_CANARY_SCALE_CONFIG_PATH),
prefix=PREFIX_LF_CANARY,
)
)

source_scale_config = load_yaml_file(source_scale_config_info.path)
validation_success = True

if not is_config_consistent_internally(source_scale_config[RUNNER_TYPE_CONFIG_KEY]):
validation_success = False
print("scale-config.yml is not internally consistent\n")
else:
print("scale-config.yml is internally consistent\n")
validation_success = is_config_valid_internally(
source_scale_config[RUNNER_TYPE_CONFIG_KEY]
)
print(f"scaled-config.yml is {'valid' if validation_success else 'invalid'}\n")

def validate_config(generated_config_info: ScaleConfigInfo) -> bool:
if generate_files:
if args.generate:
print(f"Generating updated {generated_config_info.path}")

generate_repo_scale_config(
source_scale_config_info.path,
generated_config_info.path,
generated_config_info.prefix,
)

print(
f"Generated updated pytorch/pytorch scale config file at {generated_config_info.path}\n"
)

cloned_scale_config = load_yaml_file(generated_config_info.path)

if not is_consistent_across_configs(
Expand All @@ -367,7 +312,7 @@ def validate_config(generated_config_info: ScaleConfigInfo) -> bool:
generated_config_info.prefix,
):
print(
f"Consistency validation failed between {source_scale_config.path} and {generated_config_info.path}\n"
f"Consistency validation failed between {source_scale_config_info.path} and {generated_config_info.path}\n"
)
return False
else:
Expand All @@ -380,9 +325,9 @@ def validate_config(generated_config_info: ScaleConfigInfo) -> bool:
if not validation_success:
print(
"Validation failed\n\n"
"Please run `python .github/scripts/validate_scale_config.py --test-infra-repo-root [path] "
"--pytorch-repo-root [path]` locally to validate the scale-config.yml file and generate the "
"updated pytorch/pytorch scale config files.\n\n"
"Please run `python .github/scripts/validate_scale_config.py --generate` "
"locally to validate the scale-config.yml file and generate the updated "
"variant scale config files.\n\n"
"Note: You still need to fix internal consistency errors yourself.\n\n"
"If this script passes locally and you already have a PR open on pytorch/pytorch with the "
" relevant changes, you can merge that pytorch/pytorch PR first to make this job pass."
Expand Down

0 comments on commit 774c10f

Please sign in to comment.