From ef7694f26afa5222254269d0c78bdb269cf0a12c Mon Sep 17 00:00:00 2001 From: zahiqbal Date: Wed, 17 Jan 2024 22:48:09 +0000 Subject: [PATCH] [ROCM]: Generating pytest html logs from unit-tests. --- build/rocm/Dockerfile.ms | 2 +- build/rocm/run_multi_gpu.sh | 47 +++++++++++++++++++++++++----------- build/rocm/run_single_gpu.py | 24 +++++++++++++++++- 3 files changed, 57 insertions(+), 16 deletions(-) diff --git a/build/rocm/Dockerfile.ms b/build/rocm/Dockerfile.ms index 6da873d9a68a..25251a9deebb 100644 --- a/build/rocm/Dockerfile.ms +++ b/build/rocm/Dockerfile.ms @@ -32,6 +32,6 @@ RUN git clone https://github.com/pyenv/pyenv.git /pyenv ENV PYENV_ROOT /pyenv ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH RUN pyenv install $PYTHON_VERSION -RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-rerunfailures matplotlib absl-py flatbuffers hypothesis +RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger pytest-rerunfailures matplotlib absl-py flatbuffers hypothesis diff --git a/build/rocm/run_multi_gpu.sh b/build/rocm/run_multi_gpu.sh index 290e22f609de..ff186db76062 100755 --- a/build/rocm/run_multi_gpu.sh +++ b/build/rocm/run_multi_gpu.sh @@ -13,20 +13,39 @@ # See the License for the specific language governing permissions and # limitations under the License. -set -eux -# run test module with multi-gpu requirements. We currently do not have a way to filter tests. -# this issue is also tracked in https://github.com/google/jax/issues/7323 -cmd=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l) -echo $cmd +set -eu -if [[ $cmd -gt 8 ]]; then - export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && python3 -m pytest --reruns 3 -x tests/pmap_test.py -elif [[ $cmd -gt 4 ]]; then - export HIP_VISIBLE_DEVICES=0,1,2,3 && python3 -m pytest --reruns 3 -x tests/pmap_test.py -elif [[ $cmd -gt 2 ]]; then - export HIP_VISIBLE_DEVICES=0,1 && python3 -m pytest --reruns 3 -x tests/pmap_test.py -else - export HIP_VISIBLE_DEVICES=0 && python3 -m pytest --reruns 3 -x tests/pmap_test.py +# Function to run tests with specified GPUs +run_tests() { + local base_dir=./logs + local gpu_devices="$1" + export HIP_VISIBLE_DEVICES=$gpu_devices + python3 -m pytest --html=$base_dir/multi_gpu_pmap_test_log.html --reruns 3 -x tests/pmap_test.py + python3 -m pytest --html=$base_dir/multi_gpu_multi_device_test_log.html --reruns 3 -x tests/multi_device_test.py + python3 -m pytest_html_merger -i $base_dir/ -o $base_dir/final_compiled_report.html +} + +# Check for required commands +if ! command -v lspci &> /dev/null; then + echo "lspci command not found, aborting." + exit 1 +fi + +if ! command -v python3 &> /dev/null; then + echo "Python3 is not available, aborting." + exit 1 fi -python3 -m pytest --reruns 3 -x tests/multi_device_test.py +# GPU detection and test execution +gpu_count=$(lspci | grep -c 'controller.*AMD/ATI') +echo "Number of AMD/ATI GPUs detected: $gpu_count" + +if [[ $gpu_count -gt 8 ]]; then + run_tests "0,1,2,3,4,5,6,7" +elif [[ $gpu_count -gt 4 ]]; then + run_tests "0,1,2,3" +elif [[ $gpu_count -gt 2 ]]; then + run_tests "0,1" +else + run_tests "0" +fi diff --git a/build/rocm/run_single_gpu.py b/build/rocm/run_single_gpu.py index ab38224e1113..bf37a49ee61e 100755 --- a/build/rocm/run_single_gpu.py +++ b/build/rocm/run_single_gpu.py @@ -22,6 +22,26 @@ GPU_LOCK = threading.Lock() LAST_CODE = 0 +base_dir="./logs" + +def extract_filename(path): + base_name = os.path.basename(path) + file_name, _ = os.path.splitext(base_name) + return file_name + +def generate_final_report(shell=False, env_vars={}): + env = os.environ + env = {**env, **env_vars} + cmd = ["pytest_html_merger", "-i", '{}'.format(base_dir), "-o", '{}/final_compiled_report.html'.format(base_dir)] + result = subprocess.run(cmd, + shell=shell, + capture_output=True, + env=env) + if result.returncode != 0: + print("FAILED - {}".format(" ".join(cmd))) + print(result.stderr.decode()) + # sys.exit(result.returncode) + return result.returncode, result.stderr.decode(), result.stdout.decode() def run_shell_command(cmd, shell=False, env_vars={}): @@ -69,7 +89,8 @@ def run_test(testmodule, gpu_tokens): "HIP_VISIBLE_DEVICES": str(target_gpu), "XLA_PYTHON_CLIENT_ALLOCATOR": "default", } - cmd = ["python3", "-m", "pytest", "--reruns", "3", "-x", testmodule] + testfile = extract_filename(testmodule) + cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-x", testmodule] return_code, stderr, stdout = run_shell_command(cmd, env_vars=env_vars) with GPU_LOCK: gpu_tokens.append(target_gpu) @@ -102,6 +123,7 @@ def find_num_gpus(): def main(args): all_testmodules = collect_testmodules() run_parallel(all_testmodules, args.parallel) + generate_final_report() exit(LAST_CODE)