Skip to content

Commit

Permalink
Merge pull request #19497 from ROCmSoftwarePlatform:rocm-log-pytest-h…
Browse files Browse the repository at this point in the history
…tml-report

PiperOrigin-RevId: 601188997
  • Loading branch information
jax authors committed Jan 24, 2024
2 parents b83f2b2 + ef7694f commit 831c25f
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 16 deletions.
2 changes: 1 addition & 1 deletion build/rocm/Dockerfile.ms
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,6 @@ RUN git clone https://github.com/pyenv/pyenv.git /pyenv
ENV PYENV_ROOT /pyenv
ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
RUN pyenv install $PYTHON_VERSION
RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-rerunfailures matplotlib absl-py flatbuffers hypothesis
RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger pytest-rerunfailures matplotlib absl-py flatbuffers hypothesis


47 changes: 33 additions & 14 deletions build/rocm/run_multi_gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,39 @@
# See the License for the specific language governing permissions and
# limitations under the License.

set -eux
# run test module with multi-gpu requirements. We currently do not have a way to filter tests.
# this issue is also tracked in https://github.com/google/jax/issues/7323
cmd=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
echo $cmd
set -eu

if [[ $cmd -gt 8 ]]; then
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && python3 -m pytest --reruns 3 -x tests/pmap_test.py
elif [[ $cmd -gt 4 ]]; then
export HIP_VISIBLE_DEVICES=0,1,2,3 && python3 -m pytest --reruns 3 -x tests/pmap_test.py
elif [[ $cmd -gt 2 ]]; then
export HIP_VISIBLE_DEVICES=0,1 && python3 -m pytest --reruns 3 -x tests/pmap_test.py
else
export HIP_VISIBLE_DEVICES=0 && python3 -m pytest --reruns 3 -x tests/pmap_test.py
# Function to run tests with specified GPUs
run_tests() {
local base_dir=./logs
local gpu_devices="$1"
export HIP_VISIBLE_DEVICES=$gpu_devices
python3 -m pytest --html=$base_dir/multi_gpu_pmap_test_log.html --reruns 3 -x tests/pmap_test.py
python3 -m pytest --html=$base_dir/multi_gpu_multi_device_test_log.html --reruns 3 -x tests/multi_device_test.py
python3 -m pytest_html_merger -i $base_dir/ -o $base_dir/final_compiled_report.html
}

# Check for required commands
if ! command -v lspci &> /dev/null; then
echo "lspci command not found, aborting."
exit 1
fi

if ! command -v python3 &> /dev/null; then
echo "Python3 is not available, aborting."
exit 1
fi

python3 -m pytest --reruns 3 -x tests/multi_device_test.py
# GPU detection and test execution
gpu_count=$(lspci | grep -c 'controller.*AMD/ATI')
echo "Number of AMD/ATI GPUs detected: $gpu_count"

if [[ $gpu_count -gt 8 ]]; then
run_tests "0,1,2,3,4,5,6,7"
elif [[ $gpu_count -gt 4 ]]; then
run_tests "0,1,2,3"
elif [[ $gpu_count -gt 2 ]]; then
run_tests "0,1"
else
run_tests "0"
fi
24 changes: 23 additions & 1 deletion build/rocm/run_single_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,26 @@

GPU_LOCK = threading.Lock()
LAST_CODE = 0
base_dir="./logs"

def extract_filename(path):
base_name = os.path.basename(path)
file_name, _ = os.path.splitext(base_name)
return file_name

def generate_final_report(shell=False, env_vars={}):
env = os.environ
env = {**env, **env_vars}
cmd = ["pytest_html_merger", "-i", '{}'.format(base_dir), "-o", '{}/final_compiled_report.html'.format(base_dir)]
result = subprocess.run(cmd,
shell=shell,
capture_output=True,
env=env)
if result.returncode != 0:
print("FAILED - {}".format(" ".join(cmd)))
print(result.stderr.decode())
# sys.exit(result.returncode)
return result.returncode, result.stderr.decode(), result.stdout.decode()


def run_shell_command(cmd, shell=False, env_vars={}):
Expand Down Expand Up @@ -69,7 +89,8 @@ def run_test(testmodule, gpu_tokens):
"HIP_VISIBLE_DEVICES": str(target_gpu),
"XLA_PYTHON_CLIENT_ALLOCATOR": "default",
}
cmd = ["python3", "-m", "pytest", "--reruns", "3", "-x", testmodule]
testfile = extract_filename(testmodule)
cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-x", testmodule]
return_code, stderr, stdout = run_shell_command(cmd, env_vars=env_vars)
with GPU_LOCK:
gpu_tokens.append(target_gpu)
Expand Down Expand Up @@ -102,6 +123,7 @@ def find_num_gpus():
def main(args):
all_testmodules = collect_testmodules()
run_parallel(all_testmodules, args.parallel)
generate_final_report()
exit(LAST_CODE)


Expand Down

0 comments on commit 831c25f

Please sign in to comment.