Skip to content

Commit

Permalink
Merge pull request mlcommons#800 from mlcommons/dev
Browse files Browse the repository at this point in the history
Dev -> main
  • Loading branch information
priyakasimbeg authored Oct 22, 2024
2 parents ce1003e + 5ce9e5a commit c2aa9e1
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 412 deletions.
14 changes: 14 additions & 0 deletions docker/scripts/startup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ while [ "$1" != "" ]; do
shift
TEST=$1
;;
--additional_requirements_path)
shift
ADDITIONAL_REQUIREMENTS_PATH=$1
;;
*)
usage
exit 1
Expand All @@ -140,6 +144,16 @@ while [ "$1" != "" ]; do
shift
done


# Optionally install addtional dependencies
if [[ -n ${ADDITIONAL_REQUIREMENTS_PATH+x} ]]; then
echo "Installing addtional requirements..."
COMMAND="cd algorithmic-efficiency && pip install -r ${ADDITIONAL_REQUIREMENTS_PATH}"
echo $COMMAND
eval $COMMAND
fi


if [[ ${TEST} == "true" ]]; then
cd algorithmic-efficiency
COMMAND="python3 tests/test_traindiffs.py"
Expand Down
97 changes: 80 additions & 17 deletions scoring/run_workloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
--tuning_search_space <path_to_tuning_search_space_json>
"""

import datetime
import json
import os
import struct
import subprocess
import time

from absl import app
Expand All @@ -26,9 +28,11 @@
'docker_image_url',
'us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev',
'URL to docker image')
flags.DEFINE_integer('run_percentage',
100,
'Percentage of max num steps to run for.')
flags.DEFINE_integer(
'run_percentage',
100,
'Percentage of max num steps to run for.'
'Must set the flag enable_step_budget to True for this to take effect.')
flags.DEFINE_string('experiment_name',
'my_experiment',
'Name of top sub directory in experiment dir.')
Expand Down Expand Up @@ -83,10 +87,24 @@
'If your algorithm has a smaller per step time than our baselines '
'you may want to increase the number of steps per workload.')
flags.DEFINE_string(
'workload',
'workloads',
None,
'String representing a comma separated list of workload names.'
'If not None, only run this workload, else run all workloads in workload_metadata_path.'
)
flags.DEFINE_string('additional_requirements_path',
None,
'Path to requirements.txt if any.')
flags.DEFINE_integer(
'max_steps',
None,
'Maximum number of steps to run. Must set flag enable_step_budget.'
'This flag takes precedence over the run_percentage flag.')
flags.DEFINE_bool(
'enable_step_budget',
False,
'Flag that has to be explicitly set to override time budgets to step budget percentage.'
)

FLAGS = flags.FLAGS

Expand All @@ -106,15 +124,40 @@ def container_running():
return True


def kill_containers():
docker_client = docker.from_env()
containers = docker_client.containers.list()
for container in containers:
container.kill()


def gpu_is_active():
output = subprocess.check_output([
'nvidia-smi',
'--query-gpu=utilization.gpu',
'--format=csv,noheader,nounits'
])
return any(int(x) > 0 for x in output.decode().splitlines())


def wait_until_container_not_running(sleep_interval=5 * 60):
# check gpu util
# if the gpu has not been utilized for 30 minutes kill the
gpu_last_active = datetime.datetime.now().timestamp()

while container_running():
# check if gpus have been inactive > 45 min and if so terminate container
if gpu_is_active():
gpu_last_active = datetime.datetime.now().timestamp()
if (datetime.datetime.now().timestamp() - gpu_last_active) > 45 * 60:
kill_containers(
"Killing container: GPUs have been inactive > 45 minutes...")
time.sleep(sleep_interval)
return


def main(_):
framework = FLAGS.framework
run_fraction = FLAGS.run_percentage / 100.
experiment_name = FLAGS.experiment_name
docker_image_url = FLAGS.docker_image_url
submission_path = FLAGS.submission_path
Expand All @@ -132,7 +175,13 @@ def main(_):
study_end_index = FLAGS.study_end_index
else:
study_end_index = num_studies - 1

additional_requirements_path_flag = ''
if FLAGS.additional_requirements_path:
additional_requirements_path_flag = f'--additional_requirements_path {FLAGS.additional_requirements_path} '

submission_id = FLAGS.submission_id

rng_seed = FLAGS.seed

if not rng_seed:
Expand All @@ -144,17 +193,22 @@ def main(_):
with open(FLAGS.workload_metadata_path) as f:
workload_metadata = json.load(f)

# Get list of all possible workloads
workloads = [w for w in workload_metadata.keys()]

# Read held-out workloads
# Read heldout workloads
if FLAGS.held_out_workloads_config_path:
held_out_workloads = read_held_out_workloads(
FLAGS.held_out_workloads_config_path)
workloads = workloads + held_out_workloads

# Filter for single workload
if FLAGS.workload and (FLAGS.workload in workloads):
workloads = [FLAGS.workload]
# Filter workloads if explicit workloads specified
if FLAGS.workloads is not None:
workloads = list(
filter(lambda x: x in FLAGS.workloads.split(','), workloads))
if len(workloads) != len(FLAGS.workloads.split(',')):
unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads)
raise ValueError(f'Invalid workload name {unmatched_workloads}')

rng_subkeys = prng.split(rng_key, num_studies)

Expand All @@ -174,14 +228,22 @@ def main(_):
"sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'") # clear caches
print('=' * 100)
dataset = workload_metadata[base_workload_name]['dataset']
max_steps = int(workload_metadata[base_workload_name]['max_steps'] *
run_fraction)
max_steps_flag = ''
if FLAGS.enable_step_budget:
run_fraction = FLAGS.run_percentage / 100.
if FLAGS.max_steps is None:
max_steps = int(workload_metadata[base_workload_name]['max_steps'] *
run_fraction)
else:
max_steps = FLAGS.max_steps
max_steps_flag = f'-m {max_steps}'

mount_repo_flag = ''
if FLAGS.local:
mount_repo_flag = '-v $HOME/algorithmic-efficiency:/algorithmic-efficiency '
command = ('docker run -t -d -v $HOME/data/:/data/ '
'-v $HOME/experiment_runs/:/experiment_runs '
'-v $HOME/experiment_runs/logs:/logs '
mount_repo_flag = '-v /home/kasimbeg/algorithmic-efficiency:/algorithmic-efficiency '
command = ('docker run -t -d -v /home/kasimbeg/data/:/data/ '
'-v /home/kasimbeg/experiment_runs/:/experiment_runs '
'-v /home/kasimbeg/experiment_runs/logs:/logs '
f'{mount_repo_flag}'
'--gpus all --ipc=host '
f'{docker_image_url} '
Expand All @@ -190,9 +252,10 @@ def main(_):
f'-s {submission_path} '
f'-w {workload} '
f'-e {study_dir} '
f'-m {max_steps} '
f'{max_steps_flag} '
f'--num_tuning_trials {num_tuning_trials} '
f'--rng_seed {run_seed} '
f'{additional_requirements_path_flag}'
'-c false '
'-o true '
'-i true ')
Expand Down Expand Up @@ -235,4 +298,4 @@ def main(_):

if __name__ == '__main__':
flags.mark_flag_as_required('workload_metadata_path')
app.run(main)
app.run(main)
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ jax_core_deps =
chex==0.1.7
ml_dtypes==0.2.0
protobuf==4.25.3
scipy==1.11.4


# JAX CPU
jax_cpu =
Expand Down
Loading

0 comments on commit c2aa9e1

Please sign in to comment.