From 2562c4db2a43217b0afc9306b5599e174a246fc5 Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Fri, 5 Apr 2024 15:40:50 -0400
Subject: [PATCH 01/19] Fix DDP error when experiment directory exists

---
 algorithmic_efficiency/logger_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/algorithmic_efficiency/logger_utils.py b/algorithmic_efficiency/logger_utils.py
index b7bde226a..b62d91dba 100644
--- a/algorithmic_efficiency/logger_utils.py
+++ b/algorithmic_efficiency/logger_utils.py
@@ -16,6 +16,7 @@
 import GPUtil
 import pandas as pd
 import psutil
+import torch.distributed as dist
 
 from algorithmic_efficiency import spec
 from algorithmic_efficiency.pytorch_utils import pytorch_setup
@@ -43,9 +44,6 @@ def get_log_dir(
     resume_last_run: bool,
     overwrite: bool,
 ) -> Optional[str]:
-  if RANK != 0:
-    return
-
   # Construct path to experiment workload directory.
   experiment_dir = os.path.expanduser(experiment_dir)
   workload_dir_name = f'{workload}_{framework}'
@@ -56,7 +54,7 @@ def get_log_dir(
                                    experiment_name,
                                    workload_dir_name)
 
-  if os.path.exists(experiment_path):
+  if os.path.exists(experiment_path) and RANK == 0:
     if overwrite:
       logging.info(
           f'Removing existing experiment directory {experiment_path} because '
@@ -73,6 +71,8 @@ def get_log_dir(
       if resume.lower() != 'y':
         sys.exit()
 
+  if USE_PYTORCH_DDP:
+    dist.barrier()
   logging.info(f'Creating experiment directory at {experiment_path}.')
   makedir(experiment_path)
   return experiment_path

From 957157bae3325960c2541c2fd4c3df615ddc585f Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Fri, 5 Apr 2024 15:52:00 -0400
Subject: [PATCH 02/19] Add log_dir path

---
 submission_runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/submission_runner.py b/submission_runner.py
index a6f8c05a3..69b407ae1 100644
--- a/submission_runner.py
+++ b/submission_runner.py
@@ -677,6 +677,7 @@ def main(_):
                                               FLAGS.resume_last_run,
                                               FLAGS.overwrite)
 
+  print(f"{RANK}; {logging_dir_path}")
   score = score_submission_on_workload(
       workload=workload,
       workload_name=FLAGS.workload,

From 6ab1aea311c2897145faecdbc7cf7fe88155146c Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Fri, 5 Apr 2024 17:21:58 -0400
Subject: [PATCH 03/19] Only save at rank0

---
 submission_runner.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/submission_runner.py b/submission_runner.py
index 69b407ae1..8778b8350 100644
--- a/submission_runner.py
+++ b/submission_runner.py
@@ -316,10 +316,12 @@ def train_once(
     flag_file_name = os.path.join(log_dir, f'flags_{preemption_count}.json')
     logging.info(f'Saving flags to {flag_file_name}.')
     logger_utils.write_json(flag_file_name, flags.FLAGS.flag_values_dict())
-    metrics_logger = logger_utils.set_up_loggers(log_dir,
-                                                 flags.FLAGS,
-                                                 hyperparameters)
-    workload.attach_metrics_logger(metrics_logger)
+    metrics_logger = None
+    if RANK == 0:
+      metrics_logger = logger_utils.set_up_loggers(log_dir,
+                                                     flags.FLAGS,
+                                                     hyperparameters)
+      workload.attach_metrics_logger(metrics_logger)
 
   global_start_time = get_time()
   train_state['last_step_end_time'] = global_start_time
@@ -467,14 +469,14 @@ def train_once(
 
   metrics = {'eval_results': eval_results, 'global_step': global_step}
 
-  if log_dir is not None:
-    metrics_logger.append_scalar_metrics(
-        {'score': train_state['accumulated_submission_time']},
-        global_step=global_step,
-        preemption_count=preemption_count)
-    metrics_logger.finish()
-    if save_checkpoints:
-      checkpoint_utils.save_checkpoint(
+  if log_dir is not None and RANK == 0:
+      metrics_logger.append_scalar_metrics(
+          {'score': train_state['accumulated_submission_time']},
+          global_step=global_step,
+          preemption_count=preemption_count)
+      metrics_logger.finish()
+      if save_checkpoints:
+        checkpoint_utils.save_checkpoint(
           framework=FLAGS.framework,
           optimizer_state=optimizer_state,
           model_params=model_params,

From 426a397c84d60536d0aa0b4bde7007104b092321 Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Fri, 5 Apr 2024 17:27:00 -0400
Subject: [PATCH 04/19] Avoid explicit DDP check

---
 algorithmic_efficiency/checkpoint_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/algorithmic_efficiency/checkpoint_utils.py b/algorithmic_efficiency/checkpoint_utils.py
index fb7449b99..5b7e29306 100644
--- a/algorithmic_efficiency/checkpoint_utils.py
+++ b/algorithmic_efficiency/checkpoint_utils.py
@@ -119,7 +119,7 @@ def maybe_restore_checkpoint(framework: str,
 
   else:
     checkpoint_state = latest_ckpt
-    if isinstance(model_params, torch.nn.DataParallel):
+    if hasattr(model_params, 'module'):
       model_params = model_params.module
     model_params.load_state_dict(checkpoint_state['model_params'])
     checkpoint_state['model_params'] = model_params
@@ -196,7 +196,7 @@ def save_checkpoint(framework: str,
     opt_state = jax.device_get(jax_utils.unreplicate(opt_state))
     model_state = jax.device_get(jax_utils.unreplicate(model_state))
   else:
-    if isinstance(model_params, torch.nn.DataParallel):
+    if hasattr(model_params, 'module'):
       model_params = model_params.module
     model_params = model_params.state_dict()
     optimizer_state_dict = {}

From 942e53b3f0f6549308d9b732c8e5c8a9d8030e03 Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Fri, 5 Apr 2024 17:28:51 -0400
Subject: [PATCH 05/19] Revert checks

---
 algorithmic_efficiency/checkpoint_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/algorithmic_efficiency/checkpoint_utils.py b/algorithmic_efficiency/checkpoint_utils.py
index 5b7e29306..fb7449b99 100644
--- a/algorithmic_efficiency/checkpoint_utils.py
+++ b/algorithmic_efficiency/checkpoint_utils.py
@@ -119,7 +119,7 @@ def maybe_restore_checkpoint(framework: str,
 
   else:
     checkpoint_state = latest_ckpt
-    if hasattr(model_params, 'module'):
+    if isinstance(model_params, torch.nn.DataParallel):
       model_params = model_params.module
     model_params.load_state_dict(checkpoint_state['model_params'])
     checkpoint_state['model_params'] = model_params
@@ -196,7 +196,7 @@ def save_checkpoint(framework: str,
     opt_state = jax.device_get(jax_utils.unreplicate(opt_state))
     model_state = jax.device_get(jax_utils.unreplicate(model_state))
   else:
-    if hasattr(model_params, 'module'):
+    if isinstance(model_params, torch.nn.DataParallel):
       model_params = model_params.module
     model_params = model_params.state_dict()
     optimizer_state_dict = {}

From 83078f01caa2768a2791a3567d05d9e2bcca7343 Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Fri, 5 Apr 2024 17:34:23 -0400
Subject: [PATCH 06/19] DDP checkpoint add

---
 algorithmic_efficiency/checkpoint_utils.py | 4 ++--
 submission_runner.py                       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/algorithmic_efficiency/checkpoint_utils.py b/algorithmic_efficiency/checkpoint_utils.py
index fb7449b99..d191df35b 100644
--- a/algorithmic_efficiency/checkpoint_utils.py
+++ b/algorithmic_efficiency/checkpoint_utils.py
@@ -119,7 +119,7 @@ def maybe_restore_checkpoint(framework: str,
 
   else:
     checkpoint_state = latest_ckpt
-    if isinstance(model_params, torch.nn.DataParallel):
+    if isinstance(model_params, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel)):
       model_params = model_params.module
     model_params.load_state_dict(checkpoint_state['model_params'])
     checkpoint_state['model_params'] = model_params
@@ -196,7 +196,7 @@ def save_checkpoint(framework: str,
     opt_state = jax.device_get(jax_utils.unreplicate(opt_state))
     model_state = jax.device_get(jax_utils.unreplicate(model_state))
   else:
-    if isinstance(model_params, torch.nn.DataParallel):
+    if isinstance(model_params, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel)):
       model_params = model_params.module
     model_params = model_params.state_dict()
     optimizer_state_dict = {}
diff --git a/submission_runner.py b/submission_runner.py
index 8778b8350..1945b2cda 100644
--- a/submission_runner.py
+++ b/submission_runner.py
@@ -431,7 +431,7 @@ def train_once(
 
           logging_start_time = get_time()
 
-          if log_dir is not None:
+          if log_dir is not None and RANK == 0:
             metrics_logger.append_scalar_metrics(
                 latest_eval_result,
                 global_step=global_step,

From 0fd47b1de766b8fd403325e9e7b5a2291398500b Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Sat, 6 Apr 2024 02:41:03 -0400
Subject: [PATCH 07/19] Add more descriptive error msg

---
 algorithmic_efficiency/logger_utils.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/algorithmic_efficiency/logger_utils.py b/algorithmic_efficiency/logger_utils.py
index b62d91dba..fd289faed 100644
--- a/algorithmic_efficiency/logger_utils.py
+++ b/algorithmic_efficiency/logger_utils.py
@@ -54,21 +54,29 @@ def get_log_dir(
                                    experiment_name,
                                    workload_dir_name)
 
-  if os.path.exists(experiment_path) and RANK == 0:
+  if os.path.exists(experiment_path):
     if overwrite:
       logging.info(
           f'Removing existing experiment directory {experiment_path} because '
           '--overwrite was set.')
-      shutil.rmtree(experiment_path)
+      if RANK == 0:
+        shutil.rmtree(experiment_path)
     elif resume_last_run:
       logging.info(
           f'Resuming from experiment directory {experiment_path} because '
           '--resume_last_run was set.')
     else:
-      resume = input(
-          'Found existing experiment dir with the same name: {}. Do you wish '
-          'to resume training from this dir? [y/N]:'.format(experiment_path))
-      if resume.lower() != 'y':
+      resume = 'n'
+      if RANK == 0:
+        resume = input(
+            'Found existing experiment dir with the same name: {}. Do you wish '
+            'to resume training from this dir? [y/N]:'.format(experiment_path))
+      if USE_PYTORCH_DDP:
+        dist.barrier()
+      try:
+        if resume.lower() != 'y':
+          sys.exit()
+      except RuntimeError:
         sys.exit()
 
   if USE_PYTORCH_DDP:

From 57aa5ff4319df02f8d28f1083c40173c81137ec0 Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Sat, 6 Apr 2024 02:42:41 -0400
Subject: [PATCH 08/19] Clean up barrier code

---
 algorithmic_efficiency/logger_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/algorithmic_efficiency/logger_utils.py b/algorithmic_efficiency/logger_utils.py
index fd289faed..cb3263550 100644
--- a/algorithmic_efficiency/logger_utils.py
+++ b/algorithmic_efficiency/logger_utils.py
@@ -79,8 +79,6 @@ def get_log_dir(
       except RuntimeError:
         sys.exit()
 
-  if USE_PYTORCH_DDP:
-    dist.barrier()
   logging.info(f'Creating experiment directory at {experiment_path}.')
   makedir(experiment_path)
   return experiment_path

From 3e8e5ea295ab0987ed30f42096b6dba0a39c49df Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Sat, 6 Apr 2024 03:12:35 -0400
Subject: [PATCH 09/19] Add log_dir barrier

---
 algorithmic_efficiency/logger_utils.py | 2 ++
 submission_runner.py                   | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/algorithmic_efficiency/logger_utils.py b/algorithmic_efficiency/logger_utils.py
index cb3263550..fd289faed 100644
--- a/algorithmic_efficiency/logger_utils.py
+++ b/algorithmic_efficiency/logger_utils.py
@@ -79,6 +79,8 @@ def get_log_dir(
       except RuntimeError:
         sys.exit()
 
+  if USE_PYTORCH_DDP:
+    dist.barrier()
   logging.info(f'Creating experiment directory at {experiment_path}.')
   makedir(experiment_path)
   return experiment_path
diff --git a/submission_runner.py b/submission_runner.py
index 1945b2cda..c12787ece 100644
--- a/submission_runner.py
+++ b/submission_runner.py
@@ -679,7 +679,6 @@ def main(_):
                                               FLAGS.resume_last_run,
                                               FLAGS.overwrite)
 
-  print(f"{RANK}; {logging_dir_path}")
   score = score_submission_on_workload(
       workload=workload,
       workload_name=FLAGS.workload,

From 9518a7080baf4120113b07e7fb2938dd19d37180 Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Sat, 6 Apr 2024 03:14:03 -0400
Subject: [PATCH 10/19] Add default to y

---
 algorithmic_efficiency/logger_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/algorithmic_efficiency/logger_utils.py b/algorithmic_efficiency/logger_utils.py
index fd289faed..3c17d1a22 100644
--- a/algorithmic_efficiency/logger_utils.py
+++ b/algorithmic_efficiency/logger_utils.py
@@ -66,7 +66,7 @@ def get_log_dir(
           f'Resuming from experiment directory {experiment_path} because '
           '--resume_last_run was set.')
     else:
-      resume = 'n'
+      resume = 'y'
       if RANK == 0:
         resume = input(
             'Found existing experiment dir with the same name: {}. Do you wish '

From 6d3aef3c32ee14b88c877c28a0ac3e9e0953ef07 Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Sat, 6 Apr 2024 03:52:41 -0400
Subject: [PATCH 11/19] Lint fix

---
 algorithmic_efficiency/checkpoint_utils.py |  8 ++++++--
 submission_runner.py                       | 18 +++++++++---------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/algorithmic_efficiency/checkpoint_utils.py b/algorithmic_efficiency/checkpoint_utils.py
index d191df35b..29c1a821e 100644
--- a/algorithmic_efficiency/checkpoint_utils.py
+++ b/algorithmic_efficiency/checkpoint_utils.py
@@ -119,7 +119,9 @@ def maybe_restore_checkpoint(framework: str,
 
   else:
     checkpoint_state = latest_ckpt
-    if isinstance(model_params, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel)):
+    if isinstance(
+        model_params,
+        (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel)):
       model_params = model_params.module
     model_params.load_state_dict(checkpoint_state['model_params'])
     checkpoint_state['model_params'] = model_params
@@ -196,7 +198,9 @@ def save_checkpoint(framework: str,
     opt_state = jax.device_get(jax_utils.unreplicate(opt_state))
     model_state = jax.device_get(jax_utils.unreplicate(model_state))
   else:
-    if isinstance(model_params, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel)):
+    if isinstance(
+        model_params,
+        (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel)):
       model_params = model_params.module
     model_params = model_params.state_dict()
     optimizer_state_dict = {}
diff --git a/submission_runner.py b/submission_runner.py
index c12787ece..551173bf5 100644
--- a/submission_runner.py
+++ b/submission_runner.py
@@ -319,8 +319,8 @@ def train_once(
     metrics_logger = None
     if RANK == 0:
       metrics_logger = logger_utils.set_up_loggers(log_dir,
-                                                     flags.FLAGS,
-                                                     hyperparameters)
+                                                   flags.FLAGS,
+                                                   hyperparameters)
       workload.attach_metrics_logger(metrics_logger)
 
   global_start_time = get_time()
@@ -470,13 +470,13 @@ def train_once(
   metrics = {'eval_results': eval_results, 'global_step': global_step}
 
   if log_dir is not None and RANK == 0:
-      metrics_logger.append_scalar_metrics(
-          {'score': train_state['accumulated_submission_time']},
-          global_step=global_step,
-          preemption_count=preemption_count)
-      metrics_logger.finish()
-      if save_checkpoints:
-        checkpoint_utils.save_checkpoint(
+    metrics_logger.append_scalar_metrics(
+        {'score': train_state['accumulated_submission_time']},
+        global_step=global_step,
+        preemption_count=preemption_count)
+    metrics_logger.finish()
+    if save_checkpoints:
+      checkpoint_utils.save_checkpoint(
           framework=FLAGS.framework,
           optimizer_state=optimizer_state,
           model_params=model_params,

From 790deaab44b09269c090c47e63d21b03cf1a597c Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Sat, 6 Apr 2024 04:09:05 -0400
Subject: [PATCH 12/19] Remove barrier call

---
 algorithmic_efficiency/logger_utils.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/algorithmic_efficiency/logger_utils.py b/algorithmic_efficiency/logger_utils.py
index 3c17d1a22..094f77bd6 100644
--- a/algorithmic_efficiency/logger_utils.py
+++ b/algorithmic_efficiency/logger_utils.py
@@ -73,14 +73,9 @@ def get_log_dir(
             'to resume training from this dir? [y/N]:'.format(experiment_path))
       if USE_PYTORCH_DDP:
         dist.barrier()
-      try:
-        if resume.lower() != 'y':
-          sys.exit()
-      except RuntimeError:
+      if resume.lower() != 'y':
         sys.exit()
 
-  if USE_PYTORCH_DDP:
-    dist.barrier()
   logging.info(f'Creating experiment directory at {experiment_path}.')
   makedir(experiment_path)
   return experiment_path

From e08a7591f4e541056c9d05c663fdb81a286b1e05 Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Sat, 6 Apr 2024 04:13:35 -0400
Subject: [PATCH 13/19] Clean up logic

---
 algorithmic_efficiency/logger_utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/algorithmic_efficiency/logger_utils.py b/algorithmic_efficiency/logger_utils.py
index 094f77bd6..76f7b00c9 100644
--- a/algorithmic_efficiency/logger_utils.py
+++ b/algorithmic_efficiency/logger_utils.py
@@ -71,10 +71,8 @@ def get_log_dir(
         resume = input(
             'Found existing experiment dir with the same name: {}. Do you wish '
             'to resume training from this dir? [y/N]:'.format(experiment_path))
-      if USE_PYTORCH_DDP:
-        dist.barrier()
-      if resume.lower() != 'y':
-        sys.exit()
+        if resume.lower() != 'y':
+          sys.exit()
 
   logging.info(f'Creating experiment directory at {experiment_path}.')
   makedir(experiment_path)

From 68d3ca05877ae18675849111d9308216a3aa2170 Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Sat, 6 Apr 2024 04:16:08 -0400
Subject: [PATCH 14/19] Add barrier

---
 algorithmic_efficiency/logger_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/algorithmic_efficiency/logger_utils.py b/algorithmic_efficiency/logger_utils.py
index 76f7b00c9..c1fe89d64 100644
--- a/algorithmic_efficiency/logger_utils.py
+++ b/algorithmic_efficiency/logger_utils.py
@@ -74,6 +74,8 @@ def get_log_dir(
         if resume.lower() != 'y':
           sys.exit()
 
+  if USE_PYTORCH_DDP:
+    dist.barrier()
   logging.info(f'Creating experiment directory at {experiment_path}.')
   makedir(experiment_path)
   return experiment_path

From 12976df2c8ae97807d7f648a86f56165ea38cdc9 Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Sat, 6 Apr 2024 04:17:24 -0400
Subject: [PATCH 15/19] Make FT

---
 algorithmic_efficiency/logger_utils.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/algorithmic_efficiency/logger_utils.py b/algorithmic_efficiency/logger_utils.py
index c1fe89d64..f3ab5c326 100644
--- a/algorithmic_efficiency/logger_utils.py
+++ b/algorithmic_efficiency/logger_utils.py
@@ -66,13 +66,15 @@ def get_log_dir(
           f'Resuming from experiment directory {experiment_path} because '
           '--resume_last_run was set.')
     else:
-      resume = 'y'
-      if RANK == 0:
-        resume = input(
-            'Found existing experiment dir with the same name: {}. Do you wish '
-            'to resume training from this dir? [y/N]:'.format(experiment_path))
-        if resume.lower() != 'y':
-          sys.exit()
+      try:
+        if RANK == 0:
+          resume = input(
+              'Found existing experiment dir with the same name: {}. Do you wish '
+              'to resume training from this dir? [y/N]:'.format(experiment_path))
+          if resume.lower() != 'y':
+            sys.exit()
+      except RuntimeError:
+        sys.exit()
 
   if USE_PYTORCH_DDP:
     dist.barrier()

From 80a93bf10bc5b9c899fe4047fbcac05fc6df5577 Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Sat, 6 Apr 2024 04:26:46 -0400
Subject: [PATCH 16/19] minor

---
 algorithmic_efficiency/logger_utils.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/algorithmic_efficiency/logger_utils.py b/algorithmic_efficiency/logger_utils.py
index f3ab5c326..609d996e6 100644
--- a/algorithmic_efficiency/logger_utils.py
+++ b/algorithmic_efficiency/logger_utils.py
@@ -66,18 +66,18 @@ def get_log_dir(
           f'Resuming from experiment directory {experiment_path} because '
           '--resume_last_run was set.')
     else:
-      try:
-        if RANK == 0:
-          resume = input(
-              'Found existing experiment dir with the same name: {}. Do you wish '
-              'to resume training from this dir? [y/N]:'.format(experiment_path))
-          if resume.lower() != 'y':
-            sys.exit()
-      except RuntimeError:
-        sys.exit()
+      if RANK == 0:
+        resume = input(
+            'Found existing experiment dir with the same name: {}. Do you wish '
+            'to resume training from this dir? [y/N]:'.format(experiment_path))
+        if resume.lower() != 'y':
+          sys.exit()
 
   if USE_PYTORCH_DDP:
-    dist.barrier()
+    try:
+      dist.barrier()
+    except RuntimeError:
+      sys.exit()
   logging.info(f'Creating experiment directory at {experiment_path}.')
   makedir(experiment_path)
   return experiment_path

From db59959259c2878620b05c0866e751921c2c12e9 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Fri, 19 Apr 2024 17:17:23 +0000
Subject: [PATCH 17/19] remove self-reporting

---
 CALL_FOR_SUBMISSIONS.md | 1 -
 COMPETITION_RULES.md    | 3 ---
 README.md               | 5 ++---
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/CALL_FOR_SUBMISSIONS.md b/CALL_FOR_SUBMISSIONS.md
index 059040107..0e21f0e9c 100644
--- a/CALL_FOR_SUBMISSIONS.md
+++ b/CALL_FOR_SUBMISSIONS.md
@@ -17,7 +17,6 @@ Submissions can compete under two hyperparameter tuning rulesets (with separate
 - **Registration deadline to express non-binding intent to submit: February 28th, 2024**.\
 Please fill out the (mandatory but non-binding) [**registration form**](https://forms.gle/K7ty8MaYdi2AxJ4N8).
 - **Submission deadline: April 04th, 2024** *(moved by a week from the initial March 28th, 2024)*
-- **Deadline for self-reporting preliminary results: May 28th, 2024**
 - [tentative] Announcement of all results: July 15th, 2024
 
 For a detailed and up-to-date timeline see the [Competition Rules](/COMPETITION_RULES.md).
diff --git a/COMPETITION_RULES.md b/COMPETITION_RULES.md
index b18401fe8..49ad9365f 100644
--- a/COMPETITION_RULES.md
+++ b/COMPETITION_RULES.md
@@ -43,7 +43,6 @@ The Competition begins at 12:01am (ET) on November 28, 2023 and ends at 11:59pm
 
 - **Intention to Submit.** You must register your Intention to Submit no later than 11:59pm ET on February 28, 2024.
 - **Submission Period.** You must complete your Submission and enter it after the Intention to Submit deadline, but no later than 11:59pm ET on April 04, 2024.
-- **Deadline for self-reporting results.** 11:59pm ET on May 28, 2024.
 
 ## Agreement to Official Rules
 
@@ -65,8 +64,6 @@ There are four (4) steps to a successful submission ("Submission").
 
    The form is sent to the working group chairs, who will process your Submission. Failure to complete the proper Submission Forms will results in disqualification of your Submission. At the close of the Submission Period, your GitHub repository must be public.
 
-4. **Report Results.** Prior to the Deadline for self-reporting results, run your Submission on either the qualification set or the full benchmark set and report the results. You must report your scores by uploading all unmodified logs that the benchmarking codebase automatically generates in a separate `/results` directory within the `/submission` folder of your Submission's GitHub repository.
-
 ## Submission Conditions
 
 All Submissions must meet the requirements of the terms contained in these rules, including reliance on new algorithmic or mathematical ideas and concepts, and must not use software engineering approaches in order to increase primitive operations in PyTorch, JAX, their dependencies, the operating systems, or the hardware. By entering, all Team members warrant that their Submission does not infringe any third party's rights, and that Team members have obtained all necessary permissions from all relevant third parties to submit the Submission. If, in the sole discretion of Sponsor, any Submission constitutes copyright or other intellectual property infringement, the Submission will be disqualified. Team must hold all rights through license or ownership to the entire Submission. Team members agree to indemnify Sponsor against any and all claims of infringement from any third party for any use by Sponsor of a Submission. Team members may not be: 1) represented under contract that would limit or impair Sponsor's ability to use the Submission; or 2) are under any other contractual relationship, including but not limited to guild and/or union memberships, that may prohibit them from participating fully in this Competition, or from allowing Sponsor to use royalty-free, the Submission worldwide in all media in perpetuity.
diff --git a/README.md b/README.md
index 3628caede..e4fb294ed 100644
--- a/README.md
+++ b/README.md
@@ -27,9 +27,8 @@
 ---
 
 > [!IMPORTANT]
-> Upcoming Deadline:
-> Submission deadline: **April 04th, 2024** (*moved by a week*). \
-> For submission instructions please see [Packaging your Submission Code](/GETTING_STARTED.md#package-your-submission-code) section in the Getting Started document.\
+> Submitters are no longer required to self-report results. 
+> We are currently in the process of evaluating and scoring received submissions. 
 > For other key dates please see [Call for Submissions](CALL_FOR_SUBMISSIONS.md).
 
 ## Table of Contents <!-- omit from toc -->

From 944203fe2ada3d6fac03e5533eed751e7b510e6b Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Fri, 19 Apr 2024 17:20:53 +0000
Subject: [PATCH 18/19] add notes

---
 DOCUMENTATION.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md
index d6a8676ff..607f47ead 100644
--- a/DOCUMENTATION.md
+++ b/DOCUMENTATION.md
@@ -400,6 +400,8 @@ Submissions will be scored based on their performance on the [fixed workload](#f
 
 Furthermore, a less computationally expensive subset of the fixed workloads is collected with the [qualification set](#qualification-set). Submitters without enough compute resources to self-report on the full set of fixed and held-out workloads can instead self-report on this smaller qualification set. Well-performing submissions can thereby qualify for computational resources provided by sponsors of the benchmark to be scored on the full benchmark set.
 
+NOTE: Submitters are no longer required to self-report results for AlgoPerf competition v0.5.
+
 #### Fixed workloads
 
 The fixed workloads are fully specified with the call for submissions. They contain a diverse set of tasks such as image classification, machine translation, speech recognition, or other typical machine learning tasks. For a single task there might be multiple models and therefore multiple fixed workloads. The entire set of fixed workloads should have a combined runtime of roughly 100 hours on the [benchmarking hardware](#benchmarking-hardware).
@@ -429,6 +431,8 @@ Our scoring procedure uses the held-out workloads only to penalize submissions t
 
 #### Qualification set
 
+NOTE: Submitters are no longer required to self-report results for AlgoPerf competition v0.5.
+
 The qualification set is designed for submitters that may not have the compute resources to self-report on the full set of [fixed](#fixed-workloads) and [held-out workloads](#randomized-workloads). They may instead self-report numbers on this smaller qualification set. The best-performing submissions may then qualify for compute sponsorship offering a free evaluation on the full benchmark set and therefore the possibility to win [awards and prizes](/COMPETITION_RULES.md#prizes).
 
 The qualification set consists of the same [fixed workloads](#fixed-workloads) as mentioned above, except for both workloads on *ImageNet*, both workloads on *LibriSpeech*, and the *fastMRI* workload. The remaining three workloads (*WMT*, *Criteo 1TB*, and *OGBG*) form the qualification set. There are no [randomized workloads](#randomized-workloads) in the qualification set. The qualification set of workloads aims to have a combined runtime of roughly 24 hours on the [benchmarking hardware](#benchmarking-hardware).
@@ -449,6 +453,8 @@ All scored runs have to be performed on the benchmarking hardware to allow for a
 - 240 GB in RAM
 - 2 TB in storage (for datasets).
 
+NOTE: Submitters are no longer required to self-report results for AlgoPerf competition v0.5.
+
 For self-reported results, it is acceptable to perform the tuning trials on hardware different from the benchmarking hardware, as long as the same hardware is used for all tuning trials. Once the best trial, i.e. the one that reached the *validation* target the fastest, was determined, this run has to be repeated on the competition hardware. For example, submitters can tune using their locally available hardware but have to use the benchmarking hardware, e.g. via cloud providers, for the $5$ scored runs. This allows for a fair comparison to the reported results of other submitters while allowing some flexibility in the hardware.
 
 #### Defining target performance
@@ -571,10 +577,14 @@ on the benchmarking hardware. We also recommend to do a dry run using a cloud in
 
 #### Are we allowed to use our own hardware to self-report the results?
 
+NOTE: Submitters are no longer required to self-report results for AlgoPerf competition v0.5.
+
 You only have to use the benchmarking hardware for runs that are directly involved in the scoring procedure. This includes all runs for the self-tuning ruleset, but only the runs of the best hyperparameter configuration in each study for the external tuning ruleset. For example, you could use your own (different) hardware to tune your submission and identify the best hyperparameter configuration (in each study) and then only run this configuration (i.e. 5 runs, one for each study) on the benchmarking hardware.
 
 #### What can I do if running the benchmark is too expensive for me?
 
+NOTE: Submitters are no longer required to self-report results for AlgoPerf competition v0.5.
+
 Submitters unable to self-fund scoring costs can instead self-report only on the [qualification set of workloads](/COMPETITION_RULES.md#qualification-set) that excludes some of the most expensive workloads. Based on this performance on the qualification set, the working group will provide - as funding allows - compute to evaluate and score the most promising submissions. Additionally, we encourage researchers to reach out to the [working group](mailto:algorithms@mlcommons.org) to find potential collaborators with the resources to run larger, more comprehensive experiments for both developing and scoring submissions.
 
 #### Can I submit previously published training algorithms as submissions?

From 9e3d41a382b53d6fd7de549fafad65faaeb55960 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Fri, 19 Apr 2024 17:23:46 +0000
Subject: [PATCH 19/19] add notes

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e4fb294ed..f778c94a6 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,8 @@
 
 > [!IMPORTANT]
 > Submitters are no longer required to self-report results. 
-> We are currently in the process of evaluating and scoring received submissions. 
+> We are currently in the process of evaluating and scoring received submissions.
+> We are aiming to release results by July 15th 2024.
 > For other key dates please see [Call for Submissions](CALL_FOR_SUBMISSIONS.md).
 
 ## Table of Contents <!-- omit from toc -->