apple · dipannita08 · Oct 21, 2024 · Oct 21, 2024 · Nov 4, 2024 · Nov 4, 2024
@@ -5,6 +5,7 @@
 import jax
 from absl import flags, logging
 from ml_goodput_measurement import goodput
+from ml_goodput_measurement import monitoring as goodput_monitoring
 
 from axlearn.cloud.common.utils import parse_kv_flags
 from axlearn.common import measurement
@@ -22,7 +23,11 @@ def from_flags(cls, fv: flags.FlagValues) -> "GoodputRecorder":
         """Converts flags to a recorder.
 
         `fv.recorder_spec` will be interpreted as a list of `key=value` pairs; config names
-        corresponding to keys will be set to the corresponding values.
+        corresponding to keys will be set to the corresponding values. A GoodputRecorder can
+        additionally take in following Tensorboard configs in the recorder_spec:
+         - upload_dir: The directory to write Tensorboard data to.
+         - upload_interval: The time interval in seconds at which to query and upload data
+           to Tensorboard.
         """
         cfg: measurement.Recorder.Config = cls.default_config()
         cfg = maybe_set_config(cfg, **parse_kv_flags(fv.recorder_spec, delimiter="="))
@@ -32,6 +37,7 @@ def __init__(self, cfg):
         super().__init__(cfg)
         cfg: GoodputRecorder.Config = self.config
         self._recorder = None
+        self._monitor = None
 
     def record(self, event: measurement.Event, *args, **kwargs):
         # Lazily instantiate the recorder. This avoids invoking jax before setup is complete.
@@ -49,10 +55,48 @@ def record(self, event: measurement.Event, *args, **kwargs):
             self._recorder.record_job_end_time(*args, **kwargs)
         elif event == measurement.Event.START_STEP:
             self._recorder.record_step_start_time(*args, **kwargs)
+        elif event == measurement.Event.START_ACCELERATOR_INIT:
+            self._recorder.record_tpu_init_start_time(*args, **kwargs)
+        elif event == measurement.Event.END_ACCELERATOR_INIT:
+            self._recorder.record_tpu_init_end_time(*args, **kwargs)
+        elif event == measurement.Event.START_TRAINING_PREPARATION:
+            self._recorder.record_training_preparation_start_time(*args, **kwargs)
+        elif event == measurement.Event.END_TRAINING_PREPARATION:
+            self._recorder.record_training_preparation_end_time(*args, **kwargs)
+        elif event == measurement.Event.START_DATA_LOADING:
+            self._recorder.record_data_loading_start_time(*args, **kwargs)
+        elif event == measurement.Event.END_DATA_LOADING:
+            self._recorder.record_data_loading_end_time(*args, **kwargs)
         else:
             logging.log_first_n(
                 logging.WARNING,
                 "Ignoring unknown event %s",
                 1,
                 event,
             )
+
+    def start_monitoring(self, *args, **kwargs):
+        # Instantiate ml-goodput-measurement's GoodputMonitor
+        # to asynchronously calculate goodput and badput at
+        # the upload_interval and upload to the specified
+        # tensorboard directory.
+        if self._monitor is None:
+            cfg: GoodputRecorder.Config = self.config
+            self._monitor = goodput_monitoring.GoodputMonitor(
+                job_name=cfg.name,
+                logger_name=f"goodput_logger_{cfg.name}",
+                tensorboard_dir=cfg.upload_dir,
+                upload_interval=int(cfg.upload_interval),
+                monitoring_enabled=(jax.process_index() == 0),
+                include_badput_breakdown=True,
+            )
+
+        if self._monitor:
+            self._monitor.start_goodput_uploader(*args, **kwargs)
+            logging.info("Started Goodput upload to Tensorboard in the background!")
+        else:
+            logging.log_first_n(
+                logging.WARNING,
+                "Goodput upload could not be started. Please check GoodputMonitor logs.",
+                1,
+            )
@@ -16,7 +16,9 @@
 class GoodputRecorderTest(parameterized.TestCase):
     """Tests GoodputRecorder."""
 
-    @parameterized.parameters(None, ["name=test-name"])
+    @parameterized.parameters(
+        (None,), (["name=test-name", "upload_dir=/test/path/to/upload", "upload_interval=15"],)
+    )
     def test_from_flags(self, spec):
         fv = flags.FlagValues()
         measurement.define_flags(flag_values=fv)
@@ -34,13 +36,46 @@ def test_from_flags(self, spec):
             # Recorder is not instantiated until first event.
             self.assertIsNone(recorder._recorder)
 
-    def test_record(self):
+    def test_record_and_monitor(self):
         fv = flags.FlagValues()
         measurement.define_flags(flag_values=fv)
-        fv.set_default("recorder_spec", ["name=test-name"])
+        fv.set_default(
+            "recorder_spec",
+            ["name=test-name", "upload_dir=/test/path/to/upload", "upload_interval=15"],
+        )
         fv.mark_as_parsed()
 
         recorder = GoodputRecorder.from_flags(fv)
         recorder._recorder = mock.MagicMock()
         recorder.record(measurement.Event.START_JOB)
         self.assertTrue(recorder._recorder.record_job_start_time.called)
+
+    def test_start_monitoring(self):
+        fv = flags.FlagValues()
+        measurement.define_flags(flag_values=fv)
+        fv.set_default(
+            "recorder_spec",
+            ["name=test-name", "upload_dir=/test/path/to/upload", "upload_interval=15"],
+        )
+        fv.mark_as_parsed()
+
+        recorder = GoodputRecorder.from_flags(fv)
+        recorder._monitor = None  # Ensure _monitor is initially None
+
+        with mock.patch("ml_goodput_measurement.monitoring.GoodputMonitor") as mock_goodput_monitor:
+            mock_monitor_instance = mock_goodput_monitor.return_value
+            recorder.start_monitoring()
+
+            # Check that GoodputMonitor was instantiated
+            mock_goodput_monitor.assert_called_once_with(
+                job_name="test-name",
+                logger_name="goodput_logger_test-name",
+                tensorboard_dir="/test/path/to/upload",
+                upload_interval=15,
+                monitoring_enabled=True,
+                include_badput_breakdown=True,
+            )
+
+            # Ensure that start_goodput_uploader is called on the monitor instance
+            mock_monitor_instance.start_goodput_uploader.assert_called_once()
+            self.assertIsNotNone(recorder._monitor)
@@ -13,6 +13,7 @@ def main(_):
     launch.setup()
     trainer_config = launch_trainer.get_trainer_config()
     trainer_config.set(recorder=config_for_function(lambda: measurement.global_recorder))
+    measurement.start_monitoring()
     launch_trainer.run_trainer(trainer_config)
 
 

@@ -18,11 +18,23 @@ class Event(enum.Enum):
         START_JOB: Start of job.
         END_JOB: End of job.
         START_STEP: Start of a training step. Should be recorded with `step` as a positional arg.
+        START_ACCELERATOR_INIT: Start of accelerator mesh initialization.
+        END_ACCELERATOR_INIT: End of accelerator mesh initialization.
+        START_TRAINING_PREPARATION: Start of training preparation.
+        END_TRAINING_PREPARATION: End of training preparation.
+        START_DATA_LOADING: Start of data loading.
+        END_DATA_LOADING: End of data loading.
     """
 
     START_JOB = "START_JOB"
     END_JOB = "END_JOB"
     START_STEP = "START_STEP"
+    START_ACCELERATOR_INIT = "START_ACCELERATOR_INIT"
+    END_ACCELERATOR_INIT = "END_ACCELERATOR_INIT"
+    START_TRAINING_PREPARATION = "START_TRAINING_PREPARATION"
+    END_TRAINING_PREPARATION = "END_TRAINING_PREPARATION"
+    START_DATA_LOADING = "START_DATA_LOADING"
+    END_DATA_LOADING = "END_DATA_LOADING"
 
 
 class Recorder(Configurable):
@@ -34,9 +46,13 @@ class Config(Configurable.Config):
 
         Attributes:
             name: Name of the recorder.
+            upload_dir: Directory to store metrics for the monitor.
+            upload_interval: Time interval (seconds) for monitoring uploads.
         """
 
         name: Required[str] = REQUIRED
+        upload_dir: Required[str] = REQUIRED
+        upload_interval: Required[int] = REQUIRED
 
     @classmethod
     def from_flags(cls, fv: Optional[flags.FlagValues]) -> "Recorder":
@@ -47,6 +63,10 @@ def record(self, event: Event, *args, **kwargs):
         """Records an event with the given name."""
         raise NotImplementedError(type(self))
 
+    def start_monitoring(self, **kwargs):
+        """Starts computing and uploading metrics at some configured interval in the background."""
+        raise NotImplementedError(type(self))
+
 
 _recorders: dict[str, type] = {}
 _T = TypeVar("_T")
@@ -120,3 +140,16 @@ def record_event(event: Event):
         logging.log_first_n(logging.INFO, "No recorder configured, ignoring events.", 1)
     else:
         global_recorder.record(event)
+
+
+def start_monitoring():
+    """Begins monitoring events as per global monitor functionality."""
+    if global_recorder is None:
+        logging.log_first_n(
+            logging.INFO, "Since recorder is not set up, monitoring cannot be started.", 1
+        )
+    else:
+        global_recorder.start_monitoring()
+        logging.info(
+            "Starting monitoring of events using global recorder's monitor: %s", global_recorder
+        )
@@ -85,3 +85,10 @@ def test_initialize(self, recorder_type, expected):
         with mock.patch.object(measurement.global_recorder, "record") as mock_record:
             measurement.record_event(measurement.Event.START_JOB)
             self.assertIn(measurement.Event.START_JOB, mock_record.call_args[0])
+
+        # Ensure that start_monitoring does not fail.
+        with mock.patch.object(
+            measurement.global_recorder, "start_monitoring"
+        ) as mock_start_monitoring:
+            measurement.start_monitoring()
+            mock_start_monitoring.assert_called_once()
diff --git a/axlearn/common/trainer.py b/axlearn/common/trainer.py
@@ -238,6 +238,7 @@ def __init__(
             utils.validate_float_dtype(cfg.train_dtype)
 
         # Create the device mesh.
+        self._maybe_record_event(measurement.Event.START_ACCELERATOR_INIT)
         if devices is None:
             self._step_log(
                 "Devices: global=%s local=%s %s",
@@ -324,6 +325,7 @@ def __init__(
                     model=self.model,
                     model_param_partition_specs=model_param_partition_specs,
                 )
+        self._maybe_record_event(measurement.Event.END_ACCELERATOR_INIT)
 
     @property
     def step(self):
@@ -810,6 +812,7 @@ def _prepare_training(self, prng_key: Tensor) -> bool:
         # Attempt to restore the latest checkpoint, which may contain a saved `_input_iter`.
         self.restore_checkpoint(restore_step=None)
 
+        self._maybe_record_event(measurement.Event.START_TRAINING_PREPARATION)
         if self.step is None:
             # If we didn't restore from checkpoint, attempt to build initial state according
             # to `cfg.init_state_builder` and initialize the remaining parameters.
@@ -825,6 +828,7 @@ def _prepare_training(self, prng_key: Tensor) -> bool:
                     f.write(str(jax.tree_util.tree_structure(self._trainer_state)))
 
         self._log_trainer_state_stats()
+        self._maybe_record_event(measurement.Event.END_TRAINING_PREPARATION)
         # Log config.
         self.summary_writer.log_config(cfg, step=self.step)
 
@@ -861,6 +865,7 @@ def restore_checkpoint(self, restore_step: Optional[int] = None) -> Optional[int
             restore_input_iter = cfg.save_input_iterator
             try:
                 # Try to restore with `input_iter`.
+                self._maybe_record_event(measurement.Event.START_DATA_LOADING)
                 step, ckpt_state = self.checkpointer.restore(
                     step=restore_step,
                     state=(
@@ -874,13 +879,15 @@ def restore_checkpoint(self, restore_step: Optional[int] = None) -> Optional[int
                         step,
                         restore_input_iter,
                     )
+                self._maybe_record_event(measurement.Event.END_DATA_LOADING)
             except ValueError as e:
                 logging.warning(
                     "Attempt to restore checkpoint with restore_input_iter=%s failed: %s",
                     restore_input_iter,
                     e,
                 )
                 # Restore with a different restore_input_iter setting.
+                self._maybe_record_event(measurement.Event.START_DATA_LOADING)
                 restore_input_iter = not restore_input_iter
                 step, ckpt_state = self.checkpointer.restore(
                     step=restore_step,
@@ -895,6 +902,7 @@ def restore_checkpoint(self, restore_step: Optional[int] = None) -> Optional[int
                         step,
                         restore_input_iter,
                     )
+                self._maybe_record_event(measurement.Event.END_DATA_LOADING)
             if step is not None:
                 self._step = step
                 self._trainer_state = TrainerState(

@@ -10,4 +10,8 @@ class DummyRecorder(measurement.Recorder):
     @classmethod
     def from_flags(cls, fv) -> measurement.Recorder:
         del fv
-        return cls.default_config().set(name="dummy_recorder").instantiate()
+        return (
+            cls.default_config()
+            .set(name="dummy_recorder", upload_dir="/dummy/upload_dir", upload_interval=15)
+            .instantiate()
+        )
@@ -91,7 +91,7 @@ gcp = [
     "google-cloud-compute==1.19.2", # Needed for region discovery for CloudBuild API access.
     "google-cloud-core==2.3.3",
     "google-cloud-build==3.24.1",
-    "ml_goodput_measurement==0.0.2",
+    "ml-goodput-measurement==0.0.4",
     "pika==1.3.2",  # used by event queue
     "pyOpenSSL>=22.1.0",  # compat with cryptography version.
     "tpu-info==0.2.0", # For TPU monitoring from libtpu. https://github.com/AI-Hypercomputer/cloud-accelerator-diagnostics/tree/main/tpu_info