argonne-lcf · zhenghh04 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py
@@ -50,6 +50,16 @@ class StorageType(Enum):
 
     def __str__(self):
         return self.value
+class LogLevel(Enum):
+    """
+    Different levels of logging
+    """
+    DEBUG = "debug"
+    INFO = "info"
+    WARNING = "warning"
+    ERROR = "error" 
+    def __str__(self):
+        return self.value
 
 class MetadataType(Enum):
     """
@@ -107,7 +117,6 @@ class ComputationType(Enum):
     SYNC = 'sync'
     ASYNC = 'async'
 
-
 class FormatType(Enum):
     """
     Format Type supported by the benchmark.

diff --git a/dlio_benchmark/main.py b/dlio_benchmark/main.py
@@ -146,8 +146,6 @@ def initialize(self):
         - Start profiling session for Darshan and Tensorboard.
         """
         self.comm.barrier()
-        if self.args.debug and self.args.my_rank == 0:
-            input("Debug mode: Press enter to start\n")
 
         if self.args.generate_data:
             if self.args.my_rank == 0:

diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py
@@ -26,7 +26,7 @@
 
 from dlio_benchmark.common.constants import MODULE_CONFIG
 from dlio_benchmark.common.enumerations import StorageType, FormatType, Shuffle, ReadType, FileAccess, Compression, \
-    FrameworkType, \
+    FrameworkType, LogLevel, \
     DataLoaderType, Profiler, DatasetType, DataLoaderSampler, CheckpointLocationType, CheckpointMechanismType
 from dlio_benchmark.utils.utility import DLIOMPI, get_trace_name, utcnow
 from dataclasses import dataclass
@@ -91,7 +91,7 @@ class ConfigArguments:
     chunk_size: int = 0
     compression: Compression = Compression.NONE
     compression_level: int = 4
-    debug: bool = False
+    log_level: LogLevel = LogLevel.INFO
     total_training_steps: int = -1
     do_eval: bool = False
     batch_size_eval: int = 1
@@ -167,18 +167,29 @@ def configure_dlio_logging(self, is_child=False):
         if is_child and self.multiprocessing_context == "fork":
             return
         # Configure the logging library
-        log_level = logging.DEBUG if self.debug else logging.INFO
+        log_format_verbose = '[%(levelname)s] %(message)s [%(pathname)s:%(lineno)d]'
+        log_format_simple = '[%(levelname)s] %(message)s'
+        # Set logging format to be simple only when debug_level <= INFO
+        log_format = log_format_simple
+        if self.log_level == LogLevel.DEBUG:
+            log_level = logging.DEBUG
+            log_format = log_format_verbose 
+        elif self.log_level == LogLevel.WARNING:
+            log_level = logging.WARNING
+        elif self.log_level == LogLevel.ERROR:
+            log_level = logging.ERROR
+        else:
+            log_level = logging.INFO
         logging.basicConfig(
             level=log_level,
             force=True,
             handlers=[
                 logging.FileHandler(self.logfile_path, mode="a", encoding='utf-8'),
                 logging.StreamHandler()
             ],
-            format='[%(levelname)s] %(message)s [%(pathname)s:%(lineno)d]'
+            format = log_format
             # logging's max timestamp resolution is msecs, we will pass in usecs in the message
         )
-
     def configure_dftracer(self, is_child=False, use_pid=False):
         # with "multiprocessing_context=fork" the profiler file remains open in the child process
         if is_child and self.multiprocessing_context == "fork":
@@ -542,7 +553,8 @@ def LoadConfig(args, config):
             args.output_folder = config['output']['folder']
         if 'log_file' in config['output']:
             args.log_file = config['output']['log_file']
-
+        if 'log_level' in config['output']:
+            args.log_level = LogLevel(config['output']['log_level'])
     if args.output_folder is None:
         try:
             hydra_cfg = hydra.core.hydra_config.HydraConfig.get()
@@ -558,8 +570,6 @@ def LoadConfig(args, config):
             args.generate_only = True
         else:
             args.generate_only = False
-        if 'debug' in config['workflow']:
-            args.debug = config['workflow']['debug']
         if 'evaluation' in config['workflow']:
             args.do_eval = config['workflow']['evaluation']
         if 'checkpoint' in config['workflow']:

diff --git a/dlio_benchmark/utils/statscounter.py b/dlio_benchmark/utils/statscounter.py
@@ -322,7 +322,7 @@ def batch_processed(self, epoch, step, block, t0, computation_time):
         else:
             self.output[epoch]['proc'] = [duration]
             self.output[epoch]['compute']=[computation_time]
-        logging.info(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size} samples in {duration} s")
+        logging.debug(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size} samples in {duration} s")
 
     def compute_metrics_train(self, epoch, block):
         key = f"block{block}"
@@ -358,7 +358,7 @@ def eval_batch_processed(self, epoch, step, t0, computation_time):
         duration = time() - t0
         self.output[epoch]['proc']['eval'].append(duration)
         self.output[epoch]['compute']['eval'].append(computation_time)
-        logging.info(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size_eval} samples in {duration} s")
+        logging.debug(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size_eval} samples in {duration} s")
     def finalize(self):
         self.summary['end'] = utcnow()
     def save_data(self):

diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py
@@ -37,38 +37,38 @@
     from dftracer.logger import dftracer as PerfTrace, dft_fn as Profile, DFTRACER_ENABLE as DFTRACER_ENABLE
 except:
     class Profile(object):
-        def __init__(self,  **kwargs):
-            return 
-        def log(self,  **kwargs):
-            return 
-        def log_init(self,  **kwargs):
-            return 
-        def iter(self,  **kwargs):
+        def __init__(self,  cat, name=None, epoch=None, step=None, image_idx=None, image_size=None):
             return 
+        def log(self,  func):
+            return func
+        def log_init(self,  func):
+            return func
+        def iter(self,  func, iter_name="step"):
+            return func
         def __enter__(self):
             return
-        def __exit__(self, **kwargs):
+        def __exit__(self, type, value, traceback):
             return
-        def update(self, **kwargs):
+        def update(self, epoch=None, step=None, image_idx=None, image_size=None, args={}):
             return
         def flush(self):
             return
         def reset(self):
             return
-        def log_static(self, **kwargs):
-            return
+        def log_static(self, func):
+            return func
     class dftracer(object):
         def __init__(self,):
             self.type = None
-        def initialize_log(self, **kwargs):
+        def initialize_log(self, logfile=None, data_dir=None, process_id=-1):
             return
         def get_time(self):
             return
         def enter_event(self):
             return
         def exit_event(self):
             return
-        def log_event(self,  **kwargs):
+        def log_event(self, name, cat, start_time, duration, string_args=None):
             return
         def finalize(self):
             return

diff --git a/docs/source/config.rst b/docs/source/config.rst
@@ -357,11 +357,19 @@ output
    * - log_file
      - dlio.log
      - log file name  
+   * - log_level
+     - "info"
+     - select the logging level [error|warning|info|debug]
 
 .. note::
 
    If ``folder`` is not set (None), the output folder will be ```hydra_log/unet3d/$DATE-$TIME```. 
 
+.. note:: 
+
+  ``log_level=debug`` will output detailed logging info per steps; whereas ``log_level=info`` only output log at the end of each epoch. 
+  For performance mode, we would suggest using error mode to suppress unnecessory logs. 
+
 profiling
 ------------------
 .. list-table::