NVIDIA · Priya2698 · Feb 9, 2024 · Jan 31, 2024 · Feb 1, 2024
diff --git a/python_benchmarks/global_params.py b/python_benchmarks/global_params.py
@@ -4,6 +4,12 @@
 from .core import DEVICE_PROPERTIES
 import numpy as np
 import itertools
+import os
+
+# BENCHMARK_MODE = weekly/nightly.
+BENCHMARK_MODE = os.getenv("BENCHMARK_MODE")
+if not BENCHMARK_MODE:
+    BENCHMARK_MODE = "nightly"
 
 # Datatypes to benchmark
 FLOAT_DTYPES = [torch.float32]
@@ -44,19 +50,35 @@
 
 # Utility function to generate input sizes for benchmarks
 def generate_input_sizes(dims: Union[int, List] = 2) -> List[Tuple]:
+    """
+    The weekly vs nightly input ranges only differ for 2D inputs currently.
+    Nightly input range:
+        Batch size: [16->16384] Hidden size: [768, 4*18432] (step size = 256)
+    Weekly input range:
+        Batch size:
+            [16]: Latency bound state
+            [512, 1024]: Just filled the machine
+            [16384]: Steady state (full machine)
+        Hidden size: [768, 4*18432] (step size = 8)
+    Note: The hidden size is restricted to 2 * 18432 for the batch size 16384 to avoid OOM.
+    """
     inputs = []
     if isinstance(dims, int):
         dims = [dims]
 
     for dim in dims:
         if dim == 2:
             input_ranges = []
-            step_size = 256
 
+            step_size = 256
             # max_batch_range: set according to max size that fits in GPU memory
             batch_range = [2**i for i in range(4, 14)]  # {16, 8192}
-            # max_hidden_size = 4 * d_model_max (max hidden size in feedforward layers)
 
+            if BENCHMARK_MODE == "weekly":
+                step_size = 8
+                batch_range = [16, 512, 1024]
+
+            # max_hidden_size = 4 * d_model_max (max hidden size in feedforward layers)
             # NOTE: Numpy arrays are not JSON serializable so convert them to enable storing benchmark data.
             hidden_range = np.arange(
                 D_MODEL_MIN, 4 * D_MODEL_MAX + 1, step_size