IBM · maxdebayser · Jun 28, 2024 · Jun 20, 2024 · Jun 27, 2024 · Jun 27, 2024
diff --git a/router/src/server.rs b/router/src/server.rs
@@ -222,8 +222,8 @@ impl<'a, B: BatchType> BatchConfigValidator<'a, B> {
             self.batch_type.prefill_weight(&single_request_stats, 1);
         if max_batch_weight < single_request_prefill_weight {
             panic!(
-                "max_batch_weight ({}) not large enough for (prefill) max_sequence_length ({})",
-                max_batch_weight, max_sequence_length
+                "The provided max_sequence length ({}) results in a prefill batch weight that exceeds the estimated capacity ({})",
+                max_sequence_length, max_batch_weight
             )
         }
 
@@ -232,8 +232,8 @@ impl<'a, B: BatchType> BatchConfigValidator<'a, B> {
             .batch_initial_weight(&single_request_stats, 1);
         if max_batch_weight < single_request_nexttoken_weight {
             panic!(
-                "max_batch_weight ({}) not large enough for (next-token) max_sequence_length ({})",
-                max_batch_weight, max_sequence_length
+                "The provided max_sequence length ({}) results in a next-token batch weight that exceeds the estimated capacity ({})",
+                max_sequence_length, max_batch_weight
             )
         }
     }

diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
@@ -409,13 +409,15 @@ def estimate_memory():
                 memory_scaling_model = estimate_memory()
                 compile()
 
-            max_input = memory_scaling_model.max_input_len_for_nt(1, max_sequence_length-1, sys.maxsize)
-            max_output = memory_scaling_model.max_output_len_for_nt(1, max_sequence_length-1, sys.maxsize)
-
             if local_rank == 0:
+                # For a batch of size 1 and an output of 1, get max input limited by max_sequence_length
+                max_input  = memory_scaling_model.max_input_len_for_nt(1, 1, max_sequence_length)
+                # For a batch of size 1 and an input of 1, get max output limited by max_sequence_length
+                max_output = memory_scaling_model.max_output_len_for_nt(1, 1, max_sequence_length)
+                max_theoretical_len = min(max_input, max_output) + 1
                 print(
                     "Maximum possible sequence length given available memory (for batch size 1): "
-                    f"{min(max_input, max_output)}"
+                    f"{max_theoretical_len}"
                 )
 
         elif ESTIMATE_MEMORY == "manual":