Merge branch 'master' into lekurile/update_bench_scripts

microsoft · Mar 1, 2024 · 81036e2 · 81036e2
2 parents 1e51ded + ffb8a4b
commit 81036e2
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 1 deletion.
diff --git a/benchmarks/inference/mii/run_aml.sh b/benchmarks/inference/mii/run_aml.sh
@@ -0,0 +1,20 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Run benchmark against AML endpoint
+python ./run_benchmark.py \
+        --model <model name> \
+        --deployment_name <aml deployment name> \
+        --aml_api_url <aml endpoint URL> \
+        --aml_api_key <aml API key> \
+        --mean_prompt_length 2600 \
+        --mean_max_new_tokens 60 \
+        --num_requests 256 \
+        --backend aml
+
+### Gernerate the plots
+python ./src/plot_th_lat.py
+
+echo "Find figures in ./plots/ and log outputs in ./results/"
diff --git a/benchmarks/inference/mii/src/client.py b/benchmarks/inference/mii/src/client.py
@@ -163,7 +163,11 @@ def get_response(response: requests.Response) -> List[str]:
     token_gen_time = []
     start_time = time.time()
     response = requests.post(args.aml_api_url, headers=headers, json=pload)
-    output = get_response(response)
+    # Sometimes the AML endpoint will return an error, so we send the request again
+    try:
+        output = get_response(response)
+    except Exception as e:
+        return call_aml(input_tokens, max_new_tokens, args)
 
     return ResponseDetails(
         generated_tokens=output,