[Inference] Add autoscaling config (#214)

* add autoscaling config * support openai autoscaling * remove * update config file * address comment * update docs * update docs * edit max_ongoing_requests of router actor * edit max_ongoing_requests of router actor
intel · Jun 4, 2024 · fb8542d · fb8542d
1 parent 7bd29a1
commit fb8542d
Show file tree

Hide file tree

Showing 8 changed files with 122 additions and 24 deletions.
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -308,11 +308,18 @@ async def send_request(
                 if args.track_token_latency:
                     generate_len = len(tokenizer.encode(response_text))
                 else:
-                    response_content = json.loads(response_text)
-                    if isinstance(response_content, list):
-                        generate_len = response_content[0]["generate_length"]
+                    if vllm_engine:
+                        length_name = "num_generated_tokens"
                     else:
-                        generate_len = response_content["generate_length"]
+                        length_name = "generate_length"
+                    try:
+                        response_content = json.loads(response_text)
+                        if isinstance(response_content, list):
+                            generate_len = response_content[0][length_name]
+                        else:
+                            generate_len = response_content[length_name]
+                    except Exception:
+                        generate_len = None
             else:
                 if args.track_token_latency:
                     response_content = chunks[-2].decode("utf-8")

diff --git a/docs/serve.md b/docs/serve.md
@@ -7,8 +7,9 @@ Please follow [setup.md](setup.md) to setup the environment first.
 
 
 ## Configure Serving Parameters
-We provide preconfigured yaml files in [inference/models](../llm_on_ray/inference/models) for popular open source models. You can customize a few configurations such as the resource used for serving.
+We provide preconfigured yaml files in [inference/models](../llm_on_ray/inference/models) for popular open source models. You can customize a few configurations for serving.
 
+### Resource
 To deploy on CPU, please make sure `device` is set to CPU and `cpus_per_worker` is set to a correct number.
 ```
 cpus_per_worker: 24
@@ -26,6 +27,20 @@ device: hpu
 ```
 LLM-on-Ray also supports serving with [Deepspeed](serve_deepspeed.md) for AutoTP and [IPEX-LLM](serve_ipex-llm.md) for INT4/FP4/INT8/FP8 to reduce latency. You can follow the corresponding documents to enable them.
 
+### Autoscaling
+LLM-on-Ray supports automatically scales up and down the number of serving replicas based on the resources of ray cluster and the requests traffic. You can adjust the autoscaling strategy through the following parameters in configuration file. You can follow the [guides-autoscaling-config-parameters](https://docs.ray.io/en/master/serve/advanced-guides/advanced-autoscaling.html#autoscaling-config-parameters) for more detailed explanation of these parameters.
+
+```
+max_ongoing_requests: 64
+autoscaling_config:
+    min_replicas: 1
+    initial_replicas: 1
+    max_replicas: 2
+    target_ongoing_requests: 24
+    downscale_delay_s: 30
+    upscale_delay_s: 10
+```
+
 ## Serving
 We support two methods to specify the models to be served, and they have the following priorities.
 1. Use inference configuration file if config_file is set.

diff --git a/llm_on_ray/inference/api_server_openai.py b/llm_on_ray/inference/api_server_openai.py
@@ -38,24 +38,41 @@
 from llm_on_ray.inference.api_openai_backend.router_app import Router, router_app
 
 
-def router_application(deployments, max_concurrent_queries):
+def router_application(deployments, model_list, max_ongoing_requests):
     """Create a Router Deployment.
 
     Router Deployment will point to a Serve Deployment for each specified base model,
     and have a client to query each one.
     """
     merged_client = RouterQueryClient(deployments)
 
+    # get the value of max_ongoing_requests based on configuration of all models
+    total_num_replica = 0
+    max_num_concurrent_query = 0
+    for _, infer_conf in model_list.items():
+        if infer_conf.autoscaling_config:
+            config_num_replicas = infer_conf.autoscaling_config.max_replicas
+        else:
+            config_num_replicas = infer_conf.num_replicas if infer_conf.num_replicas else 1
+        total_num_replica += config_num_replicas
+        max_num_concurrent_query = max(
+            max_num_concurrent_query,
+            infer_conf.max_ongoing_requests if infer_conf.max_ongoing_requests else 100,
+        )
+
     RouterDeployment = serve.deployment(
         route_prefix="/",
-        max_concurrent_queries=max_concurrent_queries,  # Maximum backlog for a single replica
+        max_ongoing_requests=total_num_replica
+        * (
+            (max_ongoing_requests if max_ongoing_requests else max_num_concurrent_query) + 1
+        ),  # Maximum backlog for a single replica
     )(serve.ingress(router_app)(Router))
 
     return RouterDeployment.bind(merged_client)
 
 
-def openai_serve_run(deployments, host, route_prefix, port, max_concurrent_queries):
-    router_app = router_application(deployments, max_concurrent_queries)
+def openai_serve_run(deployments, model_list, host, route_prefix, port, max_ongoing_requests):
+    router_app = router_application(deployments, model_list, max_ongoing_requests)
 
     serve.start(http_options={"host": host, "port": port})
     serve.run(

diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py
@@ -56,6 +56,7 @@ def _check_precision(cls, v: str):
 
 class Vllm(BaseModel):
     enabled: bool = False
+    max_num_seqs: int = 256
     precision: str = "bf16"
     enforce_eager: bool = False
 
@@ -137,12 +138,29 @@ def _check_perftype(cls, v: str):
         return v
 
 
+class AutoscalingConfig(BaseModel):
+    min_replicas: int = 1
+    initial_replicas: int = 1
+    max_replicas: int = 1
+    target_ongoing_requests: float = 1.0
+    metrics_interval_s: float = 10.0
+    look_back_period_s: float = 30.0
+    smoothing_factor: float = 1.0
+    upscaling_factor: Union[float, None] = None
+    downscaling_factor: Union[float, None] = None
+    downscale_delay_s: float = 600.0
+    upscale_delay_s: float = 30.0
+
+
 class InferenceConfig(BaseModel):
     host: str = "0.0.0.0"
     port: int = 8000
     name: str = "default"
     route_prefix: Union[str, None] = None
-    num_replicas: int = 1
+    dynamic_max_batch_size: int = 8
+    num_replicas: Union[int, None] = None
+    max_ongoing_requests: int = 100
+    autoscaling_config: Union[AutoscalingConfig, None] = None
     cpus_per_worker: int = 24
     gpus_per_worker: int = 0
     hpus_per_worker: int = 0

diff --git a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
@@ -2,6 +2,7 @@ port: 8000
 name: llama-2-7b-chat-hf
 route_prefix: /llama-2-7b-chat-hf
 num_replicas: 1
+dynamic_max_batch_size: 8
 cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false

diff --git a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-autoscaling.yaml b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-autoscaling.yaml
@@ -0,0 +1,28 @@
+port: 8000
+name: llama-2-7b-chat-hf
+route_prefix: /llama-2-7b-chat-hf
+max_ongoing_requests: 64
+autoscaling_config:
+    min_replicas: 1
+    initial_replicas: 1
+    max_replicas: 2
+    target_ongoing_requests: 24
+    downscale_delay_s: 30
+    upscale_delay_s: 10
+cpus_per_worker: 24
+gpus_per_worker: 0
+deepspeed: false
+vllm:
+  enabled: true
+  max_num_seqs: 64
+  precision: bf16
+workers_per_group: 2
+device: cpu
+ipex:
+  enabled: false
+  precision: bf16
+model_description:
+  model_id_or_path: meta-llama/Llama-2-7b-chat-hf
+  tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml
@@ -7,6 +7,7 @@ gpus_per_worker: 0
 deepspeed: false
 vllm:
   enabled: true
+  max_num_seqs: 256
   precision: bf16
 workers_per_group: 2
 device: cpu

diff --git a/llm_on_ray/inference/serve.py b/llm_on_ray/inference/serve.py
@@ -51,11 +51,24 @@ def get_deployed_models(args):
     deployments = {}
     for model_id, infer_conf in model_list.items():
         ray_actor_options = get_deployment_actor_options(infer_conf)
-        deployments[model_id] = PredictorDeployment.options(
-            num_replicas=infer_conf.num_replicas,
-            ray_actor_options=ray_actor_options,
-            max_concurrent_queries=args.max_concurrent_queries,
-        ).bind(infer_conf, args.vllm_max_num_seqs, args.max_batch_size)
+        depolyment_config = {
+            "ray_actor_options": ray_actor_options,
+            "max_ongoing_requests": infer_conf.max_ongoing_requests
+            if not args.max_ongoing_requests
+            else args.max_ongoing_requests,
+        }
+        if infer_conf.autoscaling_config:
+            depolyment_config["autoscaling_config"] = infer_conf.autoscaling_config.dict()
+        elif infer_conf.num_replicas:
+            depolyment_config["num_replicas"] = infer_conf.num_replicas
+        max_num_seqs = infer_conf.vllm.max_num_seqs if not args.max_num_seqs else args.max_num_seqs
+        dynamic_max_batch_size = (
+            infer_conf.dynamic_max_batch_size if not args.max_batch_size else args.max_batch_size
+        )
+        deployments[model_id] = PredictorDeployment.options(**depolyment_config).bind(
+            infer_conf, max_num_seqs, dynamic_max_batch_size
+        )
+
     return deployments, model_list
 
 
@@ -88,28 +101,26 @@ def main(argv=None):
         help="Whether to keep serve terminal.",
     )
     parser.add_argument(
-        "--max_concurrent_queries",
-        default=100,
+        "--max_ongoing_requests",
+        default=None,
         type=int,
-        help="The max concurrent requests ray serve can process.",
+        help="The max concurrent requests ray serve can process for all models.",
     )
     parser.add_argument(
         "--serve_local_only",
         action="store_true",
         help="Only support local access to url.",
     )
     parser.add_argument("--port", default=8000, type=int, help="The port of deployment address.")
-
-    # TODO: vllm_max_num_seqs and max_batch_size should be moved to InferenceConfig
     parser.add_argument(
-        "--vllm_max_num_seqs",
-        default=256,
+        "--max_num_seqs",
+        default=None,
         type=int,
         help="The batch size for vLLM. Used when vLLM is enabled.",
     )
 
     parser.add_argument(
-        "--max_batch_size", default=8, type=int, help="The max batch size for dynamic batching."
+        "--max_batch_size", default=None, type=int, help="The max batch size for dynamic batching."
     )
 
     # Print help if no arguments were provided
@@ -132,7 +143,7 @@ def main(argv=None):
         host = "127.0.0.1" if args.serve_local_only else "0.0.0.0"
         print("Service is running with deployments:" + str(deployments))
         print("Service is running models:" + str(model_list))
-        openai_serve_run(deployments, host, "/", args.port, args.max_concurrent_queries)
+        openai_serve_run(deployments, model_list, host, "/", args.port, args.max_ongoing_requests)
 
     msg = "Service is deployed successfully."
     if args.keep_serve_terminal: