Skip to content

Commit

Permalink
[Inference] Add autoscaling config (#214)
Browse files Browse the repository at this point in the history
* add autoscaling config

* support openai autoscaling

* remove

* update config file

* address comment

* update docs

* update docs

* edit max_ongoing_requests of router actor

* edit max_ongoing_requests of router actor
  • Loading branch information
KepingYan authored Jun 4, 2024
1 parent 7bd29a1 commit fb8542d
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 24 deletions.
15 changes: 11 additions & 4 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,11 +308,18 @@ async def send_request(
if args.track_token_latency:
generate_len = len(tokenizer.encode(response_text))
else:
response_content = json.loads(response_text)
if isinstance(response_content, list):
generate_len = response_content[0]["generate_length"]
if vllm_engine:
length_name = "num_generated_tokens"
else:
generate_len = response_content["generate_length"]
length_name = "generate_length"
try:
response_content = json.loads(response_text)
if isinstance(response_content, list):
generate_len = response_content[0][length_name]
else:
generate_len = response_content[length_name]
except Exception:
generate_len = None
else:
if args.track_token_latency:
response_content = chunks[-2].decode("utf-8")
Expand Down
17 changes: 16 additions & 1 deletion docs/serve.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ Please follow [setup.md](setup.md) to setup the environment first.


## Configure Serving Parameters
We provide preconfigured yaml files in [inference/models](../llm_on_ray/inference/models) for popular open source models. You can customize a few configurations such as the resource used for serving.
We provide preconfigured yaml files in [inference/models](../llm_on_ray/inference/models) for popular open source models. You can customize a few configurations for serving.

### Resource
To deploy on CPU, please make sure `device` is set to CPU and `cpus_per_worker` is set to a correct number.
```
cpus_per_worker: 24
Expand All @@ -26,6 +27,20 @@ device: hpu
```
LLM-on-Ray also supports serving with [Deepspeed](serve_deepspeed.md) for AutoTP and [IPEX-LLM](serve_ipex-llm.md) for INT4/FP4/INT8/FP8 to reduce latency. You can follow the corresponding documents to enable them.

### Autoscaling
LLM-on-Ray supports automatically scales up and down the number of serving replicas based on the resources of ray cluster and the requests traffic. You can adjust the autoscaling strategy through the following parameters in configuration file. You can follow the [guides-autoscaling-config-parameters](https://docs.ray.io/en/master/serve/advanced-guides/advanced-autoscaling.html#autoscaling-config-parameters) for more detailed explanation of these parameters.

```
max_ongoing_requests: 64
autoscaling_config:
min_replicas: 1
initial_replicas: 1
max_replicas: 2
target_ongoing_requests: 24
downscale_delay_s: 30
upscale_delay_s: 10
```

## Serving
We support two methods to specify the models to be served, and they have the following priorities.
1. Use inference configuration file if config_file is set.
Expand Down
25 changes: 21 additions & 4 deletions llm_on_ray/inference/api_server_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,24 +38,41 @@
from llm_on_ray.inference.api_openai_backend.router_app import Router, router_app


def router_application(deployments, max_concurrent_queries):
def router_application(deployments, model_list, max_ongoing_requests):
"""Create a Router Deployment.
Router Deployment will point to a Serve Deployment for each specified base model,
and have a client to query each one.
"""
merged_client = RouterQueryClient(deployments)

# get the value of max_ongoing_requests based on configuration of all models
total_num_replica = 0
max_num_concurrent_query = 0
for _, infer_conf in model_list.items():
if infer_conf.autoscaling_config:
config_num_replicas = infer_conf.autoscaling_config.max_replicas
else:
config_num_replicas = infer_conf.num_replicas if infer_conf.num_replicas else 1
total_num_replica += config_num_replicas
max_num_concurrent_query = max(
max_num_concurrent_query,
infer_conf.max_ongoing_requests if infer_conf.max_ongoing_requests else 100,
)

RouterDeployment = serve.deployment(
route_prefix="/",
max_concurrent_queries=max_concurrent_queries, # Maximum backlog for a single replica
max_ongoing_requests=total_num_replica
* (
(max_ongoing_requests if max_ongoing_requests else max_num_concurrent_query) + 1
), # Maximum backlog for a single replica
)(serve.ingress(router_app)(Router))

return RouterDeployment.bind(merged_client)


def openai_serve_run(deployments, host, route_prefix, port, max_concurrent_queries):
router_app = router_application(deployments, max_concurrent_queries)
def openai_serve_run(deployments, model_list, host, route_prefix, port, max_ongoing_requests):
router_app = router_application(deployments, model_list, max_ongoing_requests)

serve.start(http_options={"host": host, "port": port})
serve.run(
Expand Down
20 changes: 19 additions & 1 deletion llm_on_ray/inference/inference_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def _check_precision(cls, v: str):

class Vllm(BaseModel):
enabled: bool = False
max_num_seqs: int = 256
precision: str = "bf16"
enforce_eager: bool = False

Expand Down Expand Up @@ -137,12 +138,29 @@ def _check_perftype(cls, v: str):
return v


class AutoscalingConfig(BaseModel):
min_replicas: int = 1
initial_replicas: int = 1
max_replicas: int = 1
target_ongoing_requests: float = 1.0
metrics_interval_s: float = 10.0
look_back_period_s: float = 30.0
smoothing_factor: float = 1.0
upscaling_factor: Union[float, None] = None
downscaling_factor: Union[float, None] = None
downscale_delay_s: float = 600.0
upscale_delay_s: float = 30.0


class InferenceConfig(BaseModel):
host: str = "0.0.0.0"
port: int = 8000
name: str = "default"
route_prefix: Union[str, None] = None
num_replicas: int = 1
dynamic_max_batch_size: int = 8
num_replicas: Union[int, None] = None
max_ongoing_requests: int = 100
autoscaling_config: Union[AutoscalingConfig, None] = None
cpus_per_worker: int = 24
gpus_per_worker: int = 0
hpus_per_worker: int = 0
Expand Down
1 change: 1 addition & 0 deletions llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ port: 8000
name: llama-2-7b-chat-hf
route_prefix: /llama-2-7b-chat-hf
num_replicas: 1
dynamic_max_batch_size: 8
cpus_per_worker: 24
gpus_per_worker: 0
deepspeed: false
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
port: 8000
name: llama-2-7b-chat-hf
route_prefix: /llama-2-7b-chat-hf
max_ongoing_requests: 64
autoscaling_config:
min_replicas: 1
initial_replicas: 1
max_replicas: 2
target_ongoing_requests: 24
downscale_delay_s: 30
upscale_delay_s: 10
cpus_per_worker: 24
gpus_per_worker: 0
deepspeed: false
vllm:
enabled: true
max_num_seqs: 64
precision: bf16
workers_per_group: 2
device: cpu
ipex:
enabled: false
precision: bf16
model_description:
model_id_or_path: meta-llama/Llama-2-7b-chat-hf
tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
config:
use_auth_token: ''
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ gpus_per_worker: 0
deepspeed: false
vllm:
enabled: true
max_num_seqs: 256
precision: bf16
workers_per_group: 2
device: cpu
Expand Down
39 changes: 25 additions & 14 deletions llm_on_ray/inference/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,24 @@ def get_deployed_models(args):
deployments = {}
for model_id, infer_conf in model_list.items():
ray_actor_options = get_deployment_actor_options(infer_conf)
deployments[model_id] = PredictorDeployment.options(
num_replicas=infer_conf.num_replicas,
ray_actor_options=ray_actor_options,
max_concurrent_queries=args.max_concurrent_queries,
).bind(infer_conf, args.vllm_max_num_seqs, args.max_batch_size)
depolyment_config = {
"ray_actor_options": ray_actor_options,
"max_ongoing_requests": infer_conf.max_ongoing_requests
if not args.max_ongoing_requests
else args.max_ongoing_requests,
}
if infer_conf.autoscaling_config:
depolyment_config["autoscaling_config"] = infer_conf.autoscaling_config.dict()
elif infer_conf.num_replicas:
depolyment_config["num_replicas"] = infer_conf.num_replicas
max_num_seqs = infer_conf.vllm.max_num_seqs if not args.max_num_seqs else args.max_num_seqs
dynamic_max_batch_size = (
infer_conf.dynamic_max_batch_size if not args.max_batch_size else args.max_batch_size
)
deployments[model_id] = PredictorDeployment.options(**depolyment_config).bind(
infer_conf, max_num_seqs, dynamic_max_batch_size
)

return deployments, model_list


Expand Down Expand Up @@ -88,28 +101,26 @@ def main(argv=None):
help="Whether to keep serve terminal.",
)
parser.add_argument(
"--max_concurrent_queries",
default=100,
"--max_ongoing_requests",
default=None,
type=int,
help="The max concurrent requests ray serve can process.",
help="The max concurrent requests ray serve can process for all models.",
)
parser.add_argument(
"--serve_local_only",
action="store_true",
help="Only support local access to url.",
)
parser.add_argument("--port", default=8000, type=int, help="The port of deployment address.")

# TODO: vllm_max_num_seqs and max_batch_size should be moved to InferenceConfig
parser.add_argument(
"--vllm_max_num_seqs",
default=256,
"--max_num_seqs",
default=None,
type=int,
help="The batch size for vLLM. Used when vLLM is enabled.",
)

parser.add_argument(
"--max_batch_size", default=8, type=int, help="The max batch size for dynamic batching."
"--max_batch_size", default=None, type=int, help="The max batch size for dynamic batching."
)

# Print help if no arguments were provided
Expand All @@ -132,7 +143,7 @@ def main(argv=None):
host = "127.0.0.1" if args.serve_local_only else "0.0.0.0"
print("Service is running with deployments:" + str(deployments))
print("Service is running models:" + str(model_list))
openai_serve_run(deployments, host, "/", args.port, args.max_concurrent_queries)
openai_serve_run(deployments, model_list, host, "/", args.port, args.max_ongoing_requests)

msg = "Service is deployed successfully."
if args.keep_serve_terminal:
Expand Down

0 comments on commit fb8542d

Please sign in to comment.