[batch infer] Update batch inference template to use RayLLMBatch (#346)

Update the current batch llm inference template to use RayLLM-Batch --------- Co-authored-by: rickyx <[email protected]> Co-authored-by: Huaiwei Sun <[email protected]>
anyscale · Oct 22, 2024 · 7aec451 · 7aec451
1 parent ade2205
commit 7aec451
Show file tree

Hide file tree

Showing 10 changed files with 314 additions and 959 deletions.
diff --git a/configs/batch-llm/aws.yaml b/configs/batch-llm/aws.yaml
@@ -1,84 +1,4 @@
 head_node_type:
-  name: head-node
-  instance_type: m5.2xlarge
-  resources:
-    cpu: 0
-worker_node_types:
-  - name: worker-g5-xlarge-nvidia-a10-1
-    instance_type: g5.xlarge
-    resources:
-      custom_resources:
-        "accelerator_type:A10G": 1
-    min_workers: 0
-    max_workers: 4
-    use_spot: true
-    fallback_to_ondemand: true
-  - name: worker-g5-2xlarge-nvidia-a10-1
-    instance_type: g5.2xlarge
-    resources:
-      custom_resources:
-        "accelerator_type:A10G": 1
-    min_workers: 0
-    max_workers: 4
-    use_spot: true
-    fallback_to_ondemand: true
-  - name: worker-g5-4xlarge-nvidia-a10-1
-    instance_type: g5.4xlarge
-    resources:
-      custom_resources:
-        "accelerator_type:A10G": 1
-    min_workers: 0
-    max_workers: 4
-    use_spot: true
-    fallback_to_ondemand: true
-  - name: worker-g5-8xlarge-nvidia-a10-1
-    instance_type: g5.8xlarge
-    resources:
-      custom_resources:
-        "accelerator_type:A10G": 1
-    min_workers: 0
-    max_workers: 4
-    use_spot: true
-    fallback_to_ondemand: true
-  - name: worker-g5-12xlarge-nvidia-a10-4
-    instance_type: g5.12xlarge
-    resources:
-      custom_resources:
-        "accelerator_type:A10G": 1
-    min_workers: 0
-    max_workers: 1
-    use_spot: true
-    fallback_to_ondemand: true
-  - name: worker-g5-16xlarge-nvidia-a10-1
-    instance_type: g5.16xlarge
-    resources:
-      custom_resources:
-        "accelerator_type:A10G": 1
-    min_workers: 0
-    max_workers: 4
-    use_spot: true
-    fallback_to_ondemand: true
-  - name: worker-g5-24xlarge-nvidia-a10-4
-    instance_type: g5.24xlarge
-    resources:
-      custom_resources:
-        "accelerator_type:A10G": 1
-    min_workers: 0
-    max_workers: 1
-    use_spot: true
-    fallback_to_ondemand: true
-  - name: worker-g5-48xlarge-nvidia-a10-8
-    instance_type: g5.48xlarge
-    resources:
-      custom_resources:
-        "accelerator_type:A10G": 1
-    min_workers: 0
-    max_workers: 1
-    use_spot: true
-    fallback_to_ondemand: true
-aws:
-  TagSpecifications:
-    - ResourceType: instance
-      Tags:
-        - Key: as-feature-multi-zone
-          Value: "true"
+  name: head
+    # TODO(ricky): We need head node to have CUDA due to eager import from rayllm_batch now.
+  instance_type: g5.xlarge
diff --git a/configs/batch-llm/gce.yaml b/configs/batch-llm/gce.yaml
@@ -0,0 +1,4 @@
+head_node_type:
+  name: head
+  # TODO(ricky): We need head node to have CUDA due to eager import from rayllm_batch now.
+  instance_type: g2-standard-4-nvidia-l4-1
diff --git a/configs/batch-llm/gcp.yaml b/configs/batch-llm/gcp.yaml