-
Notifications
You must be signed in to change notification settings - Fork 510
/
service-with-auth.yaml
42 lines (34 loc) · 1.22 KB
/
service-with-auth.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# service.yaml
# The newly-added `service` section to the `serve-openai-api.yaml` file.
service:
# Specifying the path to the endpoint to check the readiness of the service.
readiness_probe:
path: /v1/models
# Set authorization headers here if needed.
headers:
Authorization: Bearer $AUTH_TOKEN
# How many replicas to manage.
replicas: 1
# Fields below are the same with `serve-openai-api.yaml`.
envs:
MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
AUTH_TOKEN: # TODO: Fill with your own auth token (a random string), or use --env to pass.
resources:
accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}
ports: 8000
setup: |
conda activate vllm
if [ $? -ne 0 ]; then
conda create -n vllm python=3.10 -y
conda activate vllm
fi
pip install transformers==4.38.0
pip install vllm==0.3.2
python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
run: |
conda activate vllm
echo 'Starting vllm openai api server...'
python -m vllm.entrypoints.openai.api_server \
--model $MODEL_NAME --tokenizer hf-internal-testing/llama-tokenizer \
--host 0.0.0.0 --port 8000 --api-key $AUTH_TOKEN