diff --git a/.github/workflows/rolling_batch_integration.yml b/.github/workflows/rolling_batch_integration.yml index 92f034488e2..c5f32f5d471 100644 --- a/.github/workflows/rolling_batch_integration.yml +++ b/.github/workflows/rolling_batch_integration.yml @@ -327,6 +327,15 @@ jobs: serve -m test=file:/opt/ml/model/test/ python3 llm/client.py vllm llama2-13b docker rm -f $(docker ps -aq) + - name: Test llama2-13b awq + working-directory: tests/integration + run: | + rm -rf models + python3 llm/prepare.py vllm llama2-13b-awq + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ + serve -m test=file:/opt/ml/model/test/ + python3 llm/client.py vllm llama2-13b + docker rm -f $(docker ps -aq) - name: Test gpt-neox-20b working-directory: tests/integration run: | diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index de618cca360..f814fb80dd8 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -512,6 +512,11 @@ "option.dtype": "fp16", "option.tensor_parallel_degree": 4 }, + "llama2-13b-awq": { + "option.model_id": "TheBloke/Llama-2-13B-chat-AWQ", + "option.quantize": "awq", + "option.tensor_parallel_degree": 4 + }, "gpt-neox-20b": { "option.model_id": "s3://djl-llm/gpt-neox-20b", "option.task": "text-generation",