Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CI/CD] add neuron docker and ci test scripts #3571

Merged
merged 7 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions .buildkite/run-neuron-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set -e

# Try building the docker image
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
docker build -t neuron -f Dockerfile.neuron .

# Setup cleanup
remove_docker_container() { docker rm -f neuron || true; }
trap remove_docker_container EXIT
remove_docker_container

# Run the image
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &

# Wait for the server to start
wait_for_server_to_start() {
timeout=300
counter=0

while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
echo "Timeout after $timeout seconds"
break
fi
done
}
wait_for_server_to_start

# Test a simple prompt
curl -X POST -H "Content-Type: application/json" \
localhost:8000/generate \
-d '{"prompt": "San Francisco is a"}'
5 changes: 5 additions & 0 deletions .buildkite/test-template.j2
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ steps:
queue: amd
command: bash .buildkite/run-amd-test.sh

- label: "Neuron Test"
agents:
queue: neuron
command: bash .buildkite/run-neuron-test.sh

- label: "CPU Test"
command: bash .buildkite/run-cpu-test.sh

Expand Down
36 changes: 36 additions & 0 deletions Dockerfile.neuron
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# default base image
ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"

FROM $BASE_IMAGE

RUN echo "Base image is $BASE_IMAGE"

# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y

### Mount Point ###
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app
VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}

RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U

COPY ./vllm /app/vllm/vllm
COPY ./setup.py /app/vllm/setup.py
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt

RUN cd /app/vllm \
&& python3 -m pip install -U -r requirements-neuron.txt

ENV VLLM_BUILD_WITH_NEURON 1
RUN cd /app/vllm \
&& pip install -e . \
&& cd ..

CMD ["/bin/bash"]
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,8 @@ def _is_neuron() -> bool:
subprocess.run(["neuron-ls"], capture_output=True, check=True)
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
torch_neuronx_installed = False
return torch_neuronx_installed
return torch_neuronx_installed or os.environ.get("VLLM_BUILD_WITH_NEURON",
False)


def _is_cpu() -> bool:
Expand Down
4 changes: 2 additions & 2 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,8 +333,8 @@ def from_engine_args(
engine_config = engine_args.create_engine_config()

if engine_config.device_config.device_type == "neuron":
raise NotImplementedError("Neuron is not supported for "
"async engine yet.")
from vllm.executor.neuron_executor import NeuronExecutorAsync
executor_class = NeuronExecutorAsync
elif engine_config.parallel_config.worker_use_ray:
initialize_ray_cluster(engine_config.parallel_config)
from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
Expand Down
22 changes: 21 additions & 1 deletion vllm/executor/neuron_executor.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from typing import Dict, List, Set, Tuple

from vllm.executor.executor_base import ExecutorBase
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
from vllm.utils import make_async

logger = init_logger(__name__)

Expand Down Expand Up @@ -73,3 +74,22 @@ def check_health(self) -> None:
# NeuronExecutor will always be healthy as long as
# it's running.
return


class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):

async def execute_model_async(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
blocks_to_swap_in: Dict[int, int],
blocks_to_swap_out: Dict[int, int],
blocks_to_copy: Dict[int, List[int]],
) -> SamplerOutput:
output = await make_async(self.driver_worker.execute_model)(
seq_group_metadata_list=seq_group_metadata_list, )
return output

async def check_health_async(self) -> None:
# NeuronExecutor will always be healthy as long as
# it's running.
return
Loading