From cfba4cd94d5ef1bc54c50dfc543c6e00d6e4e17b Mon Sep 17 00:00:00 2001 From: simon-mo Date: Thu, 15 Feb 2024 21:53:52 +0000 Subject: [PATCH 01/28] CI: Add ROCm Docker Build --- .github/workflows/scripts/rocm.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/scripts/rocm.yml diff --git a/.github/workflows/scripts/rocm.yml b/.github/workflows/scripts/rocm.yml new file mode 100644 index 0000000000000..97730e7c34a2e --- /dev/null +++ b/.github/workflows/scripts/rocm.yml @@ -0,0 +1,23 @@ +name: AMD ROCm Build + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + ruff: + runs-on: self-hosted + steps: + - uses: actions/checkout@v2 + - name: Check environment + run: | + docker --version + - name: Build Docker + run: | + docker build -t rocm -f Dockerfile.rocm . From d237ff6cb3c20474b21329f7d7941fbfcce513a2 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Thu, 15 Feb 2024 22:37:50 +0000 Subject: [PATCH 02/28] fix name --- .github/workflows/scripts/rocm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scripts/rocm.yml b/.github/workflows/scripts/rocm.yml index 97730e7c34a2e..10991c71f7805 100644 --- a/.github/workflows/scripts/rocm.yml +++ b/.github/workflows/scripts/rocm.yml @@ -11,7 +11,7 @@ on: - main jobs: - ruff: + rocm: runs-on: self-hosted steps: - uses: actions/checkout@v2 From 03aba6b25e70d51432b475790a4a3208b5d55cbf Mon Sep 17 00:00:00 2001 From: simon-mo Date: Thu, 15 Feb 2024 22:39:34 +0000 Subject: [PATCH 03/28] move --- .github/workflows/{scripts => }/rocm.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{scripts => }/rocm.yml (100%) diff --git a/.github/workflows/scripts/rocm.yml b/.github/workflows/rocm.yml similarity index 100% rename from .github/workflows/scripts/rocm.yml rename to .github/workflows/rocm.yml From e226c598f9aa33b6d7974aeb900c6cf47a659dd8 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Mon, 19 Feb 2024 18:35:45 -0800 Subject: [PATCH 04/28] Add sanity test --- .github/workflows/rocm.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml index 10991c71f7805..86b6412ad4fc6 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm.yml @@ -21,3 +21,9 @@ jobs: - name: Build Docker run: | docker build -t rocm -f Dockerfile.rocm . + - name: Run Sanity Test + run: | + docker run --detached --p 8000:8000 rocm + while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do sleep 1; done + python examples/openai_completion_client.py + From 51d2d75f541b8e59bd9898a05ba15fd0d5c25e81 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Mon, 19 Feb 2024 22:02:40 -0800 Subject: [PATCH 05/28] Add sanity test --- .github/workflows/rocm.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml index 86b6412ad4fc6..725aa9f14757e 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm.yml @@ -23,7 +23,15 @@ jobs: docker build -t rocm -f Dockerfile.rocm . - name: Run Sanity Test run: | - docker run --detached --p 8000:8000 rocm + # detele any existing container + docker rm -f $(docker ps -a -q) + # run the container + docker run -d --p 8000:8000 rocm + # wait for the server to start while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do sleep 1; done + # run the sanity test python examples/openai_completion_client.py + # remove the container + docker rm -f $(docker ps -a -q) + From 5a70ad78131d6822e6c44283f918fa6a408b4529 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Tue, 20 Feb 2024 10:33:44 -0800 Subject: [PATCH 06/28] remove only the rocm container --- .github/workflows/rocm.yml | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml index 725aa9f14757e..46461173abf36 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm.yml @@ -23,15 +23,14 @@ jobs: docker build -t rocm -f Dockerfile.rocm . - name: Run Sanity Test run: | - # detele any existing container - docker rm -f $(docker ps -a -q) - # run the container - docker run -d --p 8000:8000 rocm - # wait for the server to start + remove_docker_container() { + docker rm -f rocm || true + } + trap remove_docker_container EXIT + + remove_docker_container + docker run -d --p 8000:8000 -n rocm rocm while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do sleep 1; done - # run the sanity test python examples/openai_completion_client.py - # remove the container - docker rm -f $(docker ps -a -q) From cd910dc933a568f4904f23fbbfe2435c4dfa2704 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 8 Mar 2024 13:56:32 -0800 Subject: [PATCH 07/28] migrate to buildkite --- .buildkite/test-template.j2 | 15 +++++++++++++++ .github/workflows/rocm.yml | 36 ------------------------------------ 2 files changed, 15 insertions(+), 36 deletions(-) delete mode 100644 .github/workflows/rocm.yml diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 7c1cf2b5a9b39..0b7aee29f327c 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -3,6 +3,19 @@ {% set default_working_dir = "/vllm-workspace/tests" %} steps: + - label: "AMD Test" + agents: + queue: amd + commands: + - docker build -t rocm -f Dockerfile.rocm . + - remove_docker_container() { docker rm -f rocm || true; }; trap remove_docker_container EXIT + - remove_docker_container + - docker run -d --p 8000:8000 -n rocm rocm + - while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do sleep 1; done + - python examples/openai_completion_client.py + +{# Ignoring CUDA build for now + - label: ":docker: build image" commands: - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ." @@ -54,3 +67,5 @@ steps: - mountPath: /dev/shm name: dshm {% endfor %} + +#} \ No newline at end of file diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml deleted file mode 100644 index 46461173abf36..0000000000000 --- a/.github/workflows/rocm.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: AMD ROCm Build - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - pull_request: - branches: - - main - -jobs: - rocm: - runs-on: self-hosted - steps: - - uses: actions/checkout@v2 - - name: Check environment - run: | - docker --version - - name: Build Docker - run: | - docker build -t rocm -f Dockerfile.rocm . - - name: Run Sanity Test - run: | - remove_docker_container() { - docker rm -f rocm || true - } - trap remove_docker_container EXIT - - remove_docker_container - docker run -d --p 8000:8000 -n rocm rocm - while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do sleep 1; done - python examples/openai_completion_client.py - - From 1c872cbf4e0b75f5dc48ee6eacaabfc46c89c88f Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 14:31:05 -0700 Subject: [PATCH 08/28] change to shell script --- .buildkite/test-template.j2 | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 0b7aee29f327c..b8ffa1a3f5b9c 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -6,13 +6,8 @@ steps: - label: "AMD Test" agents: queue: amd - commands: - - docker build -t rocm -f Dockerfile.rocm . - - remove_docker_container() { docker rm -f rocm || true; }; trap remove_docker_container EXIT - - remove_docker_container - - docker run -d --p 8000:8000 -n rocm rocm - - while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do sleep 1; done - - python examples/openai_completion_client.py + command: bash ./buildkite/run-amd-test.sh + {# Ignoring CUDA build for now From e1d886f2f856a423f2a2979a79bb4652662c1629 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 14:32:25 -0700 Subject: [PATCH 09/28] fix typo --- .buildkite/test-template.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index b8ffa1a3f5b9c..2a1c79be376df 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -6,7 +6,7 @@ steps: - label: "AMD Test" agents: queue: amd - command: bash ./buildkite/run-amd-test.sh + command: bash .buildkite/run-amd-test.sh {# Ignoring CUDA build for now From 4d87cec94ddd274b906aeaff2ff727407c4ae6a8 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 14:32:28 -0700 Subject: [PATCH 10/28] fix typo --- .buildkite/run-amd-test.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 .buildkite/run-amd-test.sh diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh new file mode 100644 index 0000000000000..c751729f51a33 --- /dev/null +++ b/.buildkite/run-amd-test.sh @@ -0,0 +1,13 @@ +set -e +set -x + +docker build -t rocm -f Dockerfile.rocm . + +remove_docker_container() { docker rm -f rocm || true; } +trap remove_docker_container EXIT + +remove_docker_container + +docker run -d --p 8000:8000 -n rocm rocm +while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do sleep 1; done +python examples/openai_completion_client.py From 456b76b55335e6dfacdfa0fc22572d613f1348c7 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 15:36:36 -0700 Subject: [PATCH 11/28] fix docker command --- .buildkite/run-amd-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index c751729f51a33..5c3610f2ed7ba 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -8,6 +8,6 @@ trap remove_docker_container EXIT remove_docker_container -docker run -d --p 8000:8000 -n rocm rocm +docker run -d --network host -n rocm rocm while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do sleep 1; done python examples/openai_completion_client.py From cb7989a340ea9ae0cab561e12d2c48108438eb70 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 16:07:35 -0700 Subject: [PATCH 12/28] fix docker command --- .buildkite/run-amd-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 5c3610f2ed7ba..52665235d82af 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -8,6 +8,6 @@ trap remove_docker_container EXIT remove_docker_container -docker run -d --network host -n rocm rocm +docker run --detached --network host --name rocm rocm while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do sleep 1; done python examples/openai_completion_client.py From 7a60114519dc3bc5e4bd6a205a907a5a6cf268b7 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 16:13:17 -0700 Subject: [PATCH 13/28] fix docker command --- .buildkite/run-amd-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 52665235d82af..c0e32e328e7af 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -8,6 +8,6 @@ trap remove_docker_container EXIT remove_docker_container -docker run --detached --network host --name rocm rocm +docker run --detach --network host --name rocm rocm while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do sleep 1; done python examples/openai_completion_client.py From 736730464805a40a28453d318972bcfa9c1d3139 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 16:21:17 -0700 Subject: [PATCH 14/28] run docker sync --- .buildkite/run-amd-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index c0e32e328e7af..b75542d952d91 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -8,6 +8,6 @@ trap remove_docker_container EXIT remove_docker_container -docker run --detach --network host --name rocm rocm +docker run --network host --name rocm rocm & while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do sleep 1; done python examples/openai_completion_client.py From ff78d364bdfdc12127d89b2ad67ab628c8bc84f2 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 16:38:21 -0700 Subject: [PATCH 15/28] add cmd --- .buildkite/run-amd-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index b75542d952d91..16d4cbbed6838 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -8,6 +8,6 @@ trap remove_docker_container EXIT remove_docker_container -docker run --network host --name rocm rocm & +docker run --network host --name rocm rocm python3 -m vllm.entrypoints.openai.api_server & while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do sleep 1; done python examples/openai_completion_client.py From 21b0dbb1a2f22ffd348ea7ec530943943bd2b5ba Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 16:43:20 -0700 Subject: [PATCH 16/28] add outlines --- requirements-rocm.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-rocm.txt b/requirements-rocm.txt index 53bd11de7c9de..d5a3bd423b6b3 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -11,3 +11,4 @@ fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. prometheus_client >= 0.18.0 +outlines == 0.0.34 \ No newline at end of file From fe983ccdacedfab06228f250ab73f843aa7c6405 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 17:02:54 -0700 Subject: [PATCH 17/28] Revert "Dynamically configure shared memory size for moe_align_block_size_kernel (#3376)" This reverts commit 78b6c4845ac9aa57ccf7e42cf4c7d3c4cdef14cf. --- csrc/moe_align_block_size_kernels.cu | 42 +++++++++------------------- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/csrc/moe_align_block_size_kernels.cu b/csrc/moe_align_block_size_kernels.cu index 138615a4bfba0..de6a0ec0a972c 100644 --- a/csrc/moe_align_block_size_kernels.cu +++ b/csrc/moe_align_block_size_kernels.cu @@ -7,17 +7,10 @@ #include "cuda_compat.h" #include "dispatch_utils.h" +const static size_t NUM_MAX_EXPERTS = 64; #define CEILDIV(x,y) (((x) + (y) - 1) / (y)) namespace vllm { - -namespace { -__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, int32_t col) { - // don't worry about overflow because num_experts is relatively small - return row * total_col + col; -} -} - template __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, int32_t *sorted_token_ids, @@ -28,14 +21,10 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, size_t numel) { const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); const size_t start_idx = threadIdx.x * tokens_per_thread; - - extern __shared__ int32_t shared_mem[]; - - int32_t* tokens_cnts = shared_mem; // 2d tensor with shape (num_experts + 1, num_experts) - int32_t* cumsum = shared_mem + (num_experts + 1) * num_experts; // 1d tensor with shape (num_experts + 1) - + __shared__ int32_t tokens_cnts[NUM_MAX_EXPERTS + 1][NUM_MAX_EXPERTS]; + __shared__ int32_t cumsum[NUM_MAX_EXPERTS + 1]; for (int i = 0; i < num_experts; ++i) { - tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; + tokens_cnts[threadIdx.x + 1][i] = 0; } /** @@ -44,15 +33,15 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, * to expert expert_index. */ for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])]; + ++tokens_cnts[threadIdx.x + 1][topk_ids[i]]; } __syncthreads(); // For each expert we accumulate the token counts from the different threads. - tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0; + tokens_cnts[0][threadIdx.x] = 0; for (int i = 1; i <= blockDim.x; ++i) { - tokens_cnts[index(num_experts, i, threadIdx.x)] += tokens_cnts[index(num_experts, i-1, threadIdx.x)]; + tokens_cnts[i][threadIdx.x] += tokens_cnts[i-1][threadIdx.x]; } __syncthreads(); @@ -61,7 +50,7 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, if (threadIdx.x == 0) { cumsum[0] = 0; for (int i = 1; i <= num_experts; ++i) { - cumsum[i] = cumsum[i-1] + CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)], block_size) * block_size; + cumsum[i] = cumsum[i-1] + CEILDIV(tokens_cnts[blockDim.x][i - 1], block_size) * block_size; } *total_tokens_post_pad = cumsum[num_experts]; } @@ -89,9 +78,9 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, * stores the indices of the tokens processed by the expert with expert_id within * the current thread's token shard. */ - int32_t rank_post_pad = tokens_cnts[index(num_experts, threadIdx.x, expert_id)] + cumsum[expert_id]; + int32_t rank_post_pad = tokens_cnts[threadIdx.x][expert_id] + cumsum[expert_id]; sorted_token_ids[rank_post_pad] = i; - ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)]; + ++tokens_cnts[threadIdx.x][expert_id]; } } } @@ -104,16 +93,11 @@ void moe_align_block_size( torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad) { const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + assert(num_experts <= NUM_MAX_EXPERTS); VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { - // calc needed amount of shared mem for `tokens_cnts` and `cumsum` tensors - const int32_t shared_mem = ((num_experts + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t); - - // set dynamic shared mem - auto kernel = vllm::moe_align_block_size_kernel; - AT_CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem)); - kernel<<<1, num_experts, shared_mem, stream>>>( - topk_ids.data_ptr(), + vllm::moe_align_block_size_kernel<<<1, num_experts, 0, stream>>>( + topk_ids.data_ptr(), sorted_token_ids.data_ptr(), experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(), From 822bffd7a014afa32f29b7d6812066d32f1928db Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 17:03:10 -0700 Subject: [PATCH 18/28] add debug info --- .buildkite/run-amd-test.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 16d4cbbed6838..eb7d349179eeb 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -1,6 +1,8 @@ set -e set -x +rocminfo + docker build -t rocm -f Dockerfile.rocm . remove_docker_container() { docker rm -f rocm || true; } From 0125268e61b3a825cda8c63400a9d1f4785a7ec0 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 17:14:21 -0700 Subject: [PATCH 19/28] use test server due to outlines issue --- .buildkite/run-amd-test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index eb7d349179eeb..a609326d4868a 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -10,6 +10,6 @@ trap remove_docker_container EXIT remove_docker_container -docker run --network host --name rocm rocm python3 -m vllm.entrypoints.openai.api_server & +docker run --network host --name rocm rocm python3 -m vllm.entrypoints.api_server & while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do sleep 1; done -python examples/openai_completion_client.py +python examples/api_client.py From 5e3180d0ef67ea42b47ba7e690235714740d98be Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 17:21:50 -0700 Subject: [PATCH 20/28] actually use gpus --- .buildkite/run-amd-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index a609326d4868a..e04a341e0e760 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -10,6 +10,6 @@ trap remove_docker_container EXIT remove_docker_container -docker run --network host --name rocm rocm python3 -m vllm.entrypoints.api_server & +docker run --gpus all --network host --name rocm rocm python3 -m vllm.entrypoints.api_server & while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do sleep 1; done python examples/api_client.py From fbc07a9bff92533638e1f51ae7dbf5972bc92ea1 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 17:28:44 -0700 Subject: [PATCH 21/28] use rocm docker --- .buildkite/run-amd-test.sh | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index e04a341e0e760..5f2caef57acd1 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -10,6 +10,18 @@ trap remove_docker_container EXIT remove_docker_container -docker run --gpus all --network host --name rocm rocm python3 -m vllm.entrypoints.api_server & -while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do sleep 1; done +docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server & + +timeout=300 +counter=0 + +while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do + sleep 1 + counter=$((counter+1)) + if [ $counter -ge $timeout ]; then + echo "Timeout after $timeout seconds" + break + fi +done + python examples/api_client.py From 83447e4b120ec654e0b2810cda12afa3572a9c51 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 17:39:41 -0700 Subject: [PATCH 22/28] use better healthcheck --- .buildkite/run-amd-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 5f2caef57acd1..fb9880095eae3 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -15,7 +15,7 @@ docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm p timeout=300 counter=0 -while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000)" != "200" ]; do +while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do sleep 1 counter=$((counter+1)) if [ $counter -ge $timeout ]; then From 1117e4683f3764e43aa41d87bc0e8be0beeb28e8 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 17:46:31 -0700 Subject: [PATCH 23/28] use curl on host instead --- .buildkite/run-amd-test.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index fb9880095eae3..de5c07e306f9f 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -24,4 +24,6 @@ while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != " fi done -python examples/api_client.py +curl -X POST -H "Content-Type: application/json" + localhost:8000/generate + -d '{"prompt": "San Francisco is a"}' \ No newline at end of file From 1d8911fb158576d6ca4bba9c7ce49c04d732149e Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 17:51:20 -0700 Subject: [PATCH 24/28] fix curl --- .buildkite/run-amd-test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index de5c07e306f9f..025d5b96f47b8 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -24,6 +24,6 @@ while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != " fi done -curl -X POST -H "Content-Type: application/json" - localhost:8000/generate +curl -X POST -H "Content-Type: application/json" \ + localhost:8000/generate \ -d '{"prompt": "San Francisco is a"}' \ No newline at end of file From ec2d7ec9e03d46102d50734941d3d60dadcd4298 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 18:01:25 -0700 Subject: [PATCH 25/28] reset tests --- .buildkite/test-template.j2 | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 2a1c79be376df..5b012470390f0 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -8,9 +8,6 @@ steps: queue: amd command: bash .buildkite/run-amd-test.sh - -{# Ignoring CUDA build for now - - label: ":docker: build image" commands: - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ." @@ -62,5 +59,3 @@ steps: - mountPath: /dev/shm name: dshm {% endfor %} - -#} \ No newline at end of file From 94fa91d7fe62983d37f788c671bc5dffca90dd8f Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 18:01:57 -0700 Subject: [PATCH 26/28] newline --- .buildkite/run-amd-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 025d5b96f47b8..77f11ff5da4b6 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -26,4 +26,4 @@ done curl -X POST -H "Content-Type: application/json" \ localhost:8000/generate \ - -d '{"prompt": "San Francisco is a"}' \ No newline at end of file + -d '{"prompt": "San Francisco is a"}' From c5850ad99c8c76be26076234d48afd6ee0755a33 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 15 Mar 2024 18:03:38 -0700 Subject: [PATCH 27/28] add comments --- .buildkite/run-amd-test.sh | 39 +++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 77f11ff5da4b6..83a56e25aca73 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -1,29 +1,38 @@ -set -e -set -x +# This script build the ROCm docker image and run the API server inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex +# Print ROCm version rocminfo +# Try building the docker image docker build -t rocm -f Dockerfile.rocm . +# Setup cleanup remove_docker_container() { docker rm -f rocm || true; } trap remove_docker_container EXIT - remove_docker_container +# Run the image docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server & -timeout=300 -counter=0 - -while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do - sleep 1 - counter=$((counter+1)) - if [ $counter -ge $timeout ]; then - echo "Timeout after $timeout seconds" - break - fi -done - +# Wait for the server to start +wait_for_server_to_start() { + timeout=300 + counter=0 + + while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do + sleep 1 + counter=$((counter + 1)) + if [ $counter -ge $timeout ]; then + echo "Timeout after $timeout seconds" + break + fi + done +} +wait_for_server_to_start + +# Test a simple prompt curl -X POST -H "Content-Type: application/json" \ localhost:8000/generate \ -d '{"prompt": "San Francisco is a"}' From 4a886324ede2865fe7c7bbb38225c619ef848df3 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Mon, 18 Mar 2024 11:38:40 -0700 Subject: [PATCH 28/28] Revert "Revert "Dynamically configure shared memory size for moe_align_block_size_kernel (#3376)"" This reverts commit fe983ccdacedfab06228f250ab73f843aa7c6405. --- csrc/moe_align_block_size_kernels.cu | 42 +++++++++++++++++++--------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/csrc/moe_align_block_size_kernels.cu b/csrc/moe_align_block_size_kernels.cu index de6a0ec0a972c..138615a4bfba0 100644 --- a/csrc/moe_align_block_size_kernels.cu +++ b/csrc/moe_align_block_size_kernels.cu @@ -7,10 +7,17 @@ #include "cuda_compat.h" #include "dispatch_utils.h" -const static size_t NUM_MAX_EXPERTS = 64; #define CEILDIV(x,y) (((x) + (y) - 1) / (y)) namespace vllm { + +namespace { +__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, int32_t col) { + // don't worry about overflow because num_experts is relatively small + return row * total_col + col; +} +} + template __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, int32_t *sorted_token_ids, @@ -21,10 +28,14 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, size_t numel) { const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); const size_t start_idx = threadIdx.x * tokens_per_thread; - __shared__ int32_t tokens_cnts[NUM_MAX_EXPERTS + 1][NUM_MAX_EXPERTS]; - __shared__ int32_t cumsum[NUM_MAX_EXPERTS + 1]; + + extern __shared__ int32_t shared_mem[]; + + int32_t* tokens_cnts = shared_mem; // 2d tensor with shape (num_experts + 1, num_experts) + int32_t* cumsum = shared_mem + (num_experts + 1) * num_experts; // 1d tensor with shape (num_experts + 1) + for (int i = 0; i < num_experts; ++i) { - tokens_cnts[threadIdx.x + 1][i] = 0; + tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; } /** @@ -33,15 +44,15 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, * to expert expert_index. */ for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - ++tokens_cnts[threadIdx.x + 1][topk_ids[i]]; + ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])]; } __syncthreads(); // For each expert we accumulate the token counts from the different threads. - tokens_cnts[0][threadIdx.x] = 0; + tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0; for (int i = 1; i <= blockDim.x; ++i) { - tokens_cnts[i][threadIdx.x] += tokens_cnts[i-1][threadIdx.x]; + tokens_cnts[index(num_experts, i, threadIdx.x)] += tokens_cnts[index(num_experts, i-1, threadIdx.x)]; } __syncthreads(); @@ -50,7 +61,7 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, if (threadIdx.x == 0) { cumsum[0] = 0; for (int i = 1; i <= num_experts; ++i) { - cumsum[i] = cumsum[i-1] + CEILDIV(tokens_cnts[blockDim.x][i - 1], block_size) * block_size; + cumsum[i] = cumsum[i-1] + CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)], block_size) * block_size; } *total_tokens_post_pad = cumsum[num_experts]; } @@ -78,9 +89,9 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, * stores the indices of the tokens processed by the expert with expert_id within * the current thread's token shard. */ - int32_t rank_post_pad = tokens_cnts[threadIdx.x][expert_id] + cumsum[expert_id]; + int32_t rank_post_pad = tokens_cnts[index(num_experts, threadIdx.x, expert_id)] + cumsum[expert_id]; sorted_token_ids[rank_post_pad] = i; - ++tokens_cnts[threadIdx.x][expert_id]; + ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)]; } } } @@ -93,11 +104,16 @@ void moe_align_block_size( torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad) { const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - assert(num_experts <= NUM_MAX_EXPERTS); VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { - vllm::moe_align_block_size_kernel<<<1, num_experts, 0, stream>>>( - topk_ids.data_ptr(), + // calc needed amount of shared mem for `tokens_cnts` and `cumsum` tensors + const int32_t shared_mem = ((num_experts + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t); + + // set dynamic shared mem + auto kernel = vllm::moe_align_block_size_kernel; + AT_CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem)); + kernel<<<1, num_experts, shared_mem, stream>>>( + topk_ids.data_ptr(), sorted_token_ids.data_ptr(), experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(),