Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into perf-sampler
Browse files Browse the repository at this point in the history
  • Loading branch information
esmeetu committed Mar 20, 2024
2 parents 744c0c5 + 6e435de commit 63db9e7
Show file tree
Hide file tree
Showing 42 changed files with 1,945 additions and 469 deletions.
4 changes: 2 additions & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ steps:
- pytest -v -s prefix_caching

- label: Samplers Test
command: pytest -v -s samplers --forked
command: pytest -v -s samplers

- label: LogitsProcessor Test
command: pytest -v -s test_logits_processor.py
Expand All @@ -59,7 +59,7 @@ steps:
command: pytest -v -s spec_decode

- label: LoRA Test %N
command: pytest -v -s lora --forked --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4

- label: Metrics Test
Expand Down
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ endif()
#
append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")

# Ensure the 'nvcc' command is in the PATH
find_program(NVCC_EXECUTABLE nvcc)
if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
message(FATAL_ERROR "nvcc not found")
endif()

#
# Import torch cmake configuration.
# Torch also imports CUDA (and partially HIP) languages with some customizations,
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta
FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer
pip install accelerate hf_transfer modelscope

COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm
Expand Down
2 changes: 1 addition & 1 deletion docs/source/getting_started/amd-installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ You can build and install vLLM from source:

Build a docker image from `Dockerfile.rocm`, and launch a docker container.

The `Dokerfile.rocm` is designed to support both ROCm 5.7 and ROCm 6.0 and later versions. It provides flexibility to customize the build of docker image using the following arguments:
The `Dockerfile.rocm` is designed to support both ROCm 5.7 and ROCm 6.0 and later versions. It provides flexibility to customize the build of docker image using the following arguments:

* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`
* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
Expand Down
1 change: 1 addition & 0 deletions docs/source/getting_started/neuron-installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ Once neuronx-cc and transformers-neuronx packages are installed, we will be able

.. code-block:: console
$ git clone https://github.com/vllm-project/vllm.git
$ cd vllm
$ pip install -U -r requirements-neuron.txt
$ pip install .
Expand Down
12 changes: 6 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,12 @@ def compute_num_jobs(self):
except AttributeError:
num_jobs = os.cpu_count()

nvcc_cuda_version = get_nvcc_cuda_version()
if nvcc_cuda_version >= Version("11.2"):
nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
num_jobs = max(1, round(num_jobs / (nvcc_threads / 4)))
else:
nvcc_threads = None
nvcc_threads = None
if _is_cuda():
nvcc_cuda_version = get_nvcc_cuda_version()
if nvcc_cuda_version >= Version("11.2"):
nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
num_jobs = max(1, round(num_jobs / (nvcc_threads / 4)))

return num_jobs, nvcc_threads

Expand Down
4 changes: 3 additions & 1 deletion tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,21 @@
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False, True])
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
enforce_eager: bool,
) -> None:
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model

vllm_model = vllm_runner(model, dtype=dtype)
vllm_model = vllm_runner(model, dtype=dtype, enforce_eager=enforce_eager)
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model

Expand Down
14 changes: 8 additions & 6 deletions tests/core/test_block_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from vllm import SamplingParams
from vllm.block import PhysicalTokenBlock
from vllm.core.block_manager import (BlockAllocator, BlockSpaceManager,
from vllm.core.block_manager import (UncachedBlockAllocator, BlockSpaceManager,
AllocStatus)
from vllm.utils import Device
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob
Expand All @@ -15,7 +15,8 @@
def test_block_allocator_allocate():
block_size = 4
num_cpu_blocks = 4
cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks)
cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
num_cpu_blocks)

# Allocate all available cpu blocks.
num_free = num_cpu_blocks
Expand All @@ -24,7 +25,7 @@ def test_block_allocator_allocate():
block = cpu_allocator.allocate()
num_free -= 1

assert block.block_hash not in cpu_allocator.evictor
assert block not in cpu_allocator.free_blocks
assert cpu_allocator.get_num_free_blocks() == num_free

with pytest.raises(ValueError):
Expand All @@ -34,22 +35,23 @@ def test_block_allocator_allocate():
def test_block_allocator_free():
block_size = 4
num_cpu_blocks = 4
cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks)
cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
num_cpu_blocks)

# Allocate all available cpu blocks.
blocks: List[PhysicalTokenBlock] = []
for _ in range(num_cpu_blocks):
block = cpu_allocator.allocate()
blocks.append(block)
assert block.block_hash not in cpu_allocator.evictor
assert block not in cpu_allocator.free_blocks

# Free all allocated cpu blocks.
num_free = 0
assert cpu_allocator.get_num_free_blocks() == num_free
for block in blocks:
cpu_allocator.free(block)
num_free += 1
assert block.block_hash in cpu_allocator.evictor
assert block in cpu_allocator.free_blocks
assert cpu_allocator.get_num_free_blocks() == num_free

with pytest.raises(ValueError):
Expand Down
18 changes: 9 additions & 9 deletions tests/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

def test_scheduler_add_seq_group():
block_size = 4
scheduler_config = SchedulerConfig(100, 64, 1, 256)
scheduler_config = SchedulerConfig(100, 64, 1)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 4
cache_config.num_gpu_blocks = 4
Expand All @@ -26,7 +26,7 @@ def test_scheduler_add_seq_group():

def test_scheduler_abort_seq_group():
block_size = 4
scheduler_config = SchedulerConfig(100, 64, 1, 256)
scheduler_config = SchedulerConfig(100, 64, 1)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 4
cache_config.num_gpu_blocks = 4
Expand All @@ -50,7 +50,7 @@ def test_scheduler_schedule_simple():
block_size = 4
num_seq_group = 4
max_model_len = 16
scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len, 256)
scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
Expand All @@ -64,10 +64,10 @@ def test_scheduler_schedule_simple():
running.append(seq_group)

# Schedule seq groups prompts.
num_tokens = block_size * num_seq_group
seq_group_meta, out = scheduler.schedule()
assert set(out.scheduled_seq_groups) == set(running)
assert out.num_batched_tokens == num_seq_group * seq_group.get_seqs(
)[0].get_len()
assert out.num_batched_tokens == num_tokens
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
and not out.blocks_to_swap_out)
assert len(seq_group_meta) == num_seq_group
Expand All @@ -84,7 +84,7 @@ def test_scheduler_schedule_simple():
def test_scheduler_schedule_preempt_abort():
block_size = 4
max_model_len = 16
scheduler_config = SchedulerConfig(64, 2, max_model_len, 256)
scheduler_config = SchedulerConfig(64, 2, max_model_len)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 2
cache_config.num_gpu_blocks = 2
Expand All @@ -99,7 +99,7 @@ def test_scheduler_schedule_preempt_abort():
# Schedule seq groups prompts.
seq_group_meta, out = scheduler.schedule()
assert out.scheduled_seq_groups == [seq_group_a, seq_group_b]
assert out.num_batched_tokens == seq_group_a.get_seqs()[0].get_len() * 2
assert out.num_batched_tokens == block_size * 2 # seq_a and seq_b
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
and not out.blocks_to_swap_out)
assert len(seq_group_meta) == 2
Expand All @@ -124,7 +124,7 @@ def test_scheduler_schedule_preempt_abort():
scheduler.abort_seq_group("1")
seq_group_meta, out = scheduler.schedule()
assert out.scheduled_seq_groups == [seq_group_b]
assert out.num_batched_tokens == seq_group_b.get_seqs()[0].get_len()
assert out.num_batched_tokens == 5 # 4 prompt + 1 generation.
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
and not out.blocks_to_swap_out)
assert len(seq_group_meta) == 1
Expand All @@ -136,7 +136,7 @@ def test_scheduler_max_seqs():
num_seq_group = 4
max_seq_group = 2
max_model_len = 16
scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len, 256)
scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
Expand Down
51 changes: 51 additions & 0 deletions tests/kernels/test_rand.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import torch
import pytest
import random

from vllm.model_executor.layers.ops.rand import seeded_uniform
from vllm.model_executor.utils import set_random_seed


@pytest.mark.parametrize("dtype",
[torch.float32, torch.float16, torch.bfloat16])
@pytest.mark.parametrize("use_3d", [True, False])
def test_seeded_uniform(dtype: torch.dtype, use_3d: bool):
device = "cuda"
for seed in range(512):
set_random_seed(seed)
rows = random.randint(1, 512)
cols = random.randint(1, 64000)
if use_3d:
third_dim = random.randint(2, 10)
dims = [rows, third_dim, cols]
else:
dims = [rows, cols]
seeds = torch.randint(torch.iinfo(torch.long).min,
torch.iinfo(torch.long).max, (rows, ),
device=device)

# Test that the same seed produces the same output
out = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
out2 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
torch.testing.assert_close(out, out2)
# del to save memory
del out2

out3 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
torch.testing.assert_close(out, out3)
# del to save memory
del out3

# Initialize out tensor with garbage to ensure that it is overwritten
out_with_tensor = seeded_uniform(
*dims,
out=torch.full(
(*dims, ),
-1,
dtype=dtype,
device=device,
),
seeds=seeds,
dtype=dtype,
)
torch.testing.assert_close(out, out_with_tensor)
Loading

0 comments on commit 63db9e7

Please sign in to comment.