Merge remote-tracking branch 'upstream/main' into min-new-tokens

* upstream/main: [Misc] Bump up transformers to v4.39.0 & Remove StarCoder2Config (vllm-project#3551) [Misc][Log] Add log for tokenizer length not equal to vocabulary size (vllm-project#3500) [🚀 Ready to be merged] Added support for Jais models (vllm-project#3183) Fix 1D query issue from `_prune_hidden_states` (vllm-project#3539) [PREFIX CACHING FOLLOW UP] OrderedDict-based evictor (vllm-project#3431) [BugFix] Hot fix in setup.py for neuron build (vllm-project#3537) Migrate `logits` computation and gather to `model_runner` (vllm-project#3233) [1/n][Chunked Prefill] Refactor input query shapes (vllm-project#3236) [1/n] Triton sampling kernel (vllm-project#3186) [Bugfix] Fix ROCm support in CMakeLists.txt (vllm-project#3534)
tjohnson31415 · Mar 21, 2024 · 90ac00c · 90ac00c
2 parents b93e18f + c188ecb
commit 90ac00c
Show file tree

Hide file tree

Showing 71 changed files with 2,856 additions and 687 deletions.
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -47,7 +47,10 @@ steps:
     - pytest -v -s prefix_caching
 
 - label: Samplers Test
-  command: pytest -v -s samplers --forked
+  command: pytest -v -s samplers
+
+- label: LogitsProcessor Test
+  command: pytest -v -s test_logits_processor.py
 
 - label: Worker Test
   command: pytest -v -s worker
@@ -56,7 +59,7 @@ steps:
   command: pytest -v -s spec_decode
 
 - label: LoRA Test %N
-  command: pytest -v -s lora --forked --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 4
 
 - label: Metrics Test

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -51,7 +51,7 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 
 # Ensure the 'nvcc' command is in the PATH
 find_program(NVCC_EXECUTABLE nvcc)
-if (NOT NVCC_EXECUTABLE)
+if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
     message(FATAL_ERROR "nvcc not found")
 endif()
 

diff --git a/README.md b/README.md
@@ -76,6 +76,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
 - InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
 - InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
+- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
 - LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
 - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
 - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -66,7 +66,11 @@ Alongside each architecture, we include some popular models that use it.
   * - :code:`InternLM2ForCausalLM`
     - InternLM2
     - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc.
-    - 
+    -
+  * - :code:`JAISLMHeadModel`
+    - Jais
+    - :code:`core42/jais-13b`, :code:`core42/jais-13b-chat`, :code:`core42/jais-30b-v3`, :code:`core42/jais-30b-chat-v3`, etc.
+    -
   * - :code:`LlamaForCausalLM`
     - LLaMA, LLaMA-2, Vicuna, Alpaca, Yi
     - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.

diff --git a/requirements-rocm.txt b/requirements-rocm.txt
@@ -7,7 +7,7 @@ ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 tokenizers>=0.15.0
-transformers >= 4.38.0  # Required for Gemma.
+transformers >= 4.39.0  # Required for StarCoder2.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.

diff --git a/requirements.txt b/requirements.txt
@@ -5,7 +5,7 @@ ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 torch == 2.1.2
-transformers >= 4.38.0  # Required for Gemma.
+transformers >= 4.39.0  # Required for StarCoder2.
 xformers == 0.0.23.post1  # Required for CUDA 12.1.
 fastapi
 uvicorn[standard]

diff --git a/setup.py b/setup.py
@@ -168,7 +168,7 @@ def build_extensions(self) -> None:
 
 
 def _is_cuda() -> bool:
-    return torch.version.cuda is not None
+    return torch.version.cuda is not None and not _is_neuron()
 
 
 def _is_hip() -> bool:

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
@@ -13,19 +13,21 @@
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("enforce_eager", [False, True])
 def test_models(
     hf_runner,
     vllm_runner,
     example_prompts,
     model: str,
     dtype: str,
     max_tokens: int,
+    enforce_eager: bool,
 ) -> None:
     hf_model = hf_runner(model, dtype=dtype)
     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
     del hf_model
 
-    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_model = vllm_runner(model, dtype=dtype, enforce_eager=enforce_eager)
     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
     del vllm_model
 

diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
@@ -10,7 +10,7 @@
 
 def test_scheduler_add_seq_group():
     block_size = 4
-    scheduler_config = SchedulerConfig(100, 64, 1, 256)
+    scheduler_config = SchedulerConfig(100, 64, 1)
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 4
     cache_config.num_gpu_blocks = 4
@@ -26,7 +26,7 @@ def test_scheduler_add_seq_group():
 
 def test_scheduler_abort_seq_group():
     block_size = 4
-    scheduler_config = SchedulerConfig(100, 64, 1, 256)
+    scheduler_config = SchedulerConfig(100, 64, 1)
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 4
     cache_config.num_gpu_blocks = 4
@@ -50,7 +50,7 @@ def test_scheduler_schedule_simple():
     block_size = 4
     num_seq_group = 4
     max_model_len = 16
-    scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len, 256)
+    scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 8
     cache_config.num_gpu_blocks = 8
@@ -64,10 +64,10 @@ def test_scheduler_schedule_simple():
         running.append(seq_group)
 
     # Schedule seq groups prompts.
+    num_tokens = block_size * num_seq_group
     seq_group_meta, out = scheduler.schedule()
     assert set(out.scheduled_seq_groups) == set(running)
-    assert out.num_batched_tokens == num_seq_group * seq_group.get_seqs(
-    )[0].get_len()
+    assert out.num_batched_tokens == num_tokens
     assert (not out.blocks_to_copy and not out.blocks_to_swap_in
             and not out.blocks_to_swap_out)
     assert len(seq_group_meta) == num_seq_group
@@ -84,7 +84,7 @@ def test_scheduler_schedule_simple():
 def test_scheduler_schedule_preempt_abort():
     block_size = 4
     max_model_len = 16
-    scheduler_config = SchedulerConfig(64, 2, max_model_len, 256)
+    scheduler_config = SchedulerConfig(64, 2, max_model_len)
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 2
     cache_config.num_gpu_blocks = 2
@@ -99,7 +99,7 @@ def test_scheduler_schedule_preempt_abort():
     # Schedule seq groups prompts.
     seq_group_meta, out = scheduler.schedule()
     assert out.scheduled_seq_groups == [seq_group_a, seq_group_b]
-    assert out.num_batched_tokens == seq_group_a.get_seqs()[0].get_len() * 2
+    assert out.num_batched_tokens == block_size * 2  # seq_a and seq_b
     assert (not out.blocks_to_copy and not out.blocks_to_swap_in
             and not out.blocks_to_swap_out)
     assert len(seq_group_meta) == 2
@@ -124,7 +124,7 @@ def test_scheduler_schedule_preempt_abort():
     scheduler.abort_seq_group("1")
     seq_group_meta, out = scheduler.schedule()
     assert out.scheduled_seq_groups == [seq_group_b]
-    assert out.num_batched_tokens == seq_group_b.get_seqs()[0].get_len()
+    assert out.num_batched_tokens == 5  # 4 prompt + 1 generation.
     assert (not out.blocks_to_copy and not out.blocks_to_swap_in
             and not out.blocks_to_swap_out)
     assert len(seq_group_meta) == 1
@@ -136,7 +136,7 @@ def test_scheduler_max_seqs():
     num_seq_group = 4
     max_seq_group = 2
     max_model_len = 16
-    scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len, 256)
+    scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len)
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 8
     cache_config.num_gpu_blocks = 8

diff --git a/tests/kernels/test_rand.py b/tests/kernels/test_rand.py
@@ -0,0 +1,51 @@
+import torch
+import pytest
+import random
+
+from vllm.model_executor.layers.ops.rand import seeded_uniform
+from vllm.model_executor.utils import set_random_seed
+
+
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("use_3d", [True, False])
+def test_seeded_uniform(dtype: torch.dtype, use_3d: bool):
+    device = "cuda"
+    for seed in range(512):
+        set_random_seed(seed)
+        rows = random.randint(1, 512)
+        cols = random.randint(1, 64000)
+        if use_3d:
+            third_dim = random.randint(2, 10)
+            dims = [rows, third_dim, cols]
+        else:
+            dims = [rows, cols]
+        seeds = torch.randint(torch.iinfo(torch.long).min,
+                              torch.iinfo(torch.long).max, (rows, ),
+                              device=device)
+
+        # Test that the same seed produces the same output
+        out = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
+        out2 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
+        torch.testing.assert_close(out, out2)
+        # del to save memory
+        del out2
+
+        out3 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
+        torch.testing.assert_close(out, out3)
+        # del to save memory
+        del out3
+
+        # Initialize out tensor with garbage to ensure that it is overwritten
+        out_with_tensor = seeded_uniform(
+            *dims,
+            out=torch.full(
+                (*dims, ),
+                -1,
+                dtype=dtype,
+                device=device,
+            ),
+            seeds=seeds,
+            dtype=dtype,
+        )
+        torch.testing.assert_close(out, out_with_tensor)