Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CI/Build] Avoid CUDA initialization #8534

Merged
merged 22 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions benchmarks/kernels/benchmark_layernorm.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import random
import time

import torch

from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
seed_everything)


@torch.inference_mode()
Expand All @@ -16,10 +16,7 @@ def main(num_tokens: int,
do_profile: bool = False,
num_warmup_iters: int = 5,
num_iters: int = 100) -> None:
random.seed(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch.set_default_device("cuda")

layer = RMSNorm(hidden_size).to(dtype=dtype)
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/kernels/benchmark_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from transformers import AutoConfig

from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.utils import FlexibleArgumentParser
from vllm.utils import FlexibleArgumentParser, seed_everything


class BenchmarkConfig(TypedDict):
Expand Down Expand Up @@ -166,7 +166,7 @@ class BenchmarkWorker:

def __init__(self, seed: int) -> None:
torch.set_default_device("cuda")
torch.cuda.manual_seed_all(seed)
seed_everything(seed)
self.seed = seed

def benchmark(
Expand All @@ -180,7 +180,7 @@ def benchmark(
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
) -> Tuple[Dict[str, int], float]:
torch.cuda.manual_seed_all(self.seed)
seed_everything(self.seed)
dtype_str = get_config_dtype_str(dtype,
use_int8_w8a16=use_int8_w8a16,
use_fp8_w8a8=use_fp8_w8a8)
Expand Down
7 changes: 2 additions & 5 deletions benchmarks/kernels/benchmark_paged_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from vllm import _custom_ops as ops
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
create_kv_caches_with_random)
create_kv_caches_with_random, seed_everything)

NUM_BLOCKS = 1024
PARTITION_SIZE = 512
Expand All @@ -28,10 +28,7 @@ def main(
device: str = "cuda",
kv_cache_dtype: Optional[str] = None,
) -> None:
random.seed(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)

scale = float(1.0 / (head_size**0.5))
query = torch.empty(num_seqs,
Expand Down
9 changes: 3 additions & 6 deletions benchmarks/kernels/benchmark_quant.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import random
import time

import torch

from vllm import _custom_ops as ops
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
seed_everything)


@torch.inference_mode()
Expand All @@ -17,10 +17,7 @@ def main(num_tokens: int,
do_profile: bool = False,
num_warmup_iters: int = 5,
num_iters: int = 100) -> None:
random.seed(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch.set_default_device("cuda")

x = torch.randn(num_tokens, hidden_size, dtype=dtype)
Expand Down
6 changes: 2 additions & 4 deletions benchmarks/kernels/benchmark_rope.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
get_rope)
from vllm.utils import FlexibleArgumentParser
from vllm.utils import FlexibleArgumentParser, seed_everything


def benchmark_rope_kernels_multi_lora(
Expand All @@ -22,9 +22,7 @@ def benchmark_rope_kernels_multi_lora(
max_position: int = 8192,
base: int = 10000,
) -> None:
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch.set_default_device(device)
if rotary_dim is None:
rotary_dim = head_size
Expand Down
9 changes: 3 additions & 6 deletions tests/kernels/test_activation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
NewGELU, QuickGELU,
SiluAndMul)
from vllm.utils import seed_everything

from .allclose_default import get_default_atol, get_default_rtol

Expand Down Expand Up @@ -34,9 +35,7 @@ def test_act_and_mul(
seed: int,
device: str,
) -> None:
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch.set_default_device(device)
x = torch.randn(num_tokens, 2 * d, dtype=dtype)
if activation == "silu":
Expand Down Expand Up @@ -77,9 +76,7 @@ def test_activation(
seed: int,
device: str,
) -> None:
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch.set_default_device(device)
x = torch.randn(num_tokens, d, dtype=dtype)
layer = activation[0]()
Expand Down
18 changes: 5 additions & 13 deletions tests/kernels/test_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops
from vllm.utils import get_max_shared_memory_bytes, is_hip
from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything

from .allclose_default import get_default_atol, get_default_rtol

Expand Down Expand Up @@ -139,10 +139,8 @@ def test_paged_attention(
) -> None:
if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip()
random.seed(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)

seed_everything(seed)
torch.set_default_device(device)
scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads
Expand Down Expand Up @@ -354,10 +352,7 @@ def test_paged_attention_rocm(
seed: int,
device: str,
) -> None:
random.seed(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch.set_default_device(device)
scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads
Expand Down Expand Up @@ -506,10 +501,7 @@ def test_multi_query_kv_attention(
seed: int,
device: str,
) -> None:
random.seed(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch.set_default_device(device)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use
Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/test_attention_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def test_flash_attn(monkeypatch):
override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)

# Unsupported CUDA arch
with patch("torch.cuda.get_device_capability", return_value=[7, 5]):
with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
assert backend.name != STR_FLASH_ATTN_VAL

Expand Down
5 changes: 3 additions & 2 deletions tests/kernels/test_awq_triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from vllm.model_executor.layers.quantization.awq_triton import (
AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton)
from vllm.utils import seed_everything

device = "cuda"

Expand Down Expand Up @@ -79,7 +80,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
zeros_cols = qweight_cols
zeros_dtype = torch.int32

torch.manual_seed(0)
seed_everything(0)

qweight = torch.randint(0,
torch.iinfo(torch.int32).max,
Expand Down Expand Up @@ -133,7 +134,7 @@ def test_gemm(N, K, M, splitK, group_size):
qzeros_rows = scales_rows
qzeros_cols = qweight_cols

torch.manual_seed(0)
seed_everything(0)

input = torch.rand((input_rows, input_cols),
dtype=input_dtype,
Expand Down
12 changes: 3 additions & 9 deletions tests/kernels/test_blocksparse_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from vllm import _custom_ops as ops
from vllm.attention.ops.blocksparse_attention.interface import (
LocalStridedBlockSparseAttn)
from vllm.utils import get_max_shared_memory_bytes, is_hip
from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything

from .allclose_default import get_default_atol, get_default_rtol

Expand Down Expand Up @@ -172,10 +172,7 @@ def test_paged_attention(
blocksparse_block_size: int,
blocksparse_head_sliding_step: int,
) -> None:
random.seed(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch.set_default_device(device)
scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads
Expand Down Expand Up @@ -386,10 +383,7 @@ def test_varlen_blocksparse_attention_prefill(
seed: int,
device: str,
) -> None:
random.seed(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch.set_default_device(device)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use
Expand Down
25 changes: 7 additions & 18 deletions tests/kernels/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
from vllm import _custom_ops as ops
from vllm.utils import seed_everything

COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
DTYPES = [torch.half, torch.bfloat16, torch.float]
Expand Down Expand Up @@ -55,10 +56,7 @@ def test_copy_blocks(
) -> None:
if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip()
random.seed(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch.set_default_device(device)
# Generate random block mappings where each source block is mapped to two
# destination blocks.
Expand Down Expand Up @@ -134,10 +132,7 @@ def test_reshape_and_cache(
) -> None:
if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip()
random.seed(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch.set_default_device(device)
# Create a random slot mapping.
num_slots = block_size * num_blocks
Expand Down Expand Up @@ -229,9 +224,7 @@ def test_reshape_and_cache_flash(
device: str,
kv_cache_dtype: str,
) -> None:
random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch.set_default_device(device)

# Create a random slot mapping.
Expand Down Expand Up @@ -345,10 +338,8 @@ def test_swap_blocks(
pytest.skip()
if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip()
random.seed(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)

seed_everything(seed)

src_device = device if direction[0] == "cuda" else 'cpu'
dst_device = device if direction[1] == "cuda" else 'cpu'
Expand Down Expand Up @@ -417,9 +408,7 @@ def test_fp8_e4m3_conversion(
seed: int,
device: str,
) -> None:
random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
seed_everything(seed)

low = -224.0
high = 224.0
Expand Down
5 changes: 3 additions & 2 deletions tests/kernels/test_causal_conv1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
causal_conv1d_fn, causal_conv1d_update)
from vllm.utils import seed_everything


def causal_conv1d_ref(
Expand Down Expand Up @@ -104,7 +105,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
if itype == torch.bfloat16:
rtol, atol = 1e-2, 5e-2
# set seed
torch.random.manual_seed(0)
seed_everything(0)
if not channel_last:
x = torch.randn(batch,
4096 + dim + 64,
Expand Down Expand Up @@ -175,7 +176,7 @@ def test_causal_conv1d_update(batch, dim, width, has_bias, silu_activation,
if itype == torch.bfloat16:
rtol, atol = 1e-2, 5e-2
# set seed
torch.random.manual_seed(0)
seed_everything(0)
batch = 2
x = torch.randn(batch, dim, device=device, dtype=itype)
conv_state = torch.randn(batch, dim, width, device=device, dtype=itype)
Expand Down
Loading
Loading