diff --git a/tests/conftest.py b/tests/conftest.py index 5c409c8cd5ee5..e00f3eb871e37 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -56,11 +56,15 @@ def cleanup(): @pytest.fixture() -def should_do_global_cleanup_after_test() -> bool: +def should_do_global_cleanup_after_test(request) -> bool: """Allow subdirectories to skip global cleanup by overriding this fixture. This can provide a ~10x speedup for non-GPU unit tests since they don't need to initialize torch. """ + + if request.node.get_closest_marker("skip_global_cleanup"): + return False + return True diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 80a960acf0be5..43cfd78ddb0cc 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -7,6 +7,7 @@ @pytest.mark.parametrize('num_target_seq_ids', [100]) +@pytest.mark.skip_global_cleanup def test_create_target_seq_id_iterator(num_target_seq_ids: int): """Verify all new sequence ids are greater than all input seq ids. @@ -27,6 +28,7 @@ def test_create_target_seq_id_iterator(num_target_seq_ids: int): @pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.skip_global_cleanup def test_get_token_ids_to_score(k: int): """Verify correct tokens are selected for scoring. """ @@ -53,6 +55,7 @@ def test_get_token_ids_to_score(k: int): @pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.skip_global_cleanup def test_create_single_target_seq_group_metadata(k: int): """Verify correct creation of a batch-expanded seq group metadata. """ diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 87d3716ca98d7..825d360671965 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -487,7 +487,7 @@ def test_empty_input_batch(k: int, batch_size: int): **execute_model_data.to_dict()) -@torch.inference_mode() +@pytest.mark.skip_global_cleanup def test_init_device(): """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as well as other GPU initialization. @@ -537,7 +537,7 @@ def test_init_cache_engine(): @pytest.mark.parametrize('available_cpu_blocks', [500]) @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096]) @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) -@torch.inference_mode() +@pytest.mark.skip_global_cleanup def test_profile_num_available_blocks(available_gpu_blocks: int, available_cpu_blocks: int, target_cache_block_size_bytes: int, @@ -584,7 +584,7 @@ def test_profile_num_available_blocks(available_gpu_blocks: int, @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096, 2 * 2 * 8192]) @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) -@torch.inference_mode() +@pytest.mark.skip_global_cleanup def test_split_num_cache_blocks_evenly(available_gpu_blocks: int, target_cache_block_size_bytes: int, draft_kv_size_bytes: int):