Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
Use naive decompress for SM<8.0 (#32)
Browse files Browse the repository at this point in the history
A warning will be printed out if this case is triggered:
```
WARNING 02-20 22:21:27 sparse_w16a16.py:32] Unstructured sparse kernels are not optimized for NVIDIA SM < 8.0. Naive decompress kernels will be used and can be slower than dense models
```

Works on a T4 with:
```python
from vllm import LLM, SamplingParams

model = LLM(
    "nm-testing/opt-125m-pruned2.4", 
    sparsity="sparse_w16a16",
    enforce_eager=True,
    dtype="float16",
)

sampling_params = SamplingParams(max_tokens=100, temperature=0)
outputs = model.generate("Hello my name is", sampling_params=sampling_params)
outputs[0].outputs[0].text
```

Test within colab:
https://colab.research.google.com/drive/15xRvWX5gNaTb00BcaXhxwMm6yxavIKGN?usp=sharing
  • Loading branch information
mgoin authored and tlrmchlsmth committed Feb 21, 2024
1 parent ae45b23 commit 9a7781c
Showing 1 changed file with 16 additions and 9 deletions.
25 changes: 16 additions & 9 deletions vllm/model_executor/layers/sparsity/sparse_w16a16.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,36 @@

import torch

from vllm.logger import init_logger
from vllm.model_executor.layers.sparsity.base_config import SparsityConfig

from .sparse_w16a16_linear_method import SparseW16A16LinearMethod
from magic_wand import (CompressedStorageFormat, SparseBEGemmStorageFormat)
from magic_wand import (CompressedStorageFormat, SparseBitmaskStorageFormat,
SparseBEGemmStorageFormat)

logger = init_logger(__name__)

class SparseW16A16Config(SparsityConfig):
"""Config class for SparseW16A16.

TODO: Add based on need
"""
class SparseW16A16Config(SparsityConfig):
"""Config class for SparseW16A16."""

def __init__(self) -> None:
# TODO: Add new configs here
pass

def __repr__(self) -> str:
return "SparseW16A16Config()"

@classmethod
def get_storage_format_cls(cls) -> Type[CompressedStorageFormat]:
return SparseBEGemmStorageFormat
cuda_compute_capability = torch.cuda.get_device_capability()
if cuda_compute_capability >= (8, 0):
return SparseBEGemmStorageFormat
else:
# For NVIDIA SM < 8.0
logger.warning("Unstructured sparse kernels are not optimized for "
"NVIDIA SM < 8.0. Naive decompress kernels will be "
"used and can be slower than dense models")
return SparseBitmaskStorageFormat

@classmethod
def get_name(cls) -> str:
Expand All @@ -35,8 +43,7 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]:

@classmethod
def get_min_capability(cls) -> int:
# TODO: Update after checks on more GPUs
return 80
return 70

@classmethod
def get_config_filenames(cls) -> List[str]:
Expand Down

0 comments on commit 9a7781c

Please sign in to comment.