From 79c94a1275bfe2f1e4deae927f288425bb9eaa33 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 23 Apr 2024 22:05:46 +0000 Subject: [PATCH 1/5] fixed fp8 conflict with aqlm --- vllm/model_executor/layers/quantization/fp8.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 8df82e0e18edd..01e494c870e71 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -64,12 +64,13 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_size_per_partition: int, + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs, ): + output_size_per_partition = sum(output_partition_sizes) weight = Parameter(torch.empty(output_size_per_partition, input_size_per_partition, dtype=params_dtype), From f8b57e4320303b6001949fd57ba7ce12892466df Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 23 Apr 2024 22:10:05 +0000 Subject: [PATCH 2/5] added quantization tests to buildkite --- .buildkite/test-pipeline.yaml | 3 +++ tests/quantization/test_fp8.py | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f7c1569696249..11cda053260ec 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -96,6 +96,9 @@ steps: - label: Metrics Test command: pytest -v -s metrics +- label: Quantization Test + command: pytest -v -s quantization + - label: Benchmarks working_dir: "/vllm-workspace/.buildkite" commands: diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index fa10e60de10a7..d643ebd38bb5d 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -12,9 +12,9 @@ capability = capability[0] * 10 + capability[1] -@pytest.mark.skipif( - capability < QUANTIZATION_METHODS["fp8"].get_min_capability(), - reason="FP8 is not supported on this GPU type.") +# @pytest.mark.skipif( +# capability < QUANTIZATION_METHODS["fp8"].get_min_capability(), +# reason="FP8 is not supported on this GPU type.") def test_load_fp16_model(vllm_runner) -> None: llm = vllm_runner("facebook/opt-125m", quantization="fp8") From 7175e5b119e1cebdabe7202a5a0387a77ae80c72 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 23 Apr 2024 22:11:40 +0000 Subject: [PATCH 3/5] removed commented out piece --- tests/quantization/test_fp8.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index d643ebd38bb5d..fa10e60de10a7 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -12,9 +12,9 @@ capability = capability[0] * 10 + capability[1] -# @pytest.mark.skipif( -# capability < QUANTIZATION_METHODS["fp8"].get_min_capability(), -# reason="FP8 is not supported on this GPU type.") +@pytest.mark.skipif( + capability < QUANTIZATION_METHODS["fp8"].get_min_capability(), + reason="FP8 is not supported on this GPU type.") def test_load_fp16_model(vllm_runner) -> None: llm = vllm_runner("facebook/opt-125m", quantization="fp8") From 8d07b561fdf9cea139658caf9af31cbc3129615e Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 23 Apr 2024 22:43:12 +0000 Subject: [PATCH 4/5] added docstinrg --- vllm/model_executor/layers/linear.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index e56af9075e2fd..82fe74e24bf8e 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -27,16 +27,24 @@ def adjust_marlin_shard(param, shard_size, shard_offset): class LinearMethodBase(ABC): """Base class for different (maybe quantized) linear methods.""" - @abstractmethod def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): - """Create weights for a linear layer. - - The weights will be set as attributes of the layer.""" + """Create weights for a linear layer. The weights will be set as attributes of the layer. + + Args: + layer: The layer that is using the LinearMethodBase factory. + input_size_per_partition: Size of the input dim of the weight on rank X. + output_partition_sizes: Sizes of the output dim of each logical weight + on rank X. E.g., output_partition_sizes for QKVParallelLinear is + a list contains the width of Wq, Wk, Wv on rank X. + input_size: Size of the input dim of the weight across all ranks. + output_size: Size of the output dim of the weight across all ranks. + params_dtype: Datatype of the parameters. + """ raise NotImplementedError @abstractmethod From e6fe6a587dcd883a6bd267cadedb01f0ddff9bb8 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 23 Apr 2024 22:46:37 +0000 Subject: [PATCH 5/5] fixed format.sh --- vllm/model_executor/layers/linear.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 82fe74e24bf8e..6ad7ae0f63197 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -27,20 +27,22 @@ def adjust_marlin_shard(param, shard_size, shard_offset): class LinearMethodBase(ABC): """Base class for different (maybe quantized) linear methods.""" + @abstractmethod def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): - """Create weights for a linear layer. The weights will be set as attributes of the layer. + """Create weights for a linear layer. + The weights will be set as attributes of the layer. Args: layer: The layer that is using the LinearMethodBase factory. - input_size_per_partition: Size of the input dim of the weight on rank X. - output_partition_sizes: Sizes of the output dim of each logical weight - on rank X. E.g., output_partition_sizes for QKVParallelLinear is - a list contains the width of Wq, Wk, Wv on rank X. + input_size_per_partition: Size of the weight input dim on rank X. + output_partition_sizes: Sizes of the output dim of each logical + weight on rank X. E.g., output_partition_sizes for QKVLinear + is a list contains the width of Wq, Wk, Wv on rank X. input_size: Size of the input dim of the weight across all ranks. output_size: Size of the output dim of the weight across all ranks. params_dtype: Datatype of the parameters.