From 79c94a1275bfe2f1e4deae927f288425bb9eaa33 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 23 Apr 2024 22:05:46 +0000
Subject: [PATCH 1/5] fixed fp8 conflict with aqlm

---
 vllm/model_executor/layers/quantization/fp8.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 8df82e0e18edd..01e494c870e71 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -64,12 +64,13 @@ def create_weights(
         self,
         layer: torch.nn.Module,
         input_size_per_partition: int,
-        output_size_per_partition: int,
+        output_partition_sizes: List[int],
         input_size: int,
         output_size: int,
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
+        output_size_per_partition = sum(output_partition_sizes)
         weight = Parameter(torch.empty(output_size_per_partition,
                                        input_size_per_partition,
                                        dtype=params_dtype),

From f8b57e4320303b6001949fd57ba7ce12892466df Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 23 Apr 2024 22:10:05 +0000
Subject: [PATCH 2/5] added quantization tests to buildkite

---
 .buildkite/test-pipeline.yaml  | 3 +++
 tests/quantization/test_fp8.py | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f7c1569696249..11cda053260ec 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -96,6 +96,9 @@ steps:
 - label: Metrics Test
   command: pytest -v -s metrics
 
+- label: Quantization Test
+  command: pytest -v -s quantization
+
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
   commands:
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index fa10e60de10a7..d643ebd38bb5d 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -12,9 +12,9 @@
 capability = capability[0] * 10 + capability[1]
 
 
-@pytest.mark.skipif(
-    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
-    reason="FP8 is not supported on this GPU type.")
+# @pytest.mark.skipif(
+#     capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
+#     reason="FP8 is not supported on this GPU type.")
 def test_load_fp16_model(vllm_runner) -> None:
     llm = vllm_runner("facebook/opt-125m", quantization="fp8")
 

From 7175e5b119e1cebdabe7202a5a0387a77ae80c72 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 23 Apr 2024 22:11:40 +0000
Subject: [PATCH 3/5] removed commented out piece

---
 tests/quantization/test_fp8.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index d643ebd38bb5d..fa10e60de10a7 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -12,9 +12,9 @@
 capability = capability[0] * 10 + capability[1]
 
 
-# @pytest.mark.skipif(
-#     capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
-#     reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
+    reason="FP8 is not supported on this GPU type.")
 def test_load_fp16_model(vllm_runner) -> None:
     llm = vllm_runner("facebook/opt-125m", quantization="fp8")
 

From 8d07b561fdf9cea139658caf9af31cbc3129615e Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 23 Apr 2024 22:43:12 +0000
Subject: [PATCH 4/5] added docstinrg

---
 vllm/model_executor/layers/linear.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index e56af9075e2fd..82fe74e24bf8e 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -27,16 +27,24 @@ def adjust_marlin_shard(param, shard_size, shard_offset):
 
 class LinearMethodBase(ABC):
     """Base class for different (maybe quantized) linear methods."""
-
     @abstractmethod
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
                        output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
-        """Create weights for a linear layer.
-
-        The weights will be set as attributes of the layer."""
+        """Create weights for a linear layer. The weights will be set as attributes of the layer.
+        
+        Args:
+            layer: The layer that is using the LinearMethodBase factory.
+            input_size_per_partition: Size of the input dim of the weight on rank X.
+            output_partition_sizes: Sizes of the output dim of each logical weight
+                on rank X. E.g., output_partition_sizes for QKVParallelLinear is
+                a list contains the width of Wq, Wk, Wv on rank X.
+            input_size: Size of the input dim of the weight across all ranks.
+            output_size: Size of the output dim of the weight across all ranks.
+            params_dtype: Datatype of the parameters.
+        """
         raise NotImplementedError
 
     @abstractmethod

From e6fe6a587dcd883a6bd267cadedb01f0ddff9bb8 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 23 Apr 2024 22:46:37 +0000
Subject: [PATCH 5/5] fixed format.sh

---
 vllm/model_executor/layers/linear.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 82fe74e24bf8e..6ad7ae0f63197 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -27,20 +27,22 @@ def adjust_marlin_shard(param, shard_size, shard_offset):
 
 class LinearMethodBase(ABC):
     """Base class for different (maybe quantized) linear methods."""
+
     @abstractmethod
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
                        output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
-        """Create weights for a linear layer. The weights will be set as attributes of the layer.
+        """Create weights for a linear layer. 
+           The weights will be set as attributes of the layer.
         
         Args:
             layer: The layer that is using the LinearMethodBase factory.
-            input_size_per_partition: Size of the input dim of the weight on rank X.
-            output_partition_sizes: Sizes of the output dim of each logical weight
-                on rank X. E.g., output_partition_sizes for QKVParallelLinear is
-                a list contains the width of Wq, Wk, Wv on rank X.
+            input_size_per_partition: Size of the weight input dim on rank X.
+            output_partition_sizes: Sizes of the output dim of each logical 
+                weight on rank X. E.g., output_partition_sizes for QKVLinear
+                is a list contains the width of Wq, Wk, Wv on rank X.
             input_size: Size of the input dim of the weight across all ranks.
             output_size: Size of the output dim of the weight across all ranks.
             params_dtype: Datatype of the parameters.