From 079cba53b7fc41d646b951feee845d59fdb363e4 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Mon, 26 Feb 2024 14:47:05 -0500
Subject: [PATCH 01/96] actual add kernel

---
 csrc/ops.h                                    |   9 +
 csrc/quantization/aqlm/aqlm_cuda_kernel.cu    | 213 ++++++++++++++++++
 .../layers/quantization/aqlm.py               | 211 +++++++++++++++++
 3 files changed, 433 insertions(+)
 create mode 100644 csrc/quantization/aqlm/aqlm_cuda_kernel.cu
 create mode 100644 vllm/model_executor/layers/quantization/aqlm.py
diff --git a/csrc/ops.h b/csrc/ops.h
index dbdd2c2c57945..ebd7b7a03a352 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -100,6 +100,15 @@ torch::Tensor gptq_gemm(
   torch::Tensor b_g_idx,
   bool use_exllama);
 
+torch::Tensor aqlm_gemm(
+  torch::Tensor a,
+  torch::Tensor b_q_weight,
+  torch::Tensor b_gptq_qzeros,
+  torch::Tensor b_gptq_scales,
+  torch::Tensor b_g_idx,
+  bool use_exllama);
+
+
 void gptq_shuffle(
   torch::Tensor q_weight,
   torch::Tensor q_perm);
diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
new file mode 100644
index 0000000000000..0f97e93d678e6
--- /dev/null
+++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
@@ -0,0 +1,213 @@
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <iostream>
+
+__global__ void Code1x16MatVec(
+  const int4* __restrict__ A,
+  const int4* __restrict__ B,
+        int4* __restrict__ C,
+  const int4* __restrict__ codebook,
+  int prob_m,
+  int prob_k
+) {
+  int a_gl_stride = prob_k / 8 / 8;
+  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  bool pred = a_gl_rd < prob_m;
+  int b_gl_rd = 0;
+  int c_gl_wr = a_gl_rd;
+  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+
+  __shared__ int4 sh_b[32 * 9];
+  float res = 0;
+
+  int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
+  while (iters--) {
+    // We pad shared memory to avoid bank conflicts during reads
+    __syncthreads();
+    for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
+      if (b_gl_rd + i < prob_k / 8)
+        sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
+    }
+    __syncthreads();
+    b_gl_rd += 32 * 8;
+
+    int b_sh_rd = 9 * (threadIdx.x % 32);
+    if (pred && a_gl_rd < a_gl_end) {
+      const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
+      #pragma unroll
+      for (int i = 0; i < 8; i++) {
+        uint32_t dec[4];
+        // We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't
+        // actually help us; this brings > 2x speedup.
+        asm volatile (
+          "ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
+          : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
+          : "l"((void*) &codebook[enc[i]])
+        );
+        half2* a = reinterpret_cast<half2*>(&dec);
+        half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
+        half2 res2 = {};
+        #pragma unroll
+        for (int j = 0; j < 4; j++)
+          res2 = __hfma2(a[j], b[j], res2);
+        res += __half2float(res2.x) + __half2float(res2.y);
+        b_sh_rd++;
+      }
+      a_gl_rd += 32;
+    }
+  }
+
+  if (pred) {
+    #pragma unroll
+    for (int i = 16; i > 0; i /= 2)
+      res += __shfl_down_sync(0xffffffff, res, i);
+    if (threadIdx.x % 32 == 0)
+      reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
+  }
+}
+
+__global__ void Code2x8MatVec(
+  const int4* __restrict__ A,
+  const int4* __restrict__ B,
+        int4* __restrict__ C,
+  const int4* __restrict__ codebook,
+  int prob_m,
+  int prob_k
+) {
+  int a_gl_stride = prob_k / 8 / 8;
+  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  bool pred = a_gl_rd < prob_m;
+  int b_gl_rd = 0;
+  int c_gl_wr = a_gl_rd;
+  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+  int lane = threadIdx.x % 8;
+
+  extern __shared__ int4 sh[];
+  int4* sh_b = sh;
+  int4* sh_code = sh_b + 32 * 9;
+  int4* sh_code0 = sh_code;
+  int4* sh_code1 = sh_code + 256 * 8;
+
+  for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
+    int4 dec = codebook[i];
+    #pragma unroll
+    for (int j = 0; j < 8; j++)
+      sh_code[8 * i + (j + lane) % 8] = dec;
+  }
+  __syncthreads();
+
+  float res = 0;
+
+  int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
+  while (iters--) {
+    // We pad shared memory to avoid bank conflicts during reads
+    __syncthreads();
+    for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
+      if (b_gl_rd + i < prob_k / 8)
+        sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
+    }
+    __syncthreads();
+    b_gl_rd += 32 * 8;
+
+    int b_sh_rd = 9 * (threadIdx.x % 32);
+    if (pred && a_gl_rd < a_gl_end) {
+      const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
+      #pragma unroll
+      for (int i = 0; i < 8; i++) {
+        half2* a0 = reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
+        half2* a1 = reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
+        half2*  b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
+        half2 res2 = {};
+        #pragma unroll
+        for (int j = 0; j < 4; j++)
+          res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2);
+        res += __half2float(res2.x) + __half2float(res2.y);
+        b_sh_rd++;
+      }
+      a_gl_rd += 32;
+    }
+  }
+
+  if (pred) {
+    #pragma unroll
+    for (int i = 16; i > 0; i /= 2)
+      res += __shfl_down_sync(0xffffffff, res, i);
+    if (threadIdx.x % 32 == 0)
+      reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
+  }
+}
+
+inline int ceildiv(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+const int THREAD_M = 16;
+
+void  code1x16_matvec_cuda(
+  const void* __restrict__ A,
+  const void* __restrict__ B,
+        void* __restrict__ C,
+  const void* __restrict__ codebook,
+  int prob_m,
+  int prob_k
+) {
+  int sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+  int waves = 0;
+  int thread_m;
+  do {
+    waves++;
+    thread_m = ceildiv(prob_m, waves * sms);
+  } while (thread_m > THREAD_M);
+
+  int blocks = ceildiv(prob_m, thread_m);
+  int threads = 32 * thread_m;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  Code1x16MatVec<<<blocks, threads, 16*32*9, stream>>>(
+    (const int4*) A,
+    (const int4*) B,
+    (int4*) C,
+    (const int4*) codebook,
+    prob_m,
+    prob_k
+  );
+}
+
+void  code2x8_matvec_cuda(
+  const void* __restrict__ A,
+  const void* __restrict__ B,
+        void* __restrict__ C,
+  const void* __restrict__ codebook,
+  int prob_m,
+  int prob_k
+) {
+  int sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+  int waves = 0;
+  int thread_m;
+  do {
+    waves++;
+    thread_m = ceildiv(prob_m, waves * sms);
+  } while (thread_m > THREAD_M);
+
+  int blocks = ceildiv(prob_m, thread_m);
+  int threads = 32 * thread_m;
+  int shared = 16 * (2 * 256 * 8 + 32 * 9);
+  cudaFuncSetAttribute(
+    Code2x8MatVec, cudaFuncAttributeMaxDynamicSharedMemorySize, shared
+  );
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  Code2x8MatVec<<<blocks, threads, shared, stream>>>(
+    (const int4*) A,
+    (const int4*) B,
+    (int4*) C,
+    (const int4*) codebook,
+    prob_m,
+    prob_k
+  );
+}
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
new file mode 100644
index 0000000000000..7218760fbe55d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -0,0 +1,211 @@
+import enum
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm._C import ops
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+
+class GPTQConfig(QuantizationConfig):
+    """Config class for GPTQ.
+
+    Reference: https://arxiv.org/abs/2210.17323
+    """
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+    ) -> None:
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.pack_factor = 32 // self.weight_bits
+        # exllama kernel v1 only supports 4 bit
+        if self.weight_bits != 4:
+            raise ValueError(
+                "Currently, only 4-bit weight quantization is supported for "
+                f"GPTQ, but got {self.weight_bits} bits.")
+
+    def __repr__(self) -> str:
+        return (f"GPTQConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"desc_act={self.desc_act})")
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "gptq"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 60
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        return cls(weight_bits, group_size, desc_act)
+
+    def get_linear_method(self) -> "GPTQLinearMethod":
+        return GPTQLinearMethod(self)
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class ExllamaState(Enum):
+
+    UNUSED = enum.auto()
+    UNINITIALIZED = enum.auto()
+    READY = enum.auto()
+
+
+class GPTQLinearMethod(LinearMethodBase):
+    """Linear method for GPTQ.
+
+    Args:
+        quant_config: The GPTQ quantization config.
+    """
+
+    def __init__(self, quant_config: GPTQConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        input_size_per_partition: int,
+        output_size_per_partition: int,
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
+        del output_size  # Unused.
+        if input_size_per_partition % self.quant_config.group_size != 0:
+            raise ValueError(
+                "The input size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                "The output size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+        exllama_state = ExllamaState.UNINITIALIZED
+        scale_and_zero_size = input_size // group_size
+        scale_and_zero_input_dim = None
+        if input_size != input_size_per_partition and self.quant_config.group_size != -1:
+            # For act-order models, we cannot use Exllama for row parallel layer
+            if self.quant_config.desc_act:
+                exllama_state = ExllamaState.UNUSED
+            else:
+                # we need to partition qzeros and scales for exllama kernel
+                scale_and_zero_size = input_size_per_partition // group_size
+                scale_and_zero_input_dim = 0
+
+        qweight = Parameter(
+            torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qweight, {
+                "input_dim": 0,
+                "output_dim": 1,
+                "packed_dim": 0,
+                "pack_factor": self.quant_config.pack_factor,
+            })
+        g_idx = Parameter(
+            torch.tensor(
+                [
+                    i // self.quant_config.group_size
+                    for i in range(input_size_per_partition)
+                ],
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        # Ignore warning from fused linear layers such as QKVParallelLinear.
+        set_weight_attrs(g_idx, {"input_dim": 0, "ignore_warning": True})
+        qzeros = Parameter(
+            torch.empty(
+                scale_and_zero_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qzeros, {
+                "input_dim": scale_and_zero_input_dim,
+                "output_dim": 1,
+                "packed_dim": 1,
+                "pack_factor": self.quant_config.pack_factor,
+            })
+        scales = Parameter(
+            torch.empty(
+                scale_and_zero_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(scales, {
+            "input_dim": scale_and_zero_input_dim,
+            "output_dim": 1,
+        })
+        return {
+            "qweight": qweight,
+            "g_idx": g_idx,
+            "qzeros": qzeros,
+            "scales": scales,
+            "exllama_state": exllama_state,
+        }
+
+    def apply_weights(self,
+                      weights: Dict[str, Any],
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        qweight = weights["qweight"]
+        out_shape = x.shape[:-1] + (qweight.shape[-1], )
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        # exllama needs to shuffle the weight after the weight is loaded
+        # here we do the shuffle on first forward pass
+        if weights["exllama_state"] == ExllamaState.UNINITIALIZED:
+            if self.quant_config.desc_act:
+                weights["g_idx"] = torch.argsort(weights["g_idx"]).to(
+                    torch.int)
+            else:
+                weights["g_idx"] = torch.empty((1, 1), device="meta")
+            weights["exllama_state"] = ExllamaState.READY
+            ops.gptq_shuffle(weights["qweight"], weights["g_idx"])
+        output = ops.gptq_gemm(reshaped_x, weights["qweight"],
+                               weights["qzeros"], weights["scales"],
+                               weights["g_idx"],
+                               weights["exllama_state"] == ExllamaState.READY)
+        if bias is not None:
+            output = output + bias
+        return output.reshape(out_shape)

From 23c3f7727be8f73c6cb42e6b168942e6d4118bd5 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Mon, 26 Feb 2024 16:20:06 -0500
Subject: [PATCH 02/96] getting serious

---
 csrc/ops.h                                    |  17 +-
 vllm/config.py                                | 181 +++++++++++-------
 .../layers/quantization/__init__.py           |   2 +
 .../layers/quantization/aqlm.py               | 179 +++++++++--------
 vllm/model_executor/weight_utils.py           |  67 +++----
 5 files changed, 252 insertions(+), 194 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index ebd7b7a03a352..351c4cade7a09 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -69,6 +69,14 @@ void gelu_fast(
   torch::Tensor& out,
   torch::Tensor& input);
 
+torch::Tensor aqlm_gemm(
+  torch::Tensor a,
+  torch::Tensor b_q_weight,
+  torch::Tensor b_gptq_qzeros,
+  torch::Tensor b_gptq_scales,
+  torch::Tensor b_g_idx,
+  bool use_exllama);
+
 #ifndef USE_ROCM
 torch::Tensor awq_gemm(
   torch::Tensor _in_feats,
@@ -100,15 +108,6 @@ torch::Tensor gptq_gemm(
   torch::Tensor b_g_idx,
   bool use_exllama);
 
-torch::Tensor aqlm_gemm(
-  torch::Tensor a,
-  torch::Tensor b_q_weight,
-  torch::Tensor b_gptq_qzeros,
-  torch::Tensor b_gptq_scales,
-  torch::Tensor b_g_idx,
-  bool use_exllama);
-
-
 void gptq_shuffle(
   torch::Tensor q_weight,
   torch::Tensor q_perm);
diff --git a/vllm/config.py b/vllm/config.py
index bd0dc89b585f7..f2452baf8796c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,4 +1,4 @@
-from typing import Optional, Union, ClassVar
+from typing import Any, Optional, Union, ClassVar
 from dataclasses import dataclass
 import os
 from packaging.version import Version
@@ -45,7 +45,7 @@ class ModelConfig:
             a tag name, or a commit id. If unspecified, will use the default
             version.
         code_revision: The specific revision to use for the model code on
-            Hugging Face Hub. It can be a branch name, a tag name, or a 
+            Hugging Face Hub. It can be a branch name, a tag name, or a
             commit id. If unspecified, will use the default version.
         tokenizer_revision: The specific tokenizer version to use. It can be a
             branch name, a tag name, or a commit id. If unspecified, will use
@@ -98,52 +98,55 @@ def __init__(
             # download model from ModelScope hub,
             # lazy import so that modelscope is not required for normal use.
             from modelscope.hub.snapshot_download import snapshot_download  # pylint: disable=C
+
             if not os.path.exists(model):
-                model_path = snapshot_download(model_id=model,
-                                               cache_dir=download_dir,
-                                               revision=revision)
+                model_path = snapshot_download(
+                    model_id=model, cache_dir=download_dir, revision=revision
+                )
             else:
                 model_path = model
             self.model = model_path
             self.download_dir = model_path
             self.tokenizer = model_path
 
-        self.hf_config = get_config(self.model, trust_remote_code, revision,
-                                    code_revision)
+        self.hf_config = get_config(
+            self.model, trust_remote_code, revision, code_revision
+        )
         self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
-        self.max_model_len = _get_and_verify_max_len(self.hf_config,
-                                                     max_model_len)
+        self.max_model_len = _get_and_verify_max_len(self.hf_config, max_model_len)
         self._verify_load_format()
         self._verify_tokenizer_mode()
-        self._verify_quantization()
+        self.hf_quant_config = self._get_and_verify_quantization()
         self._verify_cuda_graph()
 
     def _verify_load_format(self) -> None:
         load_format = self.load_format.lower()
-        supported_load_format = [
-            "auto", "pt", "safetensors", "npcache", "dummy"
-        ]
+        supported_load_format = ["auto", "pt", "safetensors", "npcache", "dummy"]
         rocm_not_supported_load_format = []
         if load_format not in supported_load_format:
             raise ValueError(
                 f"Unknown load format: {self.load_format}. Must be one of "
-                "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'.")
+                "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'."
+            )
         if is_hip() and load_format in rocm_not_supported_load_format:
             rocm_supported_load_format = [
-                f for f in supported_load_format
+                f
+                for f in supported_load_format
                 if (f not in rocm_not_supported_load_format)
             ]
             raise ValueError(
-                f"load format \'{load_format}\' is not supported in ROCm. "
+                f"load format '{load_format}' is not supported in ROCm. "
                 f"Supported load format are "
-                f"{rocm_supported_load_format}")
+                f"{rocm_supported_load_format}"
+            )
 
         # TODO: Remove this check once HF updates the pt weights of Mixtral.
         architectures = getattr(self.hf_config, "architectures", [])
         if "MixtralForCausalLM" in architectures and load_format == "pt":
             raise ValueError(
                 "Currently, the 'pt' format is not supported for Mixtral. "
-                "Please use the 'safetensors' format instead. ")
+                "Please use the 'safetensors' format instead. "
+            )
         self.load_format = load_format
 
     def _verify_tokenizer_mode(self) -> None:
@@ -151,47 +154,63 @@ def _verify_tokenizer_mode(self) -> None:
         if tokenizer_mode not in ["auto", "slow"]:
             raise ValueError(
                 f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
-                "either 'auto' or 'slow'.")
+                "either 'auto' or 'slow'."
+            )
         self.tokenizer_mode = tokenizer_mode
 
-    def _verify_quantization(self) -> None:
-        supported_quantization = ["awq", "gptq", "squeezellm"]
+    def _get_and_verify_quantization(self) -> Any | None:
+        supported_quantization = ["aqlm", "awq", "gptq", "squeezellm"]
         rocm_not_supported_quantization = ["awq"]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
         # Parse quantization method from the HF model config, if available.
+        hf_quant_method = None
         hf_quant_config = getattr(self.hf_config, "quantization_config", None)
         if hf_quant_config is not None:
             hf_quant_method = str(hf_quant_config["quant_method"]).lower()
-            if self.quantization is None:
-                self.quantization = hf_quant_method
-            elif self.quantization != hf_quant_method:
-                raise ValueError(
-                    "Quantization method specified in the model config "
-                    f"({hf_quant_method}) does not match the quantization "
-                    f"method specified in the `quantization` argument "
-                    f"({self.quantization}).")
+        else:
+            # HF models such as https://huggingface.co/BlackSamorez/Llama-2-70b-AQLM-4Bit-2x16-hf/blob/main/config.json
+            # only have  an aqlm block, no quantization_config block.
+            hf_quant_config = getattr(self.hf_config, "aqlm", None)
+            if hf_quant_config is not None:
+                hf_quant_method = "aqlm"
+
+        if hf_quant_method is not None and self.quantization is None:
+            self.quantization = hf_quant_method
+        elif self.quantization != hf_quant_method:
+            raise ValueError(
+                "Quantization method specified in the model config "
+                f"({hf_quant_method}) does not match the quantization "
+                f"method specified in the `quantization` argument "
+                f"({self.quantization})."
+            )
 
         if self.quantization is not None:
             if self.quantization not in supported_quantization:
                 raise ValueError(
                     f"Unknown quantization method: {self.quantization}. Must "
-                    f"be one of {supported_quantization}.")
-            if is_hip(
-            ) and self.quantization in rocm_not_supported_quantization:
+                    f"be one of {supported_quantization}."
+                )
+            if is_hip() and self.quantization in rocm_not_supported_quantization:
                 raise ValueError(
                     f"{self.quantization} quantization is currently not supported "
-                    f"in ROCm.")
-            logger.warning(f"{self.quantization} quantization is not fully "
-                           "optimized yet. The speed can be slower than "
-                           "non-quantized models.")
+                    f"in ROCm."
+                )
+            logger.warning(
+                f"{self.quantization} quantization is not fully "
+                "optimized yet. The speed can be slower than "
+                "non-quantized models."
+            )
+
+        return hf_quant_config
 
     def _verify_cuda_graph(self) -> None:
         if self.max_context_len_to_capture is None:
             self.max_context_len_to_capture = self.max_model_len
-        self.max_context_len_to_capture = min(self.max_context_len_to_capture,
-                                              self.max_model_len)
+        self.max_context_len_to_capture = min(
+            self.max_context_len_to_capture, self.max_model_len
+        )
 
     def verify_with_parallel_config(
         self,
@@ -203,7 +222,8 @@ def verify_with_parallel_config(
             raise ValueError(
                 f"Total number of attention heads ({total_num_attention_heads})"
                 " must be divisible by tensor parallel size "
-                f"({tensor_parallel_size}).")
+                f"({tensor_parallel_size})."
+            )
 
         total_num_hidden_layers = self.hf_config.num_hidden_layers
         pipeline_parallel_size = parallel_config.pipeline_parallel_size
@@ -211,7 +231,8 @@ def verify_with_parallel_config(
             raise ValueError(
                 f"Total number of hidden layers ({total_num_hidden_layers}) "
                 "must be divisible by pipeline parallel size "
-                f"({pipeline_parallel_size}).")
+                f"({pipeline_parallel_size})."
+            )
 
     def get_sliding_window(self) -> Optional[int]:
         return getattr(self.hf_config, "sliding_window", None)
@@ -237,9 +258,11 @@ def get_total_num_kv_heads(self) -> int:
         falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
         new_decoder_arch_falcon = (
             self.hf_config.model_type in falcon_model_types
-            and getattr(self.hf_config, "new_decoder_architecture", False))
-        if not new_decoder_arch_falcon and getattr(self.hf_config,
-                                                   "multi_query", False):
+            and getattr(self.hf_config, "new_decoder_architecture", False)
+        )
+        if not new_decoder_arch_falcon and getattr(
+            self.hf_config, "multi_query", False
+        ):
             # Multi-query attention, only one KV head.
             # Currently, tensor parallelism is not supported in this case.
             return 1
@@ -269,8 +292,7 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
         # the tensor parallel size. We will replicate the KV heads in the
         # case where the number of KV heads is smaller than the tensor
         # parallel size so each GPU has at least one KV head.
-        return max(1,
-                   total_num_kv_heads // parallel_config.tensor_parallel_size)
+        return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size)
 
     def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
         total_num_hidden_layers = self.hf_config.num_hidden_layers
@@ -312,7 +334,8 @@ def _verify_args(self) -> None:
         if self.gpu_memory_utilization > 1.0:
             raise ValueError(
                 "GPU memory utilization must be less than 1.0. Got "
-                f"{self.gpu_memory_utilization}.")
+                f"{self.gpu_memory_utilization}."
+            )
 
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
@@ -326,13 +349,15 @@ def _verify_cache_dtype(self) -> None:
             device_name = torch.cuda.get_device_name()
             if "AMD" in device_name:
                 raise NotImplementedError(
-                    "FP8_E5M2 KV Cache on AMD GPU has not been supported yet.")
+                    "FP8_E5M2 KV Cache on AMD GPU has not been supported yet."
+                )
             logger.info(
                 "Using fp8_e5m2 data type to store kv cache. It reduces "
                 "the GPU memory footprint and boosts the performance. "
                 "But it may cause slight accuracy drop. "
                 "Currently we only support fp8 without scaling factors and "
-                "make e5m2 as a default format.")
+                "make e5m2 as a default format."
+            )
         else:
             raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
 
@@ -346,9 +371,11 @@ def verify_with_parallel_config(
         num_gpus_per_node = parallel_config.tensor_parallel_size
         cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
 
-        msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
-               f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
-               "allocated for the swap space.")
+        msg = (
+            f"{cpu_memory_usage / _GB:.2f} GiB out of "
+            f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
+            "allocated for the swap space."
+        )
         if cpu_memory_usage > 0.7 * total_cpu_memory:
             raise ValueError("Too large swap space. " + msg)
         elif cpu_memory_usage > 0.4 * total_cpu_memory:
@@ -392,19 +419,20 @@ def __init__(
 
     def _verify_args(self) -> None:
         if self.pipeline_parallel_size > 1:
-            raise NotImplementedError(
-                "Pipeline parallelism is not supported yet.")
+            raise NotImplementedError("Pipeline parallelism is not supported yet.")
         if not self.disable_custom_all_reduce and self.world_size > 1:
             if is_hip():
                 self.disable_custom_all_reduce = True
                 logger.info(
                     "Disabled the custom all-reduce kernel because it is not "
-                    "supported on AMD GPUs.")
+                    "supported on AMD GPUs."
+                )
             elif self.pipeline_parallel_size > 1:
                 self.disable_custom_all_reduce = True
                 logger.info(
                     "Disabled the custom all-reduce kernel because it is not "
-                    "supported with pipeline parallelism.")
+                    "supported with pipeline parallelism."
+                )
 
         # FIXME(woosuk): Fix the stability issues and re-enable the custom
         # all-reduce kernel.
@@ -413,7 +441,8 @@ def _verify_args(self) -> None:
             logger.info(
                 "Custom all-reduce kernels are temporarily disabled due to "
                 "stability issues. We will re-enable them once the issues are "
-                "resolved.")
+                "resolved."
+            )
 
 
 class SchedulerConfig:
@@ -455,16 +484,17 @@ def _verify_args(self) -> None:
                 "This effectively limits the maximum sequence length to "
                 "max_num_batched_tokens and makes vLLM reject longer "
                 "sequences. Please increase max_num_batched_tokens or "
-                "decrease max_model_len.")
+                "decrease max_model_len."
+            )
         if self.max_num_batched_tokens < self.max_num_seqs:
             raise ValueError(
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
                 "be greater than or equal to max_num_seqs "
-                f"({self.max_num_seqs}).")
+                f"({self.max_num_seqs})."
+            )
 
 
 class DeviceConfig:
-
     def __init__(self, device: str = "cuda") -> None:
         self.device = torch.device(device)
 
@@ -486,11 +516,13 @@ def __post_init__(self):
         if self.max_lora_rank not in possible_max_ranks:
             raise ValueError(
                 f"max_lora_rank ({self.max_lora_rank}) must be one of "
-                f"{possible_max_ranks}.")
+                f"{possible_max_ranks}."
+            )
         if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
             raise ValueError(
                 f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
-                f"must be one of {possible_lora_extra_vocab_size}.")
+                f"must be one of {possible_lora_extra_vocab_size}."
+            )
         if self.max_loras < 1:
             raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
         if self.max_cpu_loras is None:
@@ -498,7 +530,8 @@ def __post_init__(self):
         elif self.max_cpu_loras < self.max_loras:
             raise ValueError(
                 f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
-                f"max_loras ({self.max_loras})")
+                f"max_loras ({self.max_loras})"
+            )
 
     def verify_with_model_config(self, model_config: ModelConfig):
         if self.lora_dtype in (None, "auto"):
@@ -506,15 +539,15 @@ def verify_with_model_config(self, model_config: ModelConfig):
         elif isinstance(self.lora_dtype, str):
             self.lora_dtype = getattr(torch, self.lora_dtype)
         if model_config.quantization is not None:
-            raise ValueError(
-                "LoRA is not supported with quantized models yet.")
+            raise ValueError("LoRA is not supported with quantized models yet.")
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
         if scheduler_config.max_num_batched_tokens > 65528:
             raise ValueError(
                 "Due to limitations of the custom LoRA CUDA kernel, "
                 "max_num_batched_tokens must be <= 65528 when "
-                "LoRA is enabled.")
+                "LoRA is enabled."
+            )
 
 
 _STR_DTYPE_TO_TORCH_DTYPE = {
@@ -558,11 +591,14 @@ def _get_and_verify_dtype(
 
     if is_hip() and torch_dtype == torch.float32:
         rocm_supported_dtypes = [
-            k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items()
+            k
+            for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items()
             if (k not in _ROCM_NOT_SUPPORTED_DTYPE)
         ]
-        raise ValueError(f"dtype \'{dtype}\' is not supported in ROCm. "
-                         f"Supported dtypes are {rocm_supported_dtypes}")
+        raise ValueError(
+            f"dtype '{dtype}' is not supported in ROCm. "
+            f"Supported dtypes are {rocm_supported_dtypes}"
+        )
 
     # Verify the dtype.
     if torch_dtype != config_dtype:
@@ -613,7 +649,8 @@ def _get_and_verify_max_len(
             "The model's config.json does not contain any of the following "
             "keys to determine the original maximum length of the model: "
             f"{possible_keys}. Assuming the model's maximum length is "
-            f"{default_max_len}.")
+            f"{default_max_len}."
+        )
         derived_max_model_len = default_max_len
 
     rope_scaling = getattr(hf_config, "rope_scaling", None)
@@ -621,8 +658,7 @@ def _get_and_verify_max_len(
         assert "factor" in rope_scaling
         scaling_factor = rope_scaling["factor"]
         if rope_scaling["type"] == "yarn":
-            derived_max_model_len = rope_scaling[
-                "original_max_position_embeddings"]
+            derived_max_model_len = rope_scaling["original_max_position_embeddings"]
         derived_max_model_len *= scaling_factor
 
     if max_model_len is None:
@@ -633,5 +669,6 @@ def _get_and_verify_max_len(
             f"the derived max_model_len ({max_len_key}={derived_max_model_len}"
             " in model's config.json). This may lead to incorrect model "
             "outputs or CUDA errors. Make sure the value is correct and "
-            "within the model context size.")
+            "within the model context size."
+        )
     return int(max_model_len)
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index b3449eaff0e35..98d9351785a36 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -1,11 +1,13 @@
 from typing import Type
 
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
 
 _QUANTIZATION_CONFIG_REGISTRY = {
+    "aqlm": AQLMConfig,
     "awq": AWQConfig,
     "gptq": GPTQConfig,
     "squeezellm": SqueezeLLMConfig,
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 7218760fbe55d..2a0b0c794c43c 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -6,42 +6,46 @@
 from torch.nn.parameter import Parameter
 
 from vllm._C import ops
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               set_weight_attrs)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 
 
-class GPTQConfig(QuantizationConfig):
-    """Config class for GPTQ.
+class AQLMConfig(QuantizationConfig):
+    """Config class for AQLM.
 
-    Reference: https://arxiv.org/abs/2210.17323
+    Reference: https://github.com/Vahe1994/AQLM
     """
 
     def __init__(
         self,
-        weight_bits: int,
-        group_size: int,
-        desc_act: bool,
+        in_group_size: int,
+        nbits_per_codebook: int,
+        num_codebooks: int,
+        out_group_size: int,
     ) -> None:
-        self.weight_bits = weight_bits
-        self.group_size = group_size
-        self.desc_act = desc_act
-        self.pack_factor = 32 // self.weight_bits
+        self.in_group_size = in_group_size
+        self.nbits_per_codebook = nbits_per_codebook
+        self.num_codebooks = num_codebooks
+        self.out_group_size = out_group_size
+        # self.pack_factor = 32 // self.weight_bits
         # exllama kernel v1 only supports 4 bit
-        if self.weight_bits != 4:
-            raise ValueError(
-                "Currently, only 4-bit weight quantization is supported for "
-                f"GPTQ, but got {self.weight_bits} bits.")
+        # if self.weight_bits != 4:
+        # raise ValueError(
+        # "Currently, only 4-bit weight quantization is supported for "
+        # f"GPTQ, but got {self.weight_bits} bits."
+        # )
 
     def __repr__(self) -> str:
-        return (f"GPTQConfig(weight_bits={self.weight_bits}, "
-                f"group_size={self.group_size}, "
-                f"desc_act={self.desc_act})")
+        return (
+            f"AQLMConfig(in_group_size={self.in_group_size}, "
+            f"nbits_per_codebook={self.nbits_per_codebook}, "
+            f"num_codebooks={self.num_codebooks}, "
+            f"out_group_size={self.out_group_size})"
+        )
 
     @classmethod
     def get_name(cls) -> str:
-        return "gptq"
+        return "aqlm"
 
     @classmethod
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
@@ -52,39 +56,58 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
     def get_min_capability(cls) -> int:
         return 60
 
+    # such as.  (This one looks correct)
+    # https://huggingface.co/BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf/blob/main/config.json
+    #
+    # "quantization_config": {
+    #   "in_group_size": 8,
+    #   "nbits_per_codebook": 16,
+    #   "num_codebooks": 1,
+    #   "out_group_size": 1,
+
+    #   "linear_weights_not_to_quantize": [ <--- hmmm ????
+    #       "model.embed_tokens.weight",
+    #       "lm_head.weight"
+
+    # "quant_method": "aqlm" duh <- shows it's aqlm.  Do we auto-detect?  How?
+    # },
+
+    # this one looks non-standard, has no quantization_config, just an AQLM block.
+    # https://huggingface.co/BlackSamorez/Llama-2-70b-AQLM-4Bit-2x16-hf/blob/main/config.json
+    # "aqlm": {
+    #    "in_group_size": 8,
+    #    "nbits_per_codebook": 16,
+    #    "num_codebooks": 2,
+    # "   "out_group_size": 1
+
     @classmethod
     def get_config_filenames(cls) -> List[str]:
-        return ["quantize_config.json"]
+        return []  # no extra configs.
 
     @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig":
-        weight_bits = cls.get_from_keys(config, ["bits"])
-        group_size = cls.get_from_keys(config, ["group_size"])
-        desc_act = cls.get_from_keys(config, ["desc_act"])
-        return cls(weight_bits, group_size, desc_act)
+    def from_config(cls, config: Dict[str, Any]) -> "AQLMConfig":
+        in_group_size = cls.get_from_keys(config, ["in_group_size"])
+        nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"])
+        num_code_books = cls.get_from_keys(config, ["num_codebooks"])
+        out_group_size = cls.get_from_keys(config, ["out_group_size"])
+        # TODO linear_weights_not_to_quantize ?
+        return cls(in_group_size, nbits_per_codebook, num_code_books, out_group_size)
 
-    def get_linear_method(self) -> "GPTQLinearMethod":
-        return GPTQLinearMethod(self)
+    def get_linear_method(self) -> "AQLMLinearMethod":
+        return AQLMLinearMethod(self)
 
     def get_scaled_act_names(self) -> List[str]:
         return []
 
 
-class ExllamaState(Enum):
-
-    UNUSED = enum.auto()
-    UNINITIALIZED = enum.auto()
-    READY = enum.auto()
-
-
-class GPTQLinearMethod(LinearMethodBase):
-    """Linear method for GPTQ.
+class AQLMLinearMethod(LinearMethodBase):
+    """Linear method for AQLM.
 
     Args:
-        quant_config: The GPTQ quantization config.
+        quant_config: The AQLM quantization config.
     """
 
-    def __init__(self, quant_config: GPTQConfig):
+    def __init__(self, quant_config: AQLMConfig):
         self.quant_config = quant_config
 
     def create_weights(
@@ -100,28 +123,21 @@ def create_weights(
             raise ValueError(
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
-                "tensor parallel size.")
+                "tensor parallel size."
+            )
         if output_size_per_partition % self.quant_config.pack_factor != 0:
             raise ValueError(
                 "The output size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
-                "tensor parallel size.")
+                "tensor parallel size."
+            )
 
         if self.quant_config.group_size != -1:
             group_size = self.quant_config.group_size
         else:
             group_size = input_size
-        exllama_state = ExllamaState.UNINITIALIZED
         scale_and_zero_size = input_size // group_size
         scale_and_zero_input_dim = None
-        if input_size != input_size_per_partition and self.quant_config.group_size != -1:
-            # For act-order models, we cannot use Exllama for row parallel layer
-            if self.quant_config.desc_act:
-                exllama_state = ExllamaState.UNUSED
-            else:
-                # we need to partition qzeros and scales for exllama kernel
-                scale_and_zero_size = input_size_per_partition // group_size
-                scale_and_zero_input_dim = 0
 
         qweight = Parameter(
             torch.empty(
@@ -132,12 +148,14 @@ def create_weights(
             requires_grad=False,
         )
         set_weight_attrs(
-            qweight, {
+            qweight,
+            {
                 "input_dim": 0,
                 "output_dim": 1,
                 "packed_dim": 0,
                 "pack_factor": self.quant_config.pack_factor,
-            })
+            },
+        )
         g_idx = Parameter(
             torch.tensor(
                 [
@@ -159,12 +177,14 @@ def create_weights(
             requires_grad=False,
         )
         set_weight_attrs(
-            qzeros, {
+            qzeros,
+            {
                 "input_dim": scale_and_zero_input_dim,
                 "output_dim": 1,
                 "packed_dim": 1,
                 "pack_factor": self.quant_config.pack_factor,
-            })
+            },
+        )
         scales = Parameter(
             torch.empty(
                 scale_and_zero_size,
@@ -173,39 +193,36 @@ def create_weights(
             ),
             requires_grad=False,
         )
-        set_weight_attrs(scales, {
-            "input_dim": scale_and_zero_input_dim,
-            "output_dim": 1,
-        })
+        set_weight_attrs(
+            scales,
+            {
+                "input_dim": scale_and_zero_input_dim,
+                "output_dim": 1,
+            },
+        )
         return {
             "qweight": qweight,
             "g_idx": g_idx,
             "qzeros": qzeros,
-            "scales": scales,
-            "exllama_state": exllama_state,
+            "scales": scales
         }
 
-    def apply_weights(self,
-                      weights: Dict[str, Any],
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        weights: Dict[str, Any],
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         qweight = weights["qweight"]
-        out_shape = x.shape[:-1] + (qweight.shape[-1], )
+        out_shape = x.shape[:-1] + (qweight.shape[-1],)
         reshaped_x = x.reshape(-1, x.shape[-1])
-        # exllama needs to shuffle the weight after the weight is loaded
-        # here we do the shuffle on first forward pass
-        if weights["exllama_state"] == ExllamaState.UNINITIALIZED:
-            if self.quant_config.desc_act:
-                weights["g_idx"] = torch.argsort(weights["g_idx"]).to(
-                    torch.int)
-            else:
-                weights["g_idx"] = torch.empty((1, 1), device="meta")
-            weights["exllama_state"] = ExllamaState.READY
-            ops.gptq_shuffle(weights["qweight"], weights["g_idx"])
-        output = ops.gptq_gemm(reshaped_x, weights["qweight"],
-                               weights["qzeros"], weights["scales"],
-                               weights["g_idx"],
-                               weights["exllama_state"] == ExllamaState.READY)
+        output = ops.aqlm_gemm(
+            reshaped_x,
+            weights["qweight"],
+            weights["qzeros"],
+            weights["scales"],
+            weights["g_idx"],
+        )
         if bias is not None:
             output = output + bias
         return output.reshape(out_shape)
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index 3570366887e78..37c9725033d49 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -15,14 +15,15 @@
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization import (get_quantization_config,
-                                                     QuantizationConfig)
+from vllm.model_executor.layers.quantization import (
+    get_quantization_config,
+    QuantizationConfig,
+)
 
 logger = init_logger(__name__)
 
 
 class Disabledtqdm(tqdm):
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs, disable=True)
 
@@ -68,10 +69,12 @@ def convert_bin_to_safetensor_file(
     sf_size = os.stat(sf_filename).st_size
     pt_size = os.stat(pt_filename).st_size
     if (sf_size - pt_size) / pt_size > 0.01:
-        raise RuntimeError(f"""The file size different is more than 1%:
+        raise RuntimeError(
+            f"""The file size different is more than 1%:
          - {sf_filename}: {sf_size}
          - {pt_filename}: {pt_size}
-         """)
+         """
+        )
 
     # check if the tensors are the same
     reloaded = load_file(sf_filename)
@@ -85,36 +88,36 @@ def convert_bin_to_safetensor_file(
 # TODO(woosuk): Move this to other place.
 def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:
     quant_cls = get_quantization_config(model_config.quantization)
-    # Read the quantization config from the HF model config, if available.
-    hf_quant_config = getattr(model_config.hf_config, "quantization_config",
-                              None)
-    if hf_quant_config is not None:
-        return quant_cls.from_config(hf_quant_config)
+    if model_config.hf_quant_config is not None:
+        return quant_cls.from_config(model_config.hf_quant_config)
     model_name_or_path = model_config.model
     is_local = os.path.isdir(model_name_or_path)
     if not is_local:
         # Download the config files.
         with get_lock(model_name_or_path, model_config.download_dir):
-            hf_folder = snapshot_download(model_name_or_path,
-                                          revision=model_config.revision,
-                                          allow_patterns="*.json",
-                                          cache_dir=model_config.download_dir,
-                                          tqdm_class=Disabledtqdm)
+            hf_folder = snapshot_download(
+                model_name_or_path,
+                revision=model_config.revision,
+                allow_patterns="*.json",
+                cache_dir=model_config.download_dir,
+                tqdm_class=Disabledtqdm,
+            )
     else:
         hf_folder = model_name_or_path
     config_files = glob.glob(os.path.join(hf_folder, "*.json"))
 
     quant_config_files = [
-        f for f in config_files if any(
-            f.endswith(x) for x in quant_cls.get_config_filenames())
+        f
+        for f in config_files
+        if any(f.endswith(x) for x in quant_cls.get_config_filenames())
     ]
     if len(quant_config_files) == 0:
-        raise ValueError(
-            f"Cannot find the config file for {model_config.quantization}")
+        raise ValueError(f"Cannot find the config file for {model_config.quantization}")
     if len(quant_config_files) > 1:
         raise ValueError(
             f"Found multiple config files for {model_config.quantization}: "
-            f"{quant_config_files}")
+            f"{quant_config_files}"
+        )
 
     quant_config_file = quant_config_files[0]
     with open(quant_config_file, "r") as f:
@@ -164,11 +167,13 @@ def prepare_hf_model_weights(
         # Use file lock to prevent multiple processes from
         # downloading the same model weights at the same time.
         with get_lock(model_name_or_path, cache_dir):
-            hf_folder = snapshot_download(model_name_or_path,
-                                          allow_patterns=allow_patterns,
-                                          cache_dir=cache_dir,
-                                          tqdm_class=Disabledtqdm,
-                                          revision=revision)
+            hf_folder = snapshot_download(
+                model_name_or_path,
+                allow_patterns=allow_patterns,
+                cache_dir=cache_dir,
+                tqdm_class=Disabledtqdm,
+                revision=revision,
+            )
     else:
         hf_folder = model_name_or_path
     hf_weights_files: List[str] = []
@@ -189,13 +194,11 @@ def prepare_hf_model_weights(
             "scaler.pt",
         ]
         hf_weights_files = [
-            f for f in hf_weights_files
-            if not any(f.endswith(x) for x in blacklist)
+            f for f in hf_weights_files if not any(f.endswith(x) for x in blacklist)
         ]
 
     if len(hf_weights_files) == 0:
-        raise RuntimeError(
-            f"Cannot find any model weights with `{model_name_or_path}`")
+        raise RuntimeError(f"Cannot find any model weights with `{model_name_or_path}`")
 
     return hf_folder, hf_weights_files, use_safetensors
 
@@ -212,7 +215,8 @@ def hf_model_weights_iterator(
         cache_dir=cache_dir,
         load_format=load_format,
         fall_back_to_pt=fall_back_to_pt,
-        revision=revision)
+        revision=revision,
+    )
 
     if load_format == "npcache":
         # Currently np_cache only support *.bin checkpoints
@@ -276,8 +280,7 @@ def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
     return x
 
 
-def default_weight_loader(param: torch.Tensor,
-                          loaded_weight: torch.Tensor) -> None:
+def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
     """Default weight loader."""
     assert param.size() == loaded_weight.size()
     param.data.copy_(loaded_weight)

From 20a71fdb941b641276220be6752246a74a5e8efa Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Mon, 26 Feb 2024 17:56:17 -0500
Subject: [PATCH 03/96] adding in mat mat, need to move the pytorch stuff,
 maybe add some aqlm prefixes.

---
 csrc/quantization/aqlm/aqlm_cuda_kernel.cpp   | 142 ++++++++++++++++++
 .../layers/quantization/aqlm.py               |   2 +
 2 files changed, 144 insertions(+)
 create mode 100644 csrc/quantization/aqlm/aqlm_cuda_kernel.cpp

diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cpp b/csrc/quantization/aqlm/aqlm_cuda_kernel.cpp
new file mode 100644
index 0000000000000..301e8439b24ae
--- /dev/null
+++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cpp
@@ -0,0 +1,142 @@
+#include <torch/all.h>
+#include <torch/python.h>
+#include <c10/cuda/CUDAGuard.h>
+
+void code1x16_matvec_cuda(
+  const void* A,
+  const void* B,
+        void* C,
+  const void* codebook,
+  int prob_m,
+  int prob_k
+);
+
+void code2x8_matvec_cuda(
+  const void* A,
+  const void* B,
+        void* C,
+  const void* codebook,
+  int prob_m,
+  int prob_k
+);
+
+void code1x16_matvec(
+  const torch::Tensor& A,
+  const torch::Tensor& B,
+        torch::Tensor& C,
+  const torch::Tensor& codebook
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  int prob_m = C.size(0);
+  int prob_k = B.size(0);
+  code1x16_matvec_cuda(
+    A.data_ptr(),
+    B.data_ptr(),
+    C.data_ptr(),
+    codebook.data_ptr(),
+    prob_m,
+    prob_k
+  );
+}
+
+torch::Tensor code1x16_matmat(
+  const torch::Tensor& input,
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& scales,
+  const std::optional<torch::Tensor>& bias
+) {
+  auto input_sizes = input.sizes();
+  auto out_features = codes.size(0) * codebooks.size(2);
+  auto flat_input = input.reshape({-1, input.size(-1)});
+  auto flat_output = torch::empty({flat_input.size(0), out_features},
+    torch::TensorOptions()
+      .dtype(input.dtype())
+      .device(input.device())
+  );
+
+  for (int i = 0; i < flat_input.size(0); ++i) {
+    auto input_vec = flat_input.index({i});
+    auto output_vec = flat_output.index({i});
+    code1x16_matvec(
+      codes.squeeze(2),
+      input_vec,
+      output_vec,
+      codebooks
+    );
+  }
+  flat_output *= scales.flatten().unsqueeze(0);
+  if (bias.has_value()) {
+    flat_output += bias->unsqueeze(0);
+  }
+
+  auto output_sizes = input_sizes.vec();
+  output_sizes.pop_back();
+  output_sizes.push_back(-1);
+  auto output = flat_output.reshape(output_sizes).clone();
+  return output;
+}
+
+void code2x8_matvec(
+  const torch::Tensor& A,
+  const torch::Tensor& B,
+        torch::Tensor& C,
+  const torch::Tensor& codebook
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  int prob_m = C.size(0);
+  int prob_k = B.size(0);
+  code2x8_matvec_cuda(
+    A.data_ptr(),
+    B.data_ptr(),
+    C.data_ptr(),
+    codebook.data_ptr(),
+    prob_m,
+    prob_k
+  );
+}
+
+torch::Tensor code2x8_matmat(
+  const torch::Tensor& input,
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& scales,
+  const std::optional<torch::Tensor>& bias
+) {
+  auto input_sizes = input.sizes();
+  auto out_features = codes.size(0) * codebooks.size(2);
+  auto flat_input = input.reshape({-1, input.size(-1)});
+  auto flat_output = torch::empty({flat_input.size(0), out_features},
+    torch::TensorOptions()
+      .dtype(input.dtype())
+      .device(input.device())
+  );
+
+  for (int i = 0; i < flat_input.size(0); ++i) {
+    auto input_vec = flat_input.index({i});
+    auto output_vec = flat_output.index({i});
+    code2x8_matvec(
+      codes.squeeze(2),
+      input_vec,
+      output_vec,
+      codebooks
+    );
+  }
+  flat_output *= scales.flatten().unsqueeze(0);
+  if (bias.has_value()) {
+    flat_output += bias->unsqueeze(0);
+  }
+
+  auto output_sizes = input_sizes.vec();
+  output_sizes.pop_back();
+  output_sizes.push_back(-1);
+  auto output = flat_output.reshape(output_sizes).clone();
+  return output;
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("code1x16_matmat", &code1x16_matmat, "1x16 (2bit) codebook matrix-matrix product.");
+  m.def("code2x8_matmat", &code2x8_matmat, "2x8 (2bit) codebook matrix-matrix product.");
+}
+
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 2a0b0c794c43c..5745487067227 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -72,6 +72,8 @@ def get_min_capability(cls) -> int:
     # "quant_method": "aqlm" duh <- shows it's aqlm.  Do we auto-detect?  How?
     # },
 
+    #https://huggingface.co/meta-llama/Llama-2-7b-hf 
+
     # this one looks non-standard, has no quantization_config, just an AQLM block.
     # https://huggingface.co/BlackSamorez/Llama-2-70b-AQLM-4Bit-2x16-hf/blob/main/config.json
     # "aqlm": {

From d0cf25a850218914e1c3c9abf509eb36c0df1c1e Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 27 Feb 2024 14:49:45 -0500
Subject: [PATCH 04/96] load the codebooks, codes, and scales.

---
 vllm/model_executor/layers/linear.py          | 245 +++++++++++-------
 .../layers/quantization/aqlm.py               | 123 +++++----
 vllm/model_executor/models/llama.py           | 136 +++++-----
 3 files changed, 295 insertions(+), 209 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 55d38b763b2b5..cd9a17b7ef864 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -6,11 +6,14 @@
 from torch.nn.parameter import Parameter
 
 from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.parallel_utils.communication_op import (
-    tensor_model_parallel_all_reduce, tensor_model_parallel_all_gather)
-from vllm.model_executor.parallel_utils.utils import (
-    divide, split_tensor_along_last_dim)
+    tensor_model_parallel_all_reduce,
+    tensor_model_parallel_all_gather,
+)
+from vllm.model_executor.parallel_utils.utils import divide, split_tensor_along_last_dim
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.logger import init_logger
 
@@ -21,18 +24,24 @@ class LinearMethodBase(ABC):
     """Base class for different (maybe quantized) linear methods."""
 
     @abstractmethod
-    def create_weights(self, input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
-                       output_size: int,
-                       params_dtype: torch.dtype) -> Dict[str, Any]:
+    def create_weights(
+        self,
+        input_size_per_partition: int,
+        output_size_per_partition: int,
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
         """Create weights for a linear layer."""
         raise NotImplementedError
 
     @abstractmethod
-    def apply_weights(self,
-                      weights: Dict[str, torch.Tensor],
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        weights: Dict[str, torch.Tensor],
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         """Apply the weights to the input tensor."""
         raise NotImplementedError
 
@@ -48,21 +57,29 @@ class UnquantizedLinearMethod(LinearMethodBase):
     def __init__(self, separate_bias_add: bool = False):
         self.separate_bias_add = separate_bias_add
 
-    def create_weights(self, input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
-                       output_size: int,
-                       params_dtype: torch.dtype) -> Dict[str, Any]:
-        weight = Parameter(torch.empty(output_size_per_partition,
-                                       input_size_per_partition,
-                                       dtype=params_dtype),
-                           requires_grad=False)
+    def create_weights(
+        self,
+        input_size_per_partition: int,
+        output_size_per_partition: int,
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
+        weight = Parameter(
+            torch.empty(
+                output_size_per_partition, input_size_per_partition, dtype=params_dtype
+            ),
+            requires_grad=False,
+        )
         set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
         return {"weight": weight}
 
-    def apply_weights(self,
-                      weights: Dict[str, torch.Tensor],
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        weights: Dict[str, torch.Tensor],
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         weight = weights["weight"]
         if self.separate_bias_add:
             if bias:
@@ -105,14 +122,19 @@ def __init__(
             linear_method = UnquantizedLinearMethod()
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
-            self.input_size, self.output_size, self.input_size,
-            self.output_size, self.params_dtype)
+            self.input_size,
+            self.output_size,
+            self.input_size,
+            self.output_size,
+            self.params_dtype,
+        )
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
         if bias:
             self.bias = Parameter(
-                torch.empty(self.output_size, dtype=self.params_dtype))
+                torch.empty(self.output_size, dtype=self.params_dtype)
+            )
             set_weight_attrs(self.bias, {"output_dim": 0})
         else:
             self.register_parameter("bias", None)
@@ -171,20 +193,27 @@ def __init__(
             linear_method = UnquantizedLinearMethod()
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
-            self.input_size, self.output_size_per_partition, self.input_size,
-            self.output_size, self.params_dtype)
+            self.input_size,
+            self.output_size_per_partition,
+            self.input_size,
+            self.output_size,
+            self.params_dtype,
+        )
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
                 set_weight_attrs(weight, {"weight_loader": self.weight_loader})
         if bias:
             self.bias = Parameter(
-                torch.empty(self.output_size_per_partition,
-                            dtype=params_dtype))
-            set_weight_attrs(self.bias, {
-                "output_dim": 0,
-                "weight_loader": self.weight_loader,
-            })
+                torch.empty(self.output_size_per_partition, dtype=params_dtype)
+            )
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
         else:
             self.register_parameter("bias", None)
 
@@ -195,8 +224,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         if output_dim is not None:
             shard_size = param_data.shape[output_dim]
             start_idx = tp_rank * shard_size
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
-                                                 shard_size)
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -205,7 +233,8 @@ def forward(self, input_):
 
         # Matrix multiply.
         output_parallel = self.linear_method.apply_weights(
-            self.linear_weights, input_, bias)
+            self.linear_weights, input_, bias
+        )
         if self.gather_output:
             # All-gather across the partitions.
             output = tensor_model_parallel_all_gather(output_parallel)
@@ -249,13 +278,22 @@ def __init__(
         self.output_sizes = output_sizes
         tp_size = get_tensor_model_parallel_world_size()
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
-        super().__init__(input_size, sum(output_sizes), bias, gather_output,
-                         skip_bias_add, params_dtype, linear_method)
-
-    def weight_loader(self,
-                      param: Parameter,
-                      loaded_weight: torch.Tensor,
-                      loaded_shard_id: Optional[int] = None):
+        super().__init__(
+            input_size,
+            sum(output_sizes),
+            bias,
+            gather_output,
+            skip_bias_add,
+            params_dtype,
+            linear_method,
+        )
+
+    def weight_loader(
+        self,
+        param: Parameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: Optional[int] = None,
+    ):
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
         if loaded_shard_id is None:
@@ -277,7 +315,8 @@ def weight_loader(self,
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
                 loaded_weight_shard = loaded_weight.narrow(
-                    output_dim, shard_offset, shard_size)
+                    output_dim, shard_offset, shard_size
+                )
                 self.weight_loader(param, loaded_weight_shard, shard_id)
             return
 
@@ -293,18 +332,17 @@ def weight_loader(self,
             if packed_dim == output_dim:
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
-            param_data = param_data.narrow(output_dim, shard_offset,
-                                           shard_size)
+            param_data = param_data.narrow(output_dim, shard_offset, shard_size)
             start_idx = tp_rank * shard_size
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
-                                                 shard_size)
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
                 logger.warning(
                     "Loading a weight without `output_dim` attribute in "
                     "MergedColumnParallelLinear, assume the weight is "
-                    "the same for all partitions.")
+                    "the same for all partitions."
+                )
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -355,21 +393,30 @@ def __init__(
         self.num_heads = divide(self.total_num_heads, tp_size)
         if tp_size >= self.total_num_kv_heads:
             self.num_kv_heads = 1
-            self.num_kv_head_replicas = divide(tp_size,
-                                               self.total_num_kv_heads)
+            self.num_kv_head_replicas = divide(tp_size, self.total_num_kv_heads)
         else:
             self.num_kv_heads = divide(self.total_num_kv_heads, tp_size)
             self.num_kv_head_replicas = 1
         input_size = self.hidden_size
-        output_size = (self.num_heads +
-                       2 * self.num_kv_heads) * tp_size * self.head_size
-        super().__init__(input_size, output_size, bias, False, skip_bias_add,
-                         params_dtype, linear_method)
-
-    def weight_loader(self,
-                      param: Parameter,
-                      loaded_weight: torch.Tensor,
-                      loaded_shard_id: Optional[str] = None):
+        output_size = (
+            (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size
+        )
+        super().__init__(
+            input_size,
+            output_size,
+            bias,
+            False,
+            skip_bias_add,
+            params_dtype,
+            linear_method,
+        )
+
+    def weight_loader(
+        self,
+        param: Parameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: Optional[str] = None,
+    ):
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
         if loaded_shard_id is None:
@@ -381,10 +428,16 @@ def weight_loader(self,
             shard_offsets = [
                 # (shard_id, shard_offset, shard_size)
                 ("q", 0, self.total_num_heads * self.head_size),
-                ("k", self.total_num_heads * self.head_size,
-                 self.total_num_kv_heads * self.head_size),
-                ("v", (self.total_num_heads + self.total_num_kv_heads) *
-                 self.head_size, self.total_num_kv_heads * self.head_size),
+                (
+                    "k",
+                    self.total_num_heads * self.head_size,
+                    self.total_num_kv_heads * self.head_size,
+                ),
+                (
+                    "v",
+                    (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
+                    self.total_num_kv_heads * self.head_size,
+                ),
             ]
             packed_dim = getattr(param, "packed_dim", None)
             for shard_id, shard_offset, shard_size in shard_offsets:
@@ -394,7 +447,8 @@ def weight_loader(self,
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
                 loaded_weight_shard = loaded_weight.narrow(
-                    output_dim, shard_offset, shard_size)
+                    output_dim, shard_offset, shard_size
+                )
                 self.weight_loader(param, loaded_weight_shard, shard_id)
             return
 
@@ -408,8 +462,7 @@ def weight_loader(self,
                 shard_offset = self.num_heads * self.head_size
                 shard_size = self.num_kv_heads * self.head_size
             elif loaded_shard_id == "v":
-                shard_offset = (self.num_heads +
-                                self.num_kv_heads) * self.head_size
+                shard_offset = (self.num_heads + self.num_kv_heads) * self.head_size
                 shard_size = self.num_kv_heads * self.head_size
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
@@ -417,22 +470,21 @@ def weight_loader(self,
             if packed_dim == output_dim:
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
-            param_data = param_data.narrow(output_dim, shard_offset,
-                                           shard_size)
+            param_data = param_data.narrow(output_dim, shard_offset, shard_size)
             if loaded_shard_id == "q":
                 shard_id = tp_rank
             else:
                 shard_id = tp_rank // self.num_kv_head_replicas
             start_idx = shard_id * shard_size
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
-                                                 shard_size)
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
                 logger.warning(
                     "Loading a weight without `output_dim` attribute in "
                     "QKVParallelLinear, assume the weight is the same "
-                    "for all partitions.")
+                    "for all partitions."
+                )
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -492,24 +544,32 @@ def __init__(
             linear_method = UnquantizedLinearMethod()
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
-            self.input_size_per_partition, self.output_size, self.input_size,
-            self.output_size, self.params_dtype)
+            self.input_size_per_partition,
+            self.output_size,
+            self.input_size,
+            self.output_size,
+            self.params_dtype,
+        )
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
                 set_weight_attrs(weight, {"weight_loader": self.weight_loader})
 
         if not reduce_results and (bias and not skip_bias_add):
-            raise ValueError("When not reduce the results, adding bias to the "
-                             "results can lead to incorrect results")
+            raise ValueError(
+                "When not reduce the results, adding bias to the "
+                "results can lead to incorrect results"
+            )
 
         if bias:
-            self.bias = Parameter(
-                torch.empty(self.output_size, dtype=params_dtype))
-            set_weight_attrs(self.bias, {
-                "output_dim": 0,
-                "weight_loader": self.weight_loader,
-            })
+            self.bias = Parameter(torch.empty(self.output_size, dtype=params_dtype))
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
         else:
             self.register_parameter("bias", None)
 
@@ -517,12 +577,19 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         tp_rank = get_tensor_model_parallel_rank()
         input_dim = getattr(param, "input_dim", None)
         param_data = param.data
+
+        # TEST
+        print("param data shape is ", param_data.shape)
+        print("loaded_weight is ", loaded_weight.shape)
+
         if input_dim is not None:
             shard_size = param_data.shape[input_dim]
             start_idx = tp_rank * shard_size
-            loaded_weight = loaded_weight.narrow(input_dim, start_idx,
-                                                 shard_size)
+            loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size)
+            print("sharded loaded_weight is ", loaded_weight.shape)
+
         assert param_data.shape == loaded_weight.shape
+
         param_data.copy_(loaded_weight)
 
     def forward(self, input_):
@@ -532,12 +599,14 @@ def forward(self, input_):
         else:
             tp_rank = get_tensor_model_parallel_rank()
             splitted_input = split_tensor_along_last_dim(
-                input_, num_partitions=self.tp_size)
+                input_, num_partitions=self.tp_size
+            )
             input_parallel = splitted_input[tp_rank].contiguous()
 
         # Matrix multiply.
         output_parallel = self.linear_method.apply_weights(
-            self.linear_weights, input_parallel)
+            self.linear_weights, input_parallel
+        )
         if self.reduce_results and self.tp_size > 1:
             output_ = tensor_model_parallel_all_reduce(output_parallel)
         else:
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 5745487067227..561375c23b62b 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -10,6 +10,18 @@
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 
 
+def get_int_dtype(nbits: int) -> torch.dtype:
+    if nbits <= 8:
+        return torch.int8
+    if nbits <= 16:
+        return torch.int16
+    if nbits <= 32:
+        return torch.int32
+    if nbits <= 64:
+        return torch.int64
+    raise ValueError(f"No dtype available for {nbits}-bit codebooks")
+
+
 class AQLMConfig(QuantizationConfig):
     """Config class for AQLM.
 
@@ -27,13 +39,13 @@ def __init__(
         self.nbits_per_codebook = nbits_per_codebook
         self.num_codebooks = num_codebooks
         self.out_group_size = out_group_size
-        # self.pack_factor = 32 // self.weight_bits
-        # exllama kernel v1 only supports 4 bit
-        # if self.weight_bits != 4:
-        # raise ValueError(
-        # "Currently, only 4-bit weight quantization is supported for "
-        # f"GPTQ, but got {self.weight_bits} bits."
-        # )
+
+        # I think pack factor is *probably* how many elements fit into one quantized tensor element.
+        # though out group size makes it interesting, because really we are doing 2D blocks, potentially.
+        # maybe this is vllms first 2D packing?  Arg.
+        self.pack_factor = (
+            self.in_group_size * self.out_group_size // self.num_codebooks
+        )
 
     def __repr__(self) -> str:
         return (
@@ -64,23 +76,21 @@ def get_min_capability(cls) -> int:
     #   "nbits_per_codebook": 16,
     #   "num_codebooks": 1,
     #   "out_group_size": 1,
-
+    #   "quant_method": "aqlm"
     #   "linear_weights_not_to_quantize": [ <--- hmmm ????
     #       "model.embed_tokens.weight",
     #       "lm_head.weight"
-
-    # "quant_method": "aqlm" duh <- shows it's aqlm.  Do we auto-detect?  How?
     # },
 
-    #https://huggingface.co/meta-llama/Llama-2-7b-hf 
+    # https://huggingface.co/meta-llama/Llama-2-7b-hf <- can't see it, locked behind meta.
 
-    # this one looks non-standard, has no quantization_config, just an AQLM block.
+    # this is no-standard, has no "quantization_config", just an "aqlm" block.
     # https://huggingface.co/BlackSamorez/Llama-2-70b-AQLM-4Bit-2x16-hf/blob/main/config.json
     # "aqlm": {
     #    "in_group_size": 8,
     #    "nbits_per_codebook": 16,
     #    "num_codebooks": 2,
-    # "   "out_group_size": 1
+    #    "out_group_size": 1
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -121,76 +131,65 @@ def create_weights(
         params_dtype: torch.dtype,
     ) -> Dict[str, Any]:
         del output_size  # Unused.
-        if input_size_per_partition % self.quant_config.group_size != 0:
+        del input_size  # Unused.
+
+        if params_dtype != torch.half:
+            raise ValueError("Only half is currently supported by aqlm")
+        if input_size_per_partition % self.quant_config.in_group_size != 0:
             raise ValueError(
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
                 "tensor parallel size."
             )
-        if output_size_per_partition % self.quant_config.pack_factor != 0:
+        if output_size_per_partition % self.quant_config.out_group_size != 0:
             raise ValueError(
                 "The output size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
                 "tensor parallel size."
             )
 
-        if self.quant_config.group_size != -1:
-            group_size = self.quant_config.group_size
-        else:
-            group_size = input_size
-        scale_and_zero_size = input_size // group_size
-        scale_and_zero_input_dim = None
-
-        qweight = Parameter(
+        # or does this need more dimensions and use the correct nbits_per_codebook as an int type.  Does that pack them?
+        codes = Parameter(
             torch.empty(
+                output_size_per_partition,  # not entirely sure what to do with out groups, if we need this pack factor.
                 input_size_per_partition // self.quant_config.pack_factor,
-                output_size_per_partition,
-                dtype=torch.int32,
+                1,
+                dtype=get_int_dtype(self.quant_config.nbits_per_codebook),
             ),
             requires_grad=False,
         )
+
         set_weight_attrs(
-            qweight,
+            codes,
             {
-                "input_dim": 0,
-                "output_dim": 1,
-                "packed_dim": 0,
+                "input_dim": 1,
+                "output_dim": 0,
+                "packed_dim": 1,
                 "pack_factor": self.quant_config.pack_factor,
             },
         )
-        g_idx = Parameter(
-            torch.tensor(
-                [
-                    i // self.quant_config.group_size
-                    for i in range(input_size_per_partition)
-                ],
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-        # Ignore warning from fused linear layers such as QKVParallelLinear.
-        set_weight_attrs(g_idx, {"input_dim": 0, "ignore_warning": True})
-        qzeros = Parameter(
+
+        codebooks = Parameter(
             torch.empty(
-                scale_and_zero_size,
-                output_size_per_partition // self.quant_config.pack_factor,
-                dtype=torch.int32,
+                self.quant_config.num_codebooks,
+                2**self.quant_config.nbits_per_codebook,
+                self.quant_config.out_group_size,
+                self.quant_config.in_group_size,
+                dtype=params_dtype,
             ),
             requires_grad=False,
         )
-        set_weight_attrs(
-            qzeros,
-            {
-                "input_dim": scale_and_zero_input_dim,
-                "output_dim": 1,
-                "packed_dim": 1,
-                "pack_factor": self.quant_config.pack_factor,
-            },
-        )
+        # no attributes?  It's fixed size, no input or output dim, need the whole thing.
+        # looks like named attributes are for sharding so it knows how to split something up.
+
         scales = Parameter(
             torch.empty(
-                scale_and_zero_size,
-                output_size_per_partition,
+                (
+                    output_size_per_partition // self.quant_config.out_group_size,
+                    1,  # do we really need these other dimensions?  They don't count, or?
+                    1,
+                    1,
+                ),
                 dtype=params_dtype,
             ),
             requires_grad=False,
@@ -198,15 +197,15 @@ def create_weights(
         set_weight_attrs(
             scales,
             {
-                "input_dim": scale_and_zero_input_dim,
-                "output_dim": 1,
+                "output_dim": 0,
+                # "pack_factor": self.quant_config.pack_factor,   I guess not really a pack factor, just smaller?
             },
         )
+
         return {
-            "qweight": qweight,
-            "g_idx": g_idx,
-            "qzeros": qzeros,
-            "scales": scales
+            "codes": codes,
+            "codebooks": codebooks,
+            "scales": scales,
         }
 
     def apply_weights(
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index b7f6b8f3ec374..217edb20049ce 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -31,19 +31,27 @@
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    LinearMethodBase,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE)
+    VocabParallelEmbedding,
+    ParallelLMHead,
+    DEFAULT_VOCAB_PADDING_SIZE,
+)
 from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
+from vllm.model_executor.weight_utils import (
+    default_weight_loader,
+    hf_model_weights_iterator,
+)
 from vllm.sequence import SamplerOutput
 from vllm.config import LoRAConfig
 
@@ -51,7 +59,6 @@
 
 
 class LlamaMLP(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
@@ -61,16 +68,19 @@ def __init__(
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
             bias=False,
-            linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           linear_method=linear_method)
+            linear_method=linear_method,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size, hidden_size, bias=False, linear_method=linear_method
+        )
         if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -81,7 +91,6 @@ def forward(self, x):
 
 
 class LlamaAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
@@ -139,11 +148,13 @@ def __init__(
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads,
-                                   sliding_window=sliding_window)
+        self.attn = PagedAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            sliding_window=sliding_window,
+        )
 
     def forward(
         self,
@@ -162,7 +173,6 @@ def forward(
 
 
 class LlamaDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: LlamaConfig,
@@ -172,14 +182,14 @@ def __init__(
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         sliding_window = getattr(config, "sliding_window", None)
         self.self_attn = LlamaAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -193,10 +203,10 @@ def __init__(
             hidden_act=config.hidden_act,
             linear_method=linear_method,
         )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
@@ -211,8 +221,7 @@ def forward(
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
@@ -221,14 +230,12 @@ def forward(
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
 
 class LlamaModel(nn.Module):
-
     def __init__(
         self,
         config: LlamaConfig,
@@ -238,8 +245,11 @@ def __init__(
         super().__init__()
         self.config = config
         self.padding_idx = config.pad_token_id
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
+        lora_vocab = (
+            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
+            if lora_config
+            else 0
+        )
         self.vocab_size = config.vocab_size + lora_vocab
         self.org_vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(
@@ -247,10 +257,12 @@ def __init__(
             config.hidden_size,
             org_num_embeddings=config.vocab_size,
         )
-        self.layers = nn.ModuleList([
-            LlamaDecoderLayer(config, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList(
+            [
+                LlamaDecoderLayer(config, linear_method)
+                for _ in range(config.num_hidden_layers)
+            ]
+        )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
@@ -323,7 +335,8 @@ def __init__(
             padding_size=DEFAULT_VOCAB_PADDING_SIZE
             # We need bigger padding if using lora for kernel
             # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
+            if not lora_config
+            else lora_config.lora_vocab_padding_size,
         )
         self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size)
 
@@ -334,8 +347,7 @@ def forward(
         kv_caches: List[KVCache],
         input_metadata: InputMetadata,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata)
+        hidden_states = self.model(input_ids, positions, kv_caches, input_metadata)
         return hidden_states
 
     def sample(
@@ -343,15 +355,18 @@ def sample(
         hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   sampling_metadata)
+        next_tokens = self.sampler(
+            self.lm_head.weight, hidden_states, sampling_metadata
+        )
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(
+        self,
+        model_name_or_path: str,
+        cache_dir: Optional[str] = None,
+        load_format: str = "auto",
+        revision: Optional[str] = None,
+    ):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -362,15 +377,15 @@ def load_weights(self,
         ]
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+            model_name_or_path, cache_dir, load_format, revision
+        ):
             if "rotary_emb.inv_freq" in name:
                 continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -379,6 +394,8 @@ def load_weights(self,
                     continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
+                # TEST
+                print("loading ", name)
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
@@ -386,6 +403,7 @@ def load_weights(self,
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                # TEST
+                print("loading ", name)
                 weight_loader(param, loaded_weight)

From 40463e3e1e424a26a03586a00eff97d7a0164ac1 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 27 Feb 2024 15:43:55 -0500
Subject: [PATCH 05/96] try to bind cpp aqlm entry point to python

---
 csrc/ops.h                                    |  14 +-
 csrc/pybind.cpp                               |   2 +
 ...lm_cuda_kernel.cpp => aqlm_cuda_entry.cpp} |   9 +-
 setup.py                                      | 126 +++++++++++-------
 .../layers/quantization/aqlm.py               |  21 +--
 5 files changed, 105 insertions(+), 67 deletions(-)
 rename csrc/quantization/aqlm/{aqlm_cuda_kernel.cpp => aqlm_cuda_entry.cpp} (96%)

diff --git a/csrc/ops.h b/csrc/ops.h
index 351c4cade7a09..5ff16e0a27393 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -69,13 +69,13 @@ void gelu_fast(
   torch::Tensor& out,
   torch::Tensor& input);
 
-torch::Tensor aqlm_gemm(
-  torch::Tensor a,
-  torch::Tensor b_q_weight,
-  torch::Tensor b_gptq_qzeros,
-  torch::Tensor b_gptq_scales,
-  torch::Tensor b_g_idx,
-  bool use_exllama);
+torch::Tensor code1x16_matmat(
+  const torch::Tensor& input,
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& scales
+  //const std::optional<torch::Tensor>& bias
+);
 
 #ifndef USE_ROCM
 torch::Tensor awq_gemm(
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index 24c22020131e8..d1410071d3afe 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -53,6 +53,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
 
   // Quantization ops
+  ops.def("aqlm_gemm", &code1x16_matmat, "Quantized GEMM for AQLM");
+
 #ifndef USE_ROCM
   ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
   ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
similarity index 96%
rename from csrc/quantization/aqlm/aqlm_cuda_kernel.cpp
rename to csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 301e8439b24ae..66d452df82424 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -43,8 +43,8 @@ torch::Tensor code1x16_matmat(
   const torch::Tensor& input,
   const torch::Tensor& codes,
   const torch::Tensor& codebooks,
-  const torch::Tensor& scales,
-  const std::optional<torch::Tensor>& bias
+  const torch::Tensor& scales
+  //const std::optional<torch::Tensor>& bias
 ) {
   auto input_sizes = input.sizes();
   auto out_features = codes.size(0) * codebooks.size(2);
@@ -66,9 +66,12 @@ torch::Tensor code1x16_matmat(
     );
   }
   flat_output *= scales.flatten().unsqueeze(0);
+
+/* not sure how to bridge this yet.
   if (bias.has_value()) {
     flat_output += bias->unsqueeze(0);
   }
+  */
 
   auto output_sizes = input_sizes.vec();
   output_sizes.pop_back();
@@ -135,8 +138,10 @@ torch::Tensor code2x8_matmat(
 }
 
 
+/*
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("code1x16_matmat", &code1x16_matmat, "1x16 (2bit) codebook matrix-matrix product.");
   m.def("code2x8_matmat", &code2x8_matmat, "2x8 (2bit) codebook matrix-matrix product.");
 }
+*/
 
diff --git a/setup.py b/setup.py
index 8fcb86394f76d..ba00f1b220add 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,12 @@
 import setuptools
 import torch
 import torch.utils.cpp_extension as torch_cpp_ext
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
+from torch.utils.cpp_extension import (
+    BuildExtension,
+    CUDAExtension,
+    CUDA_HOME,
+    ROCM_HOME,
+)
 
 ROOT_DIR = os.path.dirname(__file__)
 
@@ -61,7 +66,8 @@ def _is_cuda() -> bool:
 
 if _is_cuda() and CUDA_HOME is None:
     raise RuntimeError(
-        "Cannot find CUDA_HOME. CUDA must be available to build the package.")
+        "Cannot find CUDA_HOME. CUDA must be available to build the package."
+    )
 
 ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
 CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
@@ -70,10 +76,12 @@ def _is_cuda() -> bool:
 
 def get_hipcc_rocm_version():
     # Run the hipcc --version command
-    result = subprocess.run(['hipcc', '--version'],
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.STDOUT,
-                            text=True)
+    result = subprocess.run(
+        ["hipcc", "--version"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+    )
 
     # Check if the command was executed successfully
     if result.returncode != 0:
@@ -81,7 +89,7 @@ def get_hipcc_rocm_version():
         return None
 
     # Extract the version using a regular expression
-    match = re.search(r'HIP version: (\S+)', result.stdout)
+    match = re.search(r"HIP version: (\S+)", result.stdout)
     if match:
         # Return the version string
         return match.group(1)
@@ -97,9 +105,9 @@ def glob(pattern: str):
 
 def get_neuronxcc_version():
     import sysconfig
+
     site_dir = sysconfig.get_paths()["purelib"]
-    version_file = os.path.join(site_dir, "neuronxcc", "version",
-                                "__init__.py")
+    version_file = os.path.join(site_dir, "neuronxcc", "version", "__init__.py")
 
     # Check if the command was executed successfully
     with open(version_file, "rt") as fp:
@@ -119,8 +127,9 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version:
 
     Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
     """
-    nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
-                                          universal_newlines=True)
+    nvcc_output = subprocess.check_output(
+        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
+    )
     output = nvcc_output.split()
     release_idx = output.index("release") + 1
     nvcc_cuda_version = parse(output[release_idx].split(",")[0])
@@ -142,8 +151,12 @@ def get_pytorch_rocm_arch() -> Set[str]:
     # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator
     if env_arch_list is None:
         command = "rocm_agent_enumerator"
-        env_arch_list = subprocess.check_output([command]).decode('utf-8')\
-                        .strip().replace("\n", ";")
+        env_arch_list = (
+            subprocess.check_output([command])
+            .decode("utf-8")
+            .strip()
+            .replace("\n", ";")
+        )
         arch_source_str = "rocm_agent_enumerator"
     else:
         arch_source_str = "PYTORCH_ROCM_ARCH env variable"
@@ -159,7 +172,8 @@ def get_pytorch_rocm_arch() -> Set[str]:
         raise RuntimeError(
             f"None of the ROCM architectures in {arch_source_str} "
             f"({env_arch_list}) is supported. "
-            f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}.")
+            f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}."
+        )
     invalid_arch_list = pytorch_rocm_arch - ROCM_SUPPORTED_ARCHS
     if invalid_arch_list:
         warnings.warn(
@@ -167,7 +181,8 @@ def get_pytorch_rocm_arch() -> Set[str]:
             f"excluded from the {arch_source_str} output "
             f"({env_arch_list}). Supported ROCM architectures are: "
             f"{ROCM_SUPPORTED_ARCHS}.",
-            stacklevel=2)
+            stacklevel=2,
+        )
     return arch_list
 
 
@@ -189,15 +204,16 @@ def get_torch_arch_list() -> Set[str]:
 
     # Filter out the invalid architectures and print a warning.
     valid_archs = NVIDIA_SUPPORTED_ARCHS.union(
-        {s + "+PTX"
-         for s in NVIDIA_SUPPORTED_ARCHS})
+        {s + "+PTX" for s in NVIDIA_SUPPORTED_ARCHS}
+    )
     arch_list = torch_arch_list.intersection(valid_archs)
     # If none of the specified architectures are valid, raise an error.
     if not arch_list:
         raise RuntimeError(
             "None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env "
             f"variable ({env_arch_list}) is supported. "
-            f"Supported CUDA architectures are: {valid_archs}.")
+            f"Supported CUDA architectures are: {valid_archs}."
+        )
     invalid_arch_list = torch_arch_list - valid_archs
     if invalid_arch_list:
         warnings.warn(
@@ -205,7 +221,8 @@ def get_torch_arch_list() -> Set[str]:
             "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
             f"({env_arch_list}). Supported CUDA architectures are: "
             f"{valid_archs}.",
-            stacklevel=2)
+            stacklevel=2,
+        )
     return arch_list
 
 
@@ -224,7 +241,8 @@ def get_torch_arch_list() -> Set[str]:
         major, minor = torch.cuda.get_device_capability(i)
         if major < 7:
             raise RuntimeError(
-                "GPUs with compute capability below 7.0 are not supported.")
+                "GPUs with compute capability below 7.0 are not supported."
+            )
         compute_capabilities.add(f"{major}.{minor}")
 
 ext_modules = []
@@ -242,12 +260,13 @@ def get_torch_arch_list() -> Set[str]:
             compute_capabilities.remove("9.0")
     # Validate the NVCC CUDA version.
     if nvcc_cuda_version < Version("11.0"):
+        raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
+    if nvcc_cuda_version < Version("11.1") and any(
+        cc.startswith("8.6") for cc in compute_capabilities
+    ):
         raise RuntimeError(
-            "CUDA 11.0 or higher is required to build the package.")
-    if (nvcc_cuda_version < Version("11.1")
-            and any(cc.startswith("8.6") for cc in compute_capabilities)):
-        raise RuntimeError(
-            "CUDA 11.1 or higher is required for compute capability 8.6.")
+            "CUDA 11.1 or higher is required for compute capability 8.6."
+        )
     if nvcc_cuda_version < Version("11.8"):
         if any(cc.startswith("8.9") for cc in compute_capabilities):
             # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
@@ -258,13 +277,16 @@ def get_torch_arch_list() -> Set[str]:
             warnings.warn(
                 "CUDA 11.8 or higher is required for compute capability 8.9. "
                 "Targeting compute capability 8.0 instead.",
-                stacklevel=2)
-            compute_capabilities = set(cc for cc in compute_capabilities
-                                       if not cc.startswith("8.9"))
+                stacklevel=2,
+            )
+            compute_capabilities = set(
+                cc for cc in compute_capabilities if not cc.startswith("8.9")
+            )
             compute_capabilities.add("8.0+PTX")
         if any(cc.startswith("9.0") for cc in compute_capabilities):
             raise RuntimeError(
-                "CUDA 11.8 or higher is required for compute capability 9.0.")
+                "CUDA 11.8 or higher is required for compute capability 9.0."
+            )
 
     NVCC_FLAGS_PUNICA = NVCC_FLAGS.copy()
 
@@ -273,16 +295,13 @@ def get_torch_arch_list() -> Set[str]:
         num = capability[0] + capability[2]
         NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
         if capability.endswith("+PTX"):
-            NVCC_FLAGS += [
-                "-gencode", f"arch=compute_{num},code=compute_{num}"
-            ]
+            NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=compute_{num}"]
         if int(capability[0]) >= 8:
-            NVCC_FLAGS_PUNICA += [
-                "-gencode", f"arch=compute_{num},code=sm_{num}"
-            ]
+            NVCC_FLAGS_PUNICA += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
             if capability.endswith("+PTX"):
                 NVCC_FLAGS_PUNICA += [
-                    "-gencode", f"arch=compute_{num},code=compute_{num}"
+                    "-gencode",
+                    f"arch=compute_{num},code=compute_{num}",
                 ]
 
     # Use NVCC threads to parallelize the build.
@@ -297,10 +316,10 @@ def get_torch_arch_list() -> Set[str]:
     # changes for punica kernels
     NVCC_FLAGS += torch_cpp_ext.COMMON_NVCC_FLAGS
     REMOVE_NVCC_FLAGS = [
-        '-D__CUDA_NO_HALF_OPERATORS__',
-        '-D__CUDA_NO_HALF_CONVERSIONS__',
-        '-D__CUDA_NO_BFLOAT16_CONVERSIONS__',
-        '-D__CUDA_NO_HALF2_OPERATORS__',
+        "-D__CUDA_NO_HALF_OPERATORS__",
+        "-D__CUDA_NO_HALF_CONVERSIONS__",
+        "-D__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "-D__CUDA_NO_HALF2_OPERATORS__",
     ]
     for flag in REMOVE_NVCC_FLAGS:
         with contextlib.suppress(ValueError):
@@ -317,13 +336,13 @@ def get_torch_arch_list() -> Set[str]:
         ext_modules.append(
             CUDAExtension(
                 name="vllm._punica_C",
-                sources=["csrc/punica/punica_ops.cc"] +
-                glob("csrc/punica/bgmv/*.cu"),
+                sources=["csrc/punica/punica_ops.cc"] + glob("csrc/punica/bgmv/*.cu"),
                 extra_compile_args={
                     "cxx": CXX_FLAGS,
                     "nvcc": NVCC_FLAGS_PUNICA,
                 },
-            ))
+            )
+        )
 elif _is_neuron():
     neuronxcc_version = get_neuronxcc_version()
 
@@ -333,6 +352,8 @@ def get_torch_arch_list() -> Set[str]:
     "csrc/pos_encoding_kernels.cu",
     "csrc/activation_kernels.cu",
     "csrc/layernorm_kernels.cu",
+    "csrc/quantization/aqlm/aqlm_cuda_entry.cpp",
+    "csrc/quantization/aqlm/aqlm_cuda_kernal.cu",
     "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
     "csrc/quantization/gptq/q_gemm.cu",
     "csrc/cuda_utils_kernels.cu",
@@ -353,7 +374,8 @@ def get_torch_arch_list() -> Set[str]:
                 "cxx": CXX_FLAGS,
                 "nvcc": NVCC_FLAGS,
             },
-        ))
+        )
+    )
 
 if not _is_neuron():
     vllm_extension = CUDAExtension(
@@ -378,8 +400,9 @@ def find_version(filepath: str) -> str:
     Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
     """
     with open(filepath) as fp:
-        version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
-                                  fp.read(), re.M)
+        version_match = re.search(
+            r"^__version__ = ['\"]([^'\"]*)['\"]", fp.read(), re.M
+        )
         if version_match:
             return version_match.group(1)
         raise RuntimeError("Unable to find version string.")
@@ -442,8 +465,10 @@ def get_requirements() -> List[str]:
     version=get_vllm_version(),
     author="vLLM Team",
     license="Apache 2.0",
-    description=("A high-throughput and memory-efficient inference and "
-                 "serving engine for LLMs"),
+    description=(
+        "A high-throughput and memory-efficient inference and "
+        "serving engine for LLMs"
+    ),
     long_description=read_readme(),
     long_description_content_type="text/markdown",
     url="https://github.com/vllm-project/vllm",
@@ -459,8 +484,9 @@ def get_requirements() -> List[str]:
         "License :: OSI Approved :: Apache Software License",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
-    packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs",
-                                               "examples", "tests")),
+    packages=setuptools.find_packages(
+        exclude=("benchmarks", "csrc", "docs", "examples", "tests")
+    ),
     python_requires=">=3.8",
     install_requires=get_requirements(),
     ext_modules=ext_modules,
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 561375c23b62b..ed559de1cfa72 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -214,16 +214,21 @@ def apply_weights(
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        qweight = weights["qweight"]
-        out_shape = x.shape[:-1] + (qweight.shape[-1],)
-        reshaped_x = x.reshape(-1, x.shape[-1])
+        # qweight = weights["qweight"] do I need the same flattening?
+        # out_shape = x.shape[:-1] + (qweight.shape[-1],)
+        # reshaped_x = x.reshape(-1, x.shape[-1]) #
+
+        print("input shape is ", x)
+
         output = ops.aqlm_gemm(
-            reshaped_x,
-            weights["qweight"],
-            weights["qzeros"],
+            x,  # hmm, reshape?
+            weights["codes"],
+            weights["codebooks"],
             weights["scales"],
-            weights["g_idx"],
         )
+
+        print("output shape is ", output)
+        
         if bias is not None:
             output = output + bias
-        return output.reshape(out_shape)
+        return output  # .reshape(out_shape)  ???

From 0e03c2315cb1ca19f7dffdcc9f12c802bf98ba1e Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 27 Feb 2024 15:52:10 -0500
Subject: [PATCH 06/96] add aqlm

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index ba00f1b220add..c2b972b54b1b7 100644
--- a/setup.py
+++ b/setup.py
@@ -352,8 +352,6 @@ def get_torch_arch_list() -> Set[str]:
     "csrc/pos_encoding_kernels.cu",
     "csrc/activation_kernels.cu",
     "csrc/layernorm_kernels.cu",
-    "csrc/quantization/aqlm/aqlm_cuda_entry.cpp",
-    "csrc/quantization/aqlm/aqlm_cuda_kernal.cu",
     "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
     "csrc/quantization/gptq/q_gemm.cu",
     "csrc/cuda_utils_kernels.cu",
@@ -362,6 +360,8 @@ def get_torch_arch_list() -> Set[str]:
 ]
 
 if _is_cuda():
+    vllm_extension_sources.append("csrc/quantization/aqlm/aqlm_cuda_entry.cpp")
+    vllm_extension_sources.append("csrc/quantization/aqlm/aqlm_cuda_kernel.cu")
     vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
     vllm_extension_sources.append("csrc/custom_all_reduce.cu")
 

From 26f8d8318433d899223e0b32be46be809c03ae70 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 27 Feb 2024 16:09:31 -0500
Subject: [PATCH 07/96] fix print statements

---
 examples/aqlm_test.py                           | 8 ++++++++
 vllm/model_executor/layers/quantization/aqlm.py | 6 +++---
 2 files changed, 11 insertions(+), 3 deletions(-)
 create mode 100644 examples/aqlm_test.py

diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py
new file mode 100644
index 0000000000000..f745c236236a5
--- /dev/null
+++ b/examples/aqlm_test.py
@@ -0,0 +1,8 @@
+from vllm import LLM, SamplingParams
+
+#model = LLM("nm-testing/llama2.c-stories110M-pruned2.4")
+model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf")
+
+sampling_params = SamplingParams(max_tokens=100, temperature=0)
+outputs = model.generate("Hello my name is", sampling_params=sampling_params)
+print(outputs[0].outputs[0].text)
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index ed559de1cfa72..0f9672023989f 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -218,7 +218,7 @@ def apply_weights(
         # out_shape = x.shape[:-1] + (qweight.shape[-1],)
         # reshaped_x = x.reshape(-1, x.shape[-1]) #
 
-        print("input shape is ", x)
+        print("input shape is ", x.shape)
 
         output = ops.aqlm_gemm(
             x,  # hmm, reshape?
@@ -227,8 +227,8 @@ def apply_weights(
             weights["scales"],
         )
 
-        print("output shape is ", output)
-        
+        print("output shape is ", output.shape)
+
         if bias is not None:
             output = output + bias
         return output  # .reshape(out_shape)  ???

From dad66ce052a003c3d5999898aa0e3ad3cc4bcd00 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Wed, 28 Feb 2024 18:34:09 +0000
Subject: [PATCH 08/96] add comment

---
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 66d452df82424..4b4b6bfc69a01 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -67,7 +67,7 @@ torch::Tensor code1x16_matmat(
   }
   flat_output *= scales.flatten().unsqueeze(0);
 
-/* not sure how to bridge this yet.
+/* not sure how to bridge this yet.  may not need to.
   if (bias.has_value()) {
     flat_output += bias->unsqueeze(0);
   }

From 77a89136448e1d30fe5540c81da453894bb38195 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Wed, 28 Feb 2024 19:18:34 +0000
Subject: [PATCH 09/96] remove unused enum

---
 vllm/model_executor/layers/quantization/aqlm.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 0f9672023989f..58dad5aaa5b4e 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -1,5 +1,3 @@
-import enum
-from enum import Enum
 from typing import Any, Dict, List, Optional
 
 import torch

From 2bb6871febf8e394ad94e9c3c860782fbddcc6bc Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Wed, 28 Feb 2024 21:39:28 +0000
Subject: [PATCH 10/96] add a bunch of prints, add bias

---
 csrc/ops.h                                    |  4 +--
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp    |  6 ++---
 examples/aqlm_test.py                         | 10 +++++---
 vllm/model_executor/layers/linear.py          | 17 ++++++++++---
 .../layers/quantization/aqlm.py               | 25 ++++++++++++-------
 5 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 5ff16e0a27393..246862ee048f0 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -73,8 +73,8 @@ torch::Tensor code1x16_matmat(
   const torch::Tensor& input,
   const torch::Tensor& codes,
   const torch::Tensor& codebooks,
-  const torch::Tensor& scales
-  //const std::optional<torch::Tensor>& bias
+  const torch::Tensor& scales,
+  const std::optional<torch::Tensor>& bias
 );
 
 #ifndef USE_ROCM
diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 4b4b6bfc69a01..6e4aa751c113d 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -43,8 +43,8 @@ torch::Tensor code1x16_matmat(
   const torch::Tensor& input,
   const torch::Tensor& codes,
   const torch::Tensor& codebooks,
-  const torch::Tensor& scales
-  //const std::optional<torch::Tensor>& bias
+  const torch::Tensor& scales,
+  const std::optional<torch::Tensor>& bias
 ) {
   auto input_sizes = input.sizes();
   auto out_features = codes.size(0) * codebooks.size(2);
@@ -67,11 +67,9 @@ torch::Tensor code1x16_matmat(
   }
   flat_output *= scales.flatten().unsqueeze(0);
 
-/* not sure how to bridge this yet.  may not need to.
   if (bias.has_value()) {
     flat_output += bias->unsqueeze(0);
   }
-  */
 
   auto output_sizes = input_sizes.vec();
   output_sizes.pop_back();
diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py
index f745c236236a5..fcb64223ca2a5 100644
--- a/examples/aqlm_test.py
+++ b/examples/aqlm_test.py
@@ -1,8 +1,12 @@
 from vllm import LLM, SamplingParams
 
 #model = LLM("nm-testing/llama2.c-stories110M-pruned2.4")
-model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf")
+#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
 
-sampling_params = SamplingParams(max_tokens=100, temperature=0)
-outputs = model.generate("Hello my name is", sampling_params=sampling_params)
+model = LLM("/home/jaemz/.cache/huggingface/hub/models--BlackSamorez--Llama-2-7b-AQLM-2Bit-1x16-hf/snapshots/1756949fed7fe691c00f015e2ebd18503a975f3b", enforce_eager=True)
+
+sampling_params = SamplingParams(max_tokens=200, temperature=0)
+outputs = model.generate("How are you ", sampling_params=sampling_params)
+print("generated!")
 print(outputs[0].outputs[0].text)
+print("output above!")
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index cd9a17b7ef864..de5b52f7e7f0c 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -326,6 +326,12 @@ def weight_loader(
         if output_dim is not None:
             shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
             shard_size = self.output_sizes[loaded_shard_id] // tp_size
+            #TEST
+            if loaded_shard_id > 0:
+                print("   loading a shard ", loaded_shard_id)
+                print("   param_data shape ", param_data.shape)
+                print("   loaded_weight shape ", loaded_weight.shape)
+
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
             packed_dim = getattr(param, "packed_dim", None)
@@ -579,14 +585,19 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         param_data = param.data
 
         # TEST
-        print("param data shape is ", param_data.shape)
-        print("loaded_weight is ", loaded_weight.shape)
+        print("   param data shape is ", param_data.shape)
+        print("   loaded_weight is ", loaded_weight.shape)
 
         if input_dim is not None:
             shard_size = param_data.shape[input_dim]
             start_idx = tp_rank * shard_size
+            print("   loaded_weight dtype is ", loaded_weight.dtype)
+            print("   data_param dtype is ", param_data.dtype)
+            #TEST 
+            assert(start_idx == 0 and shard_size == loaded_weight.shape[input_dim])
+
             loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size)
-            print("sharded loaded_weight is ", loaded_weight.shape)
+            print(   "sharded loaded_weight is ", loaded_weight.shape)
 
         assert param_data.shape == loaded_weight.shape
 
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 58dad5aaa5b4e..d6bc67bd24873 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -128,6 +128,9 @@ def create_weights(
         output_size: int,
         params_dtype: torch.dtype,
     ) -> Dict[str, Any]:
+        #TEST
+        assert(output_size == output_size_per_partition)
+        assert(input_size == input_size_per_partition)
         del output_size  # Unused.
         del input_size  # Unused.
 
@@ -149,14 +152,16 @@ def create_weights(
         # or does this need more dimensions and use the correct nbits_per_codebook as an int type.  Does that pack them?
         codes = Parameter(
             torch.empty(
-                output_size_per_partition,  # not entirely sure what to do with out groups, if we need this pack factor.
+                output_size_per_partition,  # not entirely sure what to do with num_out_groups, if we need this pack factor.
                 input_size_per_partition // self.quant_config.pack_factor,
-                1,
+                1, # probably should be num codebooks.
                 dtype=get_int_dtype(self.quant_config.nbits_per_codebook),
             ),
             requires_grad=False,
         )
 
+        print(codes.shape)
+
         set_weight_attrs(
             codes,
             {
@@ -177,14 +182,12 @@ def create_weights(
             ),
             requires_grad=False,
         )
-        # no attributes?  It's fixed size, no input or output dim, need the whole thing.
-        # looks like named attributes are for sharding so it knows how to split something up.
 
         scales = Parameter(
             torch.empty(
                 (
                     output_size_per_partition // self.quant_config.out_group_size,
-                    1,  # do we really need these other dimensions?  They don't count, or?
+                    1,
                     1,
                     1,
                 ),
@@ -218,15 +221,19 @@ def apply_weights(
 
         print("input shape is ", x.shape)
 
+        if (x.shape[1] == 5) : 
+            print("codes shape is ", weights["codes"].shape)
+            print("codebooks shape is ", weights["codebooks"].shape)
+            print("scales shape is ", weights["scales"].shape)
+            print("x is ", x)
+
         output = ops.aqlm_gemm(
             x,  # hmm, reshape?
             weights["codes"],
             weights["codebooks"],
             weights["scales"],
+            bias,
         )
 
         print("output shape is ", output.shape)
-
-        if bias is not None:
-            output = output + bias
-        return output  # .reshape(out_shape)  ???
+        return output

From 5f0c319b01d8b7a1902b7d8e8721816623180c52 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Wed, 28 Feb 2024 21:45:15 +0000
Subject: [PATCH 11/96] minor fix for scales

---
 vllm/model_executor/layers/quantization/aqlm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index d6bc67bd24873..bb75d76e47e6d 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -199,7 +199,8 @@ def create_weights(
             scales,
             {
                 "output_dim": 0,
-                # "pack_factor": self.quant_config.pack_factor,   I guess not really a pack factor, just smaller?
+                "packed_dim": 0,
+                "pack_factor": self.quant_config.out_group_size
             },
         )
 

From 024b54ca48c5bf41dba258f0984a74d254289eea Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Wed, 28 Feb 2024 21:51:50 +0000
Subject: [PATCH 12/96] change

---
 examples/aqlm_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py
index fcb64223ca2a5..47936b19c9dd3 100644
--- a/examples/aqlm_test.py
+++ b/examples/aqlm_test.py
@@ -2,7 +2,6 @@
 
 #model = LLM("nm-testing/llama2.c-stories110M-pruned2.4")
 #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
-
 model = LLM("/home/jaemz/.cache/huggingface/hub/models--BlackSamorez--Llama-2-7b-AQLM-2Bit-1x16-hf/snapshots/1756949fed7fe691c00f015e2ebd18503a975f3b", enforce_eager=True)
 
 sampling_params = SamplingParams(max_tokens=200, temperature=0)

From 84c2e2a178e3d798d37ebbf27a963a1135a11e69 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 15:26:11 +0000
Subject: [PATCH 13/96] format

---
 examples/aqlm_test.py                         |   4 +-
 setup.py                                      |  82 +++++-----
 vllm/config.py                                | 144 +++++++-----------
 vllm/model_executor/layers/linear.py          |  82 +++++-----
 .../layers/quantization/aqlm.py               |  35 ++---
 vllm/model_executor/models/llama.py           |  76 +++++----
 vllm/model_executor/weight_utils.py           |  27 ++--
 7 files changed, 205 insertions(+), 245 deletions(-)

diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py
index 47936b19c9dd3..26d90584a7858 100644
--- a/examples/aqlm_test.py
+++ b/examples/aqlm_test.py
@@ -2,7 +2,9 @@
 
 #model = LLM("nm-testing/llama2.c-stories110M-pruned2.4")
 #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
-model = LLM("/home/jaemz/.cache/huggingface/hub/models--BlackSamorez--Llama-2-7b-AQLM-2Bit-1x16-hf/snapshots/1756949fed7fe691c00f015e2ebd18503a975f3b", enforce_eager=True)
+model = LLM(
+    "/home/jaemz/.cache/huggingface/hub/models--BlackSamorez--Llama-2-7b-AQLM-2Bit-1x16-hf/snapshots/1756949fed7fe691c00f015e2ebd18503a975f3b",
+    enforce_eager=True)
 
 sampling_params = SamplingParams(max_tokens=200, temperature=0)
 outputs = model.generate("How are you ", sampling_params=sampling_params)
diff --git a/setup.py b/setup.py
index c2b972b54b1b7..ba351402f7b16 100644
--- a/setup.py
+++ b/setup.py
@@ -66,8 +66,7 @@ def _is_cuda() -> bool:
 
 if _is_cuda() and CUDA_HOME is None:
     raise RuntimeError(
-        "Cannot find CUDA_HOME. CUDA must be available to build the package."
-    )
+        "Cannot find CUDA_HOME. CUDA must be available to build the package.")
 
 ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
 CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
@@ -107,7 +106,8 @@ def get_neuronxcc_version():
     import sysconfig
 
     site_dir = sysconfig.get_paths()["purelib"]
-    version_file = os.path.join(site_dir, "neuronxcc", "version", "__init__.py")
+    version_file = os.path.join(site_dir, "neuronxcc", "version",
+                                "__init__.py")
 
     # Check if the command was executed successfully
     with open(version_file, "rt") as fp:
@@ -127,9 +127,8 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version:
 
     Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
     """
-    nvcc_output = subprocess.check_output(
-        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
-    )
+    nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
+                                          universal_newlines=True)
     output = nvcc_output.split()
     release_idx = output.index("release") + 1
     nvcc_cuda_version = parse(output[release_idx].split(",")[0])
@@ -151,12 +150,8 @@ def get_pytorch_rocm_arch() -> Set[str]:
     # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator
     if env_arch_list is None:
         command = "rocm_agent_enumerator"
-        env_arch_list = (
-            subprocess.check_output([command])
-            .decode("utf-8")
-            .strip()
-            .replace("\n", ";")
-        )
+        env_arch_list = (subprocess.check_output(
+            [command]).decode("utf-8").strip().replace("\n", ";"))
         arch_source_str = "rocm_agent_enumerator"
     else:
         arch_source_str = "PYTORCH_ROCM_ARCH env variable"
@@ -172,8 +167,7 @@ def get_pytorch_rocm_arch() -> Set[str]:
         raise RuntimeError(
             f"None of the ROCM architectures in {arch_source_str} "
             f"({env_arch_list}) is supported. "
-            f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}."
-        )
+            f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}.")
     invalid_arch_list = pytorch_rocm_arch - ROCM_SUPPORTED_ARCHS
     if invalid_arch_list:
         warnings.warn(
@@ -204,16 +198,15 @@ def get_torch_arch_list() -> Set[str]:
 
     # Filter out the invalid architectures and print a warning.
     valid_archs = NVIDIA_SUPPORTED_ARCHS.union(
-        {s + "+PTX" for s in NVIDIA_SUPPORTED_ARCHS}
-    )
+        {s + "+PTX"
+         for s in NVIDIA_SUPPORTED_ARCHS})
     arch_list = torch_arch_list.intersection(valid_archs)
     # If none of the specified architectures are valid, raise an error.
     if not arch_list:
         raise RuntimeError(
             "None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env "
             f"variable ({env_arch_list}) is supported. "
-            f"Supported CUDA architectures are: {valid_archs}."
-        )
+            f"Supported CUDA architectures are: {valid_archs}.")
     invalid_arch_list = torch_arch_list - valid_archs
     if invalid_arch_list:
         warnings.warn(
@@ -241,8 +234,7 @@ def get_torch_arch_list() -> Set[str]:
         major, minor = torch.cuda.get_device_capability(i)
         if major < 7:
             raise RuntimeError(
-                "GPUs with compute capability below 7.0 are not supported."
-            )
+                "GPUs with compute capability below 7.0 are not supported.")
         compute_capabilities.add(f"{major}.{minor}")
 
 ext_modules = []
@@ -260,13 +252,12 @@ def get_torch_arch_list() -> Set[str]:
             compute_capabilities.remove("9.0")
     # Validate the NVCC CUDA version.
     if nvcc_cuda_version < Version("11.0"):
-        raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
+        raise RuntimeError(
+            "CUDA 11.0 or higher is required to build the package.")
     if nvcc_cuda_version < Version("11.1") and any(
-        cc.startswith("8.6") for cc in compute_capabilities
-    ):
+            cc.startswith("8.6") for cc in compute_capabilities):
         raise RuntimeError(
-            "CUDA 11.1 or higher is required for compute capability 8.6."
-        )
+            "CUDA 11.1 or higher is required for compute capability 8.6.")
     if nvcc_cuda_version < Version("11.8"):
         if any(cc.startswith("8.9") for cc in compute_capabilities):
             # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
@@ -279,14 +270,12 @@ def get_torch_arch_list() -> Set[str]:
                 "Targeting compute capability 8.0 instead.",
                 stacklevel=2,
             )
-            compute_capabilities = set(
-                cc for cc in compute_capabilities if not cc.startswith("8.9")
-            )
+            compute_capabilities = set(cc for cc in compute_capabilities
+                                       if not cc.startswith("8.9"))
             compute_capabilities.add("8.0+PTX")
         if any(cc.startswith("9.0") for cc in compute_capabilities):
             raise RuntimeError(
-                "CUDA 11.8 or higher is required for compute capability 9.0."
-            )
+                "CUDA 11.8 or higher is required for compute capability 9.0.")
 
     NVCC_FLAGS_PUNICA = NVCC_FLAGS.copy()
 
@@ -295,9 +284,13 @@ def get_torch_arch_list() -> Set[str]:
         num = capability[0] + capability[2]
         NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
         if capability.endswith("+PTX"):
-            NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=compute_{num}"]
+            NVCC_FLAGS += [
+                "-gencode", f"arch=compute_{num},code=compute_{num}"
+            ]
         if int(capability[0]) >= 8:
-            NVCC_FLAGS_PUNICA += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
+            NVCC_FLAGS_PUNICA += [
+                "-gencode", f"arch=compute_{num},code=sm_{num}"
+            ]
             if capability.endswith("+PTX"):
                 NVCC_FLAGS_PUNICA += [
                     "-gencode",
@@ -336,13 +329,13 @@ def get_torch_arch_list() -> Set[str]:
         ext_modules.append(
             CUDAExtension(
                 name="vllm._punica_C",
-                sources=["csrc/punica/punica_ops.cc"] + glob("csrc/punica/bgmv/*.cu"),
+                sources=["csrc/punica/punica_ops.cc"] +
+                glob("csrc/punica/bgmv/*.cu"),
                 extra_compile_args={
                     "cxx": CXX_FLAGS,
                     "nvcc": NVCC_FLAGS_PUNICA,
                 },
-            )
-        )
+            ))
 elif _is_neuron():
     neuronxcc_version = get_neuronxcc_version()
 
@@ -374,8 +367,7 @@ def get_torch_arch_list() -> Set[str]:
                 "cxx": CXX_FLAGS,
                 "nvcc": NVCC_FLAGS,
             },
-        )
-    )
+        ))
 
 if not _is_neuron():
     vllm_extension = CUDAExtension(
@@ -400,9 +392,8 @@ def find_version(filepath: str) -> str:
     Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
     """
     with open(filepath) as fp:
-        version_match = re.search(
-            r"^__version__ = ['\"]([^'\"]*)['\"]", fp.read(), re.M
-        )
+        version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
+                                  fp.read(), re.M)
         if version_match:
             return version_match.group(1)
         raise RuntimeError("Unable to find version string.")
@@ -465,10 +456,8 @@ def get_requirements() -> List[str]:
     version=get_vllm_version(),
     author="vLLM Team",
     license="Apache 2.0",
-    description=(
-        "A high-throughput and memory-efficient inference and "
-        "serving engine for LLMs"
-    ),
+    description=("A high-throughput and memory-efficient inference and "
+                 "serving engine for LLMs"),
     long_description=read_readme(),
     long_description_content_type="text/markdown",
     url="https://github.com/vllm-project/vllm",
@@ -484,9 +473,8 @@ def get_requirements() -> List[str]:
         "License :: OSI Approved :: Apache Software License",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
-    packages=setuptools.find_packages(
-        exclude=("benchmarks", "csrc", "docs", "examples", "tests")
-    ),
+    packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs",
+                                               "examples", "tests")),
     python_requires=">=3.8",
     install_requires=get_requirements(),
     ext_modules=ext_modules,
diff --git a/vllm/config.py b/vllm/config.py
index f2452baf8796c..19f1c0e27b103 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -100,20 +100,20 @@ def __init__(
             from modelscope.hub.snapshot_download import snapshot_download  # pylint: disable=C
 
             if not os.path.exists(model):
-                model_path = snapshot_download(
-                    model_id=model, cache_dir=download_dir, revision=revision
-                )
+                model_path = snapshot_download(model_id=model,
+                                               cache_dir=download_dir,
+                                               revision=revision)
             else:
                 model_path = model
             self.model = model_path
             self.download_dir = model_path
             self.tokenizer = model_path
 
-        self.hf_config = get_config(
-            self.model, trust_remote_code, revision, code_revision
-        )
+        self.hf_config = get_config(self.model, trust_remote_code, revision,
+                                    code_revision)
         self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
-        self.max_model_len = _get_and_verify_max_len(self.hf_config, max_model_len)
+        self.max_model_len = _get_and_verify_max_len(self.hf_config,
+                                                     max_model_len)
         self._verify_load_format()
         self._verify_tokenizer_mode()
         self.hf_quant_config = self._get_and_verify_quantization()
@@ -121,32 +121,30 @@ def __init__(
 
     def _verify_load_format(self) -> None:
         load_format = self.load_format.lower()
-        supported_load_format = ["auto", "pt", "safetensors", "npcache", "dummy"]
+        supported_load_format = [
+            "auto", "pt", "safetensors", "npcache", "dummy"
+        ]
         rocm_not_supported_load_format = []
         if load_format not in supported_load_format:
             raise ValueError(
                 f"Unknown load format: {self.load_format}. Must be one of "
-                "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'."
-            )
+                "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'.")
         if is_hip() and load_format in rocm_not_supported_load_format:
             rocm_supported_load_format = [
-                f
-                for f in supported_load_format
+                f for f in supported_load_format
                 if (f not in rocm_not_supported_load_format)
             ]
             raise ValueError(
                 f"load format '{load_format}' is not supported in ROCm. "
                 f"Supported load format are "
-                f"{rocm_supported_load_format}"
-            )
+                f"{rocm_supported_load_format}")
 
         # TODO: Remove this check once HF updates the pt weights of Mixtral.
         architectures = getattr(self.hf_config, "architectures", [])
         if "MixtralForCausalLM" in architectures and load_format == "pt":
             raise ValueError(
                 "Currently, the 'pt' format is not supported for Mixtral. "
-                "Please use the 'safetensors' format instead. "
-            )
+                "Please use the 'safetensors' format instead. ")
         self.load_format = load_format
 
     def _verify_tokenizer_mode(self) -> None:
@@ -154,8 +152,7 @@ def _verify_tokenizer_mode(self) -> None:
         if tokenizer_mode not in ["auto", "slow"]:
             raise ValueError(
                 f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
-                "either 'auto' or 'slow'."
-            )
+                "either 'auto' or 'slow'.")
         self.tokenizer_mode = tokenizer_mode
 
     def _get_and_verify_quantization(self) -> Any | None:
@@ -183,34 +180,29 @@ def _get_and_verify_quantization(self) -> Any | None:
                 "Quantization method specified in the model config "
                 f"({hf_quant_method}) does not match the quantization "
                 f"method specified in the `quantization` argument "
-                f"({self.quantization})."
-            )
+                f"({self.quantization}).")
 
         if self.quantization is not None:
             if self.quantization not in supported_quantization:
                 raise ValueError(
                     f"Unknown quantization method: {self.quantization}. Must "
-                    f"be one of {supported_quantization}."
-                )
-            if is_hip() and self.quantization in rocm_not_supported_quantization:
+                    f"be one of {supported_quantization}.")
+            if is_hip(
+            ) and self.quantization in rocm_not_supported_quantization:
                 raise ValueError(
                     f"{self.quantization} quantization is currently not supported "
-                    f"in ROCm."
-                )
-            logger.warning(
-                f"{self.quantization} quantization is not fully "
-                "optimized yet. The speed can be slower than "
-                "non-quantized models."
-            )
+                    f"in ROCm.")
+            logger.warning(f"{self.quantization} quantization is not fully "
+                           "optimized yet. The speed can be slower than "
+                           "non-quantized models.")
 
         return hf_quant_config
 
     def _verify_cuda_graph(self) -> None:
         if self.max_context_len_to_capture is None:
             self.max_context_len_to_capture = self.max_model_len
-        self.max_context_len_to_capture = min(
-            self.max_context_len_to_capture, self.max_model_len
-        )
+        self.max_context_len_to_capture = min(self.max_context_len_to_capture,
+                                              self.max_model_len)
 
     def verify_with_parallel_config(
         self,
@@ -222,8 +214,7 @@ def verify_with_parallel_config(
             raise ValueError(
                 f"Total number of attention heads ({total_num_attention_heads})"
                 " must be divisible by tensor parallel size "
-                f"({tensor_parallel_size})."
-            )
+                f"({tensor_parallel_size}).")
 
         total_num_hidden_layers = self.hf_config.num_hidden_layers
         pipeline_parallel_size = parallel_config.pipeline_parallel_size
@@ -231,8 +222,7 @@ def verify_with_parallel_config(
             raise ValueError(
                 f"Total number of hidden layers ({total_num_hidden_layers}) "
                 "must be divisible by pipeline parallel size "
-                f"({pipeline_parallel_size})."
-            )
+                f"({pipeline_parallel_size}).")
 
     def get_sliding_window(self) -> Optional[int]:
         return getattr(self.hf_config, "sliding_window", None)
@@ -258,11 +248,9 @@ def get_total_num_kv_heads(self) -> int:
         falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
         new_decoder_arch_falcon = (
             self.hf_config.model_type in falcon_model_types
-            and getattr(self.hf_config, "new_decoder_architecture", False)
-        )
-        if not new_decoder_arch_falcon and getattr(
-            self.hf_config, "multi_query", False
-        ):
+            and getattr(self.hf_config, "new_decoder_architecture", False))
+        if not new_decoder_arch_falcon and getattr(self.hf_config,
+                                                   "multi_query", False):
             # Multi-query attention, only one KV head.
             # Currently, tensor parallelism is not supported in this case.
             return 1
@@ -292,7 +280,8 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
         # the tensor parallel size. We will replicate the KV heads in the
         # case where the number of KV heads is smaller than the tensor
         # parallel size so each GPU has at least one KV head.
-        return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size)
+        return max(1,
+                   total_num_kv_heads // parallel_config.tensor_parallel_size)
 
     def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
         total_num_hidden_layers = self.hf_config.num_hidden_layers
@@ -334,8 +323,7 @@ def _verify_args(self) -> None:
         if self.gpu_memory_utilization > 1.0:
             raise ValueError(
                 "GPU memory utilization must be less than 1.0. Got "
-                f"{self.gpu_memory_utilization}."
-            )
+                f"{self.gpu_memory_utilization}.")
 
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
@@ -349,15 +337,13 @@ def _verify_cache_dtype(self) -> None:
             device_name = torch.cuda.get_device_name()
             if "AMD" in device_name:
                 raise NotImplementedError(
-                    "FP8_E5M2 KV Cache on AMD GPU has not been supported yet."
-                )
+                    "FP8_E5M2 KV Cache on AMD GPU has not been supported yet.")
             logger.info(
                 "Using fp8_e5m2 data type to store kv cache. It reduces "
                 "the GPU memory footprint and boosts the performance. "
                 "But it may cause slight accuracy drop. "
                 "Currently we only support fp8 without scaling factors and "
-                "make e5m2 as a default format."
-            )
+                "make e5m2 as a default format.")
         else:
             raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
 
@@ -371,11 +357,9 @@ def verify_with_parallel_config(
         num_gpus_per_node = parallel_config.tensor_parallel_size
         cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
 
-        msg = (
-            f"{cpu_memory_usage / _GB:.2f} GiB out of "
-            f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
-            "allocated for the swap space."
-        )
+        msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
+               f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
+               "allocated for the swap space.")
         if cpu_memory_usage > 0.7 * total_cpu_memory:
             raise ValueError("Too large swap space. " + msg)
         elif cpu_memory_usage > 0.4 * total_cpu_memory:
@@ -419,20 +403,19 @@ def __init__(
 
     def _verify_args(self) -> None:
         if self.pipeline_parallel_size > 1:
-            raise NotImplementedError("Pipeline parallelism is not supported yet.")
+            raise NotImplementedError(
+                "Pipeline parallelism is not supported yet.")
         if not self.disable_custom_all_reduce and self.world_size > 1:
             if is_hip():
                 self.disable_custom_all_reduce = True
                 logger.info(
                     "Disabled the custom all-reduce kernel because it is not "
-                    "supported on AMD GPUs."
-                )
+                    "supported on AMD GPUs.")
             elif self.pipeline_parallel_size > 1:
                 self.disable_custom_all_reduce = True
                 logger.info(
                     "Disabled the custom all-reduce kernel because it is not "
-                    "supported with pipeline parallelism."
-                )
+                    "supported with pipeline parallelism.")
 
         # FIXME(woosuk): Fix the stability issues and re-enable the custom
         # all-reduce kernel.
@@ -441,8 +424,7 @@ def _verify_args(self) -> None:
             logger.info(
                 "Custom all-reduce kernels are temporarily disabled due to "
                 "stability issues. We will re-enable them once the issues are "
-                "resolved."
-            )
+                "resolved.")
 
 
 class SchedulerConfig:
@@ -484,17 +466,16 @@ def _verify_args(self) -> None:
                 "This effectively limits the maximum sequence length to "
                 "max_num_batched_tokens and makes vLLM reject longer "
                 "sequences. Please increase max_num_batched_tokens or "
-                "decrease max_model_len."
-            )
+                "decrease max_model_len.")
         if self.max_num_batched_tokens < self.max_num_seqs:
             raise ValueError(
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
                 "be greater than or equal to max_num_seqs "
-                f"({self.max_num_seqs})."
-            )
+                f"({self.max_num_seqs}).")
 
 
 class DeviceConfig:
+
     def __init__(self, device: str = "cuda") -> None:
         self.device = torch.device(device)
 
@@ -516,13 +497,11 @@ def __post_init__(self):
         if self.max_lora_rank not in possible_max_ranks:
             raise ValueError(
                 f"max_lora_rank ({self.max_lora_rank}) must be one of "
-                f"{possible_max_ranks}."
-            )
+                f"{possible_max_ranks}.")
         if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
             raise ValueError(
                 f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
-                f"must be one of {possible_lora_extra_vocab_size}."
-            )
+                f"must be one of {possible_lora_extra_vocab_size}.")
         if self.max_loras < 1:
             raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
         if self.max_cpu_loras is None:
@@ -530,8 +509,7 @@ def __post_init__(self):
         elif self.max_cpu_loras < self.max_loras:
             raise ValueError(
                 f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
-                f"max_loras ({self.max_loras})"
-            )
+                f"max_loras ({self.max_loras})")
 
     def verify_with_model_config(self, model_config: ModelConfig):
         if self.lora_dtype in (None, "auto"):
@@ -539,15 +517,15 @@ def verify_with_model_config(self, model_config: ModelConfig):
         elif isinstance(self.lora_dtype, str):
             self.lora_dtype = getattr(torch, self.lora_dtype)
         if model_config.quantization is not None:
-            raise ValueError("LoRA is not supported with quantized models yet.")
+            raise ValueError(
+                "LoRA is not supported with quantized models yet.")
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
         if scheduler_config.max_num_batched_tokens > 65528:
             raise ValueError(
                 "Due to limitations of the custom LoRA CUDA kernel, "
                 "max_num_batched_tokens must be <= 65528 when "
-                "LoRA is enabled."
-            )
+                "LoRA is enabled.")
 
 
 _STR_DTYPE_TO_TORCH_DTYPE = {
@@ -591,14 +569,11 @@ def _get_and_verify_dtype(
 
     if is_hip() and torch_dtype == torch.float32:
         rocm_supported_dtypes = [
-            k
-            for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items()
+            k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items()
             if (k not in _ROCM_NOT_SUPPORTED_DTYPE)
         ]
-        raise ValueError(
-            f"dtype '{dtype}' is not supported in ROCm. "
-            f"Supported dtypes are {rocm_supported_dtypes}"
-        )
+        raise ValueError(f"dtype '{dtype}' is not supported in ROCm. "
+                         f"Supported dtypes are {rocm_supported_dtypes}")
 
     # Verify the dtype.
     if torch_dtype != config_dtype:
@@ -649,8 +624,7 @@ def _get_and_verify_max_len(
             "The model's config.json does not contain any of the following "
             "keys to determine the original maximum length of the model: "
             f"{possible_keys}. Assuming the model's maximum length is "
-            f"{default_max_len}."
-        )
+            f"{default_max_len}.")
         derived_max_model_len = default_max_len
 
     rope_scaling = getattr(hf_config, "rope_scaling", None)
@@ -658,7 +632,8 @@ def _get_and_verify_max_len(
         assert "factor" in rope_scaling
         scaling_factor = rope_scaling["factor"]
         if rope_scaling["type"] == "yarn":
-            derived_max_model_len = rope_scaling["original_max_position_embeddings"]
+            derived_max_model_len = rope_scaling[
+                "original_max_position_embeddings"]
         derived_max_model_len *= scaling_factor
 
     if max_model_len is None:
@@ -669,6 +644,5 @@ def _get_and_verify_max_len(
             f"the derived max_model_len ({max_len_key}={derived_max_model_len}"
             " in model's config.json). This may lead to incorrect model "
             "outputs or CUDA errors. Make sure the value is correct and "
-            "within the model context size."
-        )
+            "within the model context size.")
     return int(max_model_len)
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index de5b52f7e7f0c..abbdf7c649557 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -66,9 +66,9 @@ def create_weights(
         params_dtype: torch.dtype,
     ) -> Dict[str, Any]:
         weight = Parameter(
-            torch.empty(
-                output_size_per_partition, input_size_per_partition, dtype=params_dtype
-            ),
+            torch.empty(output_size_per_partition,
+                        input_size_per_partition,
+                        dtype=params_dtype),
             requires_grad=False,
         )
         set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
@@ -133,8 +133,7 @@ def __init__(
                 self.register_parameter(name, weight)
         if bias:
             self.bias = Parameter(
-                torch.empty(self.output_size, dtype=self.params_dtype)
-            )
+                torch.empty(self.output_size, dtype=self.params_dtype))
             set_weight_attrs(self.bias, {"output_dim": 0})
         else:
             self.register_parameter("bias", None)
@@ -205,8 +204,8 @@ def __init__(
                 set_weight_attrs(weight, {"weight_loader": self.weight_loader})
         if bias:
             self.bias = Parameter(
-                torch.empty(self.output_size_per_partition, dtype=params_dtype)
-            )
+                torch.empty(self.output_size_per_partition,
+                            dtype=params_dtype))
             set_weight_attrs(
                 self.bias,
                 {
@@ -224,7 +223,8 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         if output_dim is not None:
             shard_size = param_data.shape[output_dim]
             start_idx = tp_rank * shard_size
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                 shard_size)
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -233,8 +233,7 @@ def forward(self, input_):
 
         # Matrix multiply.
         output_parallel = self.linear_method.apply_weights(
-            self.linear_weights, input_, bias
-        )
+            self.linear_weights, input_, bias)
         if self.gather_output:
             # All-gather across the partitions.
             output = tensor_model_parallel_all_gather(output_parallel)
@@ -315,8 +314,7 @@ def weight_loader(
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
                 loaded_weight_shard = loaded_weight.narrow(
-                    output_dim, shard_offset, shard_size
-                )
+                    output_dim, shard_offset, shard_size)
                 self.weight_loader(param, loaded_weight_shard, shard_id)
             return
 
@@ -338,17 +336,18 @@ def weight_loader(
             if packed_dim == output_dim:
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
-            param_data = param_data.narrow(output_dim, shard_offset, shard_size)
+            param_data = param_data.narrow(output_dim, shard_offset,
+                                           shard_size)
             start_idx = tp_rank * shard_size
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                 shard_size)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
                 logger.warning(
                     "Loading a weight without `output_dim` attribute in "
                     "MergedColumnParallelLinear, assume the weight is "
-                    "the same for all partitions."
-                )
+                    "the same for all partitions.")
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -399,14 +398,14 @@ def __init__(
         self.num_heads = divide(self.total_num_heads, tp_size)
         if tp_size >= self.total_num_kv_heads:
             self.num_kv_heads = 1
-            self.num_kv_head_replicas = divide(tp_size, self.total_num_kv_heads)
+            self.num_kv_head_replicas = divide(tp_size,
+                                               self.total_num_kv_heads)
         else:
             self.num_kv_heads = divide(self.total_num_kv_heads, tp_size)
             self.num_kv_head_replicas = 1
         input_size = self.hidden_size
-        output_size = (
-            (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size
-        )
+        output_size = ((self.num_heads + 2 * self.num_kv_heads) * tp_size *
+                       self.head_size)
         super().__init__(
             input_size,
             output_size,
@@ -441,7 +440,8 @@ def weight_loader(
                 ),
                 (
                     "v",
-                    (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
+                    (self.total_num_heads + self.total_num_kv_heads) *
+                    self.head_size,
                     self.total_num_kv_heads * self.head_size,
                 ),
             ]
@@ -453,8 +453,7 @@ def weight_loader(
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
                 loaded_weight_shard = loaded_weight.narrow(
-                    output_dim, shard_offset, shard_size
-                )
+                    output_dim, shard_offset, shard_size)
                 self.weight_loader(param, loaded_weight_shard, shard_id)
             return
 
@@ -468,7 +467,8 @@ def weight_loader(
                 shard_offset = self.num_heads * self.head_size
                 shard_size = self.num_kv_heads * self.head_size
             elif loaded_shard_id == "v":
-                shard_offset = (self.num_heads + self.num_kv_heads) * self.head_size
+                shard_offset = (self.num_heads +
+                                self.num_kv_heads) * self.head_size
                 shard_size = self.num_kv_heads * self.head_size
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
@@ -476,21 +476,22 @@ def weight_loader(
             if packed_dim == output_dim:
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
-            param_data = param_data.narrow(output_dim, shard_offset, shard_size)
+            param_data = param_data.narrow(output_dim, shard_offset,
+                                           shard_size)
             if loaded_shard_id == "q":
                 shard_id = tp_rank
             else:
                 shard_id = tp_rank // self.num_kv_head_replicas
             start_idx = shard_id * shard_size
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                 shard_size)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
                 logger.warning(
                     "Loading a weight without `output_dim` attribute in "
                     "QKVParallelLinear, assume the weight is the same "
-                    "for all partitions."
-                )
+                    "for all partitions.")
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -562,13 +563,12 @@ def __init__(
                 set_weight_attrs(weight, {"weight_loader": self.weight_loader})
 
         if not reduce_results and (bias and not skip_bias_add):
-            raise ValueError(
-                "When not reduce the results, adding bias to the "
-                "results can lead to incorrect results"
-            )
+            raise ValueError("When not reduce the results, adding bias to the "
+                             "results can lead to incorrect results")
 
         if bias:
-            self.bias = Parameter(torch.empty(self.output_size, dtype=params_dtype))
+            self.bias = Parameter(
+                torch.empty(self.output_size, dtype=params_dtype))
             set_weight_attrs(
                 self.bias,
                 {
@@ -593,11 +593,13 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             start_idx = tp_rank * shard_size
             print("   loaded_weight dtype is ", loaded_weight.dtype)
             print("   data_param dtype is ", param_data.dtype)
-            #TEST 
-            assert(start_idx == 0 and shard_size == loaded_weight.shape[input_dim])
+            #TEST
+            assert (start_idx == 0
+                    and shard_size == loaded_weight.shape[input_dim])
 
-            loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size)
-            print(   "sharded loaded_weight is ", loaded_weight.shape)
+            loaded_weight = loaded_weight.narrow(input_dim, start_idx,
+                                                 shard_size)
+            print("sharded loaded_weight is ", loaded_weight.shape)
 
         assert param_data.shape == loaded_weight.shape
 
@@ -610,14 +612,12 @@ def forward(self, input_):
         else:
             tp_rank = get_tensor_model_parallel_rank()
             splitted_input = split_tensor_along_last_dim(
-                input_, num_partitions=self.tp_size
-            )
+                input_, num_partitions=self.tp_size)
             input_parallel = splitted_input[tp_rank].contiguous()
 
         # Matrix multiply.
         output_parallel = self.linear_method.apply_weights(
-            self.linear_weights, input_parallel
-        )
+            self.linear_weights, input_parallel)
         if self.reduce_results and self.tp_size > 1:
             output_ = tensor_model_parallel_all_reduce(output_parallel)
         else:
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index bb75d76e47e6d..0f299d770b4ee 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -41,17 +41,14 @@ def __init__(
         # I think pack factor is *probably* how many elements fit into one quantized tensor element.
         # though out group size makes it interesting, because really we are doing 2D blocks, potentially.
         # maybe this is vllms first 2D packing?  Arg.
-        self.pack_factor = (
-            self.in_group_size * self.out_group_size // self.num_codebooks
-        )
+        self.pack_factor = (self.in_group_size * self.out_group_size //
+                            self.num_codebooks)
 
     def __repr__(self) -> str:
-        return (
-            f"AQLMConfig(in_group_size={self.in_group_size}, "
-            f"nbits_per_codebook={self.nbits_per_codebook}, "
-            f"num_codebooks={self.num_codebooks}, "
-            f"out_group_size={self.out_group_size})"
-        )
+        return (f"AQLMConfig(in_group_size={self.in_group_size}, "
+                f"nbits_per_codebook={self.nbits_per_codebook}, "
+                f"num_codebooks={self.num_codebooks}, "
+                f"out_group_size={self.out_group_size})")
 
     @classmethod
     def get_name(cls) -> str:
@@ -101,7 +98,8 @@ def from_config(cls, config: Dict[str, Any]) -> "AQLMConfig":
         num_code_books = cls.get_from_keys(config, ["num_codebooks"])
         out_group_size = cls.get_from_keys(config, ["out_group_size"])
         # TODO linear_weights_not_to_quantize ?
-        return cls(in_group_size, nbits_per_codebook, num_code_books, out_group_size)
+        return cls(in_group_size, nbits_per_codebook, num_code_books,
+                   out_group_size)
 
     def get_linear_method(self) -> "AQLMLinearMethod":
         return AQLMLinearMethod(self)
@@ -129,8 +127,8 @@ def create_weights(
         params_dtype: torch.dtype,
     ) -> Dict[str, Any]:
         #TEST
-        assert(output_size == output_size_per_partition)
-        assert(input_size == input_size_per_partition)
+        assert (output_size == output_size_per_partition)
+        assert (input_size == input_size_per_partition)
         del output_size  # Unused.
         del input_size  # Unused.
 
@@ -140,21 +138,19 @@ def create_weights(
             raise ValueError(
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
-                "tensor parallel size."
-            )
+                "tensor parallel size.")
         if output_size_per_partition % self.quant_config.out_group_size != 0:
             raise ValueError(
                 "The output size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
-                "tensor parallel size."
-            )
+                "tensor parallel size.")
 
         # or does this need more dimensions and use the correct nbits_per_codebook as an int type.  Does that pack them?
         codes = Parameter(
             torch.empty(
                 output_size_per_partition,  # not entirely sure what to do with num_out_groups, if we need this pack factor.
                 input_size_per_partition // self.quant_config.pack_factor,
-                1, # probably should be num codebooks.
+                1,  # probably should be num codebooks.
                 dtype=get_int_dtype(self.quant_config.nbits_per_codebook),
             ),
             requires_grad=False,
@@ -186,7 +182,8 @@ def create_weights(
         scales = Parameter(
             torch.empty(
                 (
-                    output_size_per_partition // self.quant_config.out_group_size,
+                    output_size_per_partition //
+                    self.quant_config.out_group_size,
                     1,
                     1,
                     1,
@@ -222,7 +219,7 @@ def apply_weights(
 
         print("input shape is ", x.shape)
 
-        if (x.shape[1] == 5) : 
+        if (x.shape[1] == 5):
             print("codes shape is ", weights["codes"].shape)
             print("codebooks shape is ", weights["codebooks"].shape)
             print("scales shape is ", weights["scales"].shape)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 217edb20049ce..17ffc56ff42cf 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -45,8 +45,7 @@
     DEFAULT_VOCAB_PADDING_SIZE,
 )
 from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size,
-)
+    get_tensor_model_parallel_world_size, )
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (
     default_weight_loader,
@@ -59,6 +58,7 @@
 
 
 class LlamaMLP(nn.Module):
+
     def __init__(
         self,
         hidden_size: int,
@@ -73,14 +73,13 @@ def __init__(
             bias=False,
             linear_method=linear_method,
         )
-        self.down_proj = RowParallelLinear(
-            intermediate_size, hidden_size, bias=False, linear_method=linear_method
-        )
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           linear_method=linear_method)
         if hidden_act != "silu":
-            raise ValueError(
-                f"Unsupported activation: {hidden_act}. "
-                "Only silu is supported for now."
-            )
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -91,6 +90,7 @@ def forward(self, x):
 
 
 class LlamaAttention(nn.Module):
+
     def __init__(
         self,
         hidden_size: int,
@@ -173,6 +173,7 @@ def forward(
 
 
 class LlamaDecoderLayer(nn.Module):
+
     def __init__(
         self,
         config: LlamaConfig,
@@ -182,14 +183,14 @@ def __init__(
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
         sliding_window = getattr(config, "sliding_window", None)
         self.self_attn = LlamaAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(
-                config, "num_key_value_heads", config.num_attention_heads
-            ),
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -203,10 +204,10 @@ def __init__(
             hidden_act=config.hidden_act,
             linear_method=linear_method,
         )
-        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
-        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -221,7 +222,8 @@ def forward(
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
@@ -230,12 +232,14 @@ def forward(
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
 
 class LlamaModel(nn.Module):
+
     def __init__(
         self,
         config: LlamaConfig,
@@ -245,11 +249,8 @@ def __init__(
         super().__init__()
         self.config = config
         self.padding_idx = config.pad_token_id
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
         self.org_vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(
@@ -257,12 +258,10 @@ def __init__(
             config.hidden_size,
             org_num_embeddings=config.vocab_size,
         )
-        self.layers = nn.ModuleList(
-            [
-                LlamaDecoderLayer(config, linear_method)
-                for _ in range(config.num_hidden_layers)
-            ]
-        )
+        self.layers = nn.ModuleList([
+            LlamaDecoderLayer(config, linear_method)
+            for _ in range(config.num_hidden_layers)
+        ])
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
@@ -335,8 +334,7 @@ def __init__(
             padding_size=DEFAULT_VOCAB_PADDING_SIZE
             # We need bigger padding if using lora for kernel
             # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
+            if not lora_config else lora_config.lora_vocab_padding_size,
         )
         self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size)
 
@@ -347,7 +345,8 @@ def forward(
         kv_caches: List[KVCache],
         input_metadata: InputMetadata,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches, input_metadata)
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata)
         return hidden_states
 
     def sample(
@@ -355,9 +354,8 @@ def sample(
         hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(
-            self.lm_head.weight, hidden_states, sampling_metadata
-        )
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   sampling_metadata)
         return next_tokens
 
     def load_weights(
@@ -377,8 +375,7 @@ def load_weights(
         ]
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in hf_model_weights_iterator(
-            model_name_or_path, cache_dir, load_format, revision
-        ):
+                model_name_or_path, cache_dir, load_format, revision):
             if "rotary_emb.inv_freq" in name:
                 continue
             if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
@@ -403,7 +400,8 @@ def load_weights(
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
                 # TEST
                 print("loading ", name)
                 weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index 37c9725033d49..48900a8b02271 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -24,6 +24,7 @@
 
 
 class Disabledtqdm(tqdm):
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs, disable=True)
 
@@ -69,12 +70,10 @@ def convert_bin_to_safetensor_file(
     sf_size = os.stat(sf_filename).st_size
     pt_size = os.stat(pt_filename).st_size
     if (sf_size - pt_size) / pt_size > 0.01:
-        raise RuntimeError(
-            f"""The file size different is more than 1%:
+        raise RuntimeError(f"""The file size different is more than 1%:
          - {sf_filename}: {sf_size}
          - {pt_filename}: {pt_size}
-         """
-        )
+         """)
 
     # check if the tensors are the same
     reloaded = load_file(sf_filename)
@@ -107,17 +106,16 @@ def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:
     config_files = glob.glob(os.path.join(hf_folder, "*.json"))
 
     quant_config_files = [
-        f
-        for f in config_files
-        if any(f.endswith(x) for x in quant_cls.get_config_filenames())
+        f for f in config_files if any(
+            f.endswith(x) for x in quant_cls.get_config_filenames())
     ]
     if len(quant_config_files) == 0:
-        raise ValueError(f"Cannot find the config file for {model_config.quantization}")
+        raise ValueError(
+            f"Cannot find the config file for {model_config.quantization}")
     if len(quant_config_files) > 1:
         raise ValueError(
             f"Found multiple config files for {model_config.quantization}: "
-            f"{quant_config_files}"
-        )
+            f"{quant_config_files}")
 
     quant_config_file = quant_config_files[0]
     with open(quant_config_file, "r") as f:
@@ -194,11 +192,13 @@ def prepare_hf_model_weights(
             "scaler.pt",
         ]
         hf_weights_files = [
-            f for f in hf_weights_files if not any(f.endswith(x) for x in blacklist)
+            f for f in hf_weights_files
+            if not any(f.endswith(x) for x in blacklist)
         ]
 
     if len(hf_weights_files) == 0:
-        raise RuntimeError(f"Cannot find any model weights with `{model_name_or_path}`")
+        raise RuntimeError(
+            f"Cannot find any model weights with `{model_name_or_path}`")
 
     return hf_folder, hf_weights_files, use_safetensors
 
@@ -280,7 +280,8 @@ def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
     return x
 
 
-def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+def default_weight_loader(param: torch.Tensor,
+                          loaded_weight: torch.Tensor) -> None:
     """Default weight loader."""
     assert param.size() == loaded_weight.size()
     param.data.copy_(loaded_weight)

From 8ea4d9d458d30f897693b3e4e2136ddb101c9b34 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 15:31:29 +0000
Subject: [PATCH 14/96] try reversing some formatting changes

---
 setup.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index ba351402f7b16..56864a91b3a3b 100644
--- a/setup.py
+++ b/setup.py
@@ -11,12 +11,7 @@
 import setuptools
 import torch
 import torch.utils.cpp_extension as torch_cpp_ext
-from torch.utils.cpp_extension import (
-    BuildExtension,
-    CUDAExtension,
-    CUDA_HOME,
-    ROCM_HOME,
-)
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
 
 ROOT_DIR = os.path.dirname(__file__)
 

From b993971ade28f4abaa8ad7339931e1a9866681d5 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 15:34:28 +0000
Subject: [PATCH 15/96] restored

---
 setup.py | 43 +++++++++++++++++--------------------------
 1 file changed, 17 insertions(+), 26 deletions(-)

diff --git a/setup.py b/setup.py
index 56864a91b3a3b..8fcb86394f76d 100644
--- a/setup.py
+++ b/setup.py
@@ -70,12 +70,10 @@ def _is_cuda() -> bool:
 
 def get_hipcc_rocm_version():
     # Run the hipcc --version command
-    result = subprocess.run(
-        ["hipcc", "--version"],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        text=True,
-    )
+    result = subprocess.run(['hipcc', '--version'],
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT,
+                            text=True)
 
     # Check if the command was executed successfully
     if result.returncode != 0:
@@ -83,7 +81,7 @@ def get_hipcc_rocm_version():
         return None
 
     # Extract the version using a regular expression
-    match = re.search(r"HIP version: (\S+)", result.stdout)
+    match = re.search(r'HIP version: (\S+)', result.stdout)
     if match:
         # Return the version string
         return match.group(1)
@@ -99,7 +97,6 @@ def glob(pattern: str):
 
 def get_neuronxcc_version():
     import sysconfig
-
     site_dir = sysconfig.get_paths()["purelib"]
     version_file = os.path.join(site_dir, "neuronxcc", "version",
                                 "__init__.py")
@@ -145,8 +142,8 @@ def get_pytorch_rocm_arch() -> Set[str]:
     # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator
     if env_arch_list is None:
         command = "rocm_agent_enumerator"
-        env_arch_list = (subprocess.check_output(
-            [command]).decode("utf-8").strip().replace("\n", ";"))
+        env_arch_list = subprocess.check_output([command]).decode('utf-8')\
+                        .strip().replace("\n", ";")
         arch_source_str = "rocm_agent_enumerator"
     else:
         arch_source_str = "PYTORCH_ROCM_ARCH env variable"
@@ -170,8 +167,7 @@ def get_pytorch_rocm_arch() -> Set[str]:
             f"excluded from the {arch_source_str} output "
             f"({env_arch_list}). Supported ROCM architectures are: "
             f"{ROCM_SUPPORTED_ARCHS}.",
-            stacklevel=2,
-        )
+            stacklevel=2)
     return arch_list
 
 
@@ -209,8 +205,7 @@ def get_torch_arch_list() -> Set[str]:
             "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
             f"({env_arch_list}). Supported CUDA architectures are: "
             f"{valid_archs}.",
-            stacklevel=2,
-        )
+            stacklevel=2)
     return arch_list
 
 
@@ -249,8 +244,8 @@ def get_torch_arch_list() -> Set[str]:
     if nvcc_cuda_version < Version("11.0"):
         raise RuntimeError(
             "CUDA 11.0 or higher is required to build the package.")
-    if nvcc_cuda_version < Version("11.1") and any(
-            cc.startswith("8.6") for cc in compute_capabilities):
+    if (nvcc_cuda_version < Version("11.1")
+            and any(cc.startswith("8.6") for cc in compute_capabilities)):
         raise RuntimeError(
             "CUDA 11.1 or higher is required for compute capability 8.6.")
     if nvcc_cuda_version < Version("11.8"):
@@ -263,8 +258,7 @@ def get_torch_arch_list() -> Set[str]:
             warnings.warn(
                 "CUDA 11.8 or higher is required for compute capability 8.9. "
                 "Targeting compute capability 8.0 instead.",
-                stacklevel=2,
-            )
+                stacklevel=2)
             compute_capabilities = set(cc for cc in compute_capabilities
                                        if not cc.startswith("8.9"))
             compute_capabilities.add("8.0+PTX")
@@ -288,8 +282,7 @@ def get_torch_arch_list() -> Set[str]:
             ]
             if capability.endswith("+PTX"):
                 NVCC_FLAGS_PUNICA += [
-                    "-gencode",
-                    f"arch=compute_{num},code=compute_{num}",
+                    "-gencode", f"arch=compute_{num},code=compute_{num}"
                 ]
 
     # Use NVCC threads to parallelize the build.
@@ -304,10 +297,10 @@ def get_torch_arch_list() -> Set[str]:
     # changes for punica kernels
     NVCC_FLAGS += torch_cpp_ext.COMMON_NVCC_FLAGS
     REMOVE_NVCC_FLAGS = [
-        "-D__CUDA_NO_HALF_OPERATORS__",
-        "-D__CUDA_NO_HALF_CONVERSIONS__",
-        "-D__CUDA_NO_BFLOAT16_CONVERSIONS__",
-        "-D__CUDA_NO_HALF2_OPERATORS__",
+        '-D__CUDA_NO_HALF_OPERATORS__',
+        '-D__CUDA_NO_HALF_CONVERSIONS__',
+        '-D__CUDA_NO_BFLOAT16_CONVERSIONS__',
+        '-D__CUDA_NO_HALF2_OPERATORS__',
     ]
     for flag in REMOVE_NVCC_FLAGS:
         with contextlib.suppress(ValueError):
@@ -348,8 +341,6 @@ def get_torch_arch_list() -> Set[str]:
 ]
 
 if _is_cuda():
-    vllm_extension_sources.append("csrc/quantization/aqlm/aqlm_cuda_entry.cpp")
-    vllm_extension_sources.append("csrc/quantization/aqlm/aqlm_cuda_kernel.cu")
     vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
     vllm_extension_sources.append("csrc/custom_all_reduce.cu")
 

From 17668866afc1471c57e91f245fd987f661e4304a Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 15:38:36 +0000
Subject: [PATCH 16/96] add aqlm_cuda

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 8fcb86394f76d..d177a6e1d8d4d 100644
--- a/setup.py
+++ b/setup.py
@@ -343,7 +343,8 @@ def get_torch_arch_list() -> Set[str]:
 if _is_cuda():
     vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
     vllm_extension_sources.append("csrc/custom_all_reduce.cu")
-
+    vllm_extension_sources.append("csrc/quantization/aqlm/aqlm_cuda_entry.cpp")
+    vllm_extension_sources.append("csrc/quantization/aqlm/aqlm_cuda_kernel.cu")
     # Add MoE kernels.
     ext_modules.append(
         CUDAExtension(

From b673f4791d9a3b71659f7df3f270bd61c64eea76 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 15:49:46 +0000
Subject: [PATCH 17/96] restore formatting

---
 vllm/config.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 19f1c0e27b103..70a5f3b77eba1 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -98,7 +98,6 @@ def __init__(
             # download model from ModelScope hub,
             # lazy import so that modelscope is not required for normal use.
             from modelscope.hub.snapshot_download import snapshot_download  # pylint: disable=C
-
             if not os.path.exists(model):
                 model_path = snapshot_download(model_id=model,
                                                cache_dir=download_dir,
@@ -135,7 +134,7 @@ def _verify_load_format(self) -> None:
                 if (f not in rocm_not_supported_load_format)
             ]
             raise ValueError(
-                f"load format '{load_format}' is not supported in ROCm. "
+                f"load format \'{load_format}\' is not supported in ROCm. "
                 f"Supported load format are "
                 f"{rocm_supported_load_format}")
 
@@ -572,7 +571,7 @@ def _get_and_verify_dtype(
             k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items()
             if (k not in _ROCM_NOT_SUPPORTED_DTYPE)
         ]
-        raise ValueError(f"dtype '{dtype}' is not supported in ROCm. "
+        raise ValueError(f"dtype \'{dtype}\' is not supported in ROCm. "
                          f"Supported dtypes are {rocm_supported_dtypes}")
 
     # Verify the dtype.

From 4e7d39808f0df1c9f0b51499debbecc0ed84c3f4 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 15:52:50 +0000
Subject: [PATCH 18/96] restore format

---
 vllm/model_executor/layers/linear.py | 42 ++++++++++------------------
 1 file changed, 14 insertions(+), 28 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index abbdf7c649557..edec583e2585c 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -6,13 +6,9 @@
 from torch.nn.parameter import Parameter
 
 from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
 from vllm.model_executor.parallel_utils.communication_op import (
-    tensor_model_parallel_all_reduce,
-    tensor_model_parallel_all_gather,
-)
+    tensor_model_parallel_all_reduce, tensor_model_parallel_all_gather)
 from vllm.model_executor.parallel_utils.utils import divide, split_tensor_along_last_dim
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.logger import init_logger
@@ -24,24 +20,18 @@ class LinearMethodBase(ABC):
     """Base class for different (maybe quantized) linear methods."""
 
     @abstractmethod
-    def create_weights(
-        self,
-        input_size_per_partition: int,
-        output_size_per_partition: int,
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-    ) -> Dict[str, Any]:
+    def create_weights(self, input_size_per_partition: int,
+                       output_size_per_partition: int, input_size: int,
+                       output_size: int,
+                       params_dtype: torch.dtype) -> Dict[str, Any]:
         """Create weights for a linear layer."""
         raise NotImplementedError
 
     @abstractmethod
-    def apply_weights(
-        self,
-        weights: Dict[str, torch.Tensor],
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    def apply_weights(self,
+                      weights: Dict[str, torch.Tensor],
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         """Apply the weights to the input tensor."""
         raise NotImplementedError
 
@@ -57,14 +47,10 @@ class UnquantizedLinearMethod(LinearMethodBase):
     def __init__(self, separate_bias_add: bool = False):
         self.separate_bias_add = separate_bias_add
 
-    def create_weights(
-        self,
-        input_size_per_partition: int,
-        output_size_per_partition: int,
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-    ) -> Dict[str, Any]:
+    def create_weights(self, input_size_per_partition: int,
+                       output_size_per_partition: int, input_size: int,
+                       output_size: int,
+                       params_dtype: torch.dtype) -> Dict[str, Any]:
         weight = Parameter(
             torch.empty(output_size_per_partition,
                         input_size_per_partition,

From 4fc1426e0d5699d34ccce30f82591a2689dbbad0 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 15:56:11 +0000
Subject: [PATCH 19/96] more formatting

---
 vllm/model_executor/layers/linear.py | 42 +++++++++++-----------------
 1 file changed, 16 insertions(+), 26 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index edec583e2585c..c43508b6f86c7 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -9,7 +9,8 @@
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
 from vllm.model_executor.parallel_utils.communication_op import (
     tensor_model_parallel_all_reduce, tensor_model_parallel_all_gather)
-from vllm.model_executor.parallel_utils.utils import divide, split_tensor_along_last_dim
+from vllm.model_executor.parallel_utils.utils import (
+    divide, split_tensor_along_last_dim)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.logger import init_logger
 
@@ -51,21 +52,17 @@ def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int,
                        params_dtype: torch.dtype) -> Dict[str, Any]:
-        weight = Parameter(
-            torch.empty(output_size_per_partition,
-                        input_size_per_partition,
-                        dtype=params_dtype),
-            requires_grad=False,
-        )
+        weight = Parameter(torch.empty(output_size_per_partition,
+                                       input_size_per_partition,
+                                       dtype=params_dtype),
+                           requires_grad=False)
         set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
         return {"weight": weight}
 
-    def apply_weights(
-        self,
-        weights: Dict[str, torch.Tensor],
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    def apply_weights(self,
+                      weights: Dict[str, torch.Tensor],
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         weight = weights["weight"]
         if self.separate_bias_add:
             if bias:
@@ -108,12 +105,8 @@ def __init__(
             linear_method = UnquantizedLinearMethod()
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
-            self.input_size,
-            self.output_size,
-            self.input_size,
-            self.output_size,
-            self.params_dtype,
-        )
+            self.input_size, self.output_size, self.input_size,
+            self.output_size, self.params_dtype)
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
@@ -192,13 +185,10 @@ def __init__(
             self.bias = Parameter(
                 torch.empty(self.output_size_per_partition,
                             dtype=params_dtype))
-            set_weight_attrs(
-                self.bias,
-                {
-                    "output_dim": 0,
-                    "weight_loader": self.weight_loader,
-                },
-            )
+            set_weight_attrs(self.bias, {
+                "output_dim": 0,
+                "weight_loader": self.weight_loader,
+            })
         else:
             self.register_parameter("bias", None)
 

From ac2ef816258c775d0c44a08b3002432270905ea6 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 16:46:13 +0000
Subject: [PATCH 20/96] format

---
 vllm/model_executor/layers/linear.py | 44 ++++++++--------------------
 1 file changed, 12 insertions(+), 32 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index c43508b6f86c7..90e2e7ca0c15e 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -171,12 +171,8 @@ def __init__(
             linear_method = UnquantizedLinearMethod()
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
-            self.input_size,
-            self.output_size_per_partition,
-            self.input_size,
-            self.output_size,
-            self.params_dtype,
-        )
+            self.input_size, self.output_size_per_partition, self.input_size,
+            self.output_size, self.params_dtype)
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
@@ -253,22 +249,13 @@ def __init__(
         self.output_sizes = output_sizes
         tp_size = get_tensor_model_parallel_world_size()
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
-        super().__init__(
-            input_size,
-            sum(output_sizes),
-            bias,
-            gather_output,
-            skip_bias_add,
-            params_dtype,
-            linear_method,
-        )
+        super().__init__(input_size, sum(output_sizes), bias, gather_output,
+                         skip_bias_add, params_dtype, linear_method)
 
-    def weight_loader(
-        self,
-        param: Parameter,
-        loaded_weight: torch.Tensor,
-        loaded_shard_id: Optional[int] = None,
-    ):
+    def weight_loader(self,
+                      param: Parameter,
+                      loaded_weight: torch.Tensor,
+                      loaded_shard_id: Optional[int] = None):
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
         if loaded_shard_id is None:
@@ -380,17 +367,10 @@ def __init__(
             self.num_kv_heads = divide(self.total_num_kv_heads, tp_size)
             self.num_kv_head_replicas = 1
         input_size = self.hidden_size
-        output_size = ((self.num_heads + 2 * self.num_kv_heads) * tp_size *
-                       self.head_size)
-        super().__init__(
-            input_size,
-            output_size,
-            bias,
-            False,
-            skip_bias_add,
-            params_dtype,
-            linear_method,
-        )
+        output_size = (self.num_heads +
+                       2 * self.num_kv_heads) * tp_size * self.head_size
+        super().__init__(input_size, output_size, bias, False, skip_bias_add,
+                         params_dtype, linear_method)
 
     def weight_loader(
         self,

From 30d2d42550e5a2a5b03943899651829266ef8ce6 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 16:52:51 +0000
Subject: [PATCH 21/96] restore formatting

---
 vllm/model_executor/layers/linear.py | 44 +++++++--------------
 vllm/model_executor/models/llama.py  | 57 +++++++++++-----------------
 2 files changed, 36 insertions(+), 65 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 90e2e7ca0c15e..e086b735ca8ca 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -372,12 +372,10 @@ def __init__(
         super().__init__(input_size, output_size, bias, False, skip_bias_add,
                          params_dtype, linear_method)
 
-    def weight_loader(
-        self,
-        param: Parameter,
-        loaded_weight: torch.Tensor,
-        loaded_shard_id: Optional[str] = None,
-    ):
+    def weight_loader(self,
+                      param: Parameter,
+                      loaded_weight: torch.Tensor,
+                      loaded_shard_id: Optional[str] = None):
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
         if loaded_shard_id is None:
@@ -389,17 +387,10 @@ def weight_loader(
             shard_offsets = [
                 # (shard_id, shard_offset, shard_size)
                 ("q", 0, self.total_num_heads * self.head_size),
-                (
-                    "k",
-                    self.total_num_heads * self.head_size,
-                    self.total_num_kv_heads * self.head_size,
-                ),
-                (
-                    "v",
-                    (self.total_num_heads + self.total_num_kv_heads) *
-                    self.head_size,
-                    self.total_num_kv_heads * self.head_size,
-                ),
+                ("k", self.total_num_heads * self.head_size,
+                 self.total_num_kv_heads * self.head_size),
+                ("v", (self.total_num_heads + self.total_num_kv_heads) *
+                 self.head_size, self.total_num_kv_heads * self.head_size),
             ]
             packed_dim = getattr(param, "packed_dim", None)
             for shard_id, shard_offset, shard_size in shard_offsets:
@@ -507,12 +498,8 @@ def __init__(
             linear_method = UnquantizedLinearMethod()
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
-            self.input_size_per_partition,
-            self.output_size,
-            self.input_size,
-            self.output_size,
-            self.params_dtype,
-        )
+            self.input_size_per_partition, self.output_size, self.input_size,
+            self.output_size, self.params_dtype)
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
@@ -525,13 +512,10 @@ def __init__(
         if bias:
             self.bias = Parameter(
                 torch.empty(self.output_size, dtype=params_dtype))
-            set_weight_attrs(
-                self.bias,
-                {
-                    "output_dim": 0,
-                    "weight_loader": self.weight_loader,
-                },
-            )
+            set_weight_attrs(self.bias, {
+                "output_dim": 0,
+                "weight_loader": self.weight_loader,
+            })
         else:
             self.register_parameter("bias", None)
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 17ffc56ff42cf..e928ff1d83fd4 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -31,26 +31,19 @@
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (
-    LinearMethodBase,
-    MergedColumnParallelLinear,
-    QKVParallelLinear,
-    RowParallelLinear,
-)
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding,
-    ParallelLMHead,
-    DEFAULT_VOCAB_PADDING_SIZE,
-)
+    VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE)
 from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size, )
+    get_tensor_model_parallel_world_size)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (
-    default_weight_loader,
-    hf_model_weights_iterator,
-)
+from vllm.model_executor.weight_utils import (default_weight_loader,
+                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 from vllm.config import LoRAConfig
 
@@ -68,11 +61,9 @@ def __init__(
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size,
-            [intermediate_size] * 2,
+            hidden_size, [intermediate_size] * 2,
             bias=False,
-            linear_method=linear_method,
-        )
+            linear_method=linear_method)
         self.down_proj = RowParallelLinear(intermediate_size,
                                            hidden_size,
                                            bias=False,
@@ -148,13 +139,11 @@ def __init__(
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = PagedAttention(
-            self.num_heads,
-            self.head_dim,
-            self.scaling,
-            num_kv_heads=self.num_kv_heads,
-            sliding_window=sliding_window,
-        )
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   self.scaling,
+                                   num_kv_heads=self.num_kv_heads,
+                                   sliding_window=sliding_window)
 
     def forward(
         self,
@@ -249,8 +238,8 @@ def __init__(
         super().__init__()
         self.config = config
         self.padding_idx = config.pad_token_id
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
         self.org_vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(
@@ -358,13 +347,11 @@ def sample(
                                    sampling_metadata)
         return next_tokens
 
-    def load_weights(
-        self,
-        model_name_or_path: str,
-        cache_dir: Optional[str] = None,
-        load_format: str = "auto",
-        revision: Optional[str] = None,
-    ):
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     load_format: str = "auto",
+                     revision: Optional[str] = None):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),

From 3fcb9446deeda3201fd96f0145bedc614e8801d9 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 16:54:27 +0000
Subject: [PATCH 22/96] restore formatting

---
 vllm/model_executor/models/llama.py | 5 +++--
 vllm/model_executor/weight_utils.py | 5 +----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e928ff1d83fd4..1ce2223ed2de4 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -365,11 +365,12 @@ def load_weights(self,
                 model_name_or_path, cache_dir, load_format, revision):
             if "rotary_emb.inv_freq" in name:
                 continue
-            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index 48900a8b02271..6c32ac9125d45 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -15,10 +15,7 @@
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization import (
-    get_quantization_config,
-    QuantizationConfig,
-)
+from vllm.model_executor.layers.quantization import (get_quantization_config, QuantizationConfig)
 
 logger = init_logger(__name__)
 

From 4e7291aebe80fbba81f6f79f0d041e8e75a2696a Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 16:56:40 +0000
Subject: [PATCH 23/96] formta

---
 vllm/model_executor/weight_utils.py | 30 +++++++++++++----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index 6c32ac9125d45..bdcb9d5976576 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -15,7 +15,8 @@
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization import (get_quantization_config, QuantizationConfig)
+from vllm.model_executor.layers.quantization import (get_quantization_config,
+                                                     QuantizationConfig)
 
 logger = init_logger(__name__)
 
@@ -91,13 +92,11 @@ def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:
     if not is_local:
         # Download the config files.
         with get_lock(model_name_or_path, model_config.download_dir):
-            hf_folder = snapshot_download(
-                model_name_or_path,
-                revision=model_config.revision,
-                allow_patterns="*.json",
-                cache_dir=model_config.download_dir,
-                tqdm_class=Disabledtqdm,
-            )
+            hf_folder = snapshot_download(model_name_or_path,
+                                          revision=model_config.revision,
+                                          allow_patterns="*.json",
+                                          cache_dir=model_config.download_dir,
+                                          tqdm_class=Disabledtqdm)
     else:
         hf_folder = model_name_or_path
     config_files = glob.glob(os.path.join(hf_folder, "*.json"))
@@ -162,13 +161,11 @@ def prepare_hf_model_weights(
         # Use file lock to prevent multiple processes from
         # downloading the same model weights at the same time.
         with get_lock(model_name_or_path, cache_dir):
-            hf_folder = snapshot_download(
-                model_name_or_path,
-                allow_patterns=allow_patterns,
-                cache_dir=cache_dir,
-                tqdm_class=Disabledtqdm,
-                revision=revision,
-            )
+            hf_folder = snapshot_download(model_name_or_path,
+                                          allow_patterns=allow_patterns,
+                                          cache_dir=cache_dir,
+                                          tqdm_class=Disabledtqdm,
+                                          revision=revision)
     else:
         hf_folder = model_name_or_path
     hf_weights_files: List[str] = []
@@ -212,8 +209,7 @@ def hf_model_weights_iterator(
         cache_dir=cache_dir,
         load_format=load_format,
         fall_back_to_pt=fall_back_to_pt,
-        revision=revision,
-    )
+        revision=revision)
 
     if load_format == "npcache":
         # Currently np_cache only support *.bin checkpoints

From 39abbc0c18322602d03e9ce2f0678c5afdc8f479 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 20:27:14 +0000
Subject: [PATCH 24/96] first working aqlm

---
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp    |  9 ---
 examples/aqlm_test.py                         | 14 ++--
 vllm/model_executor/layers/linear.py          | 59 ++++++++------
 .../layers/quantization/aqlm.py               | 77 ++++++++++++-------
 .../model_executor/layers/quantization/awq.py |  4 +-
 .../layers/quantization/gptq.py               | 12 +--
 .../layers/quantization/squeezellm.py         |  4 +-
 vllm/model_executor/models/llama.py           |  5 +-
 8 files changed, 98 insertions(+), 86 deletions(-)

diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 6e4aa751c113d..fb4c9d54efdee 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -134,12 +134,3 @@ torch::Tensor code2x8_matmat(
   auto output = flat_output.reshape(output_sizes).clone();
   return output;
 }
-
-
-/*
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("code1x16_matmat", &code1x16_matmat, "1x16 (2bit) codebook matrix-matrix product.");
-  m.def("code2x8_matmat", &code2x8_matmat, "2x8 (2bit) codebook matrix-matrix product.");
-}
-*/
-
diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py
index 26d90584a7858..cbc9b37857452 100644
--- a/examples/aqlm_test.py
+++ b/examples/aqlm_test.py
@@ -1,13 +1,11 @@
 from vllm import LLM, SamplingParams
 
 #model = LLM("nm-testing/llama2.c-stories110M-pruned2.4")
-#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
-model = LLM(
-    "/home/jaemz/.cache/huggingface/hub/models--BlackSamorez--Llama-2-7b-AQLM-2Bit-1x16-hf/snapshots/1756949fed7fe691c00f015e2ebd18503a975f3b",
-    enforce_eager=True)
+model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
+#model = LLM(
+#    "/home/jaemz/.cache/huggingface/hub/models--BlackSamorez--Llama-2-7b-AQLM-2Bit-1x16-hf/snapshots/1756949fed7fe691c00f015e2ebd18503a975f3b",
+#    enforce_eager=True)
 
-sampling_params = SamplingParams(max_tokens=200, temperature=0)
-outputs = model.generate("How are you ", sampling_params=sampling_params)
-print("generated!")
+sampling_params = SamplingParams(max_tokens=100, temperature=0)
+outputs = model.generate("Hello my name is", sampling_params=sampling_params)
 print(outputs[0].outputs[0].text)
-print("output above!")
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index e086b735ca8ca..c20d28054c29a 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -23,8 +23,8 @@ class LinearMethodBase(ABC):
     @abstractmethod
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
-                       output_size: int,
-                       params_dtype: torch.dtype) -> Dict[str, Any]:
+                       output_size: int, params_dtype: torch.dtype,
+                       shards: int) -> Dict[str, Any]:
         """Create weights for a linear layer."""
         raise NotImplementedError
 
@@ -50,8 +50,8 @@ def __init__(self, separate_bias_add: bool = False):
 
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
-                       output_size: int,
-                       params_dtype: torch.dtype) -> Dict[str, Any]:
+                       output_size: int, params_dtype: torch.dtype,
+                       shards: int) -> Dict[str, Any]:
         weight = Parameter(torch.empty(output_size_per_partition,
                                        input_size_per_partition,
                                        dtype=params_dtype),
@@ -106,7 +106,7 @@ def __init__(
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
             self.input_size, self.output_size, self.input_size,
-            self.output_size, self.params_dtype)
+            self.output_size, self.params_dtype, 1)
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
@@ -142,6 +142,7 @@ class ColumnParallelLinear(torch.nn.Module):
                        skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         linear_method: (Maybe quantized) linear method.
+        shards: Number of packed shards, like for QKV this would be 3
     """
 
     def __init__(
@@ -153,6 +154,7 @@ def __init__(
         skip_bias_add: bool = False,
         params_dtype: Optional[torch.dtype] = None,
         linear_method: Optional[LinearMethodBase] = None,
+        shards: int = 1,
     ):
         super().__init__()
 
@@ -160,6 +162,7 @@ def __init__(
         self.input_size = input_size
         self.output_size = output_size
         self.gather_output = gather_output
+        self.shards = shards
         # Divide the weight matrix along the last dimension.
         tp_size = get_tensor_model_parallel_world_size()
         self.output_size_per_partition = divide(output_size, tp_size)
@@ -172,7 +175,7 @@ def __init__(
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
             self.input_size, self.output_size_per_partition, self.input_size,
-            self.output_size, self.params_dtype)
+            self.output_size, self.params_dtype, self.shards)
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
@@ -250,14 +253,17 @@ def __init__(
         tp_size = get_tensor_model_parallel_world_size()
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
         super().__init__(input_size, sum(output_sizes), bias, gather_output,
-                         skip_bias_add, params_dtype, linear_method)
+                         skip_bias_add, params_dtype, linear_method,
+                         len(self.output_sizes))
 
     def weight_loader(self,
                       param: Parameter,
                       loaded_weight: torch.Tensor,
                       loaded_shard_id: Optional[int] = None):
+
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
+        shard_dim = getattr(param, "shard_dim", None)
         if loaded_shard_id is None:
             # Loaded weight is already packed.
             if output_dim is None:
@@ -287,12 +293,6 @@ def weight_loader(self,
         if output_dim is not None:
             shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
             shard_size = self.output_sizes[loaded_shard_id] // tp_size
-            #TEST
-            if loaded_shard_id > 0:
-                print("   loading a shard ", loaded_shard_id)
-                print("   param_data shape ", param_data.shape)
-                print("   loaded_weight shape ", loaded_weight.shape)
-
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
             packed_dim = getattr(param, "packed_dim", None)
@@ -304,6 +304,13 @@ def weight_loader(self,
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
+        elif shard_dim is not None:
+            shard_size = loaded_weight.shape[shard_dim]
+            shard_offset = loaded_shard_id * shard_size
+            param_data = param_data.narrow(shard_dim, shard_offset, shard_size)
+            # TODO what is up with this TP rank?
+            #start_idx = tp_rank * shard_size
+            #loaded_weight = loaded_weight.narrow(output_dim, start_idx,shard_size)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -370,14 +377,17 @@ def __init__(
         output_size = (self.num_heads +
                        2 * self.num_kv_heads) * tp_size * self.head_size
         super().__init__(input_size, output_size, bias, False, skip_bias_add,
-                         params_dtype, linear_method)
+                         params_dtype, linear_method, 3)
 
     def weight_loader(self,
                       param: Parameter,
                       loaded_weight: torch.Tensor,
                       loaded_shard_id: Optional[str] = None):
         param_data = param.data
+
         output_dim = getattr(param, "output_dim", None)
+        shard_dim = getattr(param, "shard_dim", None)
+
         if loaded_shard_id is None:
             # Loaded weight is already packed.
             if output_dim is None:
@@ -432,6 +442,16 @@ def weight_loader(self,
             start_idx = shard_id * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
+        elif shard_dim is not None:
+            shard_size = loaded_weight.shape[shard_dim]
+            if loaded_shard_id == "q":
+                shard_index = 0
+            elif loaded_shard_id == "k":
+                shard_index = 1
+            elif loaded_shard_id == "v":
+                shard_index = 2
+            param_data = param_data.narrow(shard_dim, shard_index * shard_size,
+                                           shard_size)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -499,7 +519,7 @@ def __init__(
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
             self.input_size_per_partition, self.output_size, self.input_size,
-            self.output_size, self.params_dtype)
+            self.output_size, self.params_dtype, 1)
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
@@ -524,23 +544,14 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         input_dim = getattr(param, "input_dim", None)
         param_data = param.data
 
-        # TEST
-        print("   param data shape is ", param_data.shape)
-        print("   loaded_weight is ", loaded_weight.shape)
-
         if input_dim is not None:
             shard_size = param_data.shape[input_dim]
             start_idx = tp_rank * shard_size
-            print("   loaded_weight dtype is ", loaded_weight.dtype)
-            print("   data_param dtype is ", param_data.dtype)
-            #TEST
             assert (start_idx == 0
                     and shard_size == loaded_weight.shape[input_dim])
 
             loaded_weight = loaded_weight.narrow(input_dim, start_idx,
                                                  shard_size)
-            print("sharded loaded_weight is ", loaded_weight.shape)
-
         assert param_data.shape == loaded_weight.shape
 
         param_data.copy_(loaded_weight)
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 0f299d770b4ee..daa7f88a3adef 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -118,15 +118,10 @@ class AQLMLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: AQLMConfig):
         self.quant_config = quant_config
 
-    def create_weights(
-        self,
-        input_size_per_partition: int,
-        output_size_per_partition: int,
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-    ) -> Dict[str, Any]:
-        #TEST
+    def create_weights(self, input_size_per_partition: int,
+                       output_size_per_partition: int, input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       shards: int) -> Dict[str, Any]:
         assert (output_size == output_size_per_partition)
         assert (input_size == input_size_per_partition)
         del output_size  # Unused.
@@ -145,19 +140,16 @@ def create_weights(
                 "weight shape. This can be caused by too large "
                 "tensor parallel size.")
 
-        # or does this need more dimensions and use the correct nbits_per_codebook as an int type.  Does that pack them?
         codes = Parameter(
             torch.empty(
                 output_size_per_partition,  # not entirely sure what to do with num_out_groups, if we need this pack factor.
                 input_size_per_partition // self.quant_config.pack_factor,
-                1,  # probably should be num codebooks.
+                1,  # probably should be num codebooks and change pack factor?
                 dtype=get_int_dtype(self.quant_config.nbits_per_codebook),
             ),
             requires_grad=False,
         )
 
-        print(codes.shape)
-
         set_weight_attrs(
             codes,
             {
@@ -170,7 +162,7 @@ def create_weights(
 
         codebooks = Parameter(
             torch.empty(
-                self.quant_config.num_codebooks,
+                self.quant_config.num_codebooks * shards,
                 2**self.quant_config.nbits_per_codebook,
                 self.quant_config.out_group_size,
                 self.quant_config.in_group_size,
@@ -178,6 +170,13 @@ def create_weights(
             ),
             requires_grad=False,
         )
+        set_weight_attrs(
+            codebooks,
+            {
+                "shard_dim": 0,
+                "shards": shards
+            },
+        )
 
         scales = Parameter(
             torch.empty(
@@ -213,25 +212,45 @@ def apply_weights(
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        # qweight = weights["qweight"] do I need the same flattening?
-        # out_shape = x.shape[:-1] + (qweight.shape[-1],)
-        # reshaped_x = x.reshape(-1, x.shape[-1]) #
-
-        print("input shape is ", x.shape)
 
-        if (x.shape[1] == 5):
-            print("codes shape is ", weights["codes"].shape)
-            print("codebooks shape is ", weights["codebooks"].shape)
-            print("scales shape is ", weights["scales"].shape)
-            print("x is ", x)
+        codebooks = weights["codebooks"]
+        codes = weights["codes"]
+        scales = weights["scales"]
+
+        shard_dim = getattr(codebooks, "shard_dim", None)
+        if shard_dim is not None:
+            output_shape = x.shape[:-1] + (scales.shape[0], )
+            output = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+            shards = getattr(codebooks, "shards", None)
+            # break the shards apart and combine them.
+            assert (shard_dim == 0)
+            num_codebooks = codebooks.shape[shard_dim] // shards
+
+            assert (scales.shape[0] == codes.shape[0])
+            assert (scales.shape[0] % shards == 0)
+            base_size = scales.shape[0] // shards
+
+            for shard_id in range(shards):
+                shard_output = ops.aqlm_gemm(
+                    x, codes.narrow(0, shard_id * base_size, base_size),
+                    codebooks.narrow(shard_dim, shard_id * num_codebooks,
+                                     num_codebooks),
+                    scales.narrow(0, shard_id * base_size, base_size),
+                    None if bias is None else bias.narrow(
+                        0, shard_id * base_size, base_size))
+
+                output_slice = output.narrow(-1, shard_id * base_size,
+                                             base_size)
+                assert (output_slice.shape == shard_output.shape)
+                output_slice.copy_(shard_output)
+            return output
 
         output = ops.aqlm_gemm(
-            x,  # hmm, reshape?
-            weights["codes"],
-            weights["codebooks"],
-            weights["scales"],
+            x,
+            codes,
+            codebooks,
+            scales,
             bias,
         )
 
-        print("output shape is ", output.shape)
         return output
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 3e1c814dd233c..a3623ae5b0417 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -79,8 +79,8 @@ def __init__(self, quant_config: AWQConfig):
 
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
-                       output_size: int,
-                       params_dtype: torch.dtype) -> Dict[str, Any]:
+                       output_size: int, params_dtype: torch.dtype,
+                       shards: int) -> Dict[str, Any]:
         if input_size_per_partition % self.quant_config.group_size != 0:
             raise ValueError(
                 "The input size is not aligned with the quantized "
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 7218760fbe55d..45b06947f3799 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -87,14 +87,10 @@ class GPTQLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: GPTQConfig):
         self.quant_config = quant_config
 
-    def create_weights(
-        self,
-        input_size_per_partition: int,
-        output_size_per_partition: int,
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-    ) -> Dict[str, Any]:
+    def create_weights(self, input_size_per_partition: int,
+                       output_size_per_partition: int, input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       shards: int) -> Dict[str, Any]:
         del output_size  # Unused.
         if input_size_per_partition % self.quant_config.group_size != 0:
             raise ValueError(
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index 9244e88552756..091ff22b9b095 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -69,8 +69,8 @@ def __init__(self, quant_config: SqueezeLLMConfig):
 
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
-                       output_size: int,
-                       params_dtype: torch.dtype) -> Dict[str, Any]:
+                       output_size: int, params_dtype: torch.dtype,
+                       shards: int) -> Dict[str, Any]:
         if input_size_per_partition % self.quant_config.pack_factor != 0:
             raise ValueError(
                 "The input size is not aligned with the quantized "
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 1ce2223ed2de4..4c07fb19c490a 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -360,6 +360,7 @@ def load_weights(self,
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
         ]
+
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in hf_model_weights_iterator(
                 model_name_or_path, cache_dir, load_format, revision):
@@ -379,8 +380,6 @@ def load_weights(self,
                     continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
-                # TEST
-                print("loading ", name)
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
@@ -390,6 +389,4 @@ def load_weights(self,
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                # TEST
-                print("loading ", name)
                 weight_loader(param, loaded_weight)

From 8d7fa9669f419efad1f77e08dfac8b3595b383d8 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 20:32:36 +0000
Subject: [PATCH 25/96] some improvements

---
 vllm/model_executor/layers/linear.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index c20d28054c29a..35da806a9097d 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -263,6 +263,7 @@ def weight_loader(self,
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
+        # shard_dim indicates fixed size concatenated at shard_id
         shard_dim = getattr(param, "shard_dim", None)
         if loaded_shard_id is None:
             # Loaded weight is already packed.
@@ -308,9 +309,6 @@ def weight_loader(self,
             shard_size = loaded_weight.shape[shard_dim]
             shard_offset = loaded_shard_id * shard_size
             param_data = param_data.narrow(shard_dim, shard_offset, shard_size)
-            # TODO what is up with this TP rank?
-            #start_idx = tp_rank * shard_size
-            #loaded_weight = loaded_weight.narrow(output_dim, start_idx,shard_size)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -444,12 +442,7 @@ def weight_loader(self,
                                                  shard_size)
         elif shard_dim is not None:
             shard_size = loaded_weight.shape[shard_dim]
-            if loaded_shard_id == "q":
-                shard_index = 0
-            elif loaded_shard_id == "k":
-                shard_index = 1
-            elif loaded_shard_id == "v":
-                shard_index = 2
+            shard_index = ["q", "k", "v"].index(loaded_shard_id)
             param_data = param_data.narrow(shard_dim, shard_index * shard_size,
                                            shard_size)
         else:

From 9a3dbe1daf4a0a640b77d827e238df0ea3bb7726 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 20:37:07 +0000
Subject: [PATCH 26/96] restore format

---
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 2 +-
 vllm/model_executor/models/llama.py        | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index fb4c9d54efdee..991e59f5022a7 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -74,7 +74,7 @@ torch::Tensor code1x16_matmat(
   auto output_sizes = input_sizes.vec();
   output_sizes.pop_back();
   output_sizes.push_back(-1);
-  auto output = flat_output.reshape(output_sizes).clone();
+  auto output = flat_output.reshape(output_sizes); // .clone();
   return output;
 }
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 4c07fb19c490a..b7f6b8f3ec374 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -360,7 +360,6 @@ def load_weights(self,
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
         ]
-
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in hf_model_weights_iterator(
                 model_name_or_path, cache_dir, load_format, revision):

From e7c2601efcedfeb2c3bbd39d5f6d994919d32b74 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 21:42:03 +0000
Subject: [PATCH 27/96] make a central c++ aqlm entry point

---
 csrc/ops.h                                 |  2 +-
 csrc/pybind.cpp                            |  2 +-
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 27 ++++++++++++++++++++--
 examples/aqlm_test.py                      |  8 +++----
 4 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 246862ee048f0..c70a04e1a8694 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -69,7 +69,7 @@ void gelu_fast(
   torch::Tensor& out,
   torch::Tensor& input);
 
-torch::Tensor code1x16_matmat(
+torch::Tensor aqlm_gemm(
   const torch::Tensor& input,
   const torch::Tensor& codes,
   const torch::Tensor& codebooks,
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index d1410071d3afe..51664eeb6b461 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -53,7 +53,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
 
   // Quantization ops
-  ops.def("aqlm_gemm", &code1x16_matmat, "Quantized GEMM for AQLM");
+  ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
 
 #ifndef USE_ROCM
   ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 991e59f5022a7..ac620e9361854 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -74,7 +74,7 @@ torch::Tensor code1x16_matmat(
   auto output_sizes = input_sizes.vec();
   output_sizes.pop_back();
   output_sizes.push_back(-1);
-  auto output = flat_output.reshape(output_sizes); // .clone();
+  auto output = flat_output.reshape(output_sizes);
   return output;
 }
 
@@ -131,6 +131,29 @@ torch::Tensor code2x8_matmat(
   auto output_sizes = input_sizes.vec();
   output_sizes.pop_back();
   output_sizes.push_back(-1);
-  auto output = flat_output.reshape(output_sizes).clone();
+  auto output = flat_output.reshape(output_sizes);
   return output;
 }
+
+torch::Tensor aqlm_gemm(
+  const torch::Tensor& input,
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& scales,
+  const std::optional<torch::Tensor>& bias
+)
+{
+  int const nbooks = codebooks.size(0);
+  int const entries = codebooks.size(1);
+
+  if (nbooks == 1 && entries == (1 << 16))
+  {
+    return code1x16_matmat(input, codes, codebooks, scales, bias);
+  }
+  if (nbooks == 2 && entries == (1 << 8))
+  {
+    return code2x8_matmat(input, codes, codebooks, scales, bias);
+  }
+  // TODO error somehow.
+  return {};
+}
diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py
index cbc9b37857452..1ca5400db6065 100644
--- a/examples/aqlm_test.py
+++ b/examples/aqlm_test.py
@@ -1,10 +1,10 @@
 from vllm import LLM, SamplingParams
 
-#model = LLM("nm-testing/llama2.c-stories110M-pruned2.4")
+# 1x16
 model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
-#model = LLM(
-#    "/home/jaemz/.cache/huggingface/hub/models--BlackSamorez--Llama-2-7b-AQLM-2Bit-1x16-hf/snapshots/1756949fed7fe691c00f015e2ebd18503a975f3b",
-#    enforce_eager=True)
+
+# 2 x 8
+#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True)
 
 sampling_params = SamplingParams(max_tokens=100, temperature=0)
 outputs = model.generate("Hello my name is", sampling_params=sampling_params)

From 6eba0357701bf259389825432fc192a9fff2996a Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 29 Feb 2024 22:03:14 +0000
Subject: [PATCH 28/96] add support for 2x8, worked shockingly easily

---
 examples/aqlm_test.py                           | 4 ++--
 vllm/model_executor/layers/quantization/aqlm.py | 6 ++----
 vllm/model_executor/models/llama.py             | 1 +
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py
index 1ca5400db6065..a2fd5a9b7a7c5 100644
--- a/examples/aqlm_test.py
+++ b/examples/aqlm_test.py
@@ -1,10 +1,10 @@
 from vllm import LLM, SamplingParams
 
 # 1x16
-model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
+#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
 
 # 2 x 8
-#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True)
+model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True)
 
 sampling_params = SamplingParams(max_tokens=100, temperature=0)
 outputs = model.generate("Hello my name is", sampling_params=sampling_params)
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index daa7f88a3adef..848efcbfe5d97 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -41,8 +41,7 @@ def __init__(
         # I think pack factor is *probably* how many elements fit into one quantized tensor element.
         # though out group size makes it interesting, because really we are doing 2D blocks, potentially.
         # maybe this is vllms first 2D packing?  Arg.
-        self.pack_factor = (self.in_group_size * self.out_group_size //
-                            self.num_codebooks)
+        self.pack_factor = (self.in_group_size * self.out_group_size)
 
     def __repr__(self) -> str:
         return (f"AQLMConfig(in_group_size={self.in_group_size}, "
@@ -144,7 +143,7 @@ def create_weights(self, input_size_per_partition: int,
             torch.empty(
                 output_size_per_partition,  # not entirely sure what to do with num_out_groups, if we need this pack factor.
                 input_size_per_partition // self.quant_config.pack_factor,
-                1,  # probably should be num codebooks and change pack factor?
+                self.quant_config.num_codebooks,
                 dtype=get_int_dtype(self.quant_config.nbits_per_codebook),
             ),
             requires_grad=False,
@@ -212,7 +211,6 @@ def apply_weights(
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-
         codebooks = weights["codebooks"]
         codes = weights["codes"]
         scales = weights["scales"]
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index b7f6b8f3ec374..88aea8de02845 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -388,4 +388,5 @@ def load_weights(self,
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
+                
                 weight_loader(param, loaded_weight)

From 604f66fef9b5329f7276046a2692f39a727d2a03 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 1 Mar 2024 18:05:57 +0000
Subject: [PATCH 29/96] support more than one model

---
 examples/aqlm_test.py                         | 10 ++++--
 vllm/model_executor/layers/linear.py          | 25 +++++++-------
 .../layers/quantization/aqlm.py               | 33 +++++++++++--------
 .../model_executor/layers/quantization/awq.py |  2 +-
 .../layers/quantization/gptq.py               |  2 +-
 .../layers/quantization/squeezellm.py         |  2 +-
 vllm/model_executor/models/llama.py           |  4 ++-
 7 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py
index a2fd5a9b7a7c5..5a4bd4cc7572e 100644
--- a/examples/aqlm_test.py
+++ b/examples/aqlm_test.py
@@ -1,10 +1,14 @@
 from vllm import LLM, SamplingParams
 
-# 1x16
 #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
 
-# 2 x 8
-model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True)
+#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True)
+
+model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True)
+
+# These have custom code and the old format, and puzzling and conflicting stats, which probably I shouldn't even try to support.
+#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True)
+#model = LLM("BlackSamorez/Llama-2-13b-AQLM-2Bit-1x16-hf", enforce_eager=True, trust_remote_code=True)
 
 sampling_params = SamplingParams(max_tokens=100, temperature=0)
 outputs = model.generate("Hello my name is", sampling_params=sampling_params)
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 35da806a9097d..4c7d246ca519e 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -24,7 +24,7 @@ class LinearMethodBase(ABC):
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       shards: int) -> Dict[str, Any]:
+                       output_sizes: List[int]) -> Dict[str, Any]:
         """Create weights for a linear layer."""
         raise NotImplementedError
 
@@ -51,7 +51,7 @@ def __init__(self, separate_bias_add: bool = False):
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       shards: int) -> Dict[str, Any]:
+                       output_sizes: List[int]) -> Dict[str, Any]:
         weight = Parameter(torch.empty(output_size_per_partition,
                                        input_size_per_partition,
                                        dtype=params_dtype),
@@ -106,7 +106,7 @@ def __init__(
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
             self.input_size, self.output_size, self.input_size,
-            self.output_size, self.params_dtype, 1)
+            self.output_size, self.params_dtype, [self.output_size])
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
@@ -142,7 +142,7 @@ class ColumnParallelLinear(torch.nn.Module):
                        skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         linear_method: (Maybe quantized) linear method.
-        shards: Number of packed shards, like for QKV this would be 3
+        output_sizes: list of output sizes packed into one output, like for QKV the list would be size 3.
     """
 
     def __init__(
@@ -154,7 +154,7 @@ def __init__(
         skip_bias_add: bool = False,
         params_dtype: Optional[torch.dtype] = None,
         linear_method: Optional[LinearMethodBase] = None,
-        shards: int = 1,
+        output_sizes: List[int] = [0],
     ):
         super().__init__()
 
@@ -162,7 +162,6 @@ def __init__(
         self.input_size = input_size
         self.output_size = output_size
         self.gather_output = gather_output
-        self.shards = shards
         # Divide the weight matrix along the last dimension.
         tp_size = get_tensor_model_parallel_world_size()
         self.output_size_per_partition = divide(output_size, tp_size)
@@ -175,7 +174,7 @@ def __init__(
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
             self.input_size, self.output_size_per_partition, self.input_size,
-            self.output_size, self.params_dtype, self.shards)
+            self.output_size, self.params_dtype, output_sizes)
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
@@ -254,7 +253,7 @@ def __init__(
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
         super().__init__(input_size, sum(output_sizes), bias, gather_output,
                          skip_bias_add, params_dtype, linear_method,
-                         len(self.output_sizes))
+                         self.output_sizes)
 
     def weight_loader(self,
                       param: Parameter,
@@ -374,15 +373,19 @@ def __init__(
         input_size = self.hidden_size
         output_size = (self.num_heads +
                        2 * self.num_kv_heads) * tp_size * self.head_size
+
         super().__init__(input_size, output_size, bias, False, skip_bias_add,
-                         params_dtype, linear_method, 3)
+                         params_dtype, linear_method, [
+                             self.num_heads * tp_size * self.head_size,
+                             self.num_kv_heads * tp_size * self.head_size,
+                             self.num_kv_heads * tp_size * self.head_size
+                         ])
 
     def weight_loader(self,
                       param: Parameter,
                       loaded_weight: torch.Tensor,
                       loaded_shard_id: Optional[str] = None):
         param_data = param.data
-
         output_dim = getattr(param, "output_dim", None)
         shard_dim = getattr(param, "shard_dim", None)
 
@@ -512,7 +515,7 @@ def __init__(
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
             self.input_size_per_partition, self.output_size, self.input_size,
-            self.output_size, self.params_dtype, 1)
+            self.output_size, self.params_dtype, [self.output_size])
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 848efcbfe5d97..a9732257e462d 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -120,7 +120,7 @@ def __init__(self, quant_config: AQLMConfig):
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       shards: int) -> Dict[str, Any]:
+                       output_sizes: List[int]) -> Dict[str, Any]:
         assert (output_size == output_size_per_partition)
         assert (input_size == input_size_per_partition)
         del output_size  # Unused.
@@ -156,12 +156,13 @@ def create_weights(self, input_size_per_partition: int,
                 "output_dim": 0,
                 "packed_dim": 1,
                 "pack_factor": self.quant_config.pack_factor,
+                "output_sizes": output_sizes
             },
         )
 
         codebooks = Parameter(
             torch.empty(
-                self.quant_config.num_codebooks * shards,
+                self.quant_config.num_codebooks * len(output_sizes),
                 2**self.quant_config.nbits_per_codebook,
                 self.quant_config.out_group_size,
                 self.quant_config.in_group_size,
@@ -173,7 +174,6 @@ def create_weights(self, input_size_per_partition: int,
             codebooks,
             {
                 "shard_dim": 0,
-                "shards": shards
             },
         )
 
@@ -219,28 +219,33 @@ def apply_weights(
         if shard_dim is not None:
             output_shape = x.shape[:-1] + (scales.shape[0], )
             output = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-            shards = getattr(codebooks, "shards", None)
+            output_sizes = getattr(codes, "output_sizes", None)
+            outputs = len(output_sizes)
+
             # break the shards apart and combine them.
             assert (shard_dim == 0)
-            num_codebooks = codebooks.shape[shard_dim] // shards
+            num_codebooks = codebooks.shape[shard_dim] // outputs
 
             assert (scales.shape[0] == codes.shape[0])
-            assert (scales.shape[0] % shards == 0)
-            base_size = scales.shape[0] // shards
+            assert (scales.shape[0] == sum(output_sizes))
 
-            for shard_id in range(shards):
+            output_offset = 0
+            codebooks_offset = 0
+            for output_size in output_sizes:
                 shard_output = ops.aqlm_gemm(
-                    x, codes.narrow(0, shard_id * base_size, base_size),
-                    codebooks.narrow(shard_dim, shard_id * num_codebooks,
+                    x, codes.narrow(0, output_offset, output_size),
+                    codebooks.narrow(shard_dim, codebooks_offset,
                                      num_codebooks),
-                    scales.narrow(0, shard_id * base_size, base_size),
+                    scales.narrow(0, output_offset, output_size),
                     None if bias is None else bias.narrow(
-                        0, shard_id * base_size, base_size))
+                        0, output_offset, output_size))
 
-                output_slice = output.narrow(-1, shard_id * base_size,
-                                             base_size)
+                output_slice = output.narrow(-1, output_offset, output_size)
                 assert (output_slice.shape == shard_output.shape)
                 output_slice.copy_(shard_output)
+                output_offset += output_size
+                codebooks_offset += num_codebooks
+
             return output
 
         output = ops.aqlm_gemm(
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index a3623ae5b0417..60afacea9c2af 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -80,7 +80,7 @@ def __init__(self, quant_config: AWQConfig):
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       shards: int) -> Dict[str, Any]:
+                       output_sizes: List[int]) -> Dict[str, Any]:
         if input_size_per_partition % self.quant_config.group_size != 0:
             raise ValueError(
                 "The input size is not aligned with the quantized "
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 45b06947f3799..ae2929dcd22f8 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -90,7 +90,7 @@ def __init__(self, quant_config: GPTQConfig):
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       shards: int) -> Dict[str, Any]:
+                       output_sizes: List[int]) -> Dict[str, Any]:
         del output_size  # Unused.
         if input_size_per_partition % self.quant_config.group_size != 0:
             raise ValueError(
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index 091ff22b9b095..0ec5be06abbd6 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -70,7 +70,7 @@ def __init__(self, quant_config: SqueezeLLMConfig):
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       shards: int) -> Dict[str, Any]:
+                       output_sizes: List[int]) -> Dict[str, Any]:
         if input_size_per_partition % self.quant_config.pack_factor != 0:
             raise ValueError(
                 "The input size is not aligned with the quantized "
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 88aea8de02845..d5e61a6e1ed33 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -360,6 +360,7 @@ def load_weights(self,
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
         ]
+
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in hf_model_weights_iterator(
                 model_name_or_path, cache_dir, load_format, revision):
@@ -378,6 +379,7 @@ def load_weights(self,
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 param = params_dict[name]
+
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
@@ -388,5 +390,5 @@ def load_weights(self,
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                
+
                 weight_loader(param, loaded_weight)

From ce639374af6a6d7f69247b8a9ecbcc5fbad0fb96 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 1 Mar 2024 18:23:45 +0000
Subject: [PATCH 30/96] formatting

---
 vllm/model_executor/layers/linear.py | 5 -----
 vllm/model_executor/models/llama.py  | 3 ---
 2 files changed, 8 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 4c7d246ca519e..1a27fb35ee5e9 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -539,17 +539,12 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         tp_rank = get_tensor_model_parallel_rank()
         input_dim = getattr(param, "input_dim", None)
         param_data = param.data
-
         if input_dim is not None:
             shard_size = param_data.shape[input_dim]
             start_idx = tp_rank * shard_size
-            assert (start_idx == 0
-                    and shard_size == loaded_weight.shape[input_dim])
-
             loaded_weight = loaded_weight.narrow(input_dim, start_idx,
                                                  shard_size)
         assert param_data.shape == loaded_weight.shape
-
         param_data.copy_(loaded_weight)
 
     def forward(self, input_):
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d5e61a6e1ed33..b7f6b8f3ec374 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -360,7 +360,6 @@ def load_weights(self,
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
         ]
-
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in hf_model_weights_iterator(
                 model_name_or_path, cache_dir, load_format, revision):
@@ -379,7 +378,6 @@ def load_weights(self,
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 param = params_dict[name]
-
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
@@ -390,5 +388,4 @@ def load_weights(self,
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-
                 weight_loader(param, loaded_weight)

From 6cbdff7c972418b6dd9a476957ca2280f23a9d9a Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 1 Mar 2024 18:29:27 +0000
Subject: [PATCH 31/96] remove secondary aqlm loading

---
 vllm/config.py                      | 23 ++++++++---------------
 vllm/model_executor/weight_utils.py |  7 +++++--
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 70a5f3b77eba1..bf972b53e5c6c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -165,21 +165,14 @@ def _get_and_verify_quantization(self) -> Any | None:
         hf_quant_config = getattr(self.hf_config, "quantization_config", None)
         if hf_quant_config is not None:
             hf_quant_method = str(hf_quant_config["quant_method"]).lower()
-        else:
-            # HF models such as https://huggingface.co/BlackSamorez/Llama-2-70b-AQLM-4Bit-2x16-hf/blob/main/config.json
-            # only have  an aqlm block, no quantization_config block.
-            hf_quant_config = getattr(self.hf_config, "aqlm", None)
-            if hf_quant_config is not None:
-                hf_quant_method = "aqlm"
-
-        if hf_quant_method is not None and self.quantization is None:
-            self.quantization = hf_quant_method
-        elif self.quantization != hf_quant_method:
-            raise ValueError(
-                "Quantization method specified in the model config "
-                f"({hf_quant_method}) does not match the quantization "
-                f"method specified in the `quantization` argument "
-                f"({self.quantization}).")
+            if self.quantization is None:
+                    self.quantization = hf_quant_method
+            elif self.quantization != hf_quant_method:
+                raise ValueError(
+                    "Quantization method specified in the model config "
+                    f"({hf_quant_method}) does not match the quantization "
+                    f"method specified in the `quantization` argument "
+                    f"({self.quantization}).")
 
         if self.quantization is not None:
             if self.quantization not in supported_quantization:
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index bdcb9d5976576..3570366887e78 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -85,8 +85,11 @@ def convert_bin_to_safetensor_file(
 # TODO(woosuk): Move this to other place.
 def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:
     quant_cls = get_quantization_config(model_config.quantization)
-    if model_config.hf_quant_config is not None:
-        return quant_cls.from_config(model_config.hf_quant_config)
+    # Read the quantization config from the HF model config, if available.
+    hf_quant_config = getattr(model_config.hf_config, "quantization_config",
+                              None)
+    if hf_quant_config is not None:
+        return quant_cls.from_config(hf_quant_config)
     model_name_or_path = model_config.model
     is_local = os.path.isdir(model_name_or_path)
     if not is_local:

From a58d369c987671c80ab8eccdb8367ca5990472c4 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 1 Mar 2024 18:30:41 +0000
Subject: [PATCH 32/96] restore trailing space

---
 vllm/config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index bf972b53e5c6c..b6ef7a69471b7 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -45,7 +45,7 @@ class ModelConfig:
             a tag name, or a commit id. If unspecified, will use the default
             version.
         code_revision: The specific revision to use for the model code on
-            Hugging Face Hub. It can be a branch name, a tag name, or a
+            Hugging Face Hub. It can be a branch name, a tag name, or a 
             commit id. If unspecified, will use the default version.
         tokenizer_revision: The specific tokenizer version to use. It can be a
             branch name, a tag name, or a commit id. If unspecified, will use
@@ -166,7 +166,7 @@ def _get_and_verify_quantization(self) -> Any | None:
         if hf_quant_config is not None:
             hf_quant_method = str(hf_quant_config["quant_method"]).lower()
             if self.quantization is None:
-                    self.quantization = hf_quant_method
+                self.quantization = hf_quant_method
             elif self.quantization != hf_quant_method:
                 raise ValueError(
                     "Quantization method specified in the model config "

From 31f0ddc3899abda11a62a833bc99b2d1869d99b4 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 1 Mar 2024 18:32:13 +0000
Subject: [PATCH 33/96] remove some code

---
 vllm/config.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index b6ef7a69471b7..4448ee9dac017 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -115,7 +115,7 @@ def __init__(
                                                      max_model_len)
         self._verify_load_format()
         self._verify_tokenizer_mode()
-        self.hf_quant_config = self._get_and_verify_quantization()
+        self._verify_quantization()
         self._verify_cuda_graph()
 
     def _verify_load_format(self) -> None:
@@ -154,14 +154,13 @@ def _verify_tokenizer_mode(self) -> None:
                 "either 'auto' or 'slow'.")
         self.tokenizer_mode = tokenizer_mode
 
-    def _get_and_verify_quantization(self) -> Any | None:
+    def _verify_quantization(self) -> None:
         supported_quantization = ["aqlm", "awq", "gptq", "squeezellm"]
         rocm_not_supported_quantization = ["awq"]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
         # Parse quantization method from the HF model config, if available.
-        hf_quant_method = None
         hf_quant_config = getattr(self.hf_config, "quantization_config", None)
         if hf_quant_config is not None:
             hf_quant_method = str(hf_quant_config["quant_method"]).lower()
@@ -188,8 +187,6 @@ def _get_and_verify_quantization(self) -> Any | None:
                            "optimized yet. The speed can be slower than "
                            "non-quantized models.")
 
-        return hf_quant_config
-
     def _verify_cuda_graph(self) -> None:
         if self.max_context_len_to_capture is None:
             self.max_context_len_to_capture = self.max_model_len

From edc80c61e05d70be213abebba995633e8a7f0fe0 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 1 Mar 2024 18:35:49 +0000
Subject: [PATCH 34/96] remove some comments

---
 .../layers/quantization/aqlm.py               | 25 -------------------
 1 file changed, 25 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index a9732257e462d..744f1b7cb13ac 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -62,30 +62,6 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
     def get_min_capability(cls) -> int:
         return 60
 
-    # such as.  (This one looks correct)
-    # https://huggingface.co/BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf/blob/main/config.json
-    #
-    # "quantization_config": {
-    #   "in_group_size": 8,
-    #   "nbits_per_codebook": 16,
-    #   "num_codebooks": 1,
-    #   "out_group_size": 1,
-    #   "quant_method": "aqlm"
-    #   "linear_weights_not_to_quantize": [ <--- hmmm ????
-    #       "model.embed_tokens.weight",
-    #       "lm_head.weight"
-    # },
-
-    # https://huggingface.co/meta-llama/Llama-2-7b-hf <- can't see it, locked behind meta.
-
-    # this is no-standard, has no "quantization_config", just an "aqlm" block.
-    # https://huggingface.co/BlackSamorez/Llama-2-70b-AQLM-4Bit-2x16-hf/blob/main/config.json
-    # "aqlm": {
-    #    "in_group_size": 8,
-    #    "nbits_per_codebook": 16,
-    #    "num_codebooks": 2,
-    #    "out_group_size": 1
-
     @classmethod
     def get_config_filenames(cls) -> List[str]:
         return []  # no extra configs.
@@ -96,7 +72,6 @@ def from_config(cls, config: Dict[str, Any]) -> "AQLMConfig":
         nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"])
         num_code_books = cls.get_from_keys(config, ["num_codebooks"])
         out_group_size = cls.get_from_keys(config, ["out_group_size"])
-        # TODO linear_weights_not_to_quantize ?
         return cls(in_group_size, nbits_per_codebook, num_code_books,
                    out_group_size)
 

From 3253dc77dfe53318870b90833d5503024dee688b Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 1 Mar 2024 18:38:43 +0000
Subject: [PATCH 35/96] add some attributions

---
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp      | 2 ++
 csrc/quantization/aqlm/aqlm_cuda_kernel.cu      | 2 ++
 vllm/model_executor/layers/quantization/aqlm.py | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index ac620e9361854..729b6f854e6dc 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -1,3 +1,5 @@
+// Adapted from https://github.com/Vahe1994/AQLM
+
 #include <torch/all.h>
 #include <torch/python.h>
 #include <c10/cuda/CUDAGuard.h>
diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
index 0f97e93d678e6..52d4b2e960cea 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
+++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
@@ -1,3 +1,5 @@
+// Adapted from https://github.com/Vahe1994/AQLM
+
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 744f1b7cb13ac..aedae6faeb5f7 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -1,3 +1,5 @@
+# Supports AQLM compression, see https://github.com/Vahe1994/AQLM and https://arxiv.org/pdf/2401.06118.pdf
+
 from typing import Any, Dict, List, Optional
 
 import torch

From fefe1c874ac404cb7ef98d91829ea6362f3f7c4e Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 1 Mar 2024 19:18:38 +0000
Subject: [PATCH 36/96] support 2 tp

---
 examples/aqlm_test.py                           | 2 +-
 vllm/model_executor/layers/quantization/aqlm.py | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py
index 5a4bd4cc7572e..7995a2db5a328 100644
--- a/examples/aqlm_test.py
+++ b/examples/aqlm_test.py
@@ -4,7 +4,7 @@
 
 #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True)
 
-model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True)
+model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True, tensor_parallel_size=2)
 
 # These have custom code and the old format, and puzzling and conflicting stats, which probably I shouldn't even try to support.
 #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True)
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index aedae6faeb5f7..38528b6f13df0 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -98,8 +98,6 @@ def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        output_sizes: List[int]) -> Dict[str, Any]:
-        assert (output_size == output_size_per_partition)
-        assert (input_size == input_size_per_partition)
         del output_size  # Unused.
         del input_size  # Unused.
 
@@ -202,13 +200,13 @@ def apply_weights(
             # break the shards apart and combine them.
             assert (shard_dim == 0)
             num_codebooks = codebooks.shape[shard_dim] // outputs
-
             assert (scales.shape[0] == codes.shape[0])
-            assert (scales.shape[0] == sum(output_sizes))
-
+            assert (sum(output_sizes) % scales.shape[0] == 0) 
+            out_tp = sum(output_sizes) // scales.shape[0]
             output_offset = 0
             codebooks_offset = 0
             for output_size in output_sizes:
+                output_size //= out_tp
                 shard_output = ops.aqlm_gemm(
                     x, codes.narrow(0, output_offset, output_size),
                     codebooks.narrow(shard_dim, codebooks_offset,

From 4b12ed62c3221c348ef1ef0b51408190a8d73413 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 1 Mar 2024 19:34:01 +0000
Subject: [PATCH 37/96] better tp support

---
 vllm/model_executor/layers/linear.py          |  8 ++++---
 .../layers/quantization/aqlm.py               | 21 +++++++++----------
 .../model_executor/layers/quantization/awq.py |  2 +-
 .../layers/quantization/gptq.py               |  2 +-
 .../layers/quantization/squeezellm.py         |  2 +-
 5 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 1a27fb35ee5e9..3b880709733e2 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -24,7 +24,7 @@ class LinearMethodBase(ABC):
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       output_sizes: List[int]) -> Dict[str, Any]:
+                       output_partition_sizes: List[int]) -> Dict[str, Any]:
         """Create weights for a linear layer."""
         raise NotImplementedError
 
@@ -51,7 +51,7 @@ def __init__(self, separate_bias_add: bool = False):
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       output_sizes: List[int]) -> Dict[str, Any]:
+                       output_partition_sizes: List[int]) -> Dict[str, Any]:
         weight = Parameter(torch.empty(output_size_per_partition,
                                        input_size_per_partition,
                                        dtype=params_dtype),
@@ -174,7 +174,9 @@ def __init__(
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
             self.input_size, self.output_size_per_partition, self.input_size,
-            self.output_size, self.params_dtype, output_sizes)
+            self.output_size, self.params_dtype,
+            [x // tp_size for x in output_sizes])
+
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 38528b6f13df0..d7180c0226c88 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -97,7 +97,7 @@ def __init__(self, quant_config: AQLMConfig):
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       output_sizes: List[int]) -> Dict[str, Any]:
+                       output_partition_sizes: List[int]) -> Dict[str, Any]:
         del output_size  # Unused.
         del input_size  # Unused.
 
@@ -131,13 +131,13 @@ def create_weights(self, input_size_per_partition: int,
                 "output_dim": 0,
                 "packed_dim": 1,
                 "pack_factor": self.quant_config.pack_factor,
-                "output_sizes": output_sizes
+                "output_partition_sizes": output_partition_sizes
             },
         )
 
         codebooks = Parameter(
             torch.empty(
-                self.quant_config.num_codebooks * len(output_sizes),
+                self.quant_config.num_codebooks * len(output_partition_sizes),
                 2**self.quant_config.nbits_per_codebook,
                 self.quant_config.out_group_size,
                 self.quant_config.in_group_size,
@@ -194,19 +194,18 @@ def apply_weights(
         if shard_dim is not None:
             output_shape = x.shape[:-1] + (scales.shape[0], )
             output = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-            output_sizes = getattr(codes, "output_sizes", None)
-            outputs = len(output_sizes)
+            output_partition_sizes = getattr(codes, "output_partition_sizes",
+                                             None)
+            num_outputs = len(output_partition_sizes)
 
-            # break the shards apart and combine them.
+            # break the inputs and codebooks apart then combine the outputs.
             assert (shard_dim == 0)
-            num_codebooks = codebooks.shape[shard_dim] // outputs
+            num_codebooks = codebooks.shape[shard_dim] // num_outputs
             assert (scales.shape[0] == codes.shape[0])
-            assert (sum(output_sizes) % scales.shape[0] == 0) 
-            out_tp = sum(output_sizes) // scales.shape[0]
+            assert (sum(output_partition_sizes) == scales.shape[0])
             output_offset = 0
             codebooks_offset = 0
-            for output_size in output_sizes:
-                output_size //= out_tp
+            for output_size in output_partition_sizes:
                 shard_output = ops.aqlm_gemm(
                     x, codes.narrow(0, output_offset, output_size),
                     codebooks.narrow(shard_dim, codebooks_offset,
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 60afacea9c2af..e6c7c658e1e19 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -80,7 +80,7 @@ def __init__(self, quant_config: AWQConfig):
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       output_sizes: List[int]) -> Dict[str, Any]:
+                       output_partition_sizes: List[int]) -> Dict[str, Any]:
         if input_size_per_partition % self.quant_config.group_size != 0:
             raise ValueError(
                 "The input size is not aligned with the quantized "
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index ae2929dcd22f8..7ca29b941eeb0 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -90,7 +90,7 @@ def __init__(self, quant_config: GPTQConfig):
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       output_sizes: List[int]) -> Dict[str, Any]:
+                       output_partition_sizes: List[int]) -> Dict[str, Any]:
         del output_size  # Unused.
         if input_size_per_partition % self.quant_config.group_size != 0:
             raise ValueError(
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index 0ec5be06abbd6..e266e8a74af69 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -70,7 +70,7 @@ def __init__(self, quant_config: SqueezeLLMConfig):
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       output_sizes: List[int]) -> Dict[str, Any]:
+                       output_partition_sizes: List[int]) -> Dict[str, Any]:
         if input_size_per_partition % self.quant_config.pack_factor != 0:
             raise ValueError(
                 "The input size is not aligned with the quantized "

From e5c2010fb50d99152576a882d195cab37f9f5fa5 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 1 Mar 2024 19:36:44 +0000
Subject: [PATCH 38/96] format

---
 vllm/model_executor/layers/quantization/gptq.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 7ca29b941eeb0..36a60990a8049 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -87,10 +87,15 @@ class GPTQLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: GPTQConfig):
         self.quant_config = quant_config
 
-    def create_weights(self, input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       output_partition_sizes: List[int]) -> Dict[str, Any]:
+    def create_weights(
+        self,
+        input_size_per_partition: int,
+        output_size_per_partition: int,
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        output_partition_sizes: List[int],
+    ) -> Dict[str, Any]:
         del output_size  # Unused.
         if input_size_per_partition % self.quant_config.group_size != 0:
             raise ValueError(

From eef729fb79c6e079b01ae91390af8454d0f5c7b9 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 1 Mar 2024 21:06:21 +0000
Subject: [PATCH 39/96] comments

---
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 7 ++++++-
 examples/aqlm_test.py                      | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 729b6f854e6dc..2e58d13d82295 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -4,6 +4,9 @@
 #include <torch/python.h>
 #include <c10/cuda/CUDAGuard.h>
 
+#include <iostream>
+#include <cstdlib>
+
 void code1x16_matvec_cuda(
   const void* A,
   const void* B,
@@ -156,6 +159,8 @@ torch::Tensor aqlm_gemm(
   {
     return code2x8_matmat(input, codes, codebooks, scales, bias);
   }
-  // TODO error somehow.
+
+  std::cerr << "AQLM does not support " << nbooks << " codebooks with " << entries << " entries";
+  std::abort();
   return {};
 }
diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py
index 7995a2db5a328..35289103b2e70 100644
--- a/examples/aqlm_test.py
+++ b/examples/aqlm_test.py
@@ -6,9 +6,14 @@
 
 model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True, tensor_parallel_size=2)
 
-# These have custom code and the old format, and puzzling and conflicting stats, which probably I shouldn't even try to support.
+# this has the codes 0 and 1 transposed.
 #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True)
+# this model hangs
+#model = LLM("BlackSamorez/Mixtral-8x7B-Instruct-v0_1-AQLM-2Bit-1x16-hf", enforce_eager=True)
+
+# These have custom code and the old format, and puzzling and conflicting stats, which probably I shouldn't even try to support.
 #model = LLM("BlackSamorez/Llama-2-13b-AQLM-2Bit-1x16-hf", enforce_eager=True, trust_remote_code=True)
+#model = LLM("BlackSamorez/Mixtral-8x7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
 
 sampling_params = SamplingParams(max_tokens=100, temperature=0)
 outputs = model.generate("Hello my name is", sampling_params=sampling_params)

From d31241b8744a88fc6fc38f4e809f03ff32c53923 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 1 Mar 2024 21:38:24 +0000
Subject: [PATCH 40/96] comments

---
 vllm/model_executor/layers/quantization/aqlm.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index d7180c0226c88..9dcf36ef6ef5b 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -40,9 +40,8 @@ def __init__(
         self.num_codebooks = num_codebooks
         self.out_group_size = out_group_size
 
-        # I think pack factor is *probably* how many elements fit into one quantized tensor element.
-        # though out group size makes it interesting, because really we are doing 2D blocks, potentially.
-        # maybe this is vllms first 2D packing?  Arg.
+        # out_group_size > 1 is untested, and probably won't work as-is.
+        assert(self.out_group_size == 1)
         self.pack_factor = (self.in_group_size * self.out_group_size)
 
     def __repr__(self) -> str:
@@ -116,7 +115,11 @@ def create_weights(self, input_size_per_partition: int,
 
         codes = Parameter(
             torch.empty(
-                output_size_per_partition,  # not entirely sure what to do with num_out_groups, if we need this pack factor.
+                # There could actually be two pack factors, one along input and one along output,
+                # but we don't currently support out_group_size,
+                # and only the one along output needs to be marked with "packed_dim".
+                # in order for QKVLinear to work.
+                output_size_per_partition,
                 input_size_per_partition // self.quant_config.pack_factor,
                 self.quant_config.num_codebooks,
                 dtype=get_int_dtype(self.quant_config.nbits_per_codebook),

From ba3c1256b7135bab0fa6a6fcb4b61240445d3288 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 1 Mar 2024 21:41:42 +0000
Subject: [PATCH 41/96] rename aqlm_test

---
 examples/{aqlm_test.py => aqlm_example.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/{aqlm_test.py => aqlm_example.py} (100%)

diff --git a/examples/aqlm_test.py b/examples/aqlm_example.py
similarity index 100%
rename from examples/aqlm_test.py
rename to examples/aqlm_example.py

From 703fa798254da2012501f0205e84d595744934e0 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 1 Mar 2024 22:02:52 +0000
Subject: [PATCH 42/96] better comments

---
 examples/aqlm_example.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index 35289103b2e70..ef548a22d5893 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -6,12 +6,12 @@
 
 model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True, tensor_parallel_size=2)
 
-# this has the codes 0 and 1 transposed.
+# this has the dimensions 0 and 1 transposed for the codes, and we don't currently support 8x8 anyway.
 #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True)
-# this model hangs
+# this model hangs, need to investigate.
 #model = LLM("BlackSamorez/Mixtral-8x7B-Instruct-v0_1-AQLM-2Bit-1x16-hf", enforce_eager=True)
 
-# These have custom code and the old format, and puzzling and conflicting stats, which probably I shouldn't even try to support.
+# These have custom code and no quantization_config block.
 #model = LLM("BlackSamorez/Llama-2-13b-AQLM-2Bit-1x16-hf", enforce_eager=True, trust_remote_code=True)
 #model = LLM("BlackSamorez/Mixtral-8x7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
 

From 6e47ff649cbb1c4d8867d4191c13c706582e10f9 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 1 Mar 2024 22:05:41 +0000
Subject: [PATCH 43/96] better comment

---
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 2e58d13d82295..35aeb67046070 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -160,7 +160,7 @@ torch::Tensor aqlm_gemm(
     return code2x8_matmat(input, codes, codebooks, scales, bias);
   }
 
-  std::cerr << "AQLM does not support " << nbooks << " codebooks with " << entries << " entries";
+  std::cerr << "AQLM with " << nbooks << " codebooks and " << entries << " entries is not currently supported.";
   std::abort();
   return {};
 }

From 556178f134e9c705251c4e01330e1d22416d4417 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Mon, 4 Mar 2024 20:25:56 +0000
Subject: [PATCH 44/96] first attempt

---
 csrc/ops.h                                    |  1 +
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp    | 47 ++++++++++++++++---
 csrc/quantization/aqlm/aqlm_cuda_kernel.cu    | 23 +++++++--
 .../layers/quantization/aqlm.py               | 43 +++++------------
 4 files changed, 71 insertions(+), 43 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index c70a04e1a8694..ec7ee30bf5015 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -74,6 +74,7 @@ torch::Tensor aqlm_gemm(
   const torch::Tensor& codes,
   const torch::Tensor& codebooks,
   const torch::Tensor& scales,
+  const torch::Tensor& codebook_partition_sizes,
   const std::optional<torch::Tensor>& bias
 );
 
diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 35aeb67046070..e534c911b2b7e 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -13,7 +13,9 @@ void code1x16_matvec_cuda(
         void* C,
   const void* codebook,
   int prob_m,
-  int prob_k
+  int prob_k,
+  const int codebook_a_sizes[4],  // cumulative sizes of A spanning each codebook, at most 3 long.
+  const int codebook_stride // as int4.
 );
 
 void code2x8_matvec_cuda(
@@ -29,8 +31,19 @@ void code1x16_matvec(
   const torch::Tensor& A,
   const torch::Tensor& B,
         torch::Tensor& C,
-  const torch::Tensor& codebook
+  const torch::Tensor& codebook,
+  const int codebook_a_sizes[4]  // cumulative sizes of A spanning each codebook, at most 3 long.
 ) {
+
+  // @TEST
+  int stride = codebook.stride(0) * codebook.element_size() / sizeof(int4);
+  printf("codebook rank is %ld: %ld %ld %ld %ld", codebook.dim(),codebook.size(0),codebook.size(1),codebook.size(2),codebook.size(3));
+  std::cout << "codebook element size is " << codebook.element_size() << "\n";
+  std::cout << "sizeof int4 is " << sizeof(int4) << "\n";
+  std::cout << "stride is " << stride << "\n";
+  //std::cout << "codebook dtype is " << codebook.dtype << "\n";
+  assert(false);
+
   const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
   int prob_m = C.size(0);
   int prob_k = B.size(0);
@@ -40,7 +53,9 @@ void code1x16_matvec(
     C.data_ptr(),
     codebook.data_ptr(),
     prob_m,
-    prob_k
+    prob_k,
+    codebook_a_sizes,
+    codebook.stride(0) * codebook.element_size() / sizeof(int4)
   );
 }
 
@@ -49,8 +64,8 @@ torch::Tensor code1x16_matmat(
   const torch::Tensor& codes,
   const torch::Tensor& codebooks,
   const torch::Tensor& scales,
-  const std::optional<torch::Tensor>& bias
-) {
+  const int codebook_a_sizes[4],
+  const std::optional<torch::Tensor>& bias) {
   auto input_sizes = input.sizes();
   auto out_features = codes.size(0) * codebooks.size(2);
   auto flat_input = input.reshape({-1, input.size(-1)});
@@ -67,7 +82,8 @@ torch::Tensor code1x16_matmat(
       codes.squeeze(2),
       input_vec,
       output_vec,
-      codebooks
+      codebooks,
+      codebook_a_sizes
     );
   }
   flat_output *= scales.flatten().unsqueeze(0);
@@ -145,6 +161,7 @@ torch::Tensor aqlm_gemm(
   const torch::Tensor& codes,
   const torch::Tensor& codebooks,
   const torch::Tensor& scales,
+  const torch::Tensor& codebook_partition_sizes,
   const std::optional<torch::Tensor>& bias
 )
 {
@@ -153,7 +170,23 @@ torch::Tensor aqlm_gemm(
 
   if (nbooks == 1 && entries == (1 << 16))
   {
-    return code1x16_matmat(input, codes, codebooks, scales, bias);
+    int cumulative_sizes[4];
+    int i =0;
+    int last = 0;
+    for (; i <  codebook_partition_sizes.size(0); ++i)
+    {
+      cumulative_sizes[i] = codebook_partition_sizes[i] + last;
+      printf("cum size %d is %d", i, cumulative_sizes[i]);
+      last = cumulative_sizes[i];
+    }
+    // just fill in the rest with unreachable.
+    for (; i < 4; ++i)
+    {
+      cumulative_sizes[i] = last*10;
+      printf("cum size %d is %d", i, cumulative_sizes[i]);
+    }
+
+    return code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
   }
   if (nbooks == 2 && entries == (1 << 8))
   {
diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
index 52d4b2e960cea..2bd6edd5f03e7 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
+++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
@@ -12,11 +12,22 @@ __global__ void Code1x16MatVec(
   const int4* __restrict__ B,
         int4* __restrict__ C,
   const int4* __restrict__ codebook,
-  int prob_m,
-  int prob_k
+  const int prob_m,
+  const int prob_k,
+  const int codebook_a_sizes[4],  // cumulative sizes of A spanning each codebook, at most 3 long.
+  const int codebook_stride // as int4.
 ) {
   int a_gl_stride = prob_k / 8 / 8;
   int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+
+  // advance to the codebook we are in, this easy because we only multiply one column of the codebook.
+  int codebook_index = 0;
+  while (a_gl_rd >= codebook_a_sizes[codebook_index])
+  {
+      codebook += codebook_stride; 
+      ++codebook_index;
+  }
+
   bool pred = a_gl_rd < prob_m;
   int b_gl_rd = 0;
   int c_gl_wr = a_gl_rd;
@@ -156,7 +167,9 @@ void  code1x16_matvec_cuda(
         void* __restrict__ C,
   const void* __restrict__ codebook,
   int prob_m,
-  int prob_k
+  int prob_k,
+  const int codebook_a_sizes[4],
+  const int codebook_stride
 ) {
   int sms;
   cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
@@ -176,7 +189,9 @@ void  code1x16_matvec_cuda(
     (int4*) C,
     (const int4*) codebook,
     prob_m,
-    prob_k
+    prob_k,
+    codebook_a_sizes,
+    codebook_stride
   );
 }
 
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 9dcf36ef6ef5b..b115ea7d37b0d 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -134,7 +134,6 @@ def create_weights(self, input_size_per_partition: int,
                 "output_dim": 0,
                 "packed_dim": 1,
                 "pack_factor": self.quant_config.pack_factor,
-                "output_partition_sizes": output_partition_sizes
             },
         )
 
@@ -152,6 +151,7 @@ def create_weights(self, input_size_per_partition: int,
             codebooks,
             {
                 "shard_dim": 0,
+                "output_partition_sizes": output_partition_sizes,
             },
         )
 
@@ -192,45 +192,24 @@ def apply_weights(
         codebooks = weights["codebooks"]
         codes = weights["codes"]
         scales = weights["scales"]
+        output_partition_sizes = getattr(codebooks, "output_partition_sizes",
+                                         None)
 
-        shard_dim = getattr(codebooks, "shard_dim", None)
-        if shard_dim is not None:
-            output_shape = x.shape[:-1] + (scales.shape[0], )
-            output = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-            output_partition_sizes = getattr(codes, "output_partition_sizes",
-                                             None)
-            num_outputs = len(output_partition_sizes)
-
-            # break the inputs and codebooks apart then combine the outputs.
-            assert (shard_dim == 0)
-            num_codebooks = codebooks.shape[shard_dim] // num_outputs
-            assert (scales.shape[0] == codes.shape[0])
-            assert (sum(output_partition_sizes) == scales.shape[0])
-            output_offset = 0
-            codebooks_offset = 0
-            for output_size in output_partition_sizes:
-                shard_output = ops.aqlm_gemm(
-                    x, codes.narrow(0, output_offset, output_size),
-                    codebooks.narrow(shard_dim, codebooks_offset,
-                                     num_codebooks),
-                    scales.narrow(0, output_offset, output_size),
-                    None if bias is None else bias.narrow(
-                        0, output_offset, output_size))
-
-                output_slice = output.narrow(-1, output_offset, output_size)
-                assert (output_slice.shape == shard_output.shape)
-                output_slice.copy_(shard_output)
-                output_offset += output_size
-                codebooks_offset += num_codebooks
-
-            return output
+        #test
+        print("codes shape", codes.shape)
+        print("code books shape", codebooks.shape)
+        print("partition sizes", output_partition_sizes)
+        print("input shape", x.shape)
 
         output = ops.aqlm_gemm(
             x,
             codes,
             codebooks,
             scales,
+            output_partition_sizes,
             bias,
         )
 
+        print("output shape", output.shape)
+
         return output

From e23f1cd5bab0abcc8c8ab4d734f2955902cce2bb Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 5 Mar 2024 16:21:40 +0000
Subject: [PATCH 45/96] got it working

---
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp    | 37 +++++++------------
 csrc/quantization/aqlm/aqlm_cuda_kernel.cu    | 12 +++---
 examples/aqlm_example.py                      |  4 +-
 vllm/model_executor/layers/linear.py          |  2 +-
 .../layers/quantization/aqlm.py               |  2 +-
 5 files changed, 24 insertions(+), 33 deletions(-)

diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index e534c911b2b7e..ec271435fe15d 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -14,7 +14,7 @@ void code1x16_matvec_cuda(
   const void* codebook,
   int prob_m,
   int prob_k,
-  const int codebook_a_sizes[4],  // cumulative sizes of A spanning each codebook, at most 3 long.
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
   const int codebook_stride // as int4.
 );
 
@@ -32,21 +32,12 @@ void code1x16_matvec(
   const torch::Tensor& B,
         torch::Tensor& C,
   const torch::Tensor& codebook,
-  const int codebook_a_sizes[4]  // cumulative sizes of A spanning each codebook, at most 3 long.
+  const int4 codebook_a_sizes  // cumulative sizes of A spanning each codebook, at most 3 long.
 ) {
-
-  // @TEST
-  int stride = codebook.stride(0) * codebook.element_size() / sizeof(int4);
-  printf("codebook rank is %ld: %ld %ld %ld %ld", codebook.dim(),codebook.size(0),codebook.size(1),codebook.size(2),codebook.size(3));
-  std::cout << "codebook element size is " << codebook.element_size() << "\n";
-  std::cout << "sizeof int4 is " << sizeof(int4) << "\n";
-  std::cout << "stride is " << stride << "\n";
-  //std::cout << "codebook dtype is " << codebook.dtype << "\n";
-  assert(false);
-
   const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
   int prob_m = C.size(0);
   int prob_k = B.size(0);
+
   code1x16_matvec_cuda(
     A.data_ptr(),
     B.data_ptr(),
@@ -64,7 +55,7 @@ torch::Tensor code1x16_matmat(
   const torch::Tensor& codes,
   const torch::Tensor& codebooks,
   const torch::Tensor& scales,
-  const int codebook_a_sizes[4],
+  const int4 codebook_a_sizes,
   const std::optional<torch::Tensor>& bias) {
   auto input_sizes = input.sizes();
   auto out_features = codes.size(0) * codebooks.size(2);
@@ -165,25 +156,25 @@ torch::Tensor aqlm_gemm(
   const std::optional<torch::Tensor>& bias
 )
 {
-  int const nbooks = codebooks.size(0);
+  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
   int const entries = codebooks.size(1);
 
   if (nbooks == 1 && entries == (1 << 16))
   {
-    int cumulative_sizes[4];
+    int4 cumulative_sizes;
+    auto cumulative_size = &cumulative_sizes.x;
     int i =0;
     int last = 0;
-    for (; i <  codebook_partition_sizes.size(0); ++i)
+    assert(codebook_partition_sizes.size(0) <= 4);
+    for (; i <  codebook_partition_sizes.size(0); ++i, ++cumulative_size)
     {
-      cumulative_sizes[i] = codebook_partition_sizes[i] + last;
-      printf("cum size %d is %d", i, cumulative_sizes[i]);
-      last = cumulative_sizes[i];
+      *cumulative_size = codebook_partition_sizes[i].item<int>() + last;
+      last = *cumulative_size;
     }
-    // just fill in the rest with unreachable.
-    for (; i < 4; ++i)
+    // fill in the rest with unreachable.
+    for (; i < 4; ++i, ++cumulative_size)
     {
-      cumulative_sizes[i] = last*10;
-      printf("cum size %d is %d", i, cumulative_sizes[i]);
+      *cumulative_size = last*10;
     }
 
     return code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
index 2bd6edd5f03e7..4d8bdc6e47861 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
+++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
@@ -14,18 +14,18 @@ __global__ void Code1x16MatVec(
   const int4* __restrict__ codebook,
   const int prob_m,
   const int prob_k,
-  const int codebook_a_sizes[4],  // cumulative sizes of A spanning each codebook, at most 3 long.
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
   const int codebook_stride // as int4.
 ) {
   int a_gl_stride = prob_k / 8 / 8;
   int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
 
-  // advance to the codebook we are in, this easy because we only multiply one column of the codebook.
-  int codebook_index = 0;
-  while (a_gl_rd >= codebook_a_sizes[codebook_index])
+  // advance to the correct codebook, this easy because we only multiply one column of the codebook.
+  auto codebook_size = &codebook_a_sizes.x;
+  while (a_gl_rd >= *codebook_size)
   {
       codebook += codebook_stride; 
-      ++codebook_index;
+      ++codebook_size;
   }
 
   bool pred = a_gl_rd < prob_m;
@@ -168,7 +168,7 @@ void  code1x16_matvec_cuda(
   const void* __restrict__ codebook,
   int prob_m,
   int prob_k,
-  const int codebook_a_sizes[4],
+  const int4 codebook_a_sizes,
   const int codebook_stride
 ) {
   int sms;
diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index ef548a22d5893..4ba4b4e16a9e5 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -1,10 +1,10 @@
 from vllm import LLM, SamplingParams
 
-#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
+model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
 
 #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True)
 
-model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True, tensor_parallel_size=2)
+#model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True)
 
 # this has the dimensions 0 and 1 transposed for the codes, and we don't currently support 8x8 anyway.
 #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True)
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 3b880709733e2..4825dcff138c9 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -264,7 +264,7 @@ def weight_loader(self,
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
-        # shard_dim indicates fixed size concatenated at shard_id
+        # shard_dim indicates fixed size concatenated along shard_id
         shard_dim = getattr(param, "shard_dim", None)
         if loaded_shard_id is None:
             # Loaded weight is already packed.
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index b115ea7d37b0d..91b153a5bb8b3 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -151,7 +151,7 @@ def create_weights(self, input_size_per_partition: int,
             codebooks,
             {
                 "shard_dim": 0,
-                "output_partition_sizes": output_partition_sizes,
+                "output_partition_sizes": torch.tensor(output_partition_sizes, device='cpu'),
             },
         )
 

From 6253807fbad681d6998a1c883097ed398d27dc8e Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 5 Mar 2024 16:23:40 +0000
Subject: [PATCH 46/96] remove prints

---
 vllm/model_executor/layers/quantization/aqlm.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 91b153a5bb8b3..6ca0e7d0cafeb 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -195,12 +195,6 @@ def apply_weights(
         output_partition_sizes = getattr(codebooks, "output_partition_sizes",
                                          None)
 
-        #test
-        print("codes shape", codes.shape)
-        print("code books shape", codebooks.shape)
-        print("partition sizes", output_partition_sizes)
-        print("input shape", x.shape)
-
         output = ops.aqlm_gemm(
             x,
             codes,
@@ -210,6 +204,4 @@ def apply_weights(
             bias,
         )
 
-        print("output shape", output.shape)
-
         return output

From 05ccd5072a98daa865d750c0c5defdeee0fc9383 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 5 Mar 2024 17:22:37 +0000
Subject: [PATCH 47/96] add arguments and options

---
 examples/aqlm_example.py | 41 +++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index ef548a22d5893..65b73c26a6080 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -1,20 +1,35 @@
 from vllm import LLM, SamplingParams
+import argparse
 
-#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
+def main():
 
-#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True)
+      # Create argument parser
+    parser = argparse.ArgumentParser(description='Example script with command-line arguments')
 
-model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True, tensor_parallel_size=2)
+    # Add arguments
+    parser.add_argument('--model', '-m', type=int, default=0, help='Model ID [0-2]')
+    parser.add_argument('--tensor_parallel_size', '-t', type=int, default=1, help='tensor parallel size')
 
-# this has the dimensions 0 and 1 transposed for the codes, and we don't currently support 8x8 anyway.
-#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True)
-# this model hangs, need to investigate.
-#model = LLM("BlackSamorez/Mixtral-8x7B-Instruct-v0_1-AQLM-2Bit-1x16-hf", enforce_eager=True)
+    # Parse the command-line arguments
+    args = parser.parse_args()
 
-# These have custom code and no quantization_config block.
-#model = LLM("BlackSamorez/Llama-2-13b-AQLM-2Bit-1x16-hf", enforce_eager=True, trust_remote_code=True)
-#model = LLM("BlackSamorez/Mixtral-8x7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
+    # These are the verified working models.
+    models = ["BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", "BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", "BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf"]
 
-sampling_params = SamplingParams(max_tokens=100, temperature=0)
-outputs = model.generate("Hello my name is", sampling_params=sampling_params)
-print(outputs[0].outputs[0].text)
+    model = LLM(models[args.model], enforce_eager=True, tensor_parallel_size=args.tensor_parallel_size)
+ 
+    # this has the dimensions 0 and 1 transposed for the codes, and we don't currently support 8x8 anyway.
+    #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True)
+    # this model hangs, need to investigate.
+    #model = LLM("BlackSamorez/Mixtral-8x7B-Instruct-v0_1-AQLM-2Bit-1x16-hf", enforce_eager=True)
+
+    # These have custom code and no quantization_config block.
+    #model = LLM("BlackSamorez/Llama-2-13b-AQLM-2Bit-1x16-hf", enforce_eager=True, trust_remote_code=True)
+    #model = LLM("BlackSamorez/Mixtral-8x7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
+
+    sampling_params = SamplingParams(max_tokens=100, temperature=0)
+    outputs = model.generate("Hello my name is", sampling_params=sampling_params)
+    print(outputs[0].outputs[0].text)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From 7b67492cc6d01243c198cde9a8d3603de1b923a4 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 5 Mar 2024 17:24:02 +0000
Subject: [PATCH 48/96] rename shard_dim to just bool is_metadata

---
 vllm/model_executor/layers/linear.py          | 20 +++++++++----------
 .../layers/quantization/aqlm.py               | 12 +++++------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 3b880709733e2..8f66bf6b1e677 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -264,8 +264,7 @@ def weight_loader(self,
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
-        # shard_dim indicates fixed size concatenated at shard_id
-        shard_dim = getattr(param, "shard_dim", None)
+        is_metadata = getattr(param, "is_metadata", False)
         if loaded_shard_id is None:
             # Loaded weight is already packed.
             if output_dim is None:
@@ -306,10 +305,11 @@ def weight_loader(self,
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
-        elif shard_dim is not None:
-            shard_size = loaded_weight.shape[shard_dim]
+        elif is_metadata:
+            # metadata indicates fixed size concatenated along dim 0
+            shard_size = loaded_weight.shape[0]
             shard_offset = loaded_shard_id * shard_size
-            param_data = param_data.narrow(shard_dim, shard_offset, shard_size)
+            param_data = param_data.narrow(0, shard_offset, shard_size)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -389,8 +389,7 @@ def weight_loader(self,
                       loaded_shard_id: Optional[str] = None):
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
-        shard_dim = getattr(param, "shard_dim", None)
-
+        is_metadata = getattr(param, "is_metadata", False)
         if loaded_shard_id is None:
             # Loaded weight is already packed.
             if output_dim is None:
@@ -445,10 +444,11 @@ def weight_loader(self,
             start_idx = shard_id * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
-        elif shard_dim is not None:
-            shard_size = loaded_weight.shape[shard_dim]
+        elif is_metadata:
+            # metadata indicates fixed size concatenated along dim 0
+            shard_size = loaded_weight.shape[0]
             shard_index = ["q", "k", "v"].index(loaded_shard_id)
-            param_data = param_data.narrow(shard_dim, shard_index * shard_size,
+            param_data = param_data.narrow(0, shard_index * shard_size,
                                            shard_size)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 9dcf36ef6ef5b..b606370909a04 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -151,7 +151,8 @@ def create_weights(self, input_size_per_partition: int,
         set_weight_attrs(
             codebooks,
             {
-                "shard_dim": 0,
+                # metadata indicates fixed size concatenated along dim 0
+                "is_metadata": True,
             },
         )
 
@@ -193,8 +194,8 @@ def apply_weights(
         codes = weights["codes"]
         scales = weights["scales"]
 
-        shard_dim = getattr(codebooks, "shard_dim", None)
-        if shard_dim is not None:
+        is_metadata = getattr(codebooks, "is_metadata", False)
+        if is_metadata:
             output_shape = x.shape[:-1] + (scales.shape[0], )
             output = torch.empty(output_shape, dtype=x.dtype, device=x.device)
             output_partition_sizes = getattr(codes, "output_partition_sizes",
@@ -202,8 +203,7 @@ def apply_weights(
             num_outputs = len(output_partition_sizes)
 
             # break the inputs and codebooks apart then combine the outputs.
-            assert (shard_dim == 0)
-            num_codebooks = codebooks.shape[shard_dim] // num_outputs
+            num_codebooks = codebooks.shape[0] // num_outputs
             assert (scales.shape[0] == codes.shape[0])
             assert (sum(output_partition_sizes) == scales.shape[0])
             output_offset = 0
@@ -211,7 +211,7 @@ def apply_weights(
             for output_size in output_partition_sizes:
                 shard_output = ops.aqlm_gemm(
                     x, codes.narrow(0, output_offset, output_size),
-                    codebooks.narrow(shard_dim, codebooks_offset,
+                    codebooks.narrow(0, codebooks_offset,
                                      num_codebooks),
                     scales.narrow(0, output_offset, output_size),
                     None if bias is None else bias.narrow(

From 3aafb3cab762e8851ff8ab77d7037fa70c69227a Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 5 Mar 2024 17:39:22 +0000
Subject: [PATCH 49/96] use TORCH_CHECK

---
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index ec271435fe15d..c7a1d606911d5 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -184,7 +184,6 @@ torch::Tensor aqlm_gemm(
     return code2x8_matmat(input, codes, codebooks, scales, bias);
   }
 
-  std::cerr << "AQLM with " << nbooks << " codebooks and " << entries << " entries is not currently supported.";
-  std::abort();
+  TORCH_CHECK(False, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
   return {};
 }

From ef608a612c1f98c8c9fa91a8a4ac1018f7cd32c7 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 5 Mar 2024 17:40:39 +0000
Subject: [PATCH 50/96] cleanup aqlm_example

---
 examples/aqlm_example.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index 65b73c26a6080..eee3877d73327 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -3,14 +3,11 @@
 
 def main():
 
-      # Create argument parser
     parser = argparse.ArgumentParser(description='Example script with command-line arguments')
 
-    # Add arguments
     parser.add_argument('--model', '-m', type=int, default=0, help='Model ID [0-2]')
     parser.add_argument('--tensor_parallel_size', '-t', type=int, default=1, help='tensor parallel size')
 
-    # Parse the command-line arguments
     args = parser.parse_args()
 
     # These are the verified working models.

From 5bacc9d0fbb8db708220ed19ea0a10afcab0bee9 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 5 Mar 2024 19:10:02 +0000
Subject: [PATCH 51/96] format

---
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp    |  3 +-
 examples/aqlm_example.py                      | 34 ++++++++++++++-----
 vllm/config.py                                |  2 +-
 .../layers/quantization/aqlm.py               |  5 ++-
 4 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 35aeb67046070..c2fa92c678df5 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -160,7 +160,6 @@ torch::Tensor aqlm_gemm(
     return code2x8_matmat(input, codes, codebooks, scales, bias);
   }
 
-  std::cerr << "AQLM with " << nbooks << " codebooks and " << entries << " entries is not currently supported.";
-  std::abort();
+  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
   return {};
 }
diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index eee3877d73327..6e6dc07a9f7a8 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -1,20 +1,36 @@
 from vllm import LLM, SamplingParams
 import argparse
 
+
 def main():
 
-    parser = argparse.ArgumentParser(description='Example script with command-line arguments')
+    parser = argparse.ArgumentParser(
+        description='Example script with command-line arguments')
 
-    parser.add_argument('--model', '-m', type=int, default=0, help='Model ID [0-2]')
-    parser.add_argument('--tensor_parallel_size', '-t', type=int, default=1, help='tensor parallel size')
+    parser.add_argument('--model',
+                        '-m',
+                        type=int,
+                        default=0,
+                        help='Model ID [0-2]')
+    parser.add_argument('--tensor_parallel_size',
+                        '-t',
+                        type=int,
+                        default=1,
+                        help='tensor parallel size')
 
     args = parser.parse_args()
 
     # These are the verified working models.
-    models = ["BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", "BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", "BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf"]
+    models = [
+        "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
+        "BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf",
+        "BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf"
+    ]
+
+    model = LLM(models[args.model],
+                enforce_eager=True,
+                tensor_parallel_size=args.tensor_parallel_size)
 
-    model = LLM(models[args.model], enforce_eager=True, tensor_parallel_size=args.tensor_parallel_size)
- 
     # this has the dimensions 0 and 1 transposed for the codes, and we don't currently support 8x8 anyway.
     #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True)
     # this model hangs, need to investigate.
@@ -25,8 +41,10 @@ def main():
     #model = LLM("BlackSamorez/Mixtral-8x7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
 
     sampling_params = SamplingParams(max_tokens=100, temperature=0)
-    outputs = model.generate("Hello my name is", sampling_params=sampling_params)
+    outputs = model.generate("Hello my name is",
+                             sampling_params=sampling_params)
     print(outputs[0].outputs[0].text)
 
+
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/vllm/config.py b/vllm/config.py
index 4448ee9dac017..d8825397e0fe8 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,4 +1,4 @@
-from typing import Any, Optional, Union, ClassVar
+from typing import Optional, Union, ClassVar
 from dataclasses import dataclass
 import os
 from packaging.version import Version
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index b606370909a04..9bea3dfb87e37 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -41,7 +41,7 @@ def __init__(
         self.out_group_size = out_group_size
 
         # out_group_size > 1 is untested, and probably won't work as-is.
-        assert(self.out_group_size == 1)
+        assert (self.out_group_size == 1)
         self.pack_factor = (self.in_group_size * self.out_group_size)
 
     def __repr__(self) -> str:
@@ -211,8 +211,7 @@ def apply_weights(
             for output_size in output_partition_sizes:
                 shard_output = ops.aqlm_gemm(
                     x, codes.narrow(0, output_offset, output_size),
-                    codebooks.narrow(0, codebooks_offset,
-                                     num_codebooks),
+                    codebooks.narrow(0, codebooks_offset, num_codebooks),
                     scales.narrow(0, output_offset, output_size),
                     None if bias is None else bias.narrow(
                         0, output_offset, output_size))

From 2def434a0c543c7d3394bcbd0e59d4d3d0a185c5 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 5 Mar 2024 19:17:45 +0000
Subject: [PATCH 52/96] some stuff

---
 vllm/model_executor/layers/linear.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 8f66bf6b1e677..eb1778b95548e 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -154,7 +154,7 @@ def __init__(
         skip_bias_add: bool = False,
         params_dtype: Optional[torch.dtype] = None,
         linear_method: Optional[LinearMethodBase] = None,
-        output_sizes: List[int] = [0],
+        output_sizes: Optional[List[int]] = None,
     ):
         super().__init__()
 
@@ -171,6 +171,8 @@ def __init__(
         self.params_dtype = params_dtype
         if linear_method is None:
             linear_method = UnquantizedLinearMethod()
+        if output_sizes is None:
+            output_sizes = [output_size]
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
             self.input_size, self.output_size_per_partition, self.input_size,

From 821ee99ef29be6aba8993d27b08e6d91921e54da Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 5 Mar 2024 19:45:24 +0000
Subject: [PATCH 53/96] change 60 to 70 for min cap

---
 vllm/model_executor/layers/quantization/aqlm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 9bea3dfb87e37..8c0a30c86e3dd 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -61,7 +61,7 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
     @classmethod
     # Need to figure it out
     def get_min_capability(cls) -> int:
-        return 60
+        return 70
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:

From d0816bf1f53d72cac1ade0f0d65fefae18419bd4 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 5 Mar 2024 19:49:52 +0000
Subject: [PATCH 54/96] format

---
 vllm/model_executor/layers/quantization/aqlm.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index ff0a7763c7e49..ce7824be74b4e 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -151,8 +151,10 @@ def create_weights(self, input_size_per_partition: int,
             codebooks,
             {
                 # metadata indicates fixed size concatenated along dim 0
-                "is_metadata": True,
-                "output_partition_sizes": torch.tensor(output_partition_sizes, device='cpu'),
+                "is_metadata":
+                True,
+                "output_partition_sizes":
+                torch.tensor(output_partition_sizes, device='cpu'),
             },
         )
 

From 6372c64eeed9bed56ef5a7a4298bb299b02fb820 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 5 Mar 2024 19:59:24 +0000
Subject: [PATCH 55/96] make aqlm not rocm supported

---
 vllm/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index d8825397e0fe8..7d53d03c1abb6 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -156,7 +156,7 @@ def _verify_tokenizer_mode(self) -> None:
 
     def _verify_quantization(self) -> None:
         supported_quantization = ["aqlm", "awq", "gptq", "squeezellm"]
-        rocm_not_supported_quantization = ["awq"]
+        rocm_not_supported_quantization = ["aqlm", "awq"]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 

From 83c207077429c45b96d486a6736663def2488728 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 5 Mar 2024 21:58:00 +0000
Subject: [PATCH 56/96] Add LICENSE file

---
 csrc/quantization/aqlm/LICENSE | 201 +++++++++++++++++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100644 csrc/quantization/aqlm/LICENSE

diff --git a/csrc/quantization/aqlm/LICENSE b/csrc/quantization/aqlm/LICENSE
new file mode 100644
index 0000000000000..6d83e5c5d2c26
--- /dev/null
+++ b/csrc/quantization/aqlm/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [2024] [AQLM authors]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

From 267b3390be9c3424b36660af37a1c164ab5a9f3f Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 5 Mar 2024 21:59:58 +0000
Subject: [PATCH 57/96] add reference

---
 csrc/quantization/aqlm/LICENSE | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/csrc/quantization/aqlm/LICENSE b/csrc/quantization/aqlm/LICENSE
index 6d83e5c5d2c26..bfa740da977e9 100644
--- a/csrc/quantization/aqlm/LICENSE
+++ b/csrc/quantization/aqlm/LICENSE
@@ -1,3 +1,5 @@
+Contains code from https://github.com/Vahe1994/AQLM
+
                                  Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/

From 040878966c7097bd726fce8659dc930232dff997 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 5 Mar 2024 22:02:32 +0000
Subject: [PATCH 58/96] add better license headers

---
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 17 ++++++++++++++++-
 csrc/quantization/aqlm/aqlm_cuda_kernel.cu | 17 ++++++++++++++++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 1b5e69893617d..302a0a3fda6fe 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -1,4 +1,19 @@
-// Adapted from https://github.com/Vahe1994/AQLM
+/*
+ * Modified by Neural Magic
+ * Adapted from https://github.com/Vahe1994/AQLM
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #include <torch/all.h>
 #include <torch/python.h>
diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
index 4d8bdc6e47861..1d8046eb01c5c 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
+++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
@@ -1,4 +1,19 @@
-// Adapted from https://github.com/Vahe1994/AQLM
+/*
+ * Modified by Neural Magic
+ * Adapted from https://github.com/Vahe1994/AQLM
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #include <cuda.h>
 #include <cuda_fp16.h>

From 48838b8a822c3c1a4980320d29bd8f46f654ae34 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 7 Mar 2024 17:26:08 +0000
Subject: [PATCH 59/96] add support for 2x8 optimization

---
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp    | 21 ++++++----
 csrc/quantization/aqlm/aqlm_cuda_kernel.cu    | 40 ++++++++++++++-----
 .../layers/quantization/aqlm.py               |  1 -
 3 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 302a0a3fda6fe..2e4ed8ac4eb41 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -39,7 +39,9 @@ void code2x8_matvec_cuda(
         void* C,
   const void* codebook,
   int prob_m,
-  int prob_k
+  int prob_k,
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
+  const int codebook_stride // as int4.
 );
 
 void code1x16_matvec(
@@ -109,7 +111,8 @@ void code2x8_matvec(
   const torch::Tensor& A,
   const torch::Tensor& B,
         torch::Tensor& C,
-  const torch::Tensor& codebook
+  const torch::Tensor& codebook,
+  const int4 codebook_a_sizes
 ) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
   int prob_m = C.size(0);
@@ -120,7 +123,9 @@ void code2x8_matvec(
     C.data_ptr(),
     codebook.data_ptr(),
     prob_m,
-    prob_k
+    prob_k,
+    codebook_a_sizes,
+    2 * codebook.stride(0) * codebook.element_size() / sizeof(int4)
   );
 }
 
@@ -129,6 +134,7 @@ torch::Tensor code2x8_matmat(
   const torch::Tensor& codes,
   const torch::Tensor& codebooks,
   const torch::Tensor& scales,
+  const int4 codebook_a_sizes,
   const std::optional<torch::Tensor>& bias
 ) {
   auto input_sizes = input.sizes();
@@ -147,7 +153,8 @@ torch::Tensor code2x8_matmat(
       codes.squeeze(2),
       input_vec,
       output_vec,
-      codebooks
+      codebooks,
+      codebook_a_sizes
     );
   }
   flat_output *= scales.flatten().unsqueeze(0);
@@ -174,8 +181,6 @@ torch::Tensor aqlm_gemm(
   int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
   int const entries = codebooks.size(1);
 
-  if (nbooks == 1 && entries == (1 << 16))
-  {
     int4 cumulative_sizes;
     auto cumulative_size = &cumulative_sizes.x;
     int i =0;
@@ -192,11 +197,13 @@ torch::Tensor aqlm_gemm(
       *cumulative_size = last*10;
     }
 
+  if (nbooks == 1 && entries == (1 << 16))
+  {
     return code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
   }
   if (nbooks == 2 && entries == (1 << 8))
   {
-    return code2x8_matmat(input, codes, codebooks, scales, bias);
+    return code2x8_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
   }
 
   TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
index 1d8046eb01c5c..9ae6a7eeb1587 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
+++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
@@ -34,16 +34,19 @@ __global__ void Code1x16MatVec(
 ) {
   int a_gl_stride = prob_k / 8 / 8;
   int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  bool pred = a_gl_rd < prob_m;
 
-  // advance to the correct codebook, this easy because we only multiply one column of the codebook.
-  auto codebook_size = &codebook_a_sizes.x;
-  while (a_gl_rd >= *codebook_size)
+  if (pred)
   {
-      codebook += codebook_stride; 
-      ++codebook_size;
+    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
+    auto codebook_size = &codebook_a_sizes.x;
+    while (a_gl_rd >= *codebook_size)
+    {
+        codebook += codebook_stride;
+        ++codebook_size;
+    }
   }
 
-  bool pred = a_gl_rd < prob_m;
   int b_gl_rd = 0;
   int c_gl_wr = a_gl_rd;
   a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
@@ -104,11 +107,26 @@ __global__ void Code2x8MatVec(
         int4* __restrict__ C,
   const int4* __restrict__ codebook,
   int prob_m,
-  int prob_k
+  int prob_k,
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
+  const int codebook_stride // as int4.
+
 ) {
   int a_gl_stride = prob_k / 8 / 8;
   int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
   bool pred = a_gl_rd < prob_m;
+
+  if (pred)
+  {
+    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
+    auto codebook_size = &codebook_a_sizes.x;
+    while (a_gl_rd >= *codebook_size)
+    {
+        codebook += codebook_stride;
+        ++codebook_size;
+    }
+  }
+
   int b_gl_rd = 0;
   int c_gl_wr = a_gl_rd;
   a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
@@ -216,7 +234,9 @@ void  code2x8_matvec_cuda(
         void* __restrict__ C,
   const void* __restrict__ codebook,
   int prob_m,
-  int prob_k
+  int prob_k,
+  const int4 codebook_a_sizes,
+  const int codebook_stride
 ) {
   int sms;
   cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
@@ -240,6 +260,8 @@ void  code2x8_matvec_cuda(
     (int4*) C,
     (const int4*) codebook,
     prob_m,
-    prob_k
+    prob_k,
+    codebook_a_sizes,
+    codebook_stride
   );
 }
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index ce7824be74b4e..f4f95cec91174 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -9,7 +9,6 @@
 from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 
-
 def get_int_dtype(nbits: int) -> torch.dtype:
     if nbits <= 8:
         return torch.int8

From 482262947aa78df4972135bc4c2a890917f1ed1a Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 7 Mar 2024 17:26:23 +0000
Subject: [PATCH 60/96] format

---
 vllm/model_executor/layers/quantization/aqlm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index f4f95cec91174..ce7824be74b4e 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -9,6 +9,7 @@
 from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 
+
 def get_int_dtype(nbits: int) -> torch.dtype:
     if nbits <= 8:
         return torch.int8

From c255f443885472035b8dafe44e7cb3d3c1d89e94 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 7 Mar 2024 19:10:17 +0000
Subject: [PATCH 61/96] add better example models, and replace
 output_partition_size with sizes

---
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp    |  2 +-
 examples/aqlm_example.py                      |  2 ++
 vllm/model_executor/layers/linear.py          | 26 +++++++++----------
 .../layers/quantization/aqlm.py               | 14 +++++++---
 .../model_executor/layers/quantization/awq.py | 14 +++++++---
 .../layers/quantization/gptq.py               |  4 +--
 .../layers/quantization/squeezellm.py         | 14 +++++++---
 7 files changed, 48 insertions(+), 28 deletions(-)

diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 2e4ed8ac4eb41..4cb874b27e698 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -198,7 +198,7 @@ torch::Tensor aqlm_gemm(
     }
 
   if (nbooks == 1 && entries == (1 << 16))
-  {
+  { 
     return code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
   }
   if (nbooks == 2 && entries == (1 << 8))
diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index 6e6dc07a9f7a8..691dbb68f0685 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -22,6 +22,8 @@ def main():
 
     # These are the verified working models.
     models = [
+        "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
+        "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
         "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
         "BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf",
         "BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf"
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index eb1778b95548e..857367340ec34 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -22,9 +22,9 @@ class LinearMethodBase(ABC):
 
     @abstractmethod
     def create_weights(self, input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       output_partition_sizes: List[int]) -> Dict[str, Any]:
+                       output_partition_sizes: List[int], input_size: int,
+                       output_size: int,
+                       params_dtype: torch.dtype) -> Dict[str, Any]:
         """Create weights for a linear layer."""
         raise NotImplementedError
 
@@ -49,9 +49,10 @@ def __init__(self, separate_bias_add: bool = False):
         self.separate_bias_add = separate_bias_add
 
     def create_weights(self, input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       output_partition_sizes: List[int]) -> Dict[str, Any]:
+                       output_partition_sizes: List[int], input_size: int,
+                       output_size: int,
+                       params_dtype: torch.dtype) -> Dict[str, Any]:
+        output_size_per_partition = sum(output_partition_sizes)
         weight = Parameter(torch.empty(output_size_per_partition,
                                        input_size_per_partition,
                                        dtype=params_dtype),
@@ -105,8 +106,8 @@ def __init__(
             linear_method = UnquantizedLinearMethod()
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
-            self.input_size, self.output_size, self.input_size,
-            self.output_size, self.params_dtype, [self.output_size])
+            self.input_size, [self.output_size], self.input_size,
+            self.output_size, self.params_dtype)
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
@@ -175,9 +176,8 @@ def __init__(
             output_sizes = [output_size]
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
-            self.input_size, self.output_size_per_partition, self.input_size,
-            self.output_size, self.params_dtype,
-            [x // tp_size for x in output_sizes])
+            self.input_size, [x // tp_size for x in output_sizes],
+            self.input_size, self.output_size, self.params_dtype)
 
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
@@ -518,8 +518,8 @@ def __init__(
             linear_method = UnquantizedLinearMethod()
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
-            self.input_size_per_partition, self.output_size, self.input_size,
-            self.output_size, self.params_dtype, [self.output_size])
+            self.input_size_per_partition, [self.output_size], self.input_size,
+            self.output_size, self.params_dtype)
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index ce7824be74b4e..e1d33d4ae168d 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -93,10 +93,14 @@ class AQLMLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: AQLMConfig):
         self.quant_config = quant_config
 
-    def create_weights(self, input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       output_partition_sizes: List[int]) -> Dict[str, Any]:
+    def create_weights(
+        self,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
         del output_size  # Unused.
         del input_size  # Unused.
 
@@ -107,6 +111,8 @@ def create_weights(self, input_size_per_partition: int,
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
                 "tensor parallel size.")
+
+        output_size_per_partition = sum(output_partition_sizes)
         if output_size_per_partition % self.quant_config.out_group_size != 0:
             raise ValueError(
                 "The output size is not aligned with the quantized "
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index e6c7c658e1e19..5751920590bd5 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -77,15 +77,21 @@ class AWQLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: AWQConfig):
         self.quant_config = quant_config
 
-    def create_weights(self, input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       output_partition_sizes: List[int]) -> Dict[str, Any]:
+    def create_weights(
+        self,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
         if input_size_per_partition % self.quant_config.group_size != 0:
             raise ValueError(
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
                 "tensor parallel size.")
+
+        output_size_per_partition = sum(output_partition_sizes)
         if output_size_per_partition % self.quant_config.pack_factor != 0:
             raise ValueError(
                 "The output size is not aligned with the quantized "
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 36a60990a8049..a044d5f219d3f 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -90,11 +90,10 @@ def __init__(self, quant_config: GPTQConfig):
     def create_weights(
         self,
         input_size_per_partition: int,
-        output_size_per_partition: int,
+        output_partition_sizes: List[int],
         input_size: int,
         output_size: int,
         params_dtype: torch.dtype,
-        output_partition_sizes: List[int],
     ) -> Dict[str, Any]:
         del output_size  # Unused.
         if input_size_per_partition % self.quant_config.group_size != 0:
@@ -102,6 +101,7 @@ def create_weights(
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
                 "tensor parallel size.")
+        output_size_per_partition = sum(output_partition_sizes)
         if output_size_per_partition % self.quant_config.pack_factor != 0:
             raise ValueError(
                 "The output size is not aligned with the quantized "
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index e266e8a74af69..0769cc71e8d0c 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -67,15 +67,21 @@ class SqueezeLLMLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: SqueezeLLMConfig):
         self.quant_config = quant_config
 
-    def create_weights(self, input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       output_partition_sizes: List[int]) -> Dict[str, Any]:
+    def create_weights(
+        self,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
         if input_size_per_partition % self.quant_config.pack_factor != 0:
             raise ValueError(
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
                 "tensor parallel size.")
+
+        output_size_per_partition = sum(output_partition_sizes)
         qweight = Parameter(
             torch.empty(
                 input_size_per_partition // self.quant_config.pack_factor,

From 15d7206f4dd7452f0eae8d3bf61c1b158abc91f8 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 7 Mar 2024 19:20:05 +0000
Subject: [PATCH 62/96] format

---
 vllm/config.py                       | 4 +++-
 vllm/model_executor/layers/linear.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 598b8cf7a3aef..a62865f954490 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -157,7 +157,9 @@ def _verify_tokenizer_mode(self) -> None:
         self.tokenizer_mode = tokenizer_mode
 
     def _verify_quantization(self) -> None:
-        supported_quantization = ["aqlm", "awq", "gptq", "squeezellm", "marlin"]
+        supported_quantization = [
+            "aqlm", "awq", "gptq", "squeezellm", "marlin"
+        ]
         rocm_not_supported_quantization = ["aqlm", "awq", "marlin"]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 8c0e654a09919..3101536c9e0d6 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -410,7 +410,7 @@ def weight_loader(self,
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
         is_metadata = getattr(param, "is_metadata", False)
-        
+
         if loaded_shard_id is None:
             # Loaded weight is already packed.
             if output_dim is None:

From 8df10d974bc8f8a285bd0def150619e7fccc66c3 Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 7 Mar 2024 21:11:35 +0000
Subject: [PATCH 63/96] Add test_aqlm.py

---
 tests/models/test_aqlm.py | 77 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 tests/models/test_aqlm.py

diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
new file mode 100644
index 0000000000000..65eeeafe38c56
--- /dev/null
+++ b/tests/models/test_aqlm.py
@@ -0,0 +1,77 @@
+"""Compare the outputs of a AQLM model between vLLM and HF Transformers
+
+Run `pytest tests/models/test_aqlm.py --forked`.
+"""
+
+import pytest
+import torch
+from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+aqlm_not_supported = (
+    capability < _QUANTIZATION_CONFIG_REGISTRY["aqlm"].get_min_capability())
+
+# In this test we hardcode prompts and generations for the model so we don't need to require the AQLM package as a dependency
+example_prompts = [
+    'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.\n',
+    'Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.\n',
+    'Compare and contrast artificial intelligence with human intelligence in terms of processing information.\n',
+    'Describe the basic components of a neural network and how it can be trained.\n',
+    'Write a short story about a robot that dreams for the first time.\n',
+    'Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.\n',
+    'Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.\n',
+    "Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'\n"
+]
+
+# These ground truth generations were generated using `transformers==4.38.1 aqlm==1.1.0 torch==2.2.0`
+# and the below code:
+# ```python
+# from transformers import AutoTokenizer, AutoModelForCausalLM
+# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
+# quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="cuda").cuda()
+# tokenizer = AutoTokenizer.from_pretrained(model_id)
+# outputs = []
+# for prompt in example_prompts:
+#     input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
+#     hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32)
+#     outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:]))
+# ```
+ground_truth_generations = [
+    '\n### Features\n\n- **High-throughput**: vLLM is designed to be memory-efficient and high-throughput. It',
+    'The major milestones in the development of artificial intelligence from 1950 to 2020 are as follows:\n1950',
+    'Compare and contrast artificial intelligence with human intelligence in terms of processing information. The processing of information is a key component of artificial intelligence. The processing of information is',
+    'Explain the difference between supervised and unsupervised learning.\nExplain the difference between feedforward and recurrent neural networks.\nExplain the difference',
+    'Write a short story about a robot that dreams for the first time. The story should be about 1000 words.\nThe story should be',
+    'Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. The COVID-19 pandemic has had a',
+    'The Mona Lisa is a painting by Leonardo da Vinci, and it is considered to be one of the most famous paintings in the world. The',
+    'The early bird catches the worm.\nThe early bird catches the worm. (Japanese)\nLe petit oiseau attrait'
+]
+
+
+@pytest.mark.skipif(aqlm_not_supported,
+                    reason="AQLM is not supported on this GPU type.")
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [3])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+
+    vllm_model = vllm_runner(model, dtype=dtype, enforce_eager=True)
+    vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
+                                                       max_tokens,
+                                                       num_logprobs)
+
+    # loop through the prompts to compare against the ground truth generations
+    for prompt_idx in range(len(example_prompts)):
+        vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[
+            prompt_idx]
+
+        assert vllm_output_str == ground_truth_generations[prompt_idx]

From a3039dd1c86cc66d0340684cb1a934682431cddb Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 7 Mar 2024 23:00:07 +0000
Subject: [PATCH 64/96] remove comments

---
 examples/aqlm_example.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index 691dbb68f0685..8a358e53a485c 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -20,28 +20,18 @@ def main():
 
     args = parser.parse_args()
 
-    # These are the verified working models.
     models = [
         "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
         "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
+        "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
+        "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
         "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
-        "BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf",
-        "BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf"
     ]
 
     model = LLM(models[args.model],
                 enforce_eager=True,
                 tensor_parallel_size=args.tensor_parallel_size)
 
-    # this has the dimensions 0 and 1 transposed for the codes, and we don't currently support 8x8 anyway.
-    #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True)
-    # this model hangs, need to investigate.
-    #model = LLM("BlackSamorez/Mixtral-8x7B-Instruct-v0_1-AQLM-2Bit-1x16-hf", enforce_eager=True)
-
-    # These have custom code and no quantization_config block.
-    #model = LLM("BlackSamorez/Llama-2-13b-AQLM-2Bit-1x16-hf", enforce_eager=True, trust_remote_code=True)
-    #model = LLM("BlackSamorez/Mixtral-8x7b-AQLM-2Bit-1x16-hf", enforce_eager=True)
-
     sampling_params = SamplingParams(max_tokens=100, temperature=0)
     outputs = model.generate("Hello my name is",
                              sampling_params=sampling_params)

From 2ecce81309035714770b21cbc768872c4b529f8d Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 8 Mar 2024 15:10:33 +0000
Subject: [PATCH 65/96] put aqlm inside rocm block

---
 csrc/ops.h      | 2 +-
 csrc/pybind.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 3a81874ff4b24..ea0722adf9621 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -69,6 +69,7 @@ void gelu_fast(
   torch::Tensor& out,
   torch::Tensor& input);
 
+#ifndef USE_ROCM
 torch::Tensor aqlm_gemm(
   const torch::Tensor& input,
   const torch::Tensor& codes,
@@ -78,7 +79,6 @@ torch::Tensor aqlm_gemm(
   const std::optional<torch::Tensor>& bias
 );
 
-#ifndef USE_ROCM
 torch::Tensor awq_gemm(
   torch::Tensor _in_feats,
   torch::Tensor _kernel,
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index 6b23c706e252f..ba2c00147d7f4 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -53,8 +53,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
 
   // Quantization ops
-  ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
 #ifndef USE_ROCM
+  ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
   ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
   ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");
   ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");

From 5864a00352f00a9686b6bcf1dc75fde33a7d3cac Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 8 Mar 2024 18:39:56 +0000
Subject: [PATCH 66/96] add model to example

---
 examples/aqlm_example.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index 8a358e53a485c..ca014be91405e 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -5,13 +5,18 @@
 def main():
 
     parser = argparse.ArgumentParser(
-        description='Example script with command-line arguments')
+        description='AQLM examples')
 
     parser.add_argument('--model',
                         '-m',
+                        type=str,
+                        default=None,
+                        help='model path, as for HF')
+    parser.add_argument('--choice',
+                        '-c',
                         type=int,
                         default=0,
-                        help='Model ID [0-2]')
+                        help='known good models by index, [0-4]')
     parser.add_argument('--tensor_parallel_size',
                         '-t',
                         type=int,
@@ -28,7 +33,7 @@ def main():
         "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
     ]
 
-    model = LLM(models[args.model],
+    model = LLM(args.model if args.model is not None else models[args.choice],
                 enforce_eager=True,
                 tensor_parallel_size=args.tensor_parallel_size)
 

From 58dbb014f326fde320af85fba0545bfc922a0522 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 8 Mar 2024 18:46:27 +0000
Subject: [PATCH 67/96] remove comment

---
 vllm/model_executor/layers/quantization/aqlm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index e1d33d4ae168d..c069e8c006861 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -59,7 +59,6 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
         return [torch.half]
 
     @classmethod
-    # Need to figure it out
     def get_min_capability(cls) -> int:
         return 70
 

From 7dc5f83a775ed1a6b97f5689f7c3329f6db83c8d Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 8 Mar 2024 20:23:02 +0000
Subject: [PATCH 68/96] format

---
 examples/aqlm_example.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index ca014be91405e..468364d935d0c 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -4,8 +4,7 @@
 
 def main():
 
-    parser = argparse.ArgumentParser(
-        description='AQLM examples')
+    parser = argparse.ArgumentParser(description='AQLM examples')
 
     parser.add_argument('--model',
                         '-m',

From 8069375b6303502bdfa7ce819bc2ac18b5984af0 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 8 Mar 2024 20:36:27 +0000
Subject: [PATCH 69/96] fix test

---
 tests/models/test_aqlm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index 65eeeafe38c56..2464e7e20aa70 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -51,7 +51,7 @@
 
 @pytest.mark.skipif(aqlm_not_supported,
                     reason="AQLM is not supported on this GPU type.")
-@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [3])

From 9891e22a2c95e9dc36d1b42eb1fe10c4b0018454 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 12 Mar 2024 16:57:23 +0000
Subject: [PATCH 70/96] Add dequantization kernel

---
 .../layers/quantization/aqlm.py               | 101 ++++++++++++++++--
 1 file changed, 93 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index c069e8c006861..a192f120cd9b8 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -2,8 +2,10 @@
 
 from typing import Any, Dict, List, Optional
 
+import math
 import torch
 from torch.nn.parameter import Parameter
+import torch.nn.functional as F
 
 from vllm._C import ops
 from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
@@ -22,6 +24,60 @@ def get_int_dtype(nbits: int) -> torch.dtype:
     raise ValueError(f"No dtype available for {nbits}-bit codebooks")
 
 
+@torch.inference_mode()
+def unpack_int_data(data: torch.IntTensor, nbits: int) -> torch.IntTensor:
+    return data.to(torch.int64) % (2**nbits)
+
+
+def dequantize_weight(codes: torch.Tensor,
+                      codebooks: torch.Tensor,
+                      scales: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Decode float weights from quantization codes. Differentiable.
+    :param codes: tensor of integer quantization codes, shape [*dims, num_out_groups, num_in_groups, num_codebooks]
+    :param codebooks: tensor of vectors for each quantization code, [num_codebooks, codebook_size, out_group_size, in_group_size]
+    :param scales: weight will be multiplied by this factor, must be broadcastble with [*dims, out_groups, num_in_groups, out_group_size, in_group_size]
+    :return: reconstructed weight tensor of shape [*dims, num_in_groups*group_size]
+    """
+    num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:]
+    num_codebooks, codebook_size, out_group_size, in_group_size = codebooks.shape
+    out_features = num_out_groups * out_group_size
+    in_features = num_in_groups * in_group_size
+    codebook_offsets = torch.arange(
+        0, num_codebooks * codebook_size, codebook_size,
+        device=codes.device)  # shape: [num_codebooks]
+    reconstructed_weight_flat = F.embedding_bag(
+        codes.flatten(0, -2) + codebook_offsets,
+        codebooks.flatten(0, 1).flatten(-2, -1),
+        mode="sum"
+    )  # [prod(dims) * num_out_groups * num_in_groups, out_group_size * in_group_size]
+
+    reconstructed_weight_groupwise = reconstructed_weight_flat.view(
+        list(codes.shape[:-3]) +
+        [num_out_groups, num_in_groups, out_group_size, in_group_size])
+    if scales is not None:
+        reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(
+            scales)
+    return reconstructed_weight_groupwise.swapaxes(
+        -3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features])
+
+
+def dequantize_gemm(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    dequantized_weight = dequantize_weight(
+        unpack_int_data(codes, codebooks.shape[1].bit_length() - 1),
+        codebooks,
+        scales,
+    )
+    return F.linear(input, dequantized_weight, bias)
+
+
 class AQLMConfig(QuantizationConfig):
     """Config class for AQLM.
 
@@ -203,13 +259,42 @@ def apply_weights(
         output_partition_sizes = getattr(codebooks, "output_partition_sizes",
                                          None)
 
-        output = ops.aqlm_gemm(
-            x,
-            codes,
-            codebooks,
-            scales,
-            output_partition_sizes,
-            bias,
-        )
+        output = None
+
+        use_gemv = math.prod(x.shape[:-1]) <= 32
+
+        if not use_gemv:
+            output_shape = x.shape[:-1] + (scales.shape[0], )
+            output = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+            num_outputs = len(output_partition_sizes)
+
+            # break the inputs and codebooks apart then combine the outputs.
+            num_codebooks = codebooks.shape[0] // num_outputs
+            assert (scales.shape[0] == codes.shape[0])
+            assert (sum(output_partition_sizes) == scales.shape[0])
+            output_offset = 0
+            codebooks_offset = 0
+            for output_size in output_partition_sizes:
+                shard_output = dequantize_gemm(
+                    x, codes.narrow(0, output_offset, output_size),
+                    codebooks.narrow(0, codebooks_offset, num_codebooks),
+                    scales.narrow(0, output_offset, output_size),
+                    None if bias is None else bias.narrow(
+                        0, output_offset, output_size))
+
+                output_slice = output.narrow(-1, output_offset, output_size)
+                assert (output_slice.shape == shard_output.shape)
+                output_slice.copy_(shard_output)
+                output_offset += output_size
+                codebooks_offset += num_codebooks
+        else:
+            output = ops.aqlm_gemm(
+                x,
+                codes,
+                codebooks,
+                scales,
+                output_partition_sizes,
+                bias,
+            )
 
         return output

From a51192f9debccf25879cdee214ad91e6603207a8 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 12 Mar 2024 14:22:51 -0400
Subject: [PATCH 71/96] Update csrc/quantization/aqlm/aqlm_cuda_entry.cpp

---
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 30 +++++++++++-----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 4cb874b27e698..81a5c36f5afef 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -181,21 +181,21 @@ torch::Tensor aqlm_gemm(
   int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
   int const entries = codebooks.size(1);
 
-    int4 cumulative_sizes;
-    auto cumulative_size = &cumulative_sizes.x;
-    int i =0;
-    int last = 0;
-    assert(codebook_partition_sizes.size(0) <= 4);
-    for (; i <  codebook_partition_sizes.size(0); ++i, ++cumulative_size)
-    {
-      *cumulative_size = codebook_partition_sizes[i].item<int>() + last;
-      last = *cumulative_size;
-    }
-    // fill in the rest with unreachable.
-    for (; i < 4; ++i, ++cumulative_size)
-    {
-      *cumulative_size = last*10;
-    }
+  int4 cumulative_sizes;
+  auto cumulative_size = &cumulative_sizes.x;
+  int i =0;
+  int last = 0;
+  assert(codebook_partition_sizes.size(0) <= 4);
+  for (; i <  codebook_partition_sizes.size(0); ++i, ++cumulative_size)
+  {
+    *cumulative_size = codebook_partition_sizes[i].item<int>() + last;
+    last = *cumulative_size;
+  }
+  // fill in the rest with unreachable.
+  for (; i < 4; ++i, ++cumulative_size)
+  {
+    *cumulative_size = last*10;
+  }
 
   if (nbooks == 1 && entries == (1 << 16))
   { 

From 992d5849fbf34e9c69df46759cd316ba31a95159 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 12 Mar 2024 14:23:24 -0400
Subject: [PATCH 72/96] Update csrc/quantization/aqlm/aqlm_cuda_entry.cpp

---
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 81a5c36f5afef..435cb90e69233 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -183,7 +183,7 @@ torch::Tensor aqlm_gemm(
 
   int4 cumulative_sizes;
   auto cumulative_size = &cumulative_sizes.x;
-  int i =0;
+  int i = 0;
   int last = 0;
   assert(codebook_partition_sizes.size(0) <= 4);
   for (; i <  codebook_partition_sizes.size(0); ++i, ++cumulative_size)

From 9143b453b168a5ecdd6dac0dde2bde1db6142f5f Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 12 Mar 2024 20:56:31 +0000
Subject: [PATCH 73/96] set gpu_memory_utilization

---
 examples/aqlm_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index 468364d935d0c..766fc93809bac 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -33,7 +33,7 @@ def main():
     ]
 
     model = LLM(args.model if args.model is not None else models[args.choice],
-                enforce_eager=True,
+                gpu_memory_utilization=.85,
                 tensor_parallel_size=args.tensor_parallel_size)
 
     sampling_params = SamplingParams(max_tokens=100, temperature=0)

From 5d24991d3ec3fc1b331648e0b4c46a392e7665a1 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 14 Mar 2024 19:58:51 +0000
Subject: [PATCH 74/96] add benchmark and refactor a bit.

---
 benchmarks/kernels/benchmark_aqlm.py          | 122 ++++++++++++++++++
 .../layers/quantization/aqlm.py               |  90 +++++++------
 2 files changed, 175 insertions(+), 37 deletions(-)
 create mode 100644 benchmarks/kernels/benchmark_aqlm.py

diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
new file mode 100644
index 0000000000000..f31ab6cbfb489
--- /dev/null
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -0,0 +1,122 @@
+import json
+import os
+import sys
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+from vllm.model_executor.layers.quantization.aqlm import dequantize_partioned_gemm
+from vllm._C import ops
+
+import torch
+import torch.nn.functional as F
+
+def main():
+    methods = [
+        dequantize_partioned_gemm, ops.aqlm_gemm
+    ]
+
+    filename = "./benchmark.csv"
+    print(f"writing benchmarks to file {filename}")
+    with open(filename, "a") as f:
+        sys.stdout = f
+
+        print('m | k | n', end='')
+        for method in methods:
+            print(f' | {method.__name__}', end='')
+        print('')
+
+        # These are reasonable prefill sizes.
+        ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
+                         (4096, (11008, 11008)), (11008, (4096, )))
+
+        # reasonable ranges for m.
+        for m in [
+                1, 2, 4, 8, #16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
+                #2048, 3072, 4096
+        ]:
+            print(f'{m}', file=sys.__stdout__)
+            for ksp in ksandpartions:
+                run_grid(m, ksp[0], torch.tensor(ksp[1]), methods)
+
+        sys.stdout = sys.__stdout__
+
+
+def run_grid(m: int, k: int, parts: torch.tensor, methods):
+
+    num_warmup_trials = 1
+    num_trials = 1
+
+    num_calls = 100
+
+    # warmup.
+    for method in methods:
+        for _ in range(num_warmup_trials):
+            run_timing(
+                num_calls=num_calls,
+                m=m,
+                k=k,
+                parts=parts,
+                method=method,
+            )
+
+    n = parts.sum().item()
+    print(f'{m} | {k} | {n}:{parts.tolist()}', end='')
+
+    for method in methods:
+        best_time_us = 1e20
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                m=m,
+                k=k,
+                parts=parts,
+                method=method,
+            )
+
+            kernel_dur_us = 1000 * kernel_dur_ms
+
+            if kernel_dur_us < best_time_us:
+                best_time_us = kernel_dur_us
+
+        print(f' | {kernel_dur_us:.0f}', end='')
+
+    print('')
+
+
+def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
+               method) -> float:
+
+    n = parts.sum().item()
+
+    device = torch.device('cuda:0')
+
+    input = torch.randn((1, m, k), dtype=torch.float16, device=device)
+
+    codes = torch.randint(-32768,
+                          32768,
+                          size=(n, k // 8, 1),
+                          dtype=torch.int16,
+                          device=device)
+
+    codebooks = torch.randn(size=(parts.shape[0], 65536, 1, 8),
+                            dtype=torch.float16,
+                            device=device)
+
+    scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+    for i in range(num_calls):
+        output = method(input, codes, codebooks, scales, parts, None)
+
+    end_event.record()
+    end_event.synchronize()
+
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index a192f120cd9b8..13b9751d79898 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -78,6 +78,41 @@ def dequantize_gemm(
     return F.linear(input, dequantized_weight, bias)
 
 
+def dequantize_partioned_gemm(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    output_shape = input.shape[:-1] + (scales.shape[0], )
+    output = torch.empty(output_shape, dtype=input.dtype, device=input.device)
+    num_outputs = len(output_partition_sizes)
+
+    # break the inputs and codebooks apart then combine the outputs.
+    # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big multiply at the end.
+    num_codebooks = codebooks.shape[0] // num_outputs
+    assert (scales.shape[0] == codes.shape[0])
+    assert (sum(output_partition_sizes) == scales.shape[0])
+    output_offset = 0
+    codebooks_offset = 0
+    for output_size in output_partition_sizes:
+        shard_output = dequantize_gemm(
+            input, codes.narrow(0, output_offset, output_size),
+            codebooks.narrow(0, codebooks_offset, num_codebooks),
+            scales.narrow(0, output_offset, output_size), None
+            if bias is None else bias.narrow(0, output_offset, output_size))
+
+        output_slice = output.narrow(-1, output_offset, output_size)
+        assert (output_slice.shape == shard_output.shape)
+        output_slice.copy_(shard_output)
+        output_offset += output_size
+        codebooks_offset += num_codebooks
+    return output
+
+
 class AQLMConfig(QuantizationConfig):
     """Config class for AQLM.
 
@@ -259,42 +294,23 @@ def apply_weights(
         output_partition_sizes = getattr(codebooks, "output_partition_sizes",
                                          None)
 
-        output = None
-
-        use_gemv = math.prod(x.shape[:-1]) <= 32
-
-        if not use_gemv:
-            output_shape = x.shape[:-1] + (scales.shape[0], )
-            output = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-            num_outputs = len(output_partition_sizes)
-
-            # break the inputs and codebooks apart then combine the outputs.
-            num_codebooks = codebooks.shape[0] // num_outputs
-            assert (scales.shape[0] == codes.shape[0])
-            assert (sum(output_partition_sizes) == scales.shape[0])
-            output_offset = 0
-            codebooks_offset = 0
-            for output_size in output_partition_sizes:
-                shard_output = dequantize_gemm(
-                    x, codes.narrow(0, output_offset, output_size),
-                    codebooks.narrow(0, codebooks_offset, num_codebooks),
-                    scales.narrow(0, output_offset, output_size),
-                    None if bias is None else bias.narrow(
-                        0, output_offset, output_size))
-
-                output_slice = output.narrow(-1, output_offset, output_size)
-                assert (output_slice.shape == shard_output.shape)
-                output_slice.copy_(shard_output)
-                output_offset += output_size
-                codebooks_offset += num_codebooks
-        else:
-            output = ops.aqlm_gemm(
-                x,
-                codes,
-                codebooks,
-                scales,
-                output_partition_sizes,
-                bias,
-            )
+        use_gemv = math.prod(
+            x.shape[:-1]) <= 32 or output_partition_sizes is None
+
+        output = ops.aqlm_gemm(
+            x,
+            codes,
+            codebooks,
+            scales,
+            output_partition_sizes,
+            bias,
+        ) if use_gemv else dequantize_partioned_gemm(
+            x,
+            codes,
+            codebooks,
+            scales,
+            output_partition_sizes,
+            bias,
+        )
 
         return output

From d9152e206bfb29c55109acc386e1eaf255a20934 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 21 Mar 2024 17:31:54 +0000
Subject: [PATCH 75/96] add aqlm

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 66842e6845edd..eb1e88eb405b8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -156,6 +156,8 @@ set(VLLM_EXT_SRC
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_EXT_SRC
+    "csrc/quantization/aqlm/aqlm_cuda_entry.cpp"
+    "csrc/quantization/aqlm/aqlm_cuda_kernel.cu"
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/quantization/marlin/marlin_cuda_kernel.cu"
     "csrc/custom_all_reduce.cu")

From 0574dfffd892c9382b5b5f8fb9d4d4bc1cb9e737 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 21 Mar 2024 17:41:32 +0000
Subject: [PATCH 76/96] Add dequant methods

---
 benchmarks/kernels/benchmark_aqlm.py          | 192 ++++++++++++++++--
 csrc/ops.h                                    |   6 +
 csrc/pybind.cpp                               |   1 +
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp    |  96 +++++++--
 csrc/quantization/aqlm/aqlm_cuda_kernel.cu    | 159 +++++++++++++++
 vllm/model_executor/layers/linear.py          |   3 +-
 .../layers/quantization/aqlm.py               | 110 +++++++---
 7 files changed, 507 insertions(+), 60 deletions(-)

diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index f31ab6cbfb489..8f2323c695830 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -1,28 +1,163 @@
-import json
 import os
 import sys
+from typing import Optional
 
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
-from vllm.model_executor.layers.quantization.aqlm import dequantize_partioned_gemm
+from vllm.model_executor.layers.quantization.aqlm import (
+    generic_dequantize_gemm, optimized_dequantize_gemm, dequantize_weight,
+    get_int_dtype)
 from vllm._C import ops
 
 import torch
 import torch.nn.functional as F
 
+
+def torch_mult(
+        input: torch.Tensor,  #  [..., in_features]
+        weights: torch.Tensor,
+        scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+) -> torch.Tensor:
+    output = F.linear(input, weights)
+    return output
+
+
+def dequant_out_scale(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    if bias is None:
+        output = F.linear(input, weights, bias)
+        orig_shape = output.shape
+        flattened_output = output.view(-1, output.size(-1))
+        f_scales = scales.view(-1, scales.shape[0])
+        b_scales = f_scales.expand(flattened_output.shape[0], -1)
+        flattened_output *= b_scales
+        return flattened_output.view(orig_shape)
+    else:
+        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+            -1, weights.shape[1])
+        weights *= b_scales
+        return F.linear(input, weights, bias)
+
+
+def dequant_weight_scale(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+        -1, weights.shape[1])
+    weights *= b_scales
+    return F.linear(input, weights, bias)
+
+
+def dequant_no_scale(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    return F.linear(input, weights, bias)
+
+
+# Compare my kernel against the gold standard.
+def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
+
+    n = parts.sum().item()
+
+    device = torch.device('cuda:0')
+
+    code_range = (1 << bits) // 2
+    ingroups = 8
+
+    codes = torch.randint(-code_range,
+                          code_range,
+                          size=(n, k // ingroups, nbooks),
+                          dtype=get_int_dtype(bits),
+                          device=device)
+
+    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+                            dtype=torch.float16,
+                            device=device)
+
+    count = 0
+    for index in range(16):
+        for i in range(8):
+            for book in range(nbooks):
+                codebooks[book, index, 0, i] = count * (10**book)
+            count += 1
+
+    print("codes shape", codes.shape)
+
+    for i in range(16):
+        for book in range(nbooks):
+            codes[0, i, book] = i
+            codes[0, -i, book] = i
+
+    weights = dequantize_weight(codes, codebooks, None)  # TODO Scales.
+    weights2 = ops.aqlm_dequant(codes, codebooks, parts)
+
+    print("weights shape:", weights.shape)
+    print("weights2 shape:", weights2.shape)
+
+    print("weights are:", weights)
+    print("weights2 are:", weights2)
+
+    print("first 128 weights are", weights[0, 0:128].to(torch.int32))
+    print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
+
+    print("last 128 weights are", weights[0, -128:])
+    print("last 128 weights2 are:", weights2[0, -128:])
+
+
 def main():
+
+    nbooks = 2
+    bits = 8
+
+    dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
+    return
+
     methods = [
-        dequantize_partioned_gemm, ops.aqlm_gemm
+        ops.aqlm_gemm,
+        dequant_out_scale,
+        generic_dequantize_gemm,
+        optimized_dequantize_gemm,
+        dequant_weight_scale,
+        torch_mult,
+        dequant_no_scale,
     ]
 
-    filename = "./benchmark.csv"
+    filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
     print(f"writing benchmarks to file {filename}")
-    with open(filename, "a") as f:
+    with open(filename, "w") as f:
         sys.stdout = f
 
-        print('m | k | n', end='')
+        print('m | k | n | n parts', end='')
         for method in methods:
-            print(f' | {method.__name__}', end='')
+            print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
         print('')
 
         # These are reasonable prefill sizes.
@@ -31,17 +166,19 @@ def main():
 
         # reasonable ranges for m.
         for m in [
-                1, 2, 4, 8, #16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
-                #2048, 3072, 4096
+                1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
+                128, 256, 512, 1024, 1536, 2048, 3072, 4096
         ]:
             print(f'{m}', file=sys.__stdout__)
             for ksp in ksandpartions:
-                run_grid(m, ksp[0], torch.tensor(ksp[1]), methods)
+                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
+                         methods)
 
         sys.stdout = sys.__stdout__
 
 
-def run_grid(m: int, k: int, parts: torch.tensor, methods):
+def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
+             methods):
 
     num_warmup_trials = 1
     num_trials = 1
@@ -56,11 +193,13 @@ def run_grid(m: int, k: int, parts: torch.tensor, methods):
                 m=m,
                 k=k,
                 parts=parts,
+                nbooks=nbooks,
+                bits=bits,
                 method=method,
             )
 
     n = parts.sum().item()
-    print(f'{m} | {k} | {n}:{parts.tolist()}', end='')
+    print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
 
     for method in methods:
         best_time_us = 1e20
@@ -70,6 +209,8 @@ def run_grid(m: int, k: int, parts: torch.tensor, methods):
                 m=m,
                 k=k,
                 parts=parts,
+                nbooks=nbooks,
+                bits=bits,
                 method=method,
             )
 
@@ -84,7 +225,7 @@ def run_grid(m: int, k: int, parts: torch.tensor, methods):
 
 
 def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
-               method) -> float:
+               nbooks: int, bits: int, method) -> float:
 
     n = parts.sum().item()
 
@@ -92,24 +233,35 @@ def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
 
     input = torch.randn((1, m, k), dtype=torch.float16, device=device)
 
-    codes = torch.randint(-32768,
-                          32768,
-                          size=(n, k // 8, 1),
-                          dtype=torch.int16,
+    code_range = (1 << bits) // 2
+    ingroups = 8
+
+    codes = torch.randint(-code_range,
+                          code_range,
+                          size=(n, k // ingroups, nbooks),
+                          dtype=get_int_dtype(bits),
                           device=device)
 
-    codebooks = torch.randn(size=(parts.shape[0], 65536, 1, 8),
+    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
                             dtype=torch.float16,
                             device=device)
 
     scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
 
+    # for comparison to just a pytorch mult.
+    weights = torch.randn((n, k), dtype=torch.float16, device=device)
+
     start_event = torch.cuda.Event(enable_timing=True)
     end_event = torch.cuda.Event(enable_timing=True)
 
     start_event.record()
-    for i in range(num_calls):
-        output = method(input, codes, codebooks, scales, parts, None)
+
+    if method is torch_mult:
+        for i in range(num_calls):
+            output = torch_mult(input, weights, scales)
+    else:
+        for i in range(num_calls):
+            output = method(input, codes, codebooks, scales, parts, None)
 
     end_event.record()
     end_event.synchronize()
diff --git a/csrc/ops.h b/csrc/ops.h
index 8c495ea29f61c..ed0cf7d984ca0 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -93,6 +93,12 @@ torch::Tensor aqlm_gemm(
   const std::optional<torch::Tensor>& bias
 );
 
+torch::Tensor aqlm_dequant(
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& codebook_partition_sizes
+);
+
 torch::Tensor awq_gemm(
   torch::Tensor _in_feats,
   torch::Tensor _kernel,
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index 5f0839653b8b5..c99ae3ff54ab8 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -64,6 +64,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 // Quantization ops
 #ifndef USE_ROCM
   ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
+  ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM");
   ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
   ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");
   ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 435cb90e69233..7ebfbd7af9fa4 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -44,6 +44,32 @@ void code2x8_matvec_cuda(
   const int codebook_stride // as int4.
 );
 
+void code1x16_dequant(
+        void* weights,
+  const void* a,
+  const void* codebook,
+  const int a_rows, // code rows in element space, so k
+  const int a_cols, // code columns in element space, so n
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
+  const int codebook_stride // as int4
+);
+
+void code2x8_dequant(
+        void* weights,
+  const void* a,
+  const void* codebook,
+  const int a_rows, // code rows in element space, so k
+  const int a_cols, // code columns in element space, so n
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
+  const int codebook_stride // as int4
+);
+
+
+int codebook_stride(const torch::Tensor& codebooks)
+{
+  return codebooks.stride(0) * codebooks.element_size() / sizeof(int4);
+}
+
 void code1x16_matvec(
   const torch::Tensor& A,
   const torch::Tensor& B,
@@ -63,7 +89,7 @@ void code1x16_matvec(
     prob_m,
     prob_k,
     codebook_a_sizes,
-    codebook.stride(0) * codebook.element_size() / sizeof(int4)
+    codebook_stride(codebook)
   );
 }
 
@@ -125,7 +151,7 @@ void code2x8_matvec(
     prob_m,
     prob_k,
     codebook_a_sizes,
-    2 * codebook.stride(0) * codebook.element_size() / sizeof(int4)
+    2 * codebook_stride(codebook)
   );
 }
 
@@ -169,18 +195,9 @@ torch::Tensor code2x8_matmat(
   return output;
 }
 
-torch::Tensor aqlm_gemm(
-  const torch::Tensor& input,
-  const torch::Tensor& codes,
-  const torch::Tensor& codebooks,
-  const torch::Tensor& scales,
-  const torch::Tensor& codebook_partition_sizes,
-  const std::optional<torch::Tensor>& bias
-)
+// Accumulate the partition sizes.
+int4 accumulate_sizes (const torch::Tensor& codebook_partition_sizes)
 {
-  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
-  int const entries = codebooks.size(1);
-
   int4 cumulative_sizes;
   auto cumulative_size = &cumulative_sizes.x;
   int i = 0;
@@ -196,6 +213,22 @@ torch::Tensor aqlm_gemm(
   {
     *cumulative_size = last*10;
   }
+  return cumulative_sizes;
+}
+
+torch::Tensor aqlm_gemm(
+  const torch::Tensor& input,
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& scales,
+  const torch::Tensor& codebook_partition_sizes,
+  const std::optional<torch::Tensor>& bias
+)
+{
+  int4 cumulative_sizes = accumulate_sizes(codebook_partition_sizes);
+
+  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
+  int const entries = codebooks.size(1);
 
   if (nbooks == 1 && entries == (1 << 16))
   { 
@@ -209,3 +242,40 @@ torch::Tensor aqlm_gemm(
   TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
   return {};
 }
+
+torch::Tensor aqlm_dequant(
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& codebook_partition_sizes
+)
+{
+  int4 cumulative_sizes = accumulate_sizes(codebook_partition_sizes);
+
+  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
+  int const entries = codebooks.size(1);
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(codes));
+  int rows = codes.size(1);
+  int cols = codes.size(0);
+
+  auto weights = torch::empty({cols, rows * 8},
+    torch::TensorOptions()
+      .dtype(codebooks.dtype())
+      .device(codebooks.device())
+  );
+
+  if (nbooks == 1 && entries == (1 << 16))
+  { 
+     code1x16_dequant(weights.data_ptr(), codes.data_ptr(), codebooks.data_ptr(), rows, cols, cumulative_sizes, codebook_stride(codebooks));
+     return weights;
+  }
+
+  if (nbooks == 2 && entries == (1 << 8))
+  { 
+     code2x8_dequant(weights.data_ptr(), codes.data_ptr(), codebooks.data_ptr(), rows, cols, cumulative_sizes, codebook_stride(codebooks));
+     return weights;
+  }
+
+  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
+  return {};
+}
diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
index 9ae6a7eeb1587..9e9570ee0b195 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
+++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
@@ -188,6 +188,98 @@ __global__ void Code2x8MatVec(
   }
 }
 
+
+// Dequantizes the code and codebook into weights.
+// We span horizontally and do an int4 at a time in an attempt to maximize throughput.
+__global__ void Code1x16Dequant(
+        int4* __restrict__ weights,
+  const int4* __restrict__ a,
+  const int4* __restrict__ codebook,
+  const int a_rows, // code rows in int4 space, so same as stride.
+  const int a_cols, // code columns (matter?)
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, sums to m.
+  const int codebook_stride // as int4
+) {
+  // Each thread decodes one int4 worth of codebook.
+  int a_col = blockIdx.x * 32 + threadIdx.x;
+  int a_row = blockIdx.y * 32 + threadIdx.y;
+
+  // out of range
+  if (a_row >= a_rows)
+    return;
+
+  const int weight_stride = a_rows * 8; // as int4
+  weights += a_col * weight_stride + a_row * 8;
+
+  // advance to the correct codebook, this easy because we only multiply one column of the codebook.
+  auto codebook_size = &codebook_a_sizes.x;
+  while (a_col >= *codebook_size)
+  {
+      codebook += codebook_stride;
+      ++codebook_size;
+  }
+
+  // do one int4 read and write, hopefully maxing out bandwidth.
+  int4 code_block = a[a_row + a_col * a_rows];
+  const uint16_t* enc = reinterpret_cast<const uint16_t*>(&code_block);
+  #pragma unroll
+  for (int i = 0; i < 8; i++) {
+    weights[i] = codebook[enc[i]];
+  }
+}
+
+// Dequantizes the code and codebook for 2x8
+// We span horizontally and do an int4 at a time in an attempt to maximize throughput.
+__global__ void Code2x8Dequant(
+        int4* __restrict__ weights,
+  const int4* __restrict__ a,
+  const int4* __restrict__ codebook,
+  const int a_rows, // code rows in int4 space, so same as stride.
+  const int a_cols, // code columns (matter?)
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, sums to m.
+  const int codebook_stride // as int4
+) {
+  // Each thread decodes one int4 worth of codebook.
+  int a_col = blockIdx.x * 32 + threadIdx.x;
+  int a_row = blockIdx.y * 32 + threadIdx.y;
+
+  // out of range, can happen.
+  if (a_row >= a_rows)
+    return;
+
+  const int weight_stride = a_rows * 8; // as int4
+  weights += a_col * weight_stride + a_row * 8;
+
+  // advance to the correct codebook, this easy because we only multiply one column of the codebook.
+  auto codebook_size = &codebook_a_sizes.x;
+  while (a_col >= *codebook_size)
+  {
+      // in pairs of two
+      codebook += codebook_stride * 2;
+      ++codebook_size;
+  }
+
+  // do one int4 read to get it into local memory, hopefully maxing out bandwidth.
+  int4 code_block = a[a_row + a_col * a_rows];
+  const uint8_t* enc = reinterpret_cast<const uint8_t*>(&code_block);
+  #pragma unroll
+  for (int i = 0; i < 8; i++) {
+      int4 code1 = codebook[enc[i*2]];
+      int4 code2 = (codebook + codebook_stride)[enc[i*2 + 1]];
+
+      half2* a = reinterpret_cast<half2*>(&code1);
+      half2* b = reinterpret_cast<half2*>(&code2);
+      #pragma unroll
+      for (int j = 0; j < 4; j++)
+      {
+        a[j].x = __hadd(a[j].x, b[j].x);
+        a[j].y = __hadd(a[j].y, b[j].y);
+      }
+      weights[i] = code1;
+  }
+}
+
+
 inline int ceildiv(int a, int b) {
   return (a + b - 1) / b;
 }
@@ -265,3 +357,70 @@ void  code2x8_matvec_cuda(
     codebook_stride
   );
 }
+
+
+// Dequantizes the code and codebook into weights.
+void code1x16_dequant(
+        void* __restrict__ weights,
+  const void* __restrict__ a,
+  const void* __restrict__ codebook,
+  const int a_rows, // code rows in element space, so k
+  const int a_cols, // code columns in element space, so n
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, sums to m.
+  const int codebook_stride // as int4
+) {
+  dim3 threads(32, 32, 1);
+
+  assert(a_cols % 32 == 0); 
+  // each thread does one int4 worth.
+  assert(a_rows % 8 == 0);
+
+  const int rows = a_rows/8;
+
+  dim3 blocks(ceildiv(a_cols, 32), ceildiv(rows, 32), 1);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  Code1x16Dequant<<<blocks, threads, 0, stream>>>(
+    (int4*) weights,
+    (const int4*) a,
+    (const int4*) codebook,
+    rows, // in int4 space.
+    a_cols,
+    codebook_a_sizes,
+    codebook_stride
+  );
+}
+
+// Dequantizes the code and codebook into weights.
+void code2x8_dequant(
+        void* __restrict__ weights,
+  const void* __restrict__ a,
+  const void* __restrict__ codebook,
+  const int a_rows, // code rows in element space, so k
+  const int a_cols, // code columns in element space, so n
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, sums to m.
+  const int codebook_stride // as int4
+) {
+  dim3 threads(32, 32, 1);
+
+  assert(a_cols % 32 == 0); 
+  // each thread does one int4 worth.
+  assert(a_rows % 8 == 0);
+
+  const int rows = a_rows/8;
+
+  dim3 blocks(ceildiv(a_cols, 32), ceildiv(rows, 32), 1);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  Code2x8Dequant<<<blocks, threads, 0, stream>>>(
+    (int4*) weights,
+    (const int4*) a,
+    (const int4*) codebook,
+    rows, // in int4 space.
+    a_cols,
+    codebook_a_sizes,
+    codebook_stride
+  );
+}
+
+
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index f53f9be6ad599..76f6d15c70ddb 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -151,7 +151,8 @@ class ColumnParallelLinear(torch.nn.Module):
                        skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         linear_method: (Maybe quantized) linear method.
-        output_sizes: list of output sizes packed into one output, like for QKV the list would be size 3.
+        output_sizes: list of output sizes packed into one output, like for QKV
+                       the list would be size 3.
     """
 
     def __init__(
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 13b9751d79898..b49f7736684d5 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -1,4 +1,5 @@
-# Supports AQLM compression, see https://github.com/Vahe1994/AQLM and https://arxiv.org/pdf/2401.06118.pdf
+# Supports AQLM compression, see https://github.com/Vahe1994/AQLM
+# and https://arxiv.org/pdf/2401.06118.pdf
 
 from typing import Any, Dict, List, Optional
 
@@ -9,7 +10,8 @@
 
 from vllm._C import ops
 from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
 
 
 def get_int_dtype(nbits: int) -> torch.dtype:
@@ -34,10 +36,15 @@ def dequantize_weight(codes: torch.Tensor,
                       scales: Optional[torch.Tensor] = None) -> torch.Tensor:
     """
     Decode float weights from quantization codes. Differentiable.
-    :param codes: tensor of integer quantization codes, shape [*dims, num_out_groups, num_in_groups, num_codebooks]
-    :param codebooks: tensor of vectors for each quantization code, [num_codebooks, codebook_size, out_group_size, in_group_size]
-    :param scales: weight will be multiplied by this factor, must be broadcastble with [*dims, out_groups, num_in_groups, out_group_size, in_group_size]
-    :return: reconstructed weight tensor of shape [*dims, num_in_groups*group_size]
+    :param codes: tensor of integer quantization codes, shape 
+        [*dims, num_out_groups, num_in_groups, num_codebooks]
+    :param codebooks: tensor of vectors for each quantization code, 
+        [num_codebooks, codebook_size, out_group_size, in_group_size]
+    :param scales: weight will be multiplied by this factor, must be 
+        broadcastble with 
+        [*dims, out_groups, num_in_groups, out_group_size, in_group_size]
+    :return: reconstructed weight tensor of shape 
+        [*dims, num_in_groups*group_size]
     """
     num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:]
     num_codebooks, codebook_size, out_group_size, in_group_size = codebooks.shape
@@ -50,7 +57,8 @@ def dequantize_weight(codes: torch.Tensor,
         codes.flatten(0, -2) + codebook_offsets,
         codebooks.flatten(0, 1).flatten(-2, -1),
         mode="sum"
-    )  # [prod(dims) * num_out_groups * num_in_groups, out_group_size * in_group_size]
+    )  # [prod(dims) * num_out_groups * num_in_groups, out_group_size
+    # * in_group_size]
 
     reconstructed_weight_groupwise = reconstructed_weight_flat.view(
         list(codes.shape[:-3]) +
@@ -78,7 +86,8 @@ def dequantize_gemm(
     return F.linear(input, dequantized_weight, bias)
 
 
-def dequantize_partioned_gemm(
+# Generic dequantization, slow but flexible.
+def generic_dequantize_gemm(
     input: torch.Tensor,  #  [..., in_features]
     codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
     codebooks: torch.
@@ -92,7 +101,8 @@ def dequantize_partioned_gemm(
     num_outputs = len(output_partition_sizes)
 
     # break the inputs and codebooks apart then combine the outputs.
-    # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big multiply at the end.
+    # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big
+    # multiply at the end.
     num_codebooks = codebooks.shape[0] // num_outputs
     assert (scales.shape[0] == codes.shape[0])
     assert (sum(output_partition_sizes) == scales.shape[0])
@@ -113,6 +123,35 @@ def dequantize_partioned_gemm(
     return output
 
 
+# Optimized dequnantize/decompression kernels, supports 1x16 and 2x8
+# at 6 and 9 times faster than the generic version above, respectively.
+def optimized_dequantize_gemm(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    if bias is None:
+        # scaling the output is fastest, so we do that when possible.
+        output = F.linear(input, weights, bias)
+        orig_shape = output.shape
+        flattened_output = output.view(-1, output.size(-1))
+        f_scales = scales.view(-1, scales.shape[0])
+        b_scales = f_scales.expand(flattened_output.shape[0], -1)
+        flattened_output *= b_scales
+        return output.view(orig_shape)
+    else:
+        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+            -1, weights.shape[1])
+        weights *= b_scales
+        return F.linear(input, weights, bias)
+
+
 class AQLMConfig(QuantizationConfig):
     """Config class for AQLM.
 
@@ -211,10 +250,10 @@ def create_weights(
 
         codes = Parameter(
             torch.empty(
-                # There could actually be two pack factors, one along input and one along output,
-                # but we don't currently support out_group_size,
-                # and only the one along output needs to be marked with "packed_dim".
-                # in order for QKVLinear to work.
+                # There could actually be two pack factors, one along input and
+                # one along output, but we don't currently support
+                # out_group_size, and only the one along output needs to be
+                # marked with "packed_dim" in order for QKVLinear to work.
                 output_size_per_partition,
                 input_size_per_partition // self.quant_config.pack_factor,
                 self.quant_config.num_codebooks,
@@ -294,17 +333,38 @@ def apply_weights(
         output_partition_sizes = getattr(codebooks, "output_partition_sizes",
                                          None)
 
-        use_gemv = math.prod(
-            x.shape[:-1]) <= 32 or output_partition_sizes is None
-
-        output = ops.aqlm_gemm(
-            x,
-            codes,
-            codebooks,
-            scales,
-            output_partition_sizes,
-            bias,
-        ) if use_gemv else dequantize_partioned_gemm(
+        nbooks = codes.shape[2]
+        ingroups = codebooks.shape[3]
+        outgroups = codebooks.shape[2]
+        bits = codebooks.shape[1]
+
+        # We support these formats with dedicated gemm and decompression
+        # kernels.
+        if ingroups == 8 and outgroups == 1 and (
+            (bits == 256 and nbooks == 2) or (bits == 65536 and nbooks == 1)):
+
+            # thresholds determined by timings on an A6000
+            m_threshold = 8 if bits == 65536 else 12
+            use_gemv = math.prod(x.shape[:-1]) <= m_threshold
+
+            return ops.aqlm_gemm(
+                x,
+                codes,
+                codebooks,
+                scales,
+                output_partition_sizes,
+                bias,
+            ) if use_gemv else optimized_dequantize_gemm(
+                x,
+                codes,
+                codebooks,
+                scales,
+                output_partition_sizes,
+                bias,
+            )
+
+        # fall back all unoptimized formats
+        return generic_dequantize_gemm(
             x,
             codes,
             codebooks,
@@ -312,5 +372,3 @@ def apply_weights(
             output_partition_sizes,
             bias,
         )
-
-        return output

From 39ca4a03b9b4d8605ba65ededcb978f33659b8dd Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 21 Mar 2024 17:44:58 +0000
Subject: [PATCH 77/96] fix format

---
 benchmarks/kernels/benchmark_aqlm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index 8f2323c695830..37d75ff6d020f 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -258,10 +258,10 @@ def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
 
     if method is torch_mult:
         for i in range(num_calls):
-            output = torch_mult(input, weights, scales)
+            torch_mult(input, weights, scales)
     else:
         for i in range(num_calls):
-            output = method(input, codes, codebooks, scales, parts, None)
+            method(input, codes, codebooks, scales, parts, None)
 
     end_event.record()
     end_event.synchronize()

From 522f99021d76dd64e77a1f6e74a4904fe51b914f Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 21 Mar 2024 17:45:41 +0000
Subject: [PATCH 78/96] formatA

---
 benchmarks/kernels/benchmark_aqlm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index 37d75ff6d020f..9ec8f20e4d8cd 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -2,8 +2,6 @@
 import sys
 from typing import Optional
 
-os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-
 from vllm.model_executor.layers.quantization.aqlm import (
     generic_dequantize_gemm, optimized_dequantize_gemm, dequantize_weight,
     get_int_dtype)
@@ -12,6 +10,8 @@
 import torch
 import torch.nn.functional as F
 
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
 
 def torch_mult(
         input: torch.Tensor,  #  [..., in_features]

From d2ac6b2ec9688d10c0fa2716180ff9cc64b92068 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 21 Mar 2024 17:48:37 +0000
Subject: [PATCH 79/96] some format fixes

---
 tests/models/test_aqlm.py                       | 3 ++-
 vllm/model_executor/layers/quantization/aqlm.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index 2464e7e20aa70..088c995e0c149 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -5,7 +5,8 @@
 
 import pytest
 import torch
-from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY
+from vllm.model_executor.layers.quantization import (
+    _QUANTIZATION_CONFIG_REGISTRY)
 
 capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index b49f7736684d5..12e198d3daa7e 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -47,7 +47,8 @@ def dequantize_weight(codes: torch.Tensor,
         [*dims, num_in_groups*group_size]
     """
     num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:]
-    num_codebooks, codebook_size, out_group_size, in_group_size = codebooks.shape
+    num_codebooks, codebook_size, out_group_size, in_group_size = \
+        codebooks.shape
     out_features = num_out_groups * out_group_size
     in_features = num_in_groups * in_group_size
     codebook_offsets = torch.arange(

From bb66e3cc414a061b151522b5f8265c3cd8d7bb3f Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 21 Mar 2024 17:51:53 +0000
Subject: [PATCH 80/96] formatting

---
 tests/models/test_aqlm.py | 53 ++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index 088c995e0c149..5e1d57f5c3b43 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -15,22 +15,31 @@
 
 # In this test we hardcode prompts and generations for the model so we don't need to require the AQLM package as a dependency
 example_prompts = [
-    'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.\n',
-    'Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.\n',
-    'Compare and contrast artificial intelligence with human intelligence in terms of processing information.\n',
-    'Describe the basic components of a neural network and how it can be trained.\n',
+    'vLLM is a high-throughput and memory-efficient inference and serving '
+    'engine for LLMs.\n',
+    'Briefly describe the major milestones in the development of artificial '
+    'intelligence from 1950 to 2020.\n',
+    'Compare and contrast artificial intelligence with human intelligence in '
+    'terms of processing information.\n',
+    'Describe the basic components of a neural network and how it can be '
+    'trained.\n',
     'Write a short story about a robot that dreams for the first time.\n',
-    'Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.\n',
-    'Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.\n',
-    "Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'\n"
+    'Analyze the impact of the COVID-19 pandemic on global economic structures '
+    'and future business models.\n',
+    'Explain the cultural significance of the Mona Lisa painting, and how its '
+    'perception might vary in Western versus Eastern societies.\n',
+    "Translate the following English sentence into Japanese, French, and "
+    "Swahili: 'The early bird catches the worm.'\n"
 ]
 
-# These ground truth generations were generated using `transformers==4.38.1 aqlm==1.1.0 torch==2.2.0`
+# These ground truth generations were generated using `transformers==4.38.1
+# aqlm==1.1.0 torch==2.2.0`
 # and the below code:
 # ```python
 # from transformers import AutoTokenizer, AutoModelForCausalLM
 # model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
-# quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="cuda").cuda()
+# quantized_model = AutoModelForCausalLM.from_pretrained(model_id,
+# torch_dtype="auto", device_map="cuda").cuda()
 # tokenizer = AutoTokenizer.from_pretrained(model_id)
 # outputs = []
 # for prompt in example_prompts:
@@ -39,14 +48,24 @@
 #     outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:]))
 # ```
 ground_truth_generations = [
-    '\n### Features\n\n- **High-throughput**: vLLM is designed to be memory-efficient and high-throughput. It',
-    'The major milestones in the development of artificial intelligence from 1950 to 2020 are as follows:\n1950',
-    'Compare and contrast artificial intelligence with human intelligence in terms of processing information. The processing of information is a key component of artificial intelligence. The processing of information is',
-    'Explain the difference between supervised and unsupervised learning.\nExplain the difference between feedforward and recurrent neural networks.\nExplain the difference',
-    'Write a short story about a robot that dreams for the first time. The story should be about 1000 words.\nThe story should be',
-    'Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. The COVID-19 pandemic has had a',
-    'The Mona Lisa is a painting by Leonardo da Vinci, and it is considered to be one of the most famous paintings in the world. The',
-    'The early bird catches the worm.\nThe early bird catches the worm. (Japanese)\nLe petit oiseau attrait'
+    '\n### Features\n\n- **High-throughput**: vLLM is designed to be '
+    'memory-efficient and high-throughput. It',
+    'The major milestones in the development of artificial intelligence from '
+    '1950 to 2020 are as follows:\n1950',
+    'Compare and contrast artificial intelligence with human intelligence in '
+    'terms of processing information. The processing of information is a key '
+    'component of artificial intelligence. The processing of information is',
+    'Explain the difference between supervised and unsupervised '
+    'learning.\nExplain the difference between feedforward and recurrent '
+    'neural networks.\nExplain the difference',
+    'Write a short story about a robot that dreams for the first time. The '
+    'story should be about 1000 words.\nThe story should be',
+    'Analyze the impact of the COVID-19 pandemic on global economic structures '
+    'and future business models. The COVID-19 pandemic has had a',
+    'The Mona Lisa is a painting by Leonardo da Vinci, and it is considered to '
+    'be one of the most famous paintings in the world. The',
+    'The early bird catches the worm.\nThe early bird catches the worm. '
+    '(Japanese)\nLe petit oiseau attrait'
 ]
 
 

From 11c7950e226c8696f7398b4e8f6520098f6d4923 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 21 Mar 2024 17:52:24 +0000
Subject: [PATCH 81/96] format

---
 tests/models/test_aqlm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index 5e1d57f5c3b43..aedd707406e75 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -13,7 +13,8 @@
 aqlm_not_supported = (
     capability < _QUANTIZATION_CONFIG_REGISTRY["aqlm"].get_min_capability())
 
-# In this test we hardcode prompts and generations for the model so we don't need to require the AQLM package as a dependency
+# In this test we hardcode prompts and generations for the model so we don't 
+# need to require the AQLM package as a dependency
 example_prompts = [
     'vLLM is a high-throughput and memory-efficient inference and serving '
     'engine for LLMs.\n',

From fb78b9504b2f3bf2e46805376ddcbfb7cb741d40 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 21 Mar 2024 17:54:52 +0000
Subject: [PATCH 82/96] remove dead space

---
 tests/models/test_aqlm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index aedd707406e75..47c8f3db6ea33 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -13,7 +13,7 @@
 aqlm_not_supported = (
     capability < _QUANTIZATION_CONFIG_REGISTRY["aqlm"].get_min_capability())
 
-# In this test we hardcode prompts and generations for the model so we don't 
+# In this test we hardcode prompts and generations for the model so we don't
 # need to require the AQLM package as a dependency
 example_prompts = [
     'vLLM is a high-throughput and memory-efficient inference and serving '

From d73a92beb3ed37f3bba8100953a40dc8c769abd7 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Thu, 21 Mar 2024 20:31:30 +0000
Subject: [PATCH 83/96] niceties for aqlm benchmark

---
 benchmarks/kernels/benchmark_aqlm.py | 44 +++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index 9ec8f20e4d8cd..e9383a8f1fc1a 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -1,5 +1,6 @@
 import os
 import sys
+import argparse
 from typing import Optional
 
 from vllm.model_executor.layers.quantization.aqlm import (
@@ -82,7 +83,9 @@ def dequant_no_scale(
     return F.linear(input, weights, bias)
 
 
-# Compare my kernel against the gold standard.
+# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
+# the generic pytorch version.
+# Just visual comparison.
 def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
 
     n = parts.sum().item()
@@ -116,7 +119,7 @@ def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
             codes[0, i, book] = i
             codes[0, -i, book] = i
 
-    weights = dequantize_weight(codes, codebooks, None)  # TODO Scales.
+    weights = dequantize_weight(codes, codebooks, None)
     weights2 = ops.aqlm_dequant(codes, codebooks, parts)
 
     print("weights shape:", weights.shape)
@@ -134,12 +137,36 @@ def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
 
 def main():
 
-    nbooks = 2
-    bits = 8
-
-    dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
-    return
-
+    parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
+
+    # Add arguments
+    parser.add_argument("--nbooks",
+                        type=int,
+                        default=1,
+                        help="Number of codebooks (default: 1)")
+    parser.add_argument("--bits",
+                        type=int,
+                        default=16,
+                        help="Number of bits per code element (default: 16)")
+    parser.add_argument(
+        "--test",
+        type=bool,
+        default=False,
+        help="Run the decompression/dequant tester rather than benchmarking "
+        "(default: False)")
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Extract values
+    nbooks = args.nbooks
+    bits = args.bits
+
+    if args.test:
+        dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
+        return
+
+    # Otherwise, benchmark.
     methods = [
         ops.aqlm_gemm,
         dequant_out_scale,
@@ -180,6 +207,7 @@ def main():
 def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
              methods):
 
+    # I didn't see visible improvements from increasing these, but feel free :)
     num_warmup_trials = 1
     num_trials = 1
 

From 44065550693fe99f4eec2d8a1f65c5421bb65697 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 22 Mar 2024 17:19:23 +0000
Subject: [PATCH 84/96] update the test file

---
 examples/aqlm_example.py  |  7 ++++---
 tests/models/test_aqlm.py | 10 +++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index 766fc93809bac..bacf68fac401f 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -32,9 +32,10 @@ def main():
         "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
     ]
 
-    model = LLM(args.model if args.model is not None else models[args.choice],
-                gpu_memory_utilization=.85,
-                tensor_parallel_size=args.tensor_parallel_size)
+    model = LLM(
+        args.model if args.model is not None else models[args.choice],
+        #gpu_memory_utilization=.85,
+        tensor_parallel_size=args.tensor_parallel_size)
 
     sampling_params = SamplingParams(max_tokens=100, temperature=0)
     outputs = model.generate("Hello my name is",
diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index 47c8f3db6ea33..c814b9e70711d 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -57,16 +57,16 @@
     'terms of processing information. The processing of information is a key '
     'component of artificial intelligence. The processing of information is',
     'Explain the difference between supervised and unsupervised '
-    'learning.\nExplain the difference between feedforward and recurrent '
-    'neural networks.\nExplain the difference',
+    'learning.\nExplain the difference between a feedforward neural network '
+    'and a recurrent neural network.\n',
     'Write a short story about a robot that dreams for the first time. The '
     'story should be about 1000 words.\nThe story should be',
     'Analyze the impact of the COVID-19 pandemic on global economic structures '
     'and future business models. The COVID-19 pandemic has had a',
     'The Mona Lisa is a painting by Leonardo da Vinci, and it is considered to '
     'be one of the most famous paintings in the world. The',
-    'The early bird catches the worm.\nThe early bird catches the worm. '
-    '(Japanese)\nLe petit oiseau attrait'
+    "Translate the following English sentence into Japanese, French, and "
+    "Swahili: 'The early bird catches the worm.'\nThe early bird catches"
 ]
 
 
@@ -85,7 +85,7 @@ def test_models(
     num_logprobs: int,
 ) -> None:
 
-    vllm_model = vllm_runner(model, dtype=dtype, enforce_eager=True)
+    vllm_model = vllm_runner(model, dtype=dtype)
     vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
                                                        max_tokens,
                                                        num_logprobs)

From 36223428d0f73d59314eb5374015f6232e0e7468 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Fri, 22 Mar 2024 17:20:13 +0000
Subject: [PATCH 85/96] remove gpu_memory_utilization reduction

---
 examples/aqlm_example.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index bacf68fac401f..f32605420d0b0 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -34,7 +34,6 @@ def main():
 
     model = LLM(
         args.model if args.model is not None else models[args.choice],
-        #gpu_memory_utilization=.85,
         tensor_parallel_size=args.tensor_parallel_size)
 
     sampling_params = SamplingParams(max_tokens=100, temperature=0)

From e2b3529e1c0159cebd58e55ec5ab2b5d1db9e3ae Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 26 Mar 2024 14:24:11 +0000
Subject: [PATCH 86/96] port over better dequant kernels from aqlm

---
 csrc/quantization/aqlm/aqlm_cuda_entry.cpp |  63 +++--
 csrc/quantization/aqlm/aqlm_cuda_kernel.cu | 253 ++++++++++++---------
 examples/aqlm_example.py                   |   5 +-
 3 files changed, 190 insertions(+), 131 deletions(-)

diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
index 7ebfbd7af9fa4..683488a2bb4ef 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
+++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp
@@ -44,22 +44,22 @@ void code2x8_matvec_cuda(
   const int codebook_stride // as int4.
 );
 
-void code1x16_dequant(
-        void* weights,
-  const void* a,
+void code1x16_dequant_cuda(
+  const void* A,
+        void* C,
   const void* codebook,
-  const int a_rows, // code rows in element space, so k
-  const int a_cols, // code columns in element space, so n
-  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
-  const int codebook_stride // as int4
+  int prob_m,
+  int prob_k,
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
+  const int codebook_stride // as int4.
 );
 
-void code2x8_dequant(
-        void* weights,
-  const void* a,
+void code2x8_dequant_cuda(
+  const void* A,
+        void* C,
   const void* codebook,
-  const int a_rows, // code rows in element space, so k
-  const int a_cols, // code columns in element space, so n
+  int prob_m,
+  int prob_k,
   const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
   const int codebook_stride // as int4
 );
@@ -196,7 +196,7 @@ torch::Tensor code2x8_matmat(
 }
 
 // Accumulate the partition sizes.
-int4 accumulate_sizes (const torch::Tensor& codebook_partition_sizes)
+int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes)
 {
   int4 cumulative_sizes;
   auto cumulative_size = &cumulative_sizes.x;
@@ -258,21 +258,48 @@ torch::Tensor aqlm_dequant(
   int rows = codes.size(1);
   int cols = codes.size(0);
 
-  auto weights = torch::empty({cols, rows * 8},
+  auto in_features = codes.size(1) * 8;
+  auto out_features = codes.size(0);
+
+  assert(out_features = codebook_partition_sizes.sum().item<int>());
+
+  auto weights = torch::empty({out_features, in_features},
     torch::TensorOptions()
       .dtype(codebooks.dtype())
       .device(codebooks.device())
   );
 
   if (nbooks == 1 && entries == (1 << 16))
-  { 
-     code1x16_dequant(weights.data_ptr(), codes.data_ptr(), codebooks.data_ptr(), rows, cols, cumulative_sizes, codebook_stride(codebooks));
+  {
+    code1x16_dequant_cuda(
+      codes.data_ptr(),
+      weights.data_ptr(),
+      codebooks.data_ptr(),
+      out_features,
+      in_features,
+      cumulative_sizes,
+      codebook_stride(codebooks));
+
+    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation.)
+    // weights *= scales.index({"...", 0, 0});
+
      return weights;
   }
 
   if (nbooks == 2 && entries == (1 << 8))
-  { 
-     code2x8_dequant(weights.data_ptr(), codes.data_ptr(), codebooks.data_ptr(), rows, cols, cumulative_sizes, codebook_stride(codebooks));
+  {
+     code2x8_dequant_cuda(
+        codes.data_ptr(), 
+        weights.data_ptr(), 
+        codebooks.data_ptr(), 
+        out_features,
+        in_features, 
+        cumulative_sizes, 
+        codebook_stride(codebooks));
+
+    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation)
+    // weights *= scales.index({"...", 0, 0});
+
      return weights;
   }
 
diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
index 9e9570ee0b195..d2e950f0d24c2 100644
--- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
+++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu
@@ -189,97 +189,127 @@ __global__ void Code2x8MatVec(
 }
 
 
-// Dequantizes the code and codebook into weights.
-// We span horizontally and do an int4 at a time in an attempt to maximize throughput.
 __global__ void Code1x16Dequant(
-        int4* __restrict__ weights,
-  const int4* __restrict__ a,
+  const int4* __restrict__ A,
+        int4* __restrict__ C,
   const int4* __restrict__ codebook,
-  const int a_rows, // code rows in int4 space, so same as stride.
-  const int a_cols, // code columns (matter?)
+  int prob_m,
+  int prob_k,
   const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, sums to m.
   const int codebook_stride // as int4
 ) {
-  // Each thread decodes one int4 worth of codebook.
-  int a_col = blockIdx.x * 32 + threadIdx.x;
-  int a_row = blockIdx.y * 32 + threadIdx.y;
-
-  // out of range
-  if (a_row >= a_rows)
-    return;
-
-  const int weight_stride = a_rows * 8; // as int4
-  weights += a_col * weight_stride + a_row * 8;
+  int a_gl_stride = prob_k / 8 / 8;
+  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  bool pred = a_gl_rd < prob_m;
 
-  // advance to the correct codebook, this easy because we only multiply one column of the codebook.
-  auto codebook_size = &codebook_a_sizes.x;
-  while (a_col >= *codebook_size)
+  if (pred)
   {
-      codebook += codebook_stride;
-      ++codebook_size;
+    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
+    auto codebook_size = &codebook_a_sizes.x;
+    while (a_gl_rd >= *codebook_size)
+    {
+        codebook += codebook_stride;
+        ++codebook_size;
+    }
   }
 
-  // do one int4 read and write, hopefully maxing out bandwidth.
-  int4 code_block = a[a_row + a_col * a_rows];
-  const uint16_t* enc = reinterpret_cast<const uint16_t*>(&code_block);
-  #pragma unroll
-  for (int i = 0; i < 8; i++) {
-    weights[i] = codebook[enc[i]];
+  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+
+  int c_gl_stride = prob_k / 8;
+  int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
+
+  int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
+  while (iters--) {
+    if (pred && a_gl_rd < a_gl_end) {
+      const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
+      #pragma unroll
+      for (int i = 0; i < 8; i++) {
+        int4 chunk;
+        auto dec = reinterpret_cast<uint32_t*>(&chunk);
+        // We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't
+        // actually help us; this brings > 2x speedup.
+        asm volatile (
+          "ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
+          : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
+          : "l"((void*) &codebook[enc[i]])
+        );
+
+        C[a_gl_rd * 8 + i] = chunk;
+      }
+    }
+    a_gl_rd += 32;
   }
 }
 
-// Dequantizes the code and codebook for 2x8
-// We span horizontally and do an int4 at a time in an attempt to maximize throughput.
+
 __global__ void Code2x8Dequant(
-        int4* __restrict__ weights,
-  const int4* __restrict__ a,
+  const int4* __restrict__ A,
+        int4* __restrict__ C,
   const int4* __restrict__ codebook,
-  const int a_rows, // code rows in int4 space, so same as stride.
-  const int a_cols, // code columns (matter?)
-  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, sums to m.
+  int prob_m,
+  int prob_k,
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
   const int codebook_stride // as int4
 ) {
-  // Each thread decodes one int4 worth of codebook.
-  int a_col = blockIdx.x * 32 + threadIdx.x;
-  int a_row = blockIdx.y * 32 + threadIdx.y;
+  int a_gl_stride = prob_k / 8 / 8;
+  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  bool pred = a_gl_rd < prob_m;
 
-  // out of range, can happen.
-  if (a_row >= a_rows)
-    return;
+  if (pred)
+  {
+    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
+    auto codebook_size = &codebook_a_sizes.x;
+    while (a_gl_rd >= *codebook_size)
+    {
+        codebook += codebook_stride;
+        ++codebook_size;
+    }
+  }
 
-  const int weight_stride = a_rows * 8; // as int4
-  weights += a_col * weight_stride + a_row * 8;
+  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+  int lane = threadIdx.x % 8;
 
-  // advance to the correct codebook, this easy because we only multiply one column of the codebook.
-  auto codebook_size = &codebook_a_sizes.x;
-  while (a_col >= *codebook_size)
-  {
-      // in pairs of two
-      codebook += codebook_stride * 2;
-      ++codebook_size;
+  int c_gl_stride = prob_k / 8;
+  int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
+
+  extern __shared__ int4 sh[];
+  int4* sh_code = sh;
+  int4* sh_code0 = sh_code;
+  int4* sh_code1 = sh_code + 256 * 8;
+
+  for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
+    int4 dec = codebook[i];
+    #pragma unroll
+    for (int j = 0; j < 8; j++)
+      sh_code[8 * i + (j + lane) % 8] = dec;
   }
+  __syncthreads();
 
-  // do one int4 read to get it into local memory, hopefully maxing out bandwidth.
-  int4 code_block = a[a_row + a_col * a_rows];
-  const uint8_t* enc = reinterpret_cast<const uint8_t*>(&code_block);
-  #pragma unroll
-  for (int i = 0; i < 8; i++) {
-      int4 code1 = codebook[enc[i*2]];
-      int4 code2 = (codebook + codebook_stride)[enc[i*2 + 1]];
+  float res = 0;
 
-      half2* a = reinterpret_cast<half2*>(&code1);
-      half2* b = reinterpret_cast<half2*>(&code2);
+  int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
+  while (iters--) {
+    if (pred && a_gl_rd < a_gl_end) {
+      const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
       #pragma unroll
-      for (int j = 0; j < 4; j++)
-      {
-        a[j].x = __hadd(a[j].x, b[j].x);
-        a[j].y = __hadd(a[j].y, b[j].y);
+      for (int i = 0; i < 8; i++) {
+        int4 chunk;
+        half2* a0 = reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
+        half2* a1 = reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
+        #pragma unroll
+        for (int j = 0; j < 4; j++)
+          reinterpret_cast<half2*>(&chunk)[j] = __hadd2(a0[j], a1[j]);
+        C[a_gl_rd * 8 + i] = chunk;
       }
-      weights[i] = code1;
+    }
+    a_gl_rd += 32;
   }
 }
 
-
 inline int ceildiv(int a, int b) {
   return (a + b - 1) / b;
 }
@@ -358,69 +388,72 @@ void  code2x8_matvec_cuda(
   );
 }
 
-
-// Dequantizes the code and codebook into weights.
-void code1x16_dequant(
-        void* __restrict__ weights,
-  const void* __restrict__ a,
+void code1x16_dequant_cuda(
+  const void* __restrict__ A,
+        void* __restrict__ C,
   const void* __restrict__ codebook,
-  const int a_rows, // code rows in element space, so k
-  const int a_cols, // code columns in element space, so n
-  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, sums to m.
-  const int codebook_stride // as int4
+  int prob_m,
+  int prob_k,
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
+  const int codebook_stride // as int4.
 ) {
-  dim3 threads(32, 32, 1);
-
-  assert(a_cols % 32 == 0); 
-  // each thread does one int4 worth.
-  assert(a_rows % 8 == 0);
-
-  const int rows = a_rows/8;
-
-  dim3 blocks(ceildiv(a_cols, 32), ceildiv(rows, 32), 1);
+  int sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+  int waves = 0;
+  int thread_m;
+  do {
+    waves++;
+    thread_m = ceildiv(prob_m, waves * sms);
+  } while (thread_m > THREAD_M);
 
+  int blocks = ceildiv(prob_m, thread_m);
+  int threads = 32 * thread_m;
   cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
   Code1x16Dequant<<<blocks, threads, 0, stream>>>(
-    (int4*) weights,
-    (const int4*) a,
+    (const int4*) A,
+    (int4*) C,
     (const int4*) codebook,
-    rows, // in int4 space.
-    a_cols,
-    codebook_a_sizes,
-    codebook_stride
+    prob_m,
+    prob_k,
+    codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
+    codebook_stride // as int4.
   );
 }
 
 // Dequantizes the code and codebook into weights.
-void code2x8_dequant(
-        void* __restrict__ weights,
-  const void* __restrict__ a,
+void  code2x8_dequant_cuda(
+  const void* __restrict__ A,
+        void* __restrict__ C,
   const void* __restrict__ codebook,
-  const int a_rows, // code rows in element space, so k
-  const int a_cols, // code columns in element space, so n
-  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, sums to m.
+  int prob_m,
+  int prob_k,
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
   const int codebook_stride // as int4
 ) {
-  dim3 threads(32, 32, 1);
-
-  assert(a_cols % 32 == 0); 
-  // each thread does one int4 worth.
-  assert(a_rows % 8 == 0);
-
-  const int rows = a_rows/8;
-
-  dim3 blocks(ceildiv(a_cols, 32), ceildiv(rows, 32), 1);
+  int sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+  int waves = 0;
+  int thread_m;
+  do {
+    waves++;
+    thread_m = ceildiv(prob_m, waves * sms);
+  } while (thread_m > THREAD_M);
 
+  int blocks = ceildiv(prob_m, thread_m);
+  int threads = 32 * thread_m;
+  int shared = 16 * (2 * 256 * 8 + 32 * 9);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  Code2x8Dequant<<<blocks, threads, 0, stream>>>(
-    (int4*) weights,
-    (const int4*) a,
+
+  cudaFuncSetAttribute(
+    Code2x8Dequant, cudaFuncAttributeMaxDynamicSharedMemorySize, shared
+  );
+  Code2x8Dequant<<<blocks, threads, shared, stream>>>(
+    (const int4*) A,
+    (int4*) C,
     (const int4*) codebook,
-    rows, // in int4 space.
-    a_cols,
+    prob_m,
+    prob_k,
     codebook_a_sizes,
     codebook_stride
   );
 }
-
-
diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index f32605420d0b0..d290bfdefd4ae 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -32,9 +32,8 @@ def main():
         "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
     ]
 
-    model = LLM(
-        args.model if args.model is not None else models[args.choice],
-        tensor_parallel_size=args.tensor_parallel_size)
+    model = LLM(args.model if args.model is not None else models[args.choice],
+                tensor_parallel_size=args.tensor_parallel_size)
 
     sampling_params = SamplingParams(max_tokens=100, temperature=0)
     outputs = model.generate("Hello my name is",

From 3d65a48ac0f465d3ccc1222f8391b7d82213b3d8 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 26 Mar 2024 16:20:42 +0000
Subject: [PATCH 87/96] better threshold for aqlm

---
 vllm/model_executor/layers/quantization/aqlm.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 12e198d3daa7e..193c86b2acfd2 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -344,9 +344,8 @@ def apply_weights(
         if ingroups == 8 and outgroups == 1 and (
             (bits == 256 and nbooks == 2) or (bits == 65536 and nbooks == 1)):
 
-            # thresholds determined by timings on an A6000
-            m_threshold = 8 if bits == 65536 else 12
-            use_gemv = math.prod(x.shape[:-1]) <= m_threshold
+            # thresholds determined by timings on an A6000, one GPU
+            use_gemv = math.prod(x.shape[:-1]) <= 6
 
             return ops.aqlm_gemm(
                 x,

From d033c85d63c8e398f8fb75d8212709e433d274a0 Mon Sep 17 00:00:00 2001
From: James Fleming <james@neuralmagic.com>
Date: Tue, 26 Mar 2024 16:27:13 +0000
Subject: [PATCH 88/96] format

---
 benchmarks/kernels/benchmark_aqlm.py            | 12 ++++++------
 examples/aqlm_example.py                        |  3 ++-
 tests/models/test_aqlm.py                       |  1 +
 vllm/model_executor/layers/quantization/aqlm.py |  7 ++++---
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index e9383a8f1fc1a..9602d20bcbc74 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -1,16 +1,16 @@
+import argparse
 import os
 import sys
-import argparse
 from typing import Optional
 
-from vllm.model_executor.layers.quantization.aqlm import (
-    generic_dequantize_gemm, optimized_dequantize_gemm, dequantize_weight,
-    get_int_dtype)
-from vllm._C import ops
-
 import torch
 import torch.nn.functional as F
 
+from vllm._C import ops
+from vllm.model_executor.layers.quantization.aqlm import (
+    dequantize_weight, generic_dequantize_gemm, get_int_dtype,
+    optimized_dequantize_gemm)
+
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
 
diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index d290bfdefd4ae..e7c17fa0362ae 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -1,6 +1,7 @@
-from vllm import LLM, SamplingParams
 import argparse
 
+from vllm import LLM, SamplingParams
+
 
 def main():
 
diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index c814b9e70711d..380a8ee67e1f8 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -5,6 +5,7 @@
 
 import pytest
 import torch
+
 from vllm.model_executor.layers.quantization import (
     _QUANTIZATION_CONFIG_REGISTRY)
 
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 193c86b2acfd2..272d2c2fa2694 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -1,15 +1,16 @@
 # Supports AQLM compression, see https://github.com/Vahe1994/AQLM
 # and https://arxiv.org/pdf/2401.06118.pdf
 
+import math
 from typing import Any, Dict, List, Optional
 
-import math
 import torch
-from torch.nn.parameter import Parameter
 import torch.nn.functional as F
+from torch.nn.parameter import Parameter
 
 from vllm._C import ops
-from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 

From 92206de9a8440d7fc3f59a43340b15f39680abcb Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Tue, 9 Apr 2024 14:25:14 +0000
Subject: [PATCH 89/96] Update test point

---
 tests/models/test_aqlm.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index 380a8ee67e1f8..e0a6c9e697dbc 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -48,6 +48,7 @@
 #     input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
 #     hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32)
 #     outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:]))
+# print(outputs)
 # ```
 ground_truth_generations = [
     '\n### Features\n\n- **High-throughput**: vLLM is designed to be '
@@ -57,17 +58,17 @@
     'Compare and contrast artificial intelligence with human intelligence in '
     'terms of processing information. The processing of information is a key '
     'component of artificial intelligence. The processing of information is',
-    'Explain the difference between supervised and unsupervised '
-    'learning.\nExplain the difference between a feedforward neural network '
-    'and a recurrent neural network.\n',
+    'Explain the difference between supervised and unsupervised learning.\n'
+    'Explain the difference between feedforward and recurrent neural networks.'
+    '\nExplain the difference',
     'Write a short story about a robot that dreams for the first time. The '
     'story should be about 1000 words.\nThe story should be',
-    'Analyze the impact of the COVID-19 pandemic on global economic structures '
-    'and future business models. The COVID-19 pandemic has had a',
-    'The Mona Lisa is a painting by Leonardo da Vinci, and it is considered to '
-    'be one of the most famous paintings in the world. The',
-    "Translate the following English sentence into Japanese, French, and "
-    "Swahili: 'The early bird catches the worm.'\nThe early bird catches"
+    'Analyze the impact of the COVID-19 pandemic on global economic structures'
+    ' and future business models. The COVID-19 pandemic has had a',
+    'The Mona Lisa is a painting by Leonardo da Vinci, and it is considered '
+    'to be one of the most famous paintings in the world. The',
+    'The early bird catches the worm.\nThe early bird catches the worm. '
+    '(Japanese)\nLe petit oiseau attrait'
 ]
 
 

From 811e2cc156a1869f75e245a62a3fe9d5ec52379f Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Tue, 9 Apr 2024 15:53:10 +0000
Subject: [PATCH 90/96] Poke test again

---
 tests/models/test_aqlm.py | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index e0a6c9e697dbc..020897aaaf9c8 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -1,6 +1,6 @@
 """Compare the outputs of a AQLM model between vLLM and HF Transformers
 
-Run `pytest tests/models/test_aqlm.py --forked`.
+Run `pytest tests/models/test_aqlm.py`.
 """
 
 import pytest
@@ -51,24 +51,17 @@
 # print(outputs)
 # ```
 ground_truth_generations = [
-    '\n### Features\n\n- **High-throughput**: vLLM is designed to be '
-    'memory-efficient and high-throughput. It',
+    '\n### Features\n\n- **High-throughput**: v',
     'The major milestones in the development of artificial intelligence from '
-    '1950 to 2020 are as follows:\n1950',
+    '195',
     'Compare and contrast artificial intelligence with human intelligence in '
-    'terms of processing information. The processing of information is a key '
-    'component of artificial intelligence. The processing of information is',
-    'Explain the difference between supervised and unsupervised learning.\n'
-    'Explain the difference between feedforward and recurrent neural networks.'
-    '\nExplain the difference',
-    'Write a short story about a robot that dreams for the first time. The '
-    'story should be about 1000 words.\nThe story should be',
-    'Analyze the impact of the COVID-19 pandemic on global economic structures'
-    ' and future business models. The COVID-19 pandemic has had a',
-    'The Mona Lisa is a painting by Leonardo da Vinci, and it is considered '
-    'to be one of the most famous paintings in the world. The',
-    'The early bird catches the worm.\nThe early bird catches the worm. '
-    '(Japanese)\nLe petit oiseau attrait'
+    'terms of processing information. The',
+    'Explain the difference between supervised and unsupervised learning.'
+    '\nExplain',
+    'Write a short story about a robot that dreams for the first time. The',
+    'Analyze the impact of the COVID-19 pandemic on global economic',
+    'The Mona Lisa is a painting by Leonardo da Vinci, and it',
+    'The early bird catches the worm.\nThe early bird catches the'
 ]
 
 
@@ -76,8 +69,8 @@
                     reason="AQLM is not supported on this GPU type.")
 @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
 @pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [3])
+@pytest.mark.parametrize("max_tokens", [16])
+@pytest.mark.parametrize("num_logprobs", [1])
 def test_models(
     vllm_runner,
     example_prompts,
@@ -97,4 +90,5 @@ def test_models(
         vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[
             prompt_idx]
 
+        print("Output generation:", repr(vllm_output_str))
         assert vllm_output_str == ground_truth_generations[prompt_idx]

From d0e8d0cc1af5637d4e5f9f5904da28f97cbc9e9b Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Mon, 15 Apr 2024 19:55:32 +0000
Subject: [PATCH 91/96] Resolve create_weights updates

---
 .../layers/quantization/aqlm.py               | 32 +++++++++----------
 .../layers/quantization/marlin.py             |  3 +-
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 272d2c2fa2694..6115b1de679ad 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -224,14 +224,11 @@ class AQLMLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: AQLMConfig):
         self.quant_config = quant_config
 
-    def create_weights(
-        self,
-        input_size_per_partition: int,
-        output_partition_sizes: List[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-    ) -> Dict[str, Any]:
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: List[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
         del output_size  # Unused.
         del input_size  # Unused.
 
@@ -317,21 +314,22 @@ def create_weights(
             },
         )
 
-        return {
-            "codes": codes,
-            "codebooks": codebooks,
-            "scales": scales,
-        }
+        layer.register_parameter("codes", codes)
+        set_weight_attrs(codes, extra_weight_attrs)
+        layer.register_parameter("codebooks", codebooks)
+        set_weight_attrs(codebooks, extra_weight_attrs)
+        layer.register_parameter("scales", scales)
+        set_weight_attrs(scales, extra_weight_attrs)
 
     def apply_weights(
         self,
-        weights: Dict[str, Any],
+        layer: torch.nn.Module,
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        codebooks = weights["codebooks"]
-        codes = weights["codes"]
-        scales = weights["scales"]
+        codebooks = layer.codebooks
+        codes = layer.codes
+        scales = layer.scales
         output_partition_sizes = getattr(codebooks, "output_partition_sizes",
                                          None)
 
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index bf0500f1155a1..00c3c404c2d7a 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -93,7 +93,7 @@ def create_weights(
         self,
         layer: torch.nn.Module,
         input_size_per_partition: int,
-        output_size_per_partition: int,
+        output_partition_sizes: List[int],
         input_size: int,
         output_size: int,
         params_dtype: torch.dtype,
@@ -106,6 +106,7 @@ def create_weights(
                 f"The params dtype must be float16, but got {params_dtype}")
 
         # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
         if output_size_per_partition % self.quant_config.min_n_threads != 0:
             raise ValueError(
                 f"Weight output_size_per_partition = "

From 6bb89c00d4e7cb87b23cdb0de404586007d1c19e Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Tue, 16 Apr 2024 14:59:40 +0000
Subject: [PATCH 92/96] Better test debug output (manually tested TP)

---
 tests/models/test_aqlm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index 020897aaaf9c8..f653d340fc18f 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -90,5 +90,7 @@ def test_models(
         vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[
             prompt_idx]
 
-        print("Output generation:", repr(vllm_output_str))
+        print("Prompt:          ", repr(example_prompts[prompt_idx]))
+        print("Reference output:", repr(ground_truth_generations[prompt_idx]))
+        print("Output output:   ", repr(vllm_output_str))
         assert vllm_output_str == ground_truth_generations[prompt_idx]

From 4d46f1810e57dc994486d7e85e87696ef761f158 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 18 Apr 2024 10:57:18 -0400
Subject: [PATCH 93/96] Delete csrc/quantization/aqlm/LICENSE

---
 csrc/quantization/aqlm/LICENSE | 203 ---------------------------------
 1 file changed, 203 deletions(-)
 delete mode 100644 csrc/quantization/aqlm/LICENSE

diff --git a/csrc/quantization/aqlm/LICENSE b/csrc/quantization/aqlm/LICENSE
deleted file mode 100644
index bfa740da977e9..0000000000000
--- a/csrc/quantization/aqlm/LICENSE
+++ /dev/null
@@ -1,203 +0,0 @@
-Contains code from https://github.com/Vahe1994/AQLM
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [2024] [AQLM authors]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

From a29008d3e1725d17709fc6d17442963fac8d17ae Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 18 Apr 2024 15:43:56 +0000
Subject: [PATCH 94/96] Address comments

---
 CMakeLists.txt                                        |  4 ++--
 .../aqlm/{aqlm_cuda_entry.cpp => cuda_entry.cpp}      |  0
 .../aqlm/{aqlm_cuda_kernel.cu => gemm_kernels.cu}     |  0
 vllm/model_executor/layers/linear.py                  | 11 ++++++-----
 4 files changed, 8 insertions(+), 7 deletions(-)
 rename csrc/quantization/aqlm/{aqlm_cuda_entry.cpp => cuda_entry.cpp} (100%)
 rename csrc/quantization/aqlm/{aqlm_cuda_kernel.cu => gemm_kernels.cu} (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 10a5179666487..6e8e371764150 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,8 +173,8 @@ set(VLLM_EXT_SRC
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_EXT_SRC
-    "csrc/quantization/aqlm/aqlm_cuda_entry.cpp"
-    "csrc/quantization/aqlm/aqlm_cuda_kernel.cu"
+    "csrc/quantization/aqlm/cuda_entry.cpp"
+    "csrc/quantization/aqlm/gemm_kernels.cu"
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/quantization/marlin/marlin_cuda_kernel.cu"
     "csrc/custom_all_reduce.cu")
diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/cuda_entry.cpp
similarity index 100%
rename from csrc/quantization/aqlm/aqlm_cuda_entry.cpp
rename to csrc/quantization/aqlm/cuda_entry.cpp
diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/gemm_kernels.cu
similarity index 100%
rename from csrc/quantization/aqlm/aqlm_cuda_kernel.cu
rename to csrc/quantization/aqlm/gemm_kernels.cu
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 0ec448cc6ab8b..d471a43fe822b 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -399,13 +399,14 @@ def __init__(
         input_size = self.hidden_size
         output_size = (self.num_heads +
                        2 * self.num_kv_heads) * tp_size * self.head_size
+        output_sizes = [
+            self.num_heads * tp_size * self.head_size,
+            self.num_kv_heads * tp_size * self.head_size,
+            self.num_kv_heads * tp_size * self.head_size
+        ]
 
         super().__init__(input_size, output_size, bias, False, skip_bias_add,
-                         params_dtype, linear_method, [
-                             self.num_heads * tp_size * self.head_size,
-                             self.num_kv_heads * tp_size * self.head_size,
-                             self.num_kv_heads * tp_size * self.head_size
-                         ])
+                         params_dtype, linear_method, output_sizes)
 
     def weight_loader(self,
                       param: Parameter,

From 385211568061b29d34c179d07dd64d52fbbefbfb Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 18 Apr 2024 15:52:31 +0000
Subject: [PATCH 95/96] Update test

---
 tests/models/test_aqlm.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index f653d340fc18f..a7abc011f57d7 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -6,13 +6,12 @@
 import pytest
 import torch
 
-from vllm.model_executor.layers.quantization import (
-    _QUANTIZATION_CONFIG_REGISTRY)
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
 capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]
-aqlm_not_supported = (
-    capability < _QUANTIZATION_CONFIG_REGISTRY["aqlm"].get_min_capability())
+aqlm_not_supported = (capability <
+                      QUANTIZATION_METHODS["aqlm"].get_min_capability())
 
 # In this test we hardcode prompts and generations for the model so we don't
 # need to require the AQLM package as a dependency

From d3678950196c38fd773be14955ed52d6979c3d98 Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 18 Apr 2024 18:56:14 +0000
Subject: [PATCH 96/96] Cleanup namespaces

---
 CMakeLists.txt                         |   1 -
 csrc/quantization/aqlm/cuda_entry.cpp  | 308 -------------------------
 csrc/quantization/aqlm/gemm_kernels.cu | 253 ++++++++++++++++++++
 3 files changed, 253 insertions(+), 309 deletions(-)
 delete mode 100644 csrc/quantization/aqlm/cuda_entry.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6e8e371764150..b2d0cf3e568b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,7 +173,6 @@ set(VLLM_EXT_SRC
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_EXT_SRC
-    "csrc/quantization/aqlm/cuda_entry.cpp"
     "csrc/quantization/aqlm/gemm_kernels.cu"
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/quantization/marlin/marlin_cuda_kernel.cu"
diff --git a/csrc/quantization/aqlm/cuda_entry.cpp b/csrc/quantization/aqlm/cuda_entry.cpp
deleted file mode 100644
index 683488a2bb4ef..0000000000000
--- a/csrc/quantization/aqlm/cuda_entry.cpp
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- * Modified by Neural Magic
- * Adapted from https://github.com/Vahe1994/AQLM
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *         http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <torch/all.h>
-#include <torch/python.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <iostream>
-#include <cstdlib>
-
-void code1x16_matvec_cuda(
-  const void* A,
-  const void* B,
-        void* C,
-  const void* codebook,
-  int prob_m,
-  int prob_k,
-  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
-  const int codebook_stride // as int4.
-);
-
-void code2x8_matvec_cuda(
-  const void* A,
-  const void* B,
-        void* C,
-  const void* codebook,
-  int prob_m,
-  int prob_k,
-  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
-  const int codebook_stride // as int4.
-);
-
-void code1x16_dequant_cuda(
-  const void* A,
-        void* C,
-  const void* codebook,
-  int prob_m,
-  int prob_k,
-  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
-  const int codebook_stride // as int4.
-);
-
-void code2x8_dequant_cuda(
-  const void* A,
-        void* C,
-  const void* codebook,
-  int prob_m,
-  int prob_k,
-  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
-  const int codebook_stride // as int4
-);
-
-
-int codebook_stride(const torch::Tensor& codebooks)
-{
-  return codebooks.stride(0) * codebooks.element_size() / sizeof(int4);
-}
-
-void code1x16_matvec(
-  const torch::Tensor& A,
-  const torch::Tensor& B,
-        torch::Tensor& C,
-  const torch::Tensor& codebook,
-  const int4 codebook_a_sizes  // cumulative sizes of A spanning each codebook, at most 3 long.
-) {
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
-  int prob_m = C.size(0);
-  int prob_k = B.size(0);
-
-  code1x16_matvec_cuda(
-    A.data_ptr(),
-    B.data_ptr(),
-    C.data_ptr(),
-    codebook.data_ptr(),
-    prob_m,
-    prob_k,
-    codebook_a_sizes,
-    codebook_stride(codebook)
-  );
-}
-
-torch::Tensor code1x16_matmat(
-  const torch::Tensor& input,
-  const torch::Tensor& codes,
-  const torch::Tensor& codebooks,
-  const torch::Tensor& scales,
-  const int4 codebook_a_sizes,
-  const std::optional<torch::Tensor>& bias) {
-  auto input_sizes = input.sizes();
-  auto out_features = codes.size(0) * codebooks.size(2);
-  auto flat_input = input.reshape({-1, input.size(-1)});
-  auto flat_output = torch::empty({flat_input.size(0), out_features},
-    torch::TensorOptions()
-      .dtype(input.dtype())
-      .device(input.device())
-  );
-
-  for (int i = 0; i < flat_input.size(0); ++i) {
-    auto input_vec = flat_input.index({i});
-    auto output_vec = flat_output.index({i});
-    code1x16_matvec(
-      codes.squeeze(2),
-      input_vec,
-      output_vec,
-      codebooks,
-      codebook_a_sizes
-    );
-  }
-  flat_output *= scales.flatten().unsqueeze(0);
-
-  if (bias.has_value()) {
-    flat_output += bias->unsqueeze(0);
-  }
-
-  auto output_sizes = input_sizes.vec();
-  output_sizes.pop_back();
-  output_sizes.push_back(-1);
-  auto output = flat_output.reshape(output_sizes);
-  return output;
-}
-
-void code2x8_matvec(
-  const torch::Tensor& A,
-  const torch::Tensor& B,
-        torch::Tensor& C,
-  const torch::Tensor& codebook,
-  const int4 codebook_a_sizes
-) {
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
-  int prob_m = C.size(0);
-  int prob_k = B.size(0);
-  code2x8_matvec_cuda(
-    A.data_ptr(),
-    B.data_ptr(),
-    C.data_ptr(),
-    codebook.data_ptr(),
-    prob_m,
-    prob_k,
-    codebook_a_sizes,
-    2 * codebook_stride(codebook)
-  );
-}
-
-torch::Tensor code2x8_matmat(
-  const torch::Tensor& input,
-  const torch::Tensor& codes,
-  const torch::Tensor& codebooks,
-  const torch::Tensor& scales,
-  const int4 codebook_a_sizes,
-  const std::optional<torch::Tensor>& bias
-) {
-  auto input_sizes = input.sizes();
-  auto out_features = codes.size(0) * codebooks.size(2);
-  auto flat_input = input.reshape({-1, input.size(-1)});
-  auto flat_output = torch::empty({flat_input.size(0), out_features},
-    torch::TensorOptions()
-      .dtype(input.dtype())
-      .device(input.device())
-  );
-
-  for (int i = 0; i < flat_input.size(0); ++i) {
-    auto input_vec = flat_input.index({i});
-    auto output_vec = flat_output.index({i});
-    code2x8_matvec(
-      codes.squeeze(2),
-      input_vec,
-      output_vec,
-      codebooks,
-      codebook_a_sizes
-    );
-  }
-  flat_output *= scales.flatten().unsqueeze(0);
-  if (bias.has_value()) {
-    flat_output += bias->unsqueeze(0);
-  }
-
-  auto output_sizes = input_sizes.vec();
-  output_sizes.pop_back();
-  output_sizes.push_back(-1);
-  auto output = flat_output.reshape(output_sizes);
-  return output;
-}
-
-// Accumulate the partition sizes.
-int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes)
-{
-  int4 cumulative_sizes;
-  auto cumulative_size = &cumulative_sizes.x;
-  int i = 0;
-  int last = 0;
-  assert(codebook_partition_sizes.size(0) <= 4);
-  for (; i <  codebook_partition_sizes.size(0); ++i, ++cumulative_size)
-  {
-    *cumulative_size = codebook_partition_sizes[i].item<int>() + last;
-    last = *cumulative_size;
-  }
-  // fill in the rest with unreachable.
-  for (; i < 4; ++i, ++cumulative_size)
-  {
-    *cumulative_size = last*10;
-  }
-  return cumulative_sizes;
-}
-
-torch::Tensor aqlm_gemm(
-  const torch::Tensor& input,
-  const torch::Tensor& codes,
-  const torch::Tensor& codebooks,
-  const torch::Tensor& scales,
-  const torch::Tensor& codebook_partition_sizes,
-  const std::optional<torch::Tensor>& bias
-)
-{
-  int4 cumulative_sizes = accumulate_sizes(codebook_partition_sizes);
-
-  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
-  int const entries = codebooks.size(1);
-
-  if (nbooks == 1 && entries == (1 << 16))
-  { 
-    return code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
-  }
-  if (nbooks == 2 && entries == (1 << 8))
-  {
-    return code2x8_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
-  }
-
-  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
-  return {};
-}
-
-torch::Tensor aqlm_dequant(
-  const torch::Tensor& codes,
-  const torch::Tensor& codebooks,
-  const torch::Tensor& codebook_partition_sizes
-)
-{
-  int4 cumulative_sizes = accumulate_sizes(codebook_partition_sizes);
-
-  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
-  int const entries = codebooks.size(1);
-
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(codes));
-  int rows = codes.size(1);
-  int cols = codes.size(0);
-
-  auto in_features = codes.size(1) * 8;
-  auto out_features = codes.size(0);
-
-  assert(out_features = codebook_partition_sizes.sum().item<int>());
-
-  auto weights = torch::empty({out_features, in_features},
-    torch::TensorOptions()
-      .dtype(codebooks.dtype())
-      .device(codebooks.device())
-  );
-
-  if (nbooks == 1 && entries == (1 << 16))
-  {
-    code1x16_dequant_cuda(
-      codes.data_ptr(),
-      weights.data_ptr(),
-      codebooks.data_ptr(),
-      out_features,
-      in_features,
-      cumulative_sizes,
-      codebook_stride(codebooks));
-
-    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation.)
-    // weights *= scales.index({"...", 0, 0});
-
-     return weights;
-  }
-
-  if (nbooks == 2 && entries == (1 << 8))
-  {
-     code2x8_dequant_cuda(
-        codes.data_ptr(), 
-        weights.data_ptr(), 
-        codebooks.data_ptr(), 
-        out_features,
-        in_features, 
-        cumulative_sizes, 
-        codebook_stride(codebooks));
-
-    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation)
-    // weights *= scales.index({"...", 0, 0});
-
-     return weights;
-  }
-
-  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
-  return {};
-}
diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu
index d2e950f0d24c2..4415316e1e8cd 100644
--- a/csrc/quantization/aqlm/gemm_kernels.cu
+++ b/csrc/quantization/aqlm/gemm_kernels.cu
@@ -18,9 +18,16 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+#include <torch/extension.h>
 #include <c10/cuda/CUDAStream.h>
+#include <c10/cuda/CUDAGuard.h>
 
 #include <iostream>
+#include <cstdlib>
+
+
+namespace vllm {
+namespace aqlm {
 
 __global__ void Code1x16MatVec(
   const int4* __restrict__ A,
@@ -457,3 +464,249 @@ void  code2x8_dequant_cuda(
     codebook_stride
   );
 }
+
+int codebook_stride(const torch::Tensor& codebooks)
+{
+  return codebooks.stride(0) * codebooks.element_size() / sizeof(int4);
+}
+
+void code1x16_matvec(
+  const torch::Tensor& A,
+  const torch::Tensor& B,
+        torch::Tensor& C,
+  const torch::Tensor& codebook,
+  const int4 codebook_a_sizes  // cumulative sizes of A spanning each codebook, at most 3 long.
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  int prob_m = C.size(0);
+  int prob_k = B.size(0);
+
+  code1x16_matvec_cuda(
+    A.data_ptr(),
+    B.data_ptr(),
+    C.data_ptr(),
+    codebook.data_ptr(),
+    prob_m,
+    prob_k,
+    codebook_a_sizes,
+    codebook_stride(codebook)
+  );
+}
+
+torch::Tensor code1x16_matmat(
+  const torch::Tensor& input,
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& scales,
+  const int4 codebook_a_sizes,
+  const std::optional<torch::Tensor>& bias) {
+  auto input_sizes = input.sizes();
+  auto out_features = codes.size(0) * codebooks.size(2);
+  auto flat_input = input.reshape({-1, input.size(-1)});
+  auto flat_output = torch::empty({flat_input.size(0), out_features},
+    torch::TensorOptions()
+      .dtype(input.dtype())
+      .device(input.device())
+  );
+
+  for (int i = 0; i < flat_input.size(0); ++i) {
+    auto input_vec = flat_input.index({i});
+    auto output_vec = flat_output.index({i});
+    code1x16_matvec(
+      codes.squeeze(2),
+      input_vec,
+      output_vec,
+      codebooks,
+      codebook_a_sizes
+    );
+  }
+  flat_output *= scales.flatten().unsqueeze(0);
+
+  if (bias.has_value()) {
+    flat_output += bias->unsqueeze(0);
+  }
+
+  auto output_sizes = input_sizes.vec();
+  output_sizes.pop_back();
+  output_sizes.push_back(-1);
+  auto output = flat_output.reshape(output_sizes);
+  return output;
+}
+
+void code2x8_matvec(
+  const torch::Tensor& A,
+  const torch::Tensor& B,
+        torch::Tensor& C,
+  const torch::Tensor& codebook,
+  const int4 codebook_a_sizes
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  int prob_m = C.size(0);
+  int prob_k = B.size(0);
+  code2x8_matvec_cuda(
+    A.data_ptr(),
+    B.data_ptr(),
+    C.data_ptr(),
+    codebook.data_ptr(),
+    prob_m,
+    prob_k,
+    codebook_a_sizes,
+    2 * codebook_stride(codebook)
+  );
+}
+
+torch::Tensor code2x8_matmat(
+  const torch::Tensor& input,
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& scales,
+  const int4 codebook_a_sizes,
+  const std::optional<torch::Tensor>& bias
+) {
+  auto input_sizes = input.sizes();
+  auto out_features = codes.size(0) * codebooks.size(2);
+  auto flat_input = input.reshape({-1, input.size(-1)});
+  auto flat_output = torch::empty({flat_input.size(0), out_features},
+    torch::TensorOptions()
+      .dtype(input.dtype())
+      .device(input.device())
+  );
+
+  for (int i = 0; i < flat_input.size(0); ++i) {
+    auto input_vec = flat_input.index({i});
+    auto output_vec = flat_output.index({i});
+    code2x8_matvec(
+      codes.squeeze(2),
+      input_vec,
+      output_vec,
+      codebooks,
+      codebook_a_sizes
+    );
+  }
+  flat_output *= scales.flatten().unsqueeze(0);
+  if (bias.has_value()) {
+    flat_output += bias->unsqueeze(0);
+  }
+
+  auto output_sizes = input_sizes.vec();
+  output_sizes.pop_back();
+  output_sizes.push_back(-1);
+  auto output = flat_output.reshape(output_sizes);
+  return output;
+}
+
+// Accumulate the partition sizes.
+int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes)
+{
+  int4 cumulative_sizes;
+  auto cumulative_size = &cumulative_sizes.x;
+  int i = 0;
+  int last = 0;
+  assert(codebook_partition_sizes.size(0) <= 4);
+  for (; i <  codebook_partition_sizes.size(0); ++i, ++cumulative_size)
+  {
+    *cumulative_size = codebook_partition_sizes[i].item<int>() + last;
+    last = *cumulative_size;
+  }
+  // fill in the rest with unreachable.
+  for (; i < 4; ++i, ++cumulative_size)
+  {
+    *cumulative_size = last*10;
+  }
+  return cumulative_sizes;
+}
+
+} // namespace aqlm
+} // namespace vllm
+
+
+torch::Tensor aqlm_gemm(
+  const torch::Tensor& input,
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& scales,
+  const torch::Tensor& codebook_partition_sizes,
+  const std::optional<torch::Tensor>& bias
+)
+{
+  int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
+
+  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
+  int const entries = codebooks.size(1);
+
+  if (nbooks == 1 && entries == (1 << 16))
+  { 
+    return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
+  }
+  if (nbooks == 2 && entries == (1 << 8))
+  {
+    return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
+  }
+
+  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
+  return {};
+}
+
+torch::Tensor aqlm_dequant(
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& codebook_partition_sizes
+)
+{
+  int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
+
+  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
+  int const entries = codebooks.size(1);
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(codes));
+  int rows = codes.size(1);
+  int cols = codes.size(0);
+
+  auto in_features = codes.size(1) * 8;
+  auto out_features = codes.size(0);
+
+  assert(out_features = codebook_partition_sizes.sum().item<int>());
+
+  auto weights = torch::empty({out_features, in_features},
+    torch::TensorOptions()
+      .dtype(codebooks.dtype())
+      .device(codebooks.device())
+  );
+
+  if (nbooks == 1 && entries == (1 << 16))
+  {
+    vllm::aqlm::code1x16_dequant_cuda(
+      codes.data_ptr(),
+      weights.data_ptr(),
+      codebooks.data_ptr(),
+      out_features,
+      in_features,
+      cumulative_sizes,
+      vllm::aqlm::codebook_stride(codebooks));
+
+    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation.)
+    // weights *= scales.index({"...", 0, 0});
+
+     return weights;
+  }
+
+  if (nbooks == 2 && entries == (1 << 8))
+  {
+     vllm::aqlm::code2x8_dequant_cuda(
+        codes.data_ptr(), 
+        weights.data_ptr(), 
+        codebooks.data_ptr(), 
+        out_features,
+        in_features, 
+        cumulative_sizes, 
+        vllm::aqlm::codebook_stride(codebooks));
+
+    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation)
+    // weights *= scales.index({"...", 0, 0});
+
+     return weights;
+  }
+
+  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
+  return {};
+}