From 079cba53b7fc41d646b951feee845d59fdb363e4 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Mon, 26 Feb 2024 14:47:05 -0500 Subject: [PATCH 01/96] actual add kernel --- csrc/ops.h | 9 + csrc/quantization/aqlm/aqlm_cuda_kernel.cu | 213 ++++++++++++++++++ .../layers/quantization/aqlm.py | 211 +++++++++++++++++ 3 files changed, 433 insertions(+) create mode 100644 csrc/quantization/aqlm/aqlm_cuda_kernel.cu create mode 100644 vllm/model_executor/layers/quantization/aqlm.py diff --git a/csrc/ops.h b/csrc/ops.h index dbdd2c2c57945..ebd7b7a03a352 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -100,6 +100,15 @@ torch::Tensor gptq_gemm( torch::Tensor b_g_idx, bool use_exllama); +torch::Tensor aqlm_gemm( + torch::Tensor a, + torch::Tensor b_q_weight, + torch::Tensor b_gptq_qzeros, + torch::Tensor b_gptq_scales, + torch::Tensor b_g_idx, + bool use_exllama); + + void gptq_shuffle( torch::Tensor q_weight, torch::Tensor q_perm); diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu new file mode 100644 index 0000000000000..0f97e93d678e6 --- /dev/null +++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu @@ -0,0 +1,213 @@ +#include +#include +#include +#include + +#include + +__global__ void Code1x16MatVec( + const int4* __restrict__ A, + const int4* __restrict__ B, + int4* __restrict__ C, + const int4* __restrict__ codebook, + int prob_m, + int prob_k +) { + int a_gl_stride = prob_k / 8 / 8; + int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); + bool pred = a_gl_rd < prob_m; + int b_gl_rd = 0; + int c_gl_wr = a_gl_rd; + a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; + int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; + + __shared__ int4 sh_b[32 * 9]; + float res = 0; + + int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32); + while (iters--) { + // We pad shared memory to avoid bank conflicts during reads + __syncthreads(); + for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) { + if (b_gl_rd + i < prob_k / 8) + sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i]; + } + __syncthreads(); + b_gl_rd += 32 * 8; + + int b_sh_rd = 9 * (threadIdx.x % 32); + if (pred && a_gl_rd < a_gl_end) { + const uint16_t* enc = reinterpret_cast(&A[a_gl_rd]); + #pragma unroll + for (int i = 0; i < 8; i++) { + uint32_t dec[4]; + // We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't + // actually help us; this brings > 2x speedup. + asm volatile ( + "ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3]) + : "l"((void*) &codebook[enc[i]]) + ); + half2* a = reinterpret_cast(&dec); + half2* b = reinterpret_cast(&sh_b[b_sh_rd]); + half2 res2 = {}; + #pragma unroll + for (int j = 0; j < 4; j++) + res2 = __hfma2(a[j], b[j], res2); + res += __half2float(res2.x) + __half2float(res2.y); + b_sh_rd++; + } + a_gl_rd += 32; + } + } + + if (pred) { + #pragma unroll + for (int i = 16; i > 0; i /= 2) + res += __shfl_down_sync(0xffffffff, res, i); + if (threadIdx.x % 32 == 0) + reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res); + } +} + +__global__ void Code2x8MatVec( + const int4* __restrict__ A, + const int4* __restrict__ B, + int4* __restrict__ C, + const int4* __restrict__ codebook, + int prob_m, + int prob_k +) { + int a_gl_stride = prob_k / 8 / 8; + int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); + bool pred = a_gl_rd < prob_m; + int b_gl_rd = 0; + int c_gl_wr = a_gl_rd; + a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; + int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; + int lane = threadIdx.x % 8; + + extern __shared__ int4 sh[]; + int4* sh_b = sh; + int4* sh_code = sh_b + 32 * 9; + int4* sh_code0 = sh_code; + int4* sh_code1 = sh_code + 256 * 8; + + for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) { + int4 dec = codebook[i]; + #pragma unroll + for (int j = 0; j < 8; j++) + sh_code[8 * i + (j + lane) % 8] = dec; + } + __syncthreads(); + + float res = 0; + + int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32); + while (iters--) { + // We pad shared memory to avoid bank conflicts during reads + __syncthreads(); + for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) { + if (b_gl_rd + i < prob_k / 8) + sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i]; + } + __syncthreads(); + b_gl_rd += 32 * 8; + + int b_sh_rd = 9 * (threadIdx.x % 32); + if (pred && a_gl_rd < a_gl_end) { + const uint8_t* enc = reinterpret_cast(&A[a_gl_rd]); + #pragma unroll + for (int i = 0; i < 8; i++) { + half2* a0 = reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]); + half2* a1 = reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]); + half2* b = reinterpret_cast(&sh_b[b_sh_rd]); + half2 res2 = {}; + #pragma unroll + for (int j = 0; j < 4; j++) + res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2); + res += __half2float(res2.x) + __half2float(res2.y); + b_sh_rd++; + } + a_gl_rd += 32; + } + } + + if (pred) { + #pragma unroll + for (int i = 16; i > 0; i /= 2) + res += __shfl_down_sync(0xffffffff, res, i); + if (threadIdx.x % 32 == 0) + reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res); + } +} + +inline int ceildiv(int a, int b) { + return (a + b - 1) / b; +} + +const int THREAD_M = 16; + +void code1x16_matvec_cuda( + const void* __restrict__ A, + const void* __restrict__ B, + void* __restrict__ C, + const void* __restrict__ codebook, + int prob_m, + int prob_k +) { + int sms; + cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); + int waves = 0; + int thread_m; + do { + waves++; + thread_m = ceildiv(prob_m, waves * sms); + } while (thread_m > THREAD_M); + + int blocks = ceildiv(prob_m, thread_m); + int threads = 32 * thread_m; + cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); + Code1x16MatVec<<>>( + (const int4*) A, + (const int4*) B, + (int4*) C, + (const int4*) codebook, + prob_m, + prob_k + ); +} + +void code2x8_matvec_cuda( + const void* __restrict__ A, + const void* __restrict__ B, + void* __restrict__ C, + const void* __restrict__ codebook, + int prob_m, + int prob_k +) { + int sms; + cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); + int waves = 0; + int thread_m; + do { + waves++; + thread_m = ceildiv(prob_m, waves * sms); + } while (thread_m > THREAD_M); + + int blocks = ceildiv(prob_m, thread_m); + int threads = 32 * thread_m; + int shared = 16 * (2 * 256 * 8 + 32 * 9); + cudaFuncSetAttribute( + Code2x8MatVec, cudaFuncAttributeMaxDynamicSharedMemorySize, shared + ); + cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); + Code2x8MatVec<<>>( + (const int4*) A, + (const int4*) B, + (int4*) C, + (const int4*) codebook, + prob_m, + prob_k + ); +} diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py new file mode 100644 index 0000000000000..7218760fbe55d --- /dev/null +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -0,0 +1,211 @@ +import enum +from enum import Enum +from typing import Any, Dict, List, Optional + +import torch +from torch.nn.parameter import Parameter + +from vllm._C import ops +from vllm.model_executor.layers.linear import (LinearMethodBase, + set_weight_attrs) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) + + +class GPTQConfig(QuantizationConfig): + """Config class for GPTQ. + + Reference: https://arxiv.org/abs/2210.17323 + """ + + def __init__( + self, + weight_bits: int, + group_size: int, + desc_act: bool, + ) -> None: + self.weight_bits = weight_bits + self.group_size = group_size + self.desc_act = desc_act + self.pack_factor = 32 // self.weight_bits + # exllama kernel v1 only supports 4 bit + if self.weight_bits != 4: + raise ValueError( + "Currently, only 4-bit weight quantization is supported for " + f"GPTQ, but got {self.weight_bits} bits.") + + def __repr__(self) -> str: + return (f"GPTQConfig(weight_bits={self.weight_bits}, " + f"group_size={self.group_size}, " + f"desc_act={self.desc_act})") + + @classmethod + def get_name(cls) -> str: + return "gptq" + + @classmethod + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.half] + + @classmethod + # Need to figure it out + def get_min_capability(cls) -> int: + return 60 + + @classmethod + def get_config_filenames(cls) -> List[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig": + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"]) + desc_act = cls.get_from_keys(config, ["desc_act"]) + return cls(weight_bits, group_size, desc_act) + + def get_linear_method(self) -> "GPTQLinearMethod": + return GPTQLinearMethod(self) + + def get_scaled_act_names(self) -> List[str]: + return [] + + +class ExllamaState(Enum): + + UNUSED = enum.auto() + UNINITIALIZED = enum.auto() + READY = enum.auto() + + +class GPTQLinearMethod(LinearMethodBase): + """Linear method for GPTQ. + + Args: + quant_config: The GPTQ quantization config. + """ + + def __init__(self, quant_config: GPTQConfig): + self.quant_config = quant_config + + def create_weights( + self, + input_size_per_partition: int, + output_size_per_partition: int, + input_size: int, + output_size: int, + params_dtype: torch.dtype, + ) -> Dict[str, Any]: + del output_size # Unused. + if input_size_per_partition % self.quant_config.group_size != 0: + raise ValueError( + "The input size is not aligned with the quantized " + "weight shape. This can be caused by too large " + "tensor parallel size.") + if output_size_per_partition % self.quant_config.pack_factor != 0: + raise ValueError( + "The output size is not aligned with the quantized " + "weight shape. This can be caused by too large " + "tensor parallel size.") + + if self.quant_config.group_size != -1: + group_size = self.quant_config.group_size + else: + group_size = input_size + exllama_state = ExllamaState.UNINITIALIZED + scale_and_zero_size = input_size // group_size + scale_and_zero_input_dim = None + if input_size != input_size_per_partition and self.quant_config.group_size != -1: + # For act-order models, we cannot use Exllama for row parallel layer + if self.quant_config.desc_act: + exllama_state = ExllamaState.UNUSED + else: + # we need to partition qzeros and scales for exllama kernel + scale_and_zero_size = input_size_per_partition // group_size + scale_and_zero_input_dim = 0 + + qweight = Parameter( + torch.empty( + input_size_per_partition // self.quant_config.pack_factor, + output_size_per_partition, + dtype=torch.int32, + ), + requires_grad=False, + ) + set_weight_attrs( + qweight, { + "input_dim": 0, + "output_dim": 1, + "packed_dim": 0, + "pack_factor": self.quant_config.pack_factor, + }) + g_idx = Parameter( + torch.tensor( + [ + i // self.quant_config.group_size + for i in range(input_size_per_partition) + ], + dtype=torch.int32, + ), + requires_grad=False, + ) + # Ignore warning from fused linear layers such as QKVParallelLinear. + set_weight_attrs(g_idx, {"input_dim": 0, "ignore_warning": True}) + qzeros = Parameter( + torch.empty( + scale_and_zero_size, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + set_weight_attrs( + qzeros, { + "input_dim": scale_and_zero_input_dim, + "output_dim": 1, + "packed_dim": 1, + "pack_factor": self.quant_config.pack_factor, + }) + scales = Parameter( + torch.empty( + scale_and_zero_size, + output_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs(scales, { + "input_dim": scale_and_zero_input_dim, + "output_dim": 1, + }) + return { + "qweight": qweight, + "g_idx": g_idx, + "qzeros": qzeros, + "scales": scales, + "exllama_state": exllama_state, + } + + def apply_weights(self, + weights: Dict[str, Any], + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + qweight = weights["qweight"] + out_shape = x.shape[:-1] + (qweight.shape[-1], ) + reshaped_x = x.reshape(-1, x.shape[-1]) + # exllama needs to shuffle the weight after the weight is loaded + # here we do the shuffle on first forward pass + if weights["exllama_state"] == ExllamaState.UNINITIALIZED: + if self.quant_config.desc_act: + weights["g_idx"] = torch.argsort(weights["g_idx"]).to( + torch.int) + else: + weights["g_idx"] = torch.empty((1, 1), device="meta") + weights["exllama_state"] = ExllamaState.READY + ops.gptq_shuffle(weights["qweight"], weights["g_idx"]) + output = ops.gptq_gemm(reshaped_x, weights["qweight"], + weights["qzeros"], weights["scales"], + weights["g_idx"], + weights["exllama_state"] == ExllamaState.READY) + if bias is not None: + output = output + bias + return output.reshape(out_shape) From 23c3f7727be8f73c6cb42e6b168942e6d4118bd5 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Mon, 26 Feb 2024 16:20:06 -0500 Subject: [PATCH 02/96] getting serious --- csrc/ops.h | 17 +- vllm/config.py | 181 +++++++++++------- .../layers/quantization/__init__.py | 2 + .../layers/quantization/aqlm.py | 179 +++++++++-------- vllm/model_executor/weight_utils.py | 67 +++---- 5 files changed, 252 insertions(+), 194 deletions(-) diff --git a/csrc/ops.h b/csrc/ops.h index ebd7b7a03a352..351c4cade7a09 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -69,6 +69,14 @@ void gelu_fast( torch::Tensor& out, torch::Tensor& input); +torch::Tensor aqlm_gemm( + torch::Tensor a, + torch::Tensor b_q_weight, + torch::Tensor b_gptq_qzeros, + torch::Tensor b_gptq_scales, + torch::Tensor b_g_idx, + bool use_exllama); + #ifndef USE_ROCM torch::Tensor awq_gemm( torch::Tensor _in_feats, @@ -100,15 +108,6 @@ torch::Tensor gptq_gemm( torch::Tensor b_g_idx, bool use_exllama); -torch::Tensor aqlm_gemm( - torch::Tensor a, - torch::Tensor b_q_weight, - torch::Tensor b_gptq_qzeros, - torch::Tensor b_gptq_scales, - torch::Tensor b_g_idx, - bool use_exllama); - - void gptq_shuffle( torch::Tensor q_weight, torch::Tensor q_perm); diff --git a/vllm/config.py b/vllm/config.py index bd0dc89b585f7..f2452baf8796c 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,4 +1,4 @@ -from typing import Optional, Union, ClassVar +from typing import Any, Optional, Union, ClassVar from dataclasses import dataclass import os from packaging.version import Version @@ -45,7 +45,7 @@ class ModelConfig: a tag name, or a commit id. If unspecified, will use the default version. code_revision: The specific revision to use for the model code on - Hugging Face Hub. It can be a branch name, a tag name, or a + Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. tokenizer_revision: The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use @@ -98,52 +98,55 @@ def __init__( # download model from ModelScope hub, # lazy import so that modelscope is not required for normal use. from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=C + if not os.path.exists(model): - model_path = snapshot_download(model_id=model, - cache_dir=download_dir, - revision=revision) + model_path = snapshot_download( + model_id=model, cache_dir=download_dir, revision=revision + ) else: model_path = model self.model = model_path self.download_dir = model_path self.tokenizer = model_path - self.hf_config = get_config(self.model, trust_remote_code, revision, - code_revision) + self.hf_config = get_config( + self.model, trust_remote_code, revision, code_revision + ) self.dtype = _get_and_verify_dtype(self.hf_config, dtype) - self.max_model_len = _get_and_verify_max_len(self.hf_config, - max_model_len) + self.max_model_len = _get_and_verify_max_len(self.hf_config, max_model_len) self._verify_load_format() self._verify_tokenizer_mode() - self._verify_quantization() + self.hf_quant_config = self._get_and_verify_quantization() self._verify_cuda_graph() def _verify_load_format(self) -> None: load_format = self.load_format.lower() - supported_load_format = [ - "auto", "pt", "safetensors", "npcache", "dummy" - ] + supported_load_format = ["auto", "pt", "safetensors", "npcache", "dummy"] rocm_not_supported_load_format = [] if load_format not in supported_load_format: raise ValueError( f"Unknown load format: {self.load_format}. Must be one of " - "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'.") + "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'." + ) if is_hip() and load_format in rocm_not_supported_load_format: rocm_supported_load_format = [ - f for f in supported_load_format + f + for f in supported_load_format if (f not in rocm_not_supported_load_format) ] raise ValueError( - f"load format \'{load_format}\' is not supported in ROCm. " + f"load format '{load_format}' is not supported in ROCm. " f"Supported load format are " - f"{rocm_supported_load_format}") + f"{rocm_supported_load_format}" + ) # TODO: Remove this check once HF updates the pt weights of Mixtral. architectures = getattr(self.hf_config, "architectures", []) if "MixtralForCausalLM" in architectures and load_format == "pt": raise ValueError( "Currently, the 'pt' format is not supported for Mixtral. " - "Please use the 'safetensors' format instead. ") + "Please use the 'safetensors' format instead. " + ) self.load_format = load_format def _verify_tokenizer_mode(self) -> None: @@ -151,47 +154,63 @@ def _verify_tokenizer_mode(self) -> None: if tokenizer_mode not in ["auto", "slow"]: raise ValueError( f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be " - "either 'auto' or 'slow'.") + "either 'auto' or 'slow'." + ) self.tokenizer_mode = tokenizer_mode - def _verify_quantization(self) -> None: - supported_quantization = ["awq", "gptq", "squeezellm"] + def _get_and_verify_quantization(self) -> Any | None: + supported_quantization = ["aqlm", "awq", "gptq", "squeezellm"] rocm_not_supported_quantization = ["awq"] if self.quantization is not None: self.quantization = self.quantization.lower() # Parse quantization method from the HF model config, if available. + hf_quant_method = None hf_quant_config = getattr(self.hf_config, "quantization_config", None) if hf_quant_config is not None: hf_quant_method = str(hf_quant_config["quant_method"]).lower() - if self.quantization is None: - self.quantization = hf_quant_method - elif self.quantization != hf_quant_method: - raise ValueError( - "Quantization method specified in the model config " - f"({hf_quant_method}) does not match the quantization " - f"method specified in the `quantization` argument " - f"({self.quantization}).") + else: + # HF models such as https://huggingface.co/BlackSamorez/Llama-2-70b-AQLM-4Bit-2x16-hf/blob/main/config.json + # only have an aqlm block, no quantization_config block. + hf_quant_config = getattr(self.hf_config, "aqlm", None) + if hf_quant_config is not None: + hf_quant_method = "aqlm" + + if hf_quant_method is not None and self.quantization is None: + self.quantization = hf_quant_method + elif self.quantization != hf_quant_method: + raise ValueError( + "Quantization method specified in the model config " + f"({hf_quant_method}) does not match the quantization " + f"method specified in the `quantization` argument " + f"({self.quantization})." + ) if self.quantization is not None: if self.quantization not in supported_quantization: raise ValueError( f"Unknown quantization method: {self.quantization}. Must " - f"be one of {supported_quantization}.") - if is_hip( - ) and self.quantization in rocm_not_supported_quantization: + f"be one of {supported_quantization}." + ) + if is_hip() and self.quantization in rocm_not_supported_quantization: raise ValueError( f"{self.quantization} quantization is currently not supported " - f"in ROCm.") - logger.warning(f"{self.quantization} quantization is not fully " - "optimized yet. The speed can be slower than " - "non-quantized models.") + f"in ROCm." + ) + logger.warning( + f"{self.quantization} quantization is not fully " + "optimized yet. The speed can be slower than " + "non-quantized models." + ) + + return hf_quant_config def _verify_cuda_graph(self) -> None: if self.max_context_len_to_capture is None: self.max_context_len_to_capture = self.max_model_len - self.max_context_len_to_capture = min(self.max_context_len_to_capture, - self.max_model_len) + self.max_context_len_to_capture = min( + self.max_context_len_to_capture, self.max_model_len + ) def verify_with_parallel_config( self, @@ -203,7 +222,8 @@ def verify_with_parallel_config( raise ValueError( f"Total number of attention heads ({total_num_attention_heads})" " must be divisible by tensor parallel size " - f"({tensor_parallel_size}).") + f"({tensor_parallel_size})." + ) total_num_hidden_layers = self.hf_config.num_hidden_layers pipeline_parallel_size = parallel_config.pipeline_parallel_size @@ -211,7 +231,8 @@ def verify_with_parallel_config( raise ValueError( f"Total number of hidden layers ({total_num_hidden_layers}) " "must be divisible by pipeline parallel size " - f"({pipeline_parallel_size}).") + f"({pipeline_parallel_size})." + ) def get_sliding_window(self) -> Optional[int]: return getattr(self.hf_config, "sliding_window", None) @@ -237,9 +258,11 @@ def get_total_num_kv_heads(self) -> int: falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"] new_decoder_arch_falcon = ( self.hf_config.model_type in falcon_model_types - and getattr(self.hf_config, "new_decoder_architecture", False)) - if not new_decoder_arch_falcon and getattr(self.hf_config, - "multi_query", False): + and getattr(self.hf_config, "new_decoder_architecture", False) + ) + if not new_decoder_arch_falcon and getattr( + self.hf_config, "multi_query", False + ): # Multi-query attention, only one KV head. # Currently, tensor parallelism is not supported in this case. return 1 @@ -269,8 +292,7 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: # the tensor parallel size. We will replicate the KV heads in the # case where the number of KV heads is smaller than the tensor # parallel size so each GPU has at least one KV head. - return max(1, - total_num_kv_heads // parallel_config.tensor_parallel_size) + return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size) def get_num_layers(self, parallel_config: "ParallelConfig") -> int: total_num_hidden_layers = self.hf_config.num_hidden_layers @@ -312,7 +334,8 @@ def _verify_args(self) -> None: if self.gpu_memory_utilization > 1.0: raise ValueError( "GPU memory utilization must be less than 1.0. Got " - f"{self.gpu_memory_utilization}.") + f"{self.gpu_memory_utilization}." + ) def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": @@ -326,13 +349,15 @@ def _verify_cache_dtype(self) -> None: device_name = torch.cuda.get_device_name() if "AMD" in device_name: raise NotImplementedError( - "FP8_E5M2 KV Cache on AMD GPU has not been supported yet.") + "FP8_E5M2 KV Cache on AMD GPU has not been supported yet." + ) logger.info( "Using fp8_e5m2 data type to store kv cache. It reduces " "the GPU memory footprint and boosts the performance. " "But it may cause slight accuracy drop. " "Currently we only support fp8 without scaling factors and " - "make e5m2 as a default format.") + "make e5m2 as a default format." + ) else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") @@ -346,9 +371,11 @@ def verify_with_parallel_config( num_gpus_per_node = parallel_config.tensor_parallel_size cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node - msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of " - f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is " - "allocated for the swap space.") + msg = ( + f"{cpu_memory_usage / _GB:.2f} GiB out of " + f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is " + "allocated for the swap space." + ) if cpu_memory_usage > 0.7 * total_cpu_memory: raise ValueError("Too large swap space. " + msg) elif cpu_memory_usage > 0.4 * total_cpu_memory: @@ -392,19 +419,20 @@ def __init__( def _verify_args(self) -> None: if self.pipeline_parallel_size > 1: - raise NotImplementedError( - "Pipeline parallelism is not supported yet.") + raise NotImplementedError("Pipeline parallelism is not supported yet.") if not self.disable_custom_all_reduce and self.world_size > 1: if is_hip(): self.disable_custom_all_reduce = True logger.info( "Disabled the custom all-reduce kernel because it is not " - "supported on AMD GPUs.") + "supported on AMD GPUs." + ) elif self.pipeline_parallel_size > 1: self.disable_custom_all_reduce = True logger.info( "Disabled the custom all-reduce kernel because it is not " - "supported with pipeline parallelism.") + "supported with pipeline parallelism." + ) # FIXME(woosuk): Fix the stability issues and re-enable the custom # all-reduce kernel. @@ -413,7 +441,8 @@ def _verify_args(self) -> None: logger.info( "Custom all-reduce kernels are temporarily disabled due to " "stability issues. We will re-enable them once the issues are " - "resolved.") + "resolved." + ) class SchedulerConfig: @@ -455,16 +484,17 @@ def _verify_args(self) -> None: "This effectively limits the maximum sequence length to " "max_num_batched_tokens and makes vLLM reject longer " "sequences. Please increase max_num_batched_tokens or " - "decrease max_model_len.") + "decrease max_model_len." + ) if self.max_num_batched_tokens < self.max_num_seqs: raise ValueError( f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " "be greater than or equal to max_num_seqs " - f"({self.max_num_seqs}).") + f"({self.max_num_seqs})." + ) class DeviceConfig: - def __init__(self, device: str = "cuda") -> None: self.device = torch.device(device) @@ -486,11 +516,13 @@ def __post_init__(self): if self.max_lora_rank not in possible_max_ranks: raise ValueError( f"max_lora_rank ({self.max_lora_rank}) must be one of " - f"{possible_max_ranks}.") + f"{possible_max_ranks}." + ) if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size: raise ValueError( f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) " - f"must be one of {possible_lora_extra_vocab_size}.") + f"must be one of {possible_lora_extra_vocab_size}." + ) if self.max_loras < 1: raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.") if self.max_cpu_loras is None: @@ -498,7 +530,8 @@ def __post_init__(self): elif self.max_cpu_loras < self.max_loras: raise ValueError( f"max_cpu_loras ({self.max_cpu_loras}) must be >= " - f"max_loras ({self.max_loras})") + f"max_loras ({self.max_loras})" + ) def verify_with_model_config(self, model_config: ModelConfig): if self.lora_dtype in (None, "auto"): @@ -506,15 +539,15 @@ def verify_with_model_config(self, model_config: ModelConfig): elif isinstance(self.lora_dtype, str): self.lora_dtype = getattr(torch, self.lora_dtype) if model_config.quantization is not None: - raise ValueError( - "LoRA is not supported with quantized models yet.") + raise ValueError("LoRA is not supported with quantized models yet.") def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): if scheduler_config.max_num_batched_tokens > 65528: raise ValueError( "Due to limitations of the custom LoRA CUDA kernel, " "max_num_batched_tokens must be <= 65528 when " - "LoRA is enabled.") + "LoRA is enabled." + ) _STR_DTYPE_TO_TORCH_DTYPE = { @@ -558,11 +591,14 @@ def _get_and_verify_dtype( if is_hip() and torch_dtype == torch.float32: rocm_supported_dtypes = [ - k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items() + k + for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items() if (k not in _ROCM_NOT_SUPPORTED_DTYPE) ] - raise ValueError(f"dtype \'{dtype}\' is not supported in ROCm. " - f"Supported dtypes are {rocm_supported_dtypes}") + raise ValueError( + f"dtype '{dtype}' is not supported in ROCm. " + f"Supported dtypes are {rocm_supported_dtypes}" + ) # Verify the dtype. if torch_dtype != config_dtype: @@ -613,7 +649,8 @@ def _get_and_verify_max_len( "The model's config.json does not contain any of the following " "keys to determine the original maximum length of the model: " f"{possible_keys}. Assuming the model's maximum length is " - f"{default_max_len}.") + f"{default_max_len}." + ) derived_max_model_len = default_max_len rope_scaling = getattr(hf_config, "rope_scaling", None) @@ -621,8 +658,7 @@ def _get_and_verify_max_len( assert "factor" in rope_scaling scaling_factor = rope_scaling["factor"] if rope_scaling["type"] == "yarn": - derived_max_model_len = rope_scaling[ - "original_max_position_embeddings"] + derived_max_model_len = rope_scaling["original_max_position_embeddings"] derived_max_model_len *= scaling_factor if max_model_len is None: @@ -633,5 +669,6 @@ def _get_and_verify_max_len( f"the derived max_model_len ({max_len_key}={derived_max_model_len}" " in model's config.json). This may lead to incorrect model " "outputs or CUDA errors. Make sure the value is correct and " - "within the model context size.") + "within the model context size." + ) return int(max_model_len) diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index b3449eaff0e35..98d9351785a36 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -1,11 +1,13 @@ from typing import Type from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.aqlm import AQLMConfig from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig _QUANTIZATION_CONFIG_REGISTRY = { + "aqlm": AQLMConfig, "awq": AWQConfig, "gptq": GPTQConfig, "squeezellm": SqueezeLLMConfig, diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 7218760fbe55d..2a0b0c794c43c 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -6,42 +6,46 @@ from torch.nn.parameter import Parameter from vllm._C import ops -from vllm.model_executor.layers.linear import (LinearMethodBase, - set_weight_attrs) -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) +from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig -class GPTQConfig(QuantizationConfig): - """Config class for GPTQ. +class AQLMConfig(QuantizationConfig): + """Config class for AQLM. - Reference: https://arxiv.org/abs/2210.17323 + Reference: https://github.com/Vahe1994/AQLM """ def __init__( self, - weight_bits: int, - group_size: int, - desc_act: bool, + in_group_size: int, + nbits_per_codebook: int, + num_codebooks: int, + out_group_size: int, ) -> None: - self.weight_bits = weight_bits - self.group_size = group_size - self.desc_act = desc_act - self.pack_factor = 32 // self.weight_bits + self.in_group_size = in_group_size + self.nbits_per_codebook = nbits_per_codebook + self.num_codebooks = num_codebooks + self.out_group_size = out_group_size + # self.pack_factor = 32 // self.weight_bits # exllama kernel v1 only supports 4 bit - if self.weight_bits != 4: - raise ValueError( - "Currently, only 4-bit weight quantization is supported for " - f"GPTQ, but got {self.weight_bits} bits.") + # if self.weight_bits != 4: + # raise ValueError( + # "Currently, only 4-bit weight quantization is supported for " + # f"GPTQ, but got {self.weight_bits} bits." + # ) def __repr__(self) -> str: - return (f"GPTQConfig(weight_bits={self.weight_bits}, " - f"group_size={self.group_size}, " - f"desc_act={self.desc_act})") + return ( + f"AQLMConfig(in_group_size={self.in_group_size}, " + f"nbits_per_codebook={self.nbits_per_codebook}, " + f"num_codebooks={self.num_codebooks}, " + f"out_group_size={self.out_group_size})" + ) @classmethod def get_name(cls) -> str: - return "gptq" + return "aqlm" @classmethod def get_supported_act_dtypes(cls) -> List[torch.dtype]: @@ -52,39 +56,58 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]: def get_min_capability(cls) -> int: return 60 + # such as. (This one looks correct) + # https://huggingface.co/BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf/blob/main/config.json + # + # "quantization_config": { + # "in_group_size": 8, + # "nbits_per_codebook": 16, + # "num_codebooks": 1, + # "out_group_size": 1, + + # "linear_weights_not_to_quantize": [ <--- hmmm ???? + # "model.embed_tokens.weight", + # "lm_head.weight" + + # "quant_method": "aqlm" duh <- shows it's aqlm. Do we auto-detect? How? + # }, + + # this one looks non-standard, has no quantization_config, just an AQLM block. + # https://huggingface.co/BlackSamorez/Llama-2-70b-AQLM-4Bit-2x16-hf/blob/main/config.json + # "aqlm": { + # "in_group_size": 8, + # "nbits_per_codebook": 16, + # "num_codebooks": 2, + # " "out_group_size": 1 + @classmethod def get_config_filenames(cls) -> List[str]: - return ["quantize_config.json"] + return [] # no extra configs. @classmethod - def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig": - weight_bits = cls.get_from_keys(config, ["bits"]) - group_size = cls.get_from_keys(config, ["group_size"]) - desc_act = cls.get_from_keys(config, ["desc_act"]) - return cls(weight_bits, group_size, desc_act) + def from_config(cls, config: Dict[str, Any]) -> "AQLMConfig": + in_group_size = cls.get_from_keys(config, ["in_group_size"]) + nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"]) + num_code_books = cls.get_from_keys(config, ["num_codebooks"]) + out_group_size = cls.get_from_keys(config, ["out_group_size"]) + # TODO linear_weights_not_to_quantize ? + return cls(in_group_size, nbits_per_codebook, num_code_books, out_group_size) - def get_linear_method(self) -> "GPTQLinearMethod": - return GPTQLinearMethod(self) + def get_linear_method(self) -> "AQLMLinearMethod": + return AQLMLinearMethod(self) def get_scaled_act_names(self) -> List[str]: return [] -class ExllamaState(Enum): - - UNUSED = enum.auto() - UNINITIALIZED = enum.auto() - READY = enum.auto() - - -class GPTQLinearMethod(LinearMethodBase): - """Linear method for GPTQ. +class AQLMLinearMethod(LinearMethodBase): + """Linear method for AQLM. Args: - quant_config: The GPTQ quantization config. + quant_config: The AQLM quantization config. """ - def __init__(self, quant_config: GPTQConfig): + def __init__(self, quant_config: AQLMConfig): self.quant_config = quant_config def create_weights( @@ -100,28 +123,21 @@ def create_weights( raise ValueError( "The input size is not aligned with the quantized " "weight shape. This can be caused by too large " - "tensor parallel size.") + "tensor parallel size." + ) if output_size_per_partition % self.quant_config.pack_factor != 0: raise ValueError( "The output size is not aligned with the quantized " "weight shape. This can be caused by too large " - "tensor parallel size.") + "tensor parallel size." + ) if self.quant_config.group_size != -1: group_size = self.quant_config.group_size else: group_size = input_size - exllama_state = ExllamaState.UNINITIALIZED scale_and_zero_size = input_size // group_size scale_and_zero_input_dim = None - if input_size != input_size_per_partition and self.quant_config.group_size != -1: - # For act-order models, we cannot use Exllama for row parallel layer - if self.quant_config.desc_act: - exllama_state = ExllamaState.UNUSED - else: - # we need to partition qzeros and scales for exllama kernel - scale_and_zero_size = input_size_per_partition // group_size - scale_and_zero_input_dim = 0 qweight = Parameter( torch.empty( @@ -132,12 +148,14 @@ def create_weights( requires_grad=False, ) set_weight_attrs( - qweight, { + qweight, + { "input_dim": 0, "output_dim": 1, "packed_dim": 0, "pack_factor": self.quant_config.pack_factor, - }) + }, + ) g_idx = Parameter( torch.tensor( [ @@ -159,12 +177,14 @@ def create_weights( requires_grad=False, ) set_weight_attrs( - qzeros, { + qzeros, + { "input_dim": scale_and_zero_input_dim, "output_dim": 1, "packed_dim": 1, "pack_factor": self.quant_config.pack_factor, - }) + }, + ) scales = Parameter( torch.empty( scale_and_zero_size, @@ -173,39 +193,36 @@ def create_weights( ), requires_grad=False, ) - set_weight_attrs(scales, { - "input_dim": scale_and_zero_input_dim, - "output_dim": 1, - }) + set_weight_attrs( + scales, + { + "input_dim": scale_and_zero_input_dim, + "output_dim": 1, + }, + ) return { "qweight": qweight, "g_idx": g_idx, "qzeros": qzeros, - "scales": scales, - "exllama_state": exllama_state, + "scales": scales } - def apply_weights(self, - weights: Dict[str, Any], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: + def apply_weights( + self, + weights: Dict[str, Any], + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: qweight = weights["qweight"] - out_shape = x.shape[:-1] + (qweight.shape[-1], ) + out_shape = x.shape[:-1] + (qweight.shape[-1],) reshaped_x = x.reshape(-1, x.shape[-1]) - # exllama needs to shuffle the weight after the weight is loaded - # here we do the shuffle on first forward pass - if weights["exllama_state"] == ExllamaState.UNINITIALIZED: - if self.quant_config.desc_act: - weights["g_idx"] = torch.argsort(weights["g_idx"]).to( - torch.int) - else: - weights["g_idx"] = torch.empty((1, 1), device="meta") - weights["exllama_state"] = ExllamaState.READY - ops.gptq_shuffle(weights["qweight"], weights["g_idx"]) - output = ops.gptq_gemm(reshaped_x, weights["qweight"], - weights["qzeros"], weights["scales"], - weights["g_idx"], - weights["exllama_state"] == ExllamaState.READY) + output = ops.aqlm_gemm( + reshaped_x, + weights["qweight"], + weights["qzeros"], + weights["scales"], + weights["g_idx"], + ) if bias is not None: output = output + bias return output.reshape(out_shape) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 3570366887e78..37c9725033d49 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -15,14 +15,15 @@ from vllm.config import ModelConfig from vllm.logger import init_logger -from vllm.model_executor.layers.quantization import (get_quantization_config, - QuantizationConfig) +from vllm.model_executor.layers.quantization import ( + get_quantization_config, + QuantizationConfig, +) logger = init_logger(__name__) class Disabledtqdm(tqdm): - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs, disable=True) @@ -68,10 +69,12 @@ def convert_bin_to_safetensor_file( sf_size = os.stat(sf_filename).st_size pt_size = os.stat(pt_filename).st_size if (sf_size - pt_size) / pt_size > 0.01: - raise RuntimeError(f"""The file size different is more than 1%: + raise RuntimeError( + f"""The file size different is more than 1%: - {sf_filename}: {sf_size} - {pt_filename}: {pt_size} - """) + """ + ) # check if the tensors are the same reloaded = load_file(sf_filename) @@ -85,36 +88,36 @@ def convert_bin_to_safetensor_file( # TODO(woosuk): Move this to other place. def get_quant_config(model_config: ModelConfig) -> QuantizationConfig: quant_cls = get_quantization_config(model_config.quantization) - # Read the quantization config from the HF model config, if available. - hf_quant_config = getattr(model_config.hf_config, "quantization_config", - None) - if hf_quant_config is not None: - return quant_cls.from_config(hf_quant_config) + if model_config.hf_quant_config is not None: + return quant_cls.from_config(model_config.hf_quant_config) model_name_or_path = model_config.model is_local = os.path.isdir(model_name_or_path) if not is_local: # Download the config files. with get_lock(model_name_or_path, model_config.download_dir): - hf_folder = snapshot_download(model_name_or_path, - revision=model_config.revision, - allow_patterns="*.json", - cache_dir=model_config.download_dir, - tqdm_class=Disabledtqdm) + hf_folder = snapshot_download( + model_name_or_path, + revision=model_config.revision, + allow_patterns="*.json", + cache_dir=model_config.download_dir, + tqdm_class=Disabledtqdm, + ) else: hf_folder = model_name_or_path config_files = glob.glob(os.path.join(hf_folder, "*.json")) quant_config_files = [ - f for f in config_files if any( - f.endswith(x) for x in quant_cls.get_config_filenames()) + f + for f in config_files + if any(f.endswith(x) for x in quant_cls.get_config_filenames()) ] if len(quant_config_files) == 0: - raise ValueError( - f"Cannot find the config file for {model_config.quantization}") + raise ValueError(f"Cannot find the config file for {model_config.quantization}") if len(quant_config_files) > 1: raise ValueError( f"Found multiple config files for {model_config.quantization}: " - f"{quant_config_files}") + f"{quant_config_files}" + ) quant_config_file = quant_config_files[0] with open(quant_config_file, "r") as f: @@ -164,11 +167,13 @@ def prepare_hf_model_weights( # Use file lock to prevent multiple processes from # downloading the same model weights at the same time. with get_lock(model_name_or_path, cache_dir): - hf_folder = snapshot_download(model_name_or_path, - allow_patterns=allow_patterns, - cache_dir=cache_dir, - tqdm_class=Disabledtqdm, - revision=revision) + hf_folder = snapshot_download( + model_name_or_path, + allow_patterns=allow_patterns, + cache_dir=cache_dir, + tqdm_class=Disabledtqdm, + revision=revision, + ) else: hf_folder = model_name_or_path hf_weights_files: List[str] = [] @@ -189,13 +194,11 @@ def prepare_hf_model_weights( "scaler.pt", ] hf_weights_files = [ - f for f in hf_weights_files - if not any(f.endswith(x) for x in blacklist) + f for f in hf_weights_files if not any(f.endswith(x) for x in blacklist) ] if len(hf_weights_files) == 0: - raise RuntimeError( - f"Cannot find any model weights with `{model_name_or_path}`") + raise RuntimeError(f"Cannot find any model weights with `{model_name_or_path}`") return hf_folder, hf_weights_files, use_safetensors @@ -212,7 +215,8 @@ def hf_model_weights_iterator( cache_dir=cache_dir, load_format=load_format, fall_back_to_pt=fall_back_to_pt, - revision=revision) + revision=revision, + ) if load_format == "npcache": # Currently np_cache only support *.bin checkpoints @@ -276,8 +280,7 @@ def convert_pyslice_to_tensor(x: Any) -> torch.Tensor: return x -def default_weight_loader(param: torch.Tensor, - loaded_weight: torch.Tensor) -> None: +def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None: """Default weight loader.""" assert param.size() == loaded_weight.size() param.data.copy_(loaded_weight) From 20a71fdb941b641276220be6752246a74a5e8efa Mon Sep 17 00:00:00 2001 From: James Fleming Date: Mon, 26 Feb 2024 17:56:17 -0500 Subject: [PATCH 03/96] adding in mat mat, need to move the pytorch stuff, maybe add some aqlm prefixes. --- csrc/quantization/aqlm/aqlm_cuda_kernel.cpp | 142 ++++++++++++++++++ .../layers/quantization/aqlm.py | 2 + 2 files changed, 144 insertions(+) create mode 100644 csrc/quantization/aqlm/aqlm_cuda_kernel.cpp diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cpp b/csrc/quantization/aqlm/aqlm_cuda_kernel.cpp new file mode 100644 index 0000000000000..301e8439b24ae --- /dev/null +++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cpp @@ -0,0 +1,142 @@ +#include +#include +#include + +void code1x16_matvec_cuda( + const void* A, + const void* B, + void* C, + const void* codebook, + int prob_m, + int prob_k +); + +void code2x8_matvec_cuda( + const void* A, + const void* B, + void* C, + const void* codebook, + int prob_m, + int prob_k +); + +void code1x16_matvec( + const torch::Tensor& A, + const torch::Tensor& B, + torch::Tensor& C, + const torch::Tensor& codebook +) { + const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); + int prob_m = C.size(0); + int prob_k = B.size(0); + code1x16_matvec_cuda( + A.data_ptr(), + B.data_ptr(), + C.data_ptr(), + codebook.data_ptr(), + prob_m, + prob_k + ); +} + +torch::Tensor code1x16_matmat( + const torch::Tensor& input, + const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& scales, + const std::optional& bias +) { + auto input_sizes = input.sizes(); + auto out_features = codes.size(0) * codebooks.size(2); + auto flat_input = input.reshape({-1, input.size(-1)}); + auto flat_output = torch::empty({flat_input.size(0), out_features}, + torch::TensorOptions() + .dtype(input.dtype()) + .device(input.device()) + ); + + for (int i = 0; i < flat_input.size(0); ++i) { + auto input_vec = flat_input.index({i}); + auto output_vec = flat_output.index({i}); + code1x16_matvec( + codes.squeeze(2), + input_vec, + output_vec, + codebooks + ); + } + flat_output *= scales.flatten().unsqueeze(0); + if (bias.has_value()) { + flat_output += bias->unsqueeze(0); + } + + auto output_sizes = input_sizes.vec(); + output_sizes.pop_back(); + output_sizes.push_back(-1); + auto output = flat_output.reshape(output_sizes).clone(); + return output; +} + +void code2x8_matvec( + const torch::Tensor& A, + const torch::Tensor& B, + torch::Tensor& C, + const torch::Tensor& codebook +) { + const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); + int prob_m = C.size(0); + int prob_k = B.size(0); + code2x8_matvec_cuda( + A.data_ptr(), + B.data_ptr(), + C.data_ptr(), + codebook.data_ptr(), + prob_m, + prob_k + ); +} + +torch::Tensor code2x8_matmat( + const torch::Tensor& input, + const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& scales, + const std::optional& bias +) { + auto input_sizes = input.sizes(); + auto out_features = codes.size(0) * codebooks.size(2); + auto flat_input = input.reshape({-1, input.size(-1)}); + auto flat_output = torch::empty({flat_input.size(0), out_features}, + torch::TensorOptions() + .dtype(input.dtype()) + .device(input.device()) + ); + + for (int i = 0; i < flat_input.size(0); ++i) { + auto input_vec = flat_input.index({i}); + auto output_vec = flat_output.index({i}); + code2x8_matvec( + codes.squeeze(2), + input_vec, + output_vec, + codebooks + ); + } + flat_output *= scales.flatten().unsqueeze(0); + if (bias.has_value()) { + flat_output += bias->unsqueeze(0); + } + + auto output_sizes = input_sizes.vec(); + output_sizes.pop_back(); + output_sizes.push_back(-1); + auto output = flat_output.reshape(output_sizes).clone(); + return output; +} + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("code1x16_matmat", &code1x16_matmat, "1x16 (2bit) codebook matrix-matrix product."); + m.def("code2x8_matmat", &code2x8_matmat, "2x8 (2bit) codebook matrix-matrix product."); +} + diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 2a0b0c794c43c..5745487067227 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -72,6 +72,8 @@ def get_min_capability(cls) -> int: # "quant_method": "aqlm" duh <- shows it's aqlm. Do we auto-detect? How? # }, + #https://huggingface.co/meta-llama/Llama-2-7b-hf + # this one looks non-standard, has no quantization_config, just an AQLM block. # https://huggingface.co/BlackSamorez/Llama-2-70b-AQLM-4Bit-2x16-hf/blob/main/config.json # "aqlm": { From d0cf25a850218914e1c3c9abf509eb36c0df1c1e Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 27 Feb 2024 14:49:45 -0500 Subject: [PATCH 04/96] load the codebooks, codes, and scales. --- vllm/model_executor/layers/linear.py | 245 +++++++++++------- .../layers/quantization/aqlm.py | 123 +++++---- vllm/model_executor/models/llama.py | 136 +++++----- 3 files changed, 295 insertions(+), 209 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 55d38b763b2b5..cd9a17b7ef864 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -6,11 +6,14 @@ from torch.nn.parameter import Parameter from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_all_reduce, tensor_model_parallel_all_gather) -from vllm.model_executor.parallel_utils.utils import ( - divide, split_tensor_along_last_dim) + tensor_model_parallel_all_reduce, + tensor_model_parallel_all_gather, +) +from vllm.model_executor.parallel_utils.utils import divide, split_tensor_along_last_dim from vllm.model_executor.utils import set_weight_attrs from vllm.logger import init_logger @@ -21,18 +24,24 @@ class LinearMethodBase(ABC): """Base class for different (maybe quantized) linear methods.""" @abstractmethod - def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, - output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: + def create_weights( + self, + input_size_per_partition: int, + output_size_per_partition: int, + input_size: int, + output_size: int, + params_dtype: torch.dtype, + ) -> Dict[str, Any]: """Create weights for a linear layer.""" raise NotImplementedError @abstractmethod - def apply_weights(self, - weights: Dict[str, torch.Tensor], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: + def apply_weights( + self, + weights: Dict[str, torch.Tensor], + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: """Apply the weights to the input tensor.""" raise NotImplementedError @@ -48,21 +57,29 @@ class UnquantizedLinearMethod(LinearMethodBase): def __init__(self, separate_bias_add: bool = False): self.separate_bias_add = separate_bias_add - def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, - output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: - weight = Parameter(torch.empty(output_size_per_partition, - input_size_per_partition, - dtype=params_dtype), - requires_grad=False) + def create_weights( + self, + input_size_per_partition: int, + output_size_per_partition: int, + input_size: int, + output_size: int, + params_dtype: torch.dtype, + ) -> Dict[str, Any]: + weight = Parameter( + torch.empty( + output_size_per_partition, input_size_per_partition, dtype=params_dtype + ), + requires_grad=False, + ) set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) return {"weight": weight} - def apply_weights(self, - weights: Dict[str, torch.Tensor], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: + def apply_weights( + self, + weights: Dict[str, torch.Tensor], + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: weight = weights["weight"] if self.separate_bias_add: if bias: @@ -105,14 +122,19 @@ def __init__( linear_method = UnquantizedLinearMethod() self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( - self.input_size, self.output_size, self.input_size, - self.output_size, self.params_dtype) + self.input_size, + self.output_size, + self.input_size, + self.output_size, + self.params_dtype, + ) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) if bias: self.bias = Parameter( - torch.empty(self.output_size, dtype=self.params_dtype)) + torch.empty(self.output_size, dtype=self.params_dtype) + ) set_weight_attrs(self.bias, {"output_dim": 0}) else: self.register_parameter("bias", None) @@ -171,20 +193,27 @@ def __init__( linear_method = UnquantizedLinearMethod() self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( - self.input_size, self.output_size_per_partition, self.input_size, - self.output_size, self.params_dtype) + self.input_size, + self.output_size_per_partition, + self.input_size, + self.output_size, + self.params_dtype, + ) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) set_weight_attrs(weight, {"weight_loader": self.weight_loader}) if bias: self.bias = Parameter( - torch.empty(self.output_size_per_partition, - dtype=params_dtype)) - set_weight_attrs(self.bias, { - "output_dim": 0, - "weight_loader": self.weight_loader, - }) + torch.empty(self.output_size_per_partition, dtype=params_dtype) + ) + set_weight_attrs( + self.bias, + { + "output_dim": 0, + "weight_loader": self.weight_loader, + }, + ) else: self.register_parameter("bias", None) @@ -195,8 +224,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): if output_dim is not None: shard_size = param_data.shape[output_dim] start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, - shard_size) + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) @@ -205,7 +233,8 @@ def forward(self, input_): # Matrix multiply. output_parallel = self.linear_method.apply_weights( - self.linear_weights, input_, bias) + self.linear_weights, input_, bias + ) if self.gather_output: # All-gather across the partitions. output = tensor_model_parallel_all_gather(output_parallel) @@ -249,13 +278,22 @@ def __init__( self.output_sizes = output_sizes tp_size = get_tensor_model_parallel_world_size() assert all(output_size % tp_size == 0 for output_size in output_sizes) - super().__init__(input_size, sum(output_sizes), bias, gather_output, - skip_bias_add, params_dtype, linear_method) - - def weight_loader(self, - param: Parameter, - loaded_weight: torch.Tensor, - loaded_shard_id: Optional[int] = None): + super().__init__( + input_size, + sum(output_sizes), + bias, + gather_output, + skip_bias_add, + params_dtype, + linear_method, + ) + + def weight_loader( + self, + param: Parameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[int] = None, + ): param_data = param.data output_dim = getattr(param, "output_dim", None) if loaded_shard_id is None: @@ -277,7 +315,8 @@ def weight_loader(self, shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor loaded_weight_shard = loaded_weight.narrow( - output_dim, shard_offset, shard_size) + output_dim, shard_offset, shard_size + ) self.weight_loader(param, loaded_weight_shard, shard_id) return @@ -293,18 +332,17 @@ def weight_loader(self, if packed_dim == output_dim: shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - param_data = param_data.narrow(output_dim, shard_offset, - shard_size) + param_data = param_data.narrow(output_dim, shard_offset, shard_size) start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, - shard_size) + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) else: ignore_warning = getattr(param, "ignore_warning", False) if not ignore_warning: logger.warning( "Loading a weight without `output_dim` attribute in " "MergedColumnParallelLinear, assume the weight is " - "the same for all partitions.") + "the same for all partitions." + ) assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) @@ -355,21 +393,30 @@ def __init__( self.num_heads = divide(self.total_num_heads, tp_size) if tp_size >= self.total_num_kv_heads: self.num_kv_heads = 1 - self.num_kv_head_replicas = divide(tp_size, - self.total_num_kv_heads) + self.num_kv_head_replicas = divide(tp_size, self.total_num_kv_heads) else: self.num_kv_heads = divide(self.total_num_kv_heads, tp_size) self.num_kv_head_replicas = 1 input_size = self.hidden_size - output_size = (self.num_heads + - 2 * self.num_kv_heads) * tp_size * self.head_size - super().__init__(input_size, output_size, bias, False, skip_bias_add, - params_dtype, linear_method) - - def weight_loader(self, - param: Parameter, - loaded_weight: torch.Tensor, - loaded_shard_id: Optional[str] = None): + output_size = ( + (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size + ) + super().__init__( + input_size, + output_size, + bias, + False, + skip_bias_add, + params_dtype, + linear_method, + ) + + def weight_loader( + self, + param: Parameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[str] = None, + ): param_data = param.data output_dim = getattr(param, "output_dim", None) if loaded_shard_id is None: @@ -381,10 +428,16 @@ def weight_loader(self, shard_offsets = [ # (shard_id, shard_offset, shard_size) ("q", 0, self.total_num_heads * self.head_size), - ("k", self.total_num_heads * self.head_size, - self.total_num_kv_heads * self.head_size), - ("v", (self.total_num_heads + self.total_num_kv_heads) * - self.head_size, self.total_num_kv_heads * self.head_size), + ( + "k", + self.total_num_heads * self.head_size, + self.total_num_kv_heads * self.head_size, + ), + ( + "v", + (self.total_num_heads + self.total_num_kv_heads) * self.head_size, + self.total_num_kv_heads * self.head_size, + ), ] packed_dim = getattr(param, "packed_dim", None) for shard_id, shard_offset, shard_size in shard_offsets: @@ -394,7 +447,8 @@ def weight_loader(self, shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor loaded_weight_shard = loaded_weight.narrow( - output_dim, shard_offset, shard_size) + output_dim, shard_offset, shard_size + ) self.weight_loader(param, loaded_weight_shard, shard_id) return @@ -408,8 +462,7 @@ def weight_loader(self, shard_offset = self.num_heads * self.head_size shard_size = self.num_kv_heads * self.head_size elif loaded_shard_id == "v": - shard_offset = (self.num_heads + - self.num_kv_heads) * self.head_size + shard_offset = (self.num_heads + self.num_kv_heads) * self.head_size shard_size = self.num_kv_heads * self.head_size # If quantized, we need to adjust the offset and size to account # for the packing. @@ -417,22 +470,21 @@ def weight_loader(self, if packed_dim == output_dim: shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - param_data = param_data.narrow(output_dim, shard_offset, - shard_size) + param_data = param_data.narrow(output_dim, shard_offset, shard_size) if loaded_shard_id == "q": shard_id = tp_rank else: shard_id = tp_rank // self.num_kv_head_replicas start_idx = shard_id * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, - shard_size) + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) else: ignore_warning = getattr(param, "ignore_warning", False) if not ignore_warning: logger.warning( "Loading a weight without `output_dim` attribute in " "QKVParallelLinear, assume the weight is the same " - "for all partitions.") + "for all partitions." + ) assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) @@ -492,24 +544,32 @@ def __init__( linear_method = UnquantizedLinearMethod() self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( - self.input_size_per_partition, self.output_size, self.input_size, - self.output_size, self.params_dtype) + self.input_size_per_partition, + self.output_size, + self.input_size, + self.output_size, + self.params_dtype, + ) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) set_weight_attrs(weight, {"weight_loader": self.weight_loader}) if not reduce_results and (bias and not skip_bias_add): - raise ValueError("When not reduce the results, adding bias to the " - "results can lead to incorrect results") + raise ValueError( + "When not reduce the results, adding bias to the " + "results can lead to incorrect results" + ) if bias: - self.bias = Parameter( - torch.empty(self.output_size, dtype=params_dtype)) - set_weight_attrs(self.bias, { - "output_dim": 0, - "weight_loader": self.weight_loader, - }) + self.bias = Parameter(torch.empty(self.output_size, dtype=params_dtype)) + set_weight_attrs( + self.bias, + { + "output_dim": 0, + "weight_loader": self.weight_loader, + }, + ) else: self.register_parameter("bias", None) @@ -517,12 +577,19 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): tp_rank = get_tensor_model_parallel_rank() input_dim = getattr(param, "input_dim", None) param_data = param.data + + # TEST + print("param data shape is ", param_data.shape) + print("loaded_weight is ", loaded_weight.shape) + if input_dim is not None: shard_size = param_data.shape[input_dim] start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(input_dim, start_idx, - shard_size) + loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size) + print("sharded loaded_weight is ", loaded_weight.shape) + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) def forward(self, input_): @@ -532,12 +599,14 @@ def forward(self, input_): else: tp_rank = get_tensor_model_parallel_rank() splitted_input = split_tensor_along_last_dim( - input_, num_partitions=self.tp_size) + input_, num_partitions=self.tp_size + ) input_parallel = splitted_input[tp_rank].contiguous() # Matrix multiply. output_parallel = self.linear_method.apply_weights( - self.linear_weights, input_parallel) + self.linear_weights, input_parallel + ) if self.reduce_results and self.tp_size > 1: output_ = tensor_model_parallel_all_reduce(output_parallel) else: diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 5745487067227..561375c23b62b 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -10,6 +10,18 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +def get_int_dtype(nbits: int) -> torch.dtype: + if nbits <= 8: + return torch.int8 + if nbits <= 16: + return torch.int16 + if nbits <= 32: + return torch.int32 + if nbits <= 64: + return torch.int64 + raise ValueError(f"No dtype available for {nbits}-bit codebooks") + + class AQLMConfig(QuantizationConfig): """Config class for AQLM. @@ -27,13 +39,13 @@ def __init__( self.nbits_per_codebook = nbits_per_codebook self.num_codebooks = num_codebooks self.out_group_size = out_group_size - # self.pack_factor = 32 // self.weight_bits - # exllama kernel v1 only supports 4 bit - # if self.weight_bits != 4: - # raise ValueError( - # "Currently, only 4-bit weight quantization is supported for " - # f"GPTQ, but got {self.weight_bits} bits." - # ) + + # I think pack factor is *probably* how many elements fit into one quantized tensor element. + # though out group size makes it interesting, because really we are doing 2D blocks, potentially. + # maybe this is vllms first 2D packing? Arg. + self.pack_factor = ( + self.in_group_size * self.out_group_size // self.num_codebooks + ) def __repr__(self) -> str: return ( @@ -64,23 +76,21 @@ def get_min_capability(cls) -> int: # "nbits_per_codebook": 16, # "num_codebooks": 1, # "out_group_size": 1, - + # "quant_method": "aqlm" # "linear_weights_not_to_quantize": [ <--- hmmm ???? # "model.embed_tokens.weight", # "lm_head.weight" - - # "quant_method": "aqlm" duh <- shows it's aqlm. Do we auto-detect? How? # }, - #https://huggingface.co/meta-llama/Llama-2-7b-hf + # https://huggingface.co/meta-llama/Llama-2-7b-hf <- can't see it, locked behind meta. - # this one looks non-standard, has no quantization_config, just an AQLM block. + # this is no-standard, has no "quantization_config", just an "aqlm" block. # https://huggingface.co/BlackSamorez/Llama-2-70b-AQLM-4Bit-2x16-hf/blob/main/config.json # "aqlm": { # "in_group_size": 8, # "nbits_per_codebook": 16, # "num_codebooks": 2, - # " "out_group_size": 1 + # "out_group_size": 1 @classmethod def get_config_filenames(cls) -> List[str]: @@ -121,76 +131,65 @@ def create_weights( params_dtype: torch.dtype, ) -> Dict[str, Any]: del output_size # Unused. - if input_size_per_partition % self.quant_config.group_size != 0: + del input_size # Unused. + + if params_dtype != torch.half: + raise ValueError("Only half is currently supported by aqlm") + if input_size_per_partition % self.quant_config.in_group_size != 0: raise ValueError( "The input size is not aligned with the quantized " "weight shape. This can be caused by too large " "tensor parallel size." ) - if output_size_per_partition % self.quant_config.pack_factor != 0: + if output_size_per_partition % self.quant_config.out_group_size != 0: raise ValueError( "The output size is not aligned with the quantized " "weight shape. This can be caused by too large " "tensor parallel size." ) - if self.quant_config.group_size != -1: - group_size = self.quant_config.group_size - else: - group_size = input_size - scale_and_zero_size = input_size // group_size - scale_and_zero_input_dim = None - - qweight = Parameter( + # or does this need more dimensions and use the correct nbits_per_codebook as an int type. Does that pack them? + codes = Parameter( torch.empty( + output_size_per_partition, # not entirely sure what to do with out groups, if we need this pack factor. input_size_per_partition // self.quant_config.pack_factor, - output_size_per_partition, - dtype=torch.int32, + 1, + dtype=get_int_dtype(self.quant_config.nbits_per_codebook), ), requires_grad=False, ) + set_weight_attrs( - qweight, + codes, { - "input_dim": 0, - "output_dim": 1, - "packed_dim": 0, + "input_dim": 1, + "output_dim": 0, + "packed_dim": 1, "pack_factor": self.quant_config.pack_factor, }, ) - g_idx = Parameter( - torch.tensor( - [ - i // self.quant_config.group_size - for i in range(input_size_per_partition) - ], - dtype=torch.int32, - ), - requires_grad=False, - ) - # Ignore warning from fused linear layers such as QKVParallelLinear. - set_weight_attrs(g_idx, {"input_dim": 0, "ignore_warning": True}) - qzeros = Parameter( + + codebooks = Parameter( torch.empty( - scale_and_zero_size, - output_size_per_partition // self.quant_config.pack_factor, - dtype=torch.int32, + self.quant_config.num_codebooks, + 2**self.quant_config.nbits_per_codebook, + self.quant_config.out_group_size, + self.quant_config.in_group_size, + dtype=params_dtype, ), requires_grad=False, ) - set_weight_attrs( - qzeros, - { - "input_dim": scale_and_zero_input_dim, - "output_dim": 1, - "packed_dim": 1, - "pack_factor": self.quant_config.pack_factor, - }, - ) + # no attributes? It's fixed size, no input or output dim, need the whole thing. + # looks like named attributes are for sharding so it knows how to split something up. + scales = Parameter( torch.empty( - scale_and_zero_size, - output_size_per_partition, + ( + output_size_per_partition // self.quant_config.out_group_size, + 1, # do we really need these other dimensions? They don't count, or? + 1, + 1, + ), dtype=params_dtype, ), requires_grad=False, @@ -198,15 +197,15 @@ def create_weights( set_weight_attrs( scales, { - "input_dim": scale_and_zero_input_dim, - "output_dim": 1, + "output_dim": 0, + # "pack_factor": self.quant_config.pack_factor, I guess not really a pack factor, just smaller? }, ) + return { - "qweight": qweight, - "g_idx": g_idx, - "qzeros": qzeros, - "scales": scales + "codes": codes, + "codebooks": codebooks, + "scales": scales, } def apply_weights( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index b7f6b8f3ec374..217edb20049ce 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -31,19 +31,27 @@ from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import PagedAttention from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) +from vllm.model_executor.layers.linear import ( + LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) + VocabParallelEmbedding, + ParallelLMHead, + DEFAULT_VOCAB_PADDING_SIZE, +) from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) + get_tensor_model_parallel_world_size, +) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) +from vllm.model_executor.weight_utils import ( + default_weight_loader, + hf_model_weights_iterator, +) from vllm.sequence import SamplerOutput from vllm.config import LoRAConfig @@ -51,7 +59,6 @@ class LlamaMLP(nn.Module): - def __init__( self, hidden_size: int, @@ -61,16 +68,19 @@ def __init__( ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, + hidden_size, + [intermediate_size] * 2, bias=False, - linear_method=linear_method) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - linear_method=linear_method) + linear_method=linear_method, + ) + self.down_proj = RowParallelLinear( + intermediate_size, hidden_size, bias=False, linear_method=linear_method + ) if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") + raise ValueError( + f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now." + ) self.act_fn = SiluAndMul() def forward(self, x): @@ -81,7 +91,6 @@ def forward(self, x): class LlamaAttention(nn.Module): - def __init__( self, hidden_size: int, @@ -139,11 +148,13 @@ def __init__( base=rope_theta, rope_scaling=rope_scaling, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - sliding_window=sliding_window) + self.attn = PagedAttention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=sliding_window, + ) def forward( self, @@ -162,7 +173,6 @@ def forward( class LlamaDecoderLayer(nn.Module): - def __init__( self, config: LlamaConfig, @@ -172,14 +182,14 @@ def __init__( self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) rope_scaling = getattr(config, "rope_scaling", None) - max_position_embeddings = getattr(config, "max_position_embeddings", - 8192) + max_position_embeddings = getattr(config, "max_position_embeddings", 8192) sliding_window = getattr(config, "sliding_window", None) self.self_attn = LlamaAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, - num_kv_heads=getattr(config, "num_key_value_heads", - config.num_attention_heads), + num_kv_heads=getattr( + config, "num_key_value_heads", config.num_attention_heads + ), rope_theta=rope_theta, rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, @@ -193,10 +203,10 @@ def __init__( hidden_act=config.hidden_act, linear_method=linear_method, ) - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) def forward( self, @@ -211,8 +221,7 @@ def forward( residual = hidden_states hidden_states = self.input_layernorm(hidden_states) else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) + hidden_states, residual = self.input_layernorm(hidden_states, residual) hidden_states = self.self_attn( positions=positions, hidden_states=hidden_states, @@ -221,14 +230,12 @@ def forward( ) # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) hidden_states = self.mlp(hidden_states) return hidden_states, residual class LlamaModel(nn.Module): - def __init__( self, config: LlamaConfig, @@ -238,8 +245,11 @@ def __init__( super().__init__() self.config = config self.padding_idx = config.pad_token_id - lora_vocab = (lora_config.lora_extra_vocab_size * - (lora_config.max_loras or 1)) if lora_config else 0 + lora_vocab = ( + (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) + if lora_config + else 0 + ) self.vocab_size = config.vocab_size + lora_vocab self.org_vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( @@ -247,10 +257,12 @@ def __init__( config.hidden_size, org_num_embeddings=config.vocab_size, ) - self.layers = nn.ModuleList([ - LlamaDecoderLayer(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) + self.layers = nn.ModuleList( + [ + LlamaDecoderLayer(config, linear_method) + for _ in range(config.num_hidden_layers) + ] + ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) def forward( @@ -323,7 +335,8 @@ def __init__( padding_size=DEFAULT_VOCAB_PADDING_SIZE # We need bigger padding if using lora for kernel # compatibility - if not lora_config else lora_config.lora_vocab_padding_size, + if not lora_config + else lora_config.lora_vocab_padding_size, ) self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size) @@ -334,8 +347,7 @@ def forward( kv_caches: List[KVCache], input_metadata: InputMetadata, ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - input_metadata) + hidden_states = self.model(input_ids, positions, kv_caches, input_metadata) return hidden_states def sample( @@ -343,15 +355,18 @@ def sample( hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata) + next_tokens = self.sampler( + self.lm_head.weight, hidden_states, sampling_metadata + ) return next_tokens - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): + def load_weights( + self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None, + ): stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -362,15 +377,15 @@ def load_weights(self, ] params_dict = dict(self.named_parameters()) for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): + model_name_or_path, cache_dir, load_format, revision + ): if "rotary_emb.inv_freq" in name: continue - if ("rotary_emb.cos_cached" in name - or "rotary_emb.sin_cached" in name): + if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: + for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue name = name.replace(weight_name, param_name) @@ -379,6 +394,8 @@ def load_weights(self, continue param = params_dict[name] weight_loader = param.weight_loader + # TEST + print("loading ", name) weight_loader(param, loaded_weight, shard_id) break else: @@ -386,6 +403,7 @@ def load_weights(self, if name.endswith(".bias") and name not in params_dict: continue param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) + weight_loader = getattr(param, "weight_loader", default_weight_loader) + # TEST + print("loading ", name) weight_loader(param, loaded_weight) From 40463e3e1e424a26a03586a00eff97d7a0164ac1 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 27 Feb 2024 15:43:55 -0500 Subject: [PATCH 05/96] try to bind cpp aqlm entry point to python --- csrc/ops.h | 14 +- csrc/pybind.cpp | 2 + ...lm_cuda_kernel.cpp => aqlm_cuda_entry.cpp} | 9 +- setup.py | 126 +++++++++++------- .../layers/quantization/aqlm.py | 21 +-- 5 files changed, 105 insertions(+), 67 deletions(-) rename csrc/quantization/aqlm/{aqlm_cuda_kernel.cpp => aqlm_cuda_entry.cpp} (96%) diff --git a/csrc/ops.h b/csrc/ops.h index 351c4cade7a09..5ff16e0a27393 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -69,13 +69,13 @@ void gelu_fast( torch::Tensor& out, torch::Tensor& input); -torch::Tensor aqlm_gemm( - torch::Tensor a, - torch::Tensor b_q_weight, - torch::Tensor b_gptq_qzeros, - torch::Tensor b_gptq_scales, - torch::Tensor b_g_idx, - bool use_exllama); +torch::Tensor code1x16_matmat( + const torch::Tensor& input, + const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& scales + //const std::optional& bias +); #ifndef USE_ROCM torch::Tensor awq_gemm( diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index 24c22020131e8..d1410071d3afe 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -53,6 +53,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); // Quantization ops + ops.def("aqlm_gemm", &code1x16_matmat, "Quantized GEMM for AQLM"); + #ifndef USE_ROCM ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ"); ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ"); diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp similarity index 96% rename from csrc/quantization/aqlm/aqlm_cuda_kernel.cpp rename to csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 301e8439b24ae..66d452df82424 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -43,8 +43,8 @@ torch::Tensor code1x16_matmat( const torch::Tensor& input, const torch::Tensor& codes, const torch::Tensor& codebooks, - const torch::Tensor& scales, - const std::optional& bias + const torch::Tensor& scales + //const std::optional& bias ) { auto input_sizes = input.sizes(); auto out_features = codes.size(0) * codebooks.size(2); @@ -66,9 +66,12 @@ torch::Tensor code1x16_matmat( ); } flat_output *= scales.flatten().unsqueeze(0); + +/* not sure how to bridge this yet. if (bias.has_value()) { flat_output += bias->unsqueeze(0); } + */ auto output_sizes = input_sizes.vec(); output_sizes.pop_back(); @@ -135,8 +138,10 @@ torch::Tensor code2x8_matmat( } +/* PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("code1x16_matmat", &code1x16_matmat, "1x16 (2bit) codebook matrix-matrix product."); m.def("code2x8_matmat", &code2x8_matmat, "2x8 (2bit) codebook matrix-matrix product."); } +*/ diff --git a/setup.py b/setup.py index 8fcb86394f76d..ba00f1b220add 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,12 @@ import setuptools import torch import torch.utils.cpp_extension as torch_cpp_ext -from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME +from torch.utils.cpp_extension import ( + BuildExtension, + CUDAExtension, + CUDA_HOME, + ROCM_HOME, +) ROOT_DIR = os.path.dirname(__file__) @@ -61,7 +66,8 @@ def _is_cuda() -> bool: if _is_cuda() and CUDA_HOME is None: raise RuntimeError( - "Cannot find CUDA_HOME. CUDA must be available to build the package.") + "Cannot find CUDA_HOME. CUDA must be available to build the package." + ) ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0 CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] @@ -70,10 +76,12 @@ def _is_cuda() -> bool: def get_hipcc_rocm_version(): # Run the hipcc --version command - result = subprocess.run(['hipcc', '--version'], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True) + result = subprocess.run( + ["hipcc", "--version"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) # Check if the command was executed successfully if result.returncode != 0: @@ -81,7 +89,7 @@ def get_hipcc_rocm_version(): return None # Extract the version using a regular expression - match = re.search(r'HIP version: (\S+)', result.stdout) + match = re.search(r"HIP version: (\S+)", result.stdout) if match: # Return the version string return match.group(1) @@ -97,9 +105,9 @@ def glob(pattern: str): def get_neuronxcc_version(): import sysconfig + site_dir = sysconfig.get_paths()["purelib"] - version_file = os.path.join(site_dir, "neuronxcc", "version", - "__init__.py") + version_file = os.path.join(site_dir, "neuronxcc", "version", "__init__.py") # Check if the command was executed successfully with open(version_file, "rt") as fp: @@ -119,8 +127,9 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version: Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py """ - nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], - universal_newlines=True) + nvcc_output = subprocess.check_output( + [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True + ) output = nvcc_output.split() release_idx = output.index("release") + 1 nvcc_cuda_version = parse(output[release_idx].split(",")[0]) @@ -142,8 +151,12 @@ def get_pytorch_rocm_arch() -> Set[str]: # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator if env_arch_list is None: command = "rocm_agent_enumerator" - env_arch_list = subprocess.check_output([command]).decode('utf-8')\ - .strip().replace("\n", ";") + env_arch_list = ( + subprocess.check_output([command]) + .decode("utf-8") + .strip() + .replace("\n", ";") + ) arch_source_str = "rocm_agent_enumerator" else: arch_source_str = "PYTORCH_ROCM_ARCH env variable" @@ -159,7 +172,8 @@ def get_pytorch_rocm_arch() -> Set[str]: raise RuntimeError( f"None of the ROCM architectures in {arch_source_str} " f"({env_arch_list}) is supported. " - f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}.") + f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}." + ) invalid_arch_list = pytorch_rocm_arch - ROCM_SUPPORTED_ARCHS if invalid_arch_list: warnings.warn( @@ -167,7 +181,8 @@ def get_pytorch_rocm_arch() -> Set[str]: f"excluded from the {arch_source_str} output " f"({env_arch_list}). Supported ROCM architectures are: " f"{ROCM_SUPPORTED_ARCHS}.", - stacklevel=2) + stacklevel=2, + ) return arch_list @@ -189,15 +204,16 @@ def get_torch_arch_list() -> Set[str]: # Filter out the invalid architectures and print a warning. valid_archs = NVIDIA_SUPPORTED_ARCHS.union( - {s + "+PTX" - for s in NVIDIA_SUPPORTED_ARCHS}) + {s + "+PTX" for s in NVIDIA_SUPPORTED_ARCHS} + ) arch_list = torch_arch_list.intersection(valid_archs) # If none of the specified architectures are valid, raise an error. if not arch_list: raise RuntimeError( "None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env " f"variable ({env_arch_list}) is supported. " - f"Supported CUDA architectures are: {valid_archs}.") + f"Supported CUDA architectures are: {valid_archs}." + ) invalid_arch_list = torch_arch_list - valid_archs if invalid_arch_list: warnings.warn( @@ -205,7 +221,8 @@ def get_torch_arch_list() -> Set[str]: "excluded from the `TORCH_CUDA_ARCH_LIST` env variable " f"({env_arch_list}). Supported CUDA architectures are: " f"{valid_archs}.", - stacklevel=2) + stacklevel=2, + ) return arch_list @@ -224,7 +241,8 @@ def get_torch_arch_list() -> Set[str]: major, minor = torch.cuda.get_device_capability(i) if major < 7: raise RuntimeError( - "GPUs with compute capability below 7.0 are not supported.") + "GPUs with compute capability below 7.0 are not supported." + ) compute_capabilities.add(f"{major}.{minor}") ext_modules = [] @@ -242,12 +260,13 @@ def get_torch_arch_list() -> Set[str]: compute_capabilities.remove("9.0") # Validate the NVCC CUDA version. if nvcc_cuda_version < Version("11.0"): + raise RuntimeError("CUDA 11.0 or higher is required to build the package.") + if nvcc_cuda_version < Version("11.1") and any( + cc.startswith("8.6") for cc in compute_capabilities + ): raise RuntimeError( - "CUDA 11.0 or higher is required to build the package.") - if (nvcc_cuda_version < Version("11.1") - and any(cc.startswith("8.6") for cc in compute_capabilities)): - raise RuntimeError( - "CUDA 11.1 or higher is required for compute capability 8.6.") + "CUDA 11.1 or higher is required for compute capability 8.6." + ) if nvcc_cuda_version < Version("11.8"): if any(cc.startswith("8.9") for cc in compute_capabilities): # CUDA 11.8 is required to generate the code targeting compute capability 8.9. @@ -258,13 +277,16 @@ def get_torch_arch_list() -> Set[str]: warnings.warn( "CUDA 11.8 or higher is required for compute capability 8.9. " "Targeting compute capability 8.0 instead.", - stacklevel=2) - compute_capabilities = set(cc for cc in compute_capabilities - if not cc.startswith("8.9")) + stacklevel=2, + ) + compute_capabilities = set( + cc for cc in compute_capabilities if not cc.startswith("8.9") + ) compute_capabilities.add("8.0+PTX") if any(cc.startswith("9.0") for cc in compute_capabilities): raise RuntimeError( - "CUDA 11.8 or higher is required for compute capability 9.0.") + "CUDA 11.8 or higher is required for compute capability 9.0." + ) NVCC_FLAGS_PUNICA = NVCC_FLAGS.copy() @@ -273,16 +295,13 @@ def get_torch_arch_list() -> Set[str]: num = capability[0] + capability[2] NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"] if capability.endswith("+PTX"): - NVCC_FLAGS += [ - "-gencode", f"arch=compute_{num},code=compute_{num}" - ] + NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=compute_{num}"] if int(capability[0]) >= 8: - NVCC_FLAGS_PUNICA += [ - "-gencode", f"arch=compute_{num},code=sm_{num}" - ] + NVCC_FLAGS_PUNICA += ["-gencode", f"arch=compute_{num},code=sm_{num}"] if capability.endswith("+PTX"): NVCC_FLAGS_PUNICA += [ - "-gencode", f"arch=compute_{num},code=compute_{num}" + "-gencode", + f"arch=compute_{num},code=compute_{num}", ] # Use NVCC threads to parallelize the build. @@ -297,10 +316,10 @@ def get_torch_arch_list() -> Set[str]: # changes for punica kernels NVCC_FLAGS += torch_cpp_ext.COMMON_NVCC_FLAGS REMOVE_NVCC_FLAGS = [ - '-D__CUDA_NO_HALF_OPERATORS__', - '-D__CUDA_NO_HALF_CONVERSIONS__', - '-D__CUDA_NO_BFLOAT16_CONVERSIONS__', - '-D__CUDA_NO_HALF2_OPERATORS__', + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_BFLOAT16_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", ] for flag in REMOVE_NVCC_FLAGS: with contextlib.suppress(ValueError): @@ -317,13 +336,13 @@ def get_torch_arch_list() -> Set[str]: ext_modules.append( CUDAExtension( name="vllm._punica_C", - sources=["csrc/punica/punica_ops.cc"] + - glob("csrc/punica/bgmv/*.cu"), + sources=["csrc/punica/punica_ops.cc"] + glob("csrc/punica/bgmv/*.cu"), extra_compile_args={ "cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS_PUNICA, }, - )) + ) + ) elif _is_neuron(): neuronxcc_version = get_neuronxcc_version() @@ -333,6 +352,8 @@ def get_torch_arch_list() -> Set[str]: "csrc/pos_encoding_kernels.cu", "csrc/activation_kernels.cu", "csrc/layernorm_kernels.cu", + "csrc/quantization/aqlm/aqlm_cuda_entry.cpp", + "csrc/quantization/aqlm/aqlm_cuda_kernal.cu", "csrc/quantization/squeezellm/quant_cuda_kernel.cu", "csrc/quantization/gptq/q_gemm.cu", "csrc/cuda_utils_kernels.cu", @@ -353,7 +374,8 @@ def get_torch_arch_list() -> Set[str]: "cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS, }, - )) + ) + ) if not _is_neuron(): vllm_extension = CUDAExtension( @@ -378,8 +400,9 @@ def find_version(filepath: str) -> str: Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py """ with open(filepath) as fp: - version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", - fp.read(), re.M) + version_match = re.search( + r"^__version__ = ['\"]([^'\"]*)['\"]", fp.read(), re.M + ) if version_match: return version_match.group(1) raise RuntimeError("Unable to find version string.") @@ -442,8 +465,10 @@ def get_requirements() -> List[str]: version=get_vllm_version(), author="vLLM Team", license="Apache 2.0", - description=("A high-throughput and memory-efficient inference and " - "serving engine for LLMs"), + description=( + "A high-throughput and memory-efficient inference and " + "serving engine for LLMs" + ), long_description=read_readme(), long_description_content_type="text/markdown", url="https://github.com/vllm-project/vllm", @@ -459,8 +484,9 @@ def get_requirements() -> List[str]: "License :: OSI Approved :: Apache Software License", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], - packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs", - "examples", "tests")), + packages=setuptools.find_packages( + exclude=("benchmarks", "csrc", "docs", "examples", "tests") + ), python_requires=">=3.8", install_requires=get_requirements(), ext_modules=ext_modules, diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 561375c23b62b..ed559de1cfa72 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -214,16 +214,21 @@ def apply_weights( x: torch.Tensor, bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: - qweight = weights["qweight"] - out_shape = x.shape[:-1] + (qweight.shape[-1],) - reshaped_x = x.reshape(-1, x.shape[-1]) + # qweight = weights["qweight"] do I need the same flattening? + # out_shape = x.shape[:-1] + (qweight.shape[-1],) + # reshaped_x = x.reshape(-1, x.shape[-1]) # + + print("input shape is ", x) + output = ops.aqlm_gemm( - reshaped_x, - weights["qweight"], - weights["qzeros"], + x, # hmm, reshape? + weights["codes"], + weights["codebooks"], weights["scales"], - weights["g_idx"], ) + + print("output shape is ", output) + if bias is not None: output = output + bias - return output.reshape(out_shape) + return output # .reshape(out_shape) ??? From 0e03c2315cb1ca19f7dffdcc9f12c802bf98ba1e Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 27 Feb 2024 15:52:10 -0500 Subject: [PATCH 06/96] add aqlm --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ba00f1b220add..c2b972b54b1b7 100644 --- a/setup.py +++ b/setup.py @@ -352,8 +352,6 @@ def get_torch_arch_list() -> Set[str]: "csrc/pos_encoding_kernels.cu", "csrc/activation_kernels.cu", "csrc/layernorm_kernels.cu", - "csrc/quantization/aqlm/aqlm_cuda_entry.cpp", - "csrc/quantization/aqlm/aqlm_cuda_kernal.cu", "csrc/quantization/squeezellm/quant_cuda_kernel.cu", "csrc/quantization/gptq/q_gemm.cu", "csrc/cuda_utils_kernels.cu", @@ -362,6 +360,8 @@ def get_torch_arch_list() -> Set[str]: ] if _is_cuda(): + vllm_extension_sources.append("csrc/quantization/aqlm/aqlm_cuda_entry.cpp") + vllm_extension_sources.append("csrc/quantization/aqlm/aqlm_cuda_kernel.cu") vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu") vllm_extension_sources.append("csrc/custom_all_reduce.cu") From 26f8d8318433d899223e0b32be46be809c03ae70 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 27 Feb 2024 16:09:31 -0500 Subject: [PATCH 07/96] fix print statements --- examples/aqlm_test.py | 8 ++++++++ vllm/model_executor/layers/quantization/aqlm.py | 6 +++--- 2 files changed, 11 insertions(+), 3 deletions(-) create mode 100644 examples/aqlm_test.py diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py new file mode 100644 index 0000000000000..f745c236236a5 --- /dev/null +++ b/examples/aqlm_test.py @@ -0,0 +1,8 @@ +from vllm import LLM, SamplingParams + +#model = LLM("nm-testing/llama2.c-stories110M-pruned2.4") +model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf") + +sampling_params = SamplingParams(max_tokens=100, temperature=0) +outputs = model.generate("Hello my name is", sampling_params=sampling_params) +print(outputs[0].outputs[0].text) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index ed559de1cfa72..0f9672023989f 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -218,7 +218,7 @@ def apply_weights( # out_shape = x.shape[:-1] + (qweight.shape[-1],) # reshaped_x = x.reshape(-1, x.shape[-1]) # - print("input shape is ", x) + print("input shape is ", x.shape) output = ops.aqlm_gemm( x, # hmm, reshape? @@ -227,8 +227,8 @@ def apply_weights( weights["scales"], ) - print("output shape is ", output) - + print("output shape is ", output.shape) + if bias is not None: output = output + bias return output # .reshape(out_shape) ??? From dad66ce052a003c3d5999898aa0e3ad3cc4bcd00 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Wed, 28 Feb 2024 18:34:09 +0000 Subject: [PATCH 08/96] add comment --- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 66d452df82424..4b4b6bfc69a01 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -67,7 +67,7 @@ torch::Tensor code1x16_matmat( } flat_output *= scales.flatten().unsqueeze(0); -/* not sure how to bridge this yet. +/* not sure how to bridge this yet. may not need to. if (bias.has_value()) { flat_output += bias->unsqueeze(0); } From 77a89136448e1d30fe5540c81da453894bb38195 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Wed, 28 Feb 2024 19:18:34 +0000 Subject: [PATCH 09/96] remove unused enum --- vllm/model_executor/layers/quantization/aqlm.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 0f9672023989f..58dad5aaa5b4e 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -1,5 +1,3 @@ -import enum -from enum import Enum from typing import Any, Dict, List, Optional import torch From 2bb6871febf8e394ad94e9c3c860782fbddcc6bc Mon Sep 17 00:00:00 2001 From: James Fleming Date: Wed, 28 Feb 2024 21:39:28 +0000 Subject: [PATCH 10/96] add a bunch of prints, add bias --- csrc/ops.h | 4 +-- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 6 ++--- examples/aqlm_test.py | 10 +++++--- vllm/model_executor/layers/linear.py | 17 ++++++++++--- .../layers/quantization/aqlm.py | 25 ++++++++++++------- 5 files changed, 41 insertions(+), 21 deletions(-) diff --git a/csrc/ops.h b/csrc/ops.h index 5ff16e0a27393..246862ee048f0 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -73,8 +73,8 @@ torch::Tensor code1x16_matmat( const torch::Tensor& input, const torch::Tensor& codes, const torch::Tensor& codebooks, - const torch::Tensor& scales - //const std::optional& bias + const torch::Tensor& scales, + const std::optional& bias ); #ifndef USE_ROCM diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 4b4b6bfc69a01..6e4aa751c113d 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -43,8 +43,8 @@ torch::Tensor code1x16_matmat( const torch::Tensor& input, const torch::Tensor& codes, const torch::Tensor& codebooks, - const torch::Tensor& scales - //const std::optional& bias + const torch::Tensor& scales, + const std::optional& bias ) { auto input_sizes = input.sizes(); auto out_features = codes.size(0) * codebooks.size(2); @@ -67,11 +67,9 @@ torch::Tensor code1x16_matmat( } flat_output *= scales.flatten().unsqueeze(0); -/* not sure how to bridge this yet. may not need to. if (bias.has_value()) { flat_output += bias->unsqueeze(0); } - */ auto output_sizes = input_sizes.vec(); output_sizes.pop_back(); diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py index f745c236236a5..fcb64223ca2a5 100644 --- a/examples/aqlm_test.py +++ b/examples/aqlm_test.py @@ -1,8 +1,12 @@ from vllm import LLM, SamplingParams #model = LLM("nm-testing/llama2.c-stories110M-pruned2.4") -model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf") +#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True) -sampling_params = SamplingParams(max_tokens=100, temperature=0) -outputs = model.generate("Hello my name is", sampling_params=sampling_params) +model = LLM("/home/jaemz/.cache/huggingface/hub/models--BlackSamorez--Llama-2-7b-AQLM-2Bit-1x16-hf/snapshots/1756949fed7fe691c00f015e2ebd18503a975f3b", enforce_eager=True) + +sampling_params = SamplingParams(max_tokens=200, temperature=0) +outputs = model.generate("How are you ", sampling_params=sampling_params) +print("generated!") print(outputs[0].outputs[0].text) +print("output above!") diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index cd9a17b7ef864..de5b52f7e7f0c 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -326,6 +326,12 @@ def weight_loader( if output_dim is not None: shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size shard_size = self.output_sizes[loaded_shard_id] // tp_size + #TEST + if loaded_shard_id > 0: + print(" loading a shard ", loaded_shard_id) + print(" param_data shape ", param_data.shape) + print(" loaded_weight shape ", loaded_weight.shape) + # If quantized, we need to adjust the offset and size to account # for the packing. packed_dim = getattr(param, "packed_dim", None) @@ -579,14 +585,19 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): param_data = param.data # TEST - print("param data shape is ", param_data.shape) - print("loaded_weight is ", loaded_weight.shape) + print(" param data shape is ", param_data.shape) + print(" loaded_weight is ", loaded_weight.shape) if input_dim is not None: shard_size = param_data.shape[input_dim] start_idx = tp_rank * shard_size + print(" loaded_weight dtype is ", loaded_weight.dtype) + print(" data_param dtype is ", param_data.dtype) + #TEST + assert(start_idx == 0 and shard_size == loaded_weight.shape[input_dim]) + loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size) - print("sharded loaded_weight is ", loaded_weight.shape) + print( "sharded loaded_weight is ", loaded_weight.shape) assert param_data.shape == loaded_weight.shape diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 58dad5aaa5b4e..d6bc67bd24873 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -128,6 +128,9 @@ def create_weights( output_size: int, params_dtype: torch.dtype, ) -> Dict[str, Any]: + #TEST + assert(output_size == output_size_per_partition) + assert(input_size == input_size_per_partition) del output_size # Unused. del input_size # Unused. @@ -149,14 +152,16 @@ def create_weights( # or does this need more dimensions and use the correct nbits_per_codebook as an int type. Does that pack them? codes = Parameter( torch.empty( - output_size_per_partition, # not entirely sure what to do with out groups, if we need this pack factor. + output_size_per_partition, # not entirely sure what to do with num_out_groups, if we need this pack factor. input_size_per_partition // self.quant_config.pack_factor, - 1, + 1, # probably should be num codebooks. dtype=get_int_dtype(self.quant_config.nbits_per_codebook), ), requires_grad=False, ) + print(codes.shape) + set_weight_attrs( codes, { @@ -177,14 +182,12 @@ def create_weights( ), requires_grad=False, ) - # no attributes? It's fixed size, no input or output dim, need the whole thing. - # looks like named attributes are for sharding so it knows how to split something up. scales = Parameter( torch.empty( ( output_size_per_partition // self.quant_config.out_group_size, - 1, # do we really need these other dimensions? They don't count, or? + 1, 1, 1, ), @@ -218,15 +221,19 @@ def apply_weights( print("input shape is ", x.shape) + if (x.shape[1] == 5) : + print("codes shape is ", weights["codes"].shape) + print("codebooks shape is ", weights["codebooks"].shape) + print("scales shape is ", weights["scales"].shape) + print("x is ", x) + output = ops.aqlm_gemm( x, # hmm, reshape? weights["codes"], weights["codebooks"], weights["scales"], + bias, ) print("output shape is ", output.shape) - - if bias is not None: - output = output + bias - return output # .reshape(out_shape) ??? + return output From 5f0c319b01d8b7a1902b7d8e8721816623180c52 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Wed, 28 Feb 2024 21:45:15 +0000 Subject: [PATCH 11/96] minor fix for scales --- vllm/model_executor/layers/quantization/aqlm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index d6bc67bd24873..bb75d76e47e6d 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -199,7 +199,8 @@ def create_weights( scales, { "output_dim": 0, - # "pack_factor": self.quant_config.pack_factor, I guess not really a pack factor, just smaller? + "packed_dim": 0, + "pack_factor": self.quant_config.out_group_size }, ) From 024b54ca48c5bf41dba258f0984a74d254289eea Mon Sep 17 00:00:00 2001 From: James Fleming Date: Wed, 28 Feb 2024 21:51:50 +0000 Subject: [PATCH 12/96] change --- examples/aqlm_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py index fcb64223ca2a5..47936b19c9dd3 100644 --- a/examples/aqlm_test.py +++ b/examples/aqlm_test.py @@ -2,7 +2,6 @@ #model = LLM("nm-testing/llama2.c-stories110M-pruned2.4") #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True) - model = LLM("/home/jaemz/.cache/huggingface/hub/models--BlackSamorez--Llama-2-7b-AQLM-2Bit-1x16-hf/snapshots/1756949fed7fe691c00f015e2ebd18503a975f3b", enforce_eager=True) sampling_params = SamplingParams(max_tokens=200, temperature=0) From 84c2e2a178e3d798d37ebbf27a963a1135a11e69 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 15:26:11 +0000 Subject: [PATCH 13/96] format --- examples/aqlm_test.py | 4 +- setup.py | 82 +++++----- vllm/config.py | 144 +++++++----------- vllm/model_executor/layers/linear.py | 82 +++++----- .../layers/quantization/aqlm.py | 35 ++--- vllm/model_executor/models/llama.py | 76 +++++---- vllm/model_executor/weight_utils.py | 27 ++-- 7 files changed, 205 insertions(+), 245 deletions(-) diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py index 47936b19c9dd3..26d90584a7858 100644 --- a/examples/aqlm_test.py +++ b/examples/aqlm_test.py @@ -2,7 +2,9 @@ #model = LLM("nm-testing/llama2.c-stories110M-pruned2.4") #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True) -model = LLM("/home/jaemz/.cache/huggingface/hub/models--BlackSamorez--Llama-2-7b-AQLM-2Bit-1x16-hf/snapshots/1756949fed7fe691c00f015e2ebd18503a975f3b", enforce_eager=True) +model = LLM( + "/home/jaemz/.cache/huggingface/hub/models--BlackSamorez--Llama-2-7b-AQLM-2Bit-1x16-hf/snapshots/1756949fed7fe691c00f015e2ebd18503a975f3b", + enforce_eager=True) sampling_params = SamplingParams(max_tokens=200, temperature=0) outputs = model.generate("How are you ", sampling_params=sampling_params) diff --git a/setup.py b/setup.py index c2b972b54b1b7..ba351402f7b16 100644 --- a/setup.py +++ b/setup.py @@ -66,8 +66,7 @@ def _is_cuda() -> bool: if _is_cuda() and CUDA_HOME is None: raise RuntimeError( - "Cannot find CUDA_HOME. CUDA must be available to build the package." - ) + "Cannot find CUDA_HOME. CUDA must be available to build the package.") ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0 CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] @@ -107,7 +106,8 @@ def get_neuronxcc_version(): import sysconfig site_dir = sysconfig.get_paths()["purelib"] - version_file = os.path.join(site_dir, "neuronxcc", "version", "__init__.py") + version_file = os.path.join(site_dir, "neuronxcc", "version", + "__init__.py") # Check if the command was executed successfully with open(version_file, "rt") as fp: @@ -127,9 +127,8 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version: Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py """ - nvcc_output = subprocess.check_output( - [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True - ) + nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], + universal_newlines=True) output = nvcc_output.split() release_idx = output.index("release") + 1 nvcc_cuda_version = parse(output[release_idx].split(",")[0]) @@ -151,12 +150,8 @@ def get_pytorch_rocm_arch() -> Set[str]: # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator if env_arch_list is None: command = "rocm_agent_enumerator" - env_arch_list = ( - subprocess.check_output([command]) - .decode("utf-8") - .strip() - .replace("\n", ";") - ) + env_arch_list = (subprocess.check_output( + [command]).decode("utf-8").strip().replace("\n", ";")) arch_source_str = "rocm_agent_enumerator" else: arch_source_str = "PYTORCH_ROCM_ARCH env variable" @@ -172,8 +167,7 @@ def get_pytorch_rocm_arch() -> Set[str]: raise RuntimeError( f"None of the ROCM architectures in {arch_source_str} " f"({env_arch_list}) is supported. " - f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}." - ) + f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}.") invalid_arch_list = pytorch_rocm_arch - ROCM_SUPPORTED_ARCHS if invalid_arch_list: warnings.warn( @@ -204,16 +198,15 @@ def get_torch_arch_list() -> Set[str]: # Filter out the invalid architectures and print a warning. valid_archs = NVIDIA_SUPPORTED_ARCHS.union( - {s + "+PTX" for s in NVIDIA_SUPPORTED_ARCHS} - ) + {s + "+PTX" + for s in NVIDIA_SUPPORTED_ARCHS}) arch_list = torch_arch_list.intersection(valid_archs) # If none of the specified architectures are valid, raise an error. if not arch_list: raise RuntimeError( "None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env " f"variable ({env_arch_list}) is supported. " - f"Supported CUDA architectures are: {valid_archs}." - ) + f"Supported CUDA architectures are: {valid_archs}.") invalid_arch_list = torch_arch_list - valid_archs if invalid_arch_list: warnings.warn( @@ -241,8 +234,7 @@ def get_torch_arch_list() -> Set[str]: major, minor = torch.cuda.get_device_capability(i) if major < 7: raise RuntimeError( - "GPUs with compute capability below 7.0 are not supported." - ) + "GPUs with compute capability below 7.0 are not supported.") compute_capabilities.add(f"{major}.{minor}") ext_modules = [] @@ -260,13 +252,12 @@ def get_torch_arch_list() -> Set[str]: compute_capabilities.remove("9.0") # Validate the NVCC CUDA version. if nvcc_cuda_version < Version("11.0"): - raise RuntimeError("CUDA 11.0 or higher is required to build the package.") + raise RuntimeError( + "CUDA 11.0 or higher is required to build the package.") if nvcc_cuda_version < Version("11.1") and any( - cc.startswith("8.6") for cc in compute_capabilities - ): + cc.startswith("8.6") for cc in compute_capabilities): raise RuntimeError( - "CUDA 11.1 or higher is required for compute capability 8.6." - ) + "CUDA 11.1 or higher is required for compute capability 8.6.") if nvcc_cuda_version < Version("11.8"): if any(cc.startswith("8.9") for cc in compute_capabilities): # CUDA 11.8 is required to generate the code targeting compute capability 8.9. @@ -279,14 +270,12 @@ def get_torch_arch_list() -> Set[str]: "Targeting compute capability 8.0 instead.", stacklevel=2, ) - compute_capabilities = set( - cc for cc in compute_capabilities if not cc.startswith("8.9") - ) + compute_capabilities = set(cc for cc in compute_capabilities + if not cc.startswith("8.9")) compute_capabilities.add("8.0+PTX") if any(cc.startswith("9.0") for cc in compute_capabilities): raise RuntimeError( - "CUDA 11.8 or higher is required for compute capability 9.0." - ) + "CUDA 11.8 or higher is required for compute capability 9.0.") NVCC_FLAGS_PUNICA = NVCC_FLAGS.copy() @@ -295,9 +284,13 @@ def get_torch_arch_list() -> Set[str]: num = capability[0] + capability[2] NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"] if capability.endswith("+PTX"): - NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=compute_{num}"] + NVCC_FLAGS += [ + "-gencode", f"arch=compute_{num},code=compute_{num}" + ] if int(capability[0]) >= 8: - NVCC_FLAGS_PUNICA += ["-gencode", f"arch=compute_{num},code=sm_{num}"] + NVCC_FLAGS_PUNICA += [ + "-gencode", f"arch=compute_{num},code=sm_{num}" + ] if capability.endswith("+PTX"): NVCC_FLAGS_PUNICA += [ "-gencode", @@ -336,13 +329,13 @@ def get_torch_arch_list() -> Set[str]: ext_modules.append( CUDAExtension( name="vllm._punica_C", - sources=["csrc/punica/punica_ops.cc"] + glob("csrc/punica/bgmv/*.cu"), + sources=["csrc/punica/punica_ops.cc"] + + glob("csrc/punica/bgmv/*.cu"), extra_compile_args={ "cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS_PUNICA, }, - ) - ) + )) elif _is_neuron(): neuronxcc_version = get_neuronxcc_version() @@ -374,8 +367,7 @@ def get_torch_arch_list() -> Set[str]: "cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS, }, - ) - ) + )) if not _is_neuron(): vllm_extension = CUDAExtension( @@ -400,9 +392,8 @@ def find_version(filepath: str) -> str: Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py """ with open(filepath) as fp: - version_match = re.search( - r"^__version__ = ['\"]([^'\"]*)['\"]", fp.read(), re.M - ) + version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", + fp.read(), re.M) if version_match: return version_match.group(1) raise RuntimeError("Unable to find version string.") @@ -465,10 +456,8 @@ def get_requirements() -> List[str]: version=get_vllm_version(), author="vLLM Team", license="Apache 2.0", - description=( - "A high-throughput and memory-efficient inference and " - "serving engine for LLMs" - ), + description=("A high-throughput and memory-efficient inference and " + "serving engine for LLMs"), long_description=read_readme(), long_description_content_type="text/markdown", url="https://github.com/vllm-project/vllm", @@ -484,9 +473,8 @@ def get_requirements() -> List[str]: "License :: OSI Approved :: Apache Software License", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], - packages=setuptools.find_packages( - exclude=("benchmarks", "csrc", "docs", "examples", "tests") - ), + packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs", + "examples", "tests")), python_requires=">=3.8", install_requires=get_requirements(), ext_modules=ext_modules, diff --git a/vllm/config.py b/vllm/config.py index f2452baf8796c..19f1c0e27b103 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -100,20 +100,20 @@ def __init__( from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=C if not os.path.exists(model): - model_path = snapshot_download( - model_id=model, cache_dir=download_dir, revision=revision - ) + model_path = snapshot_download(model_id=model, + cache_dir=download_dir, + revision=revision) else: model_path = model self.model = model_path self.download_dir = model_path self.tokenizer = model_path - self.hf_config = get_config( - self.model, trust_remote_code, revision, code_revision - ) + self.hf_config = get_config(self.model, trust_remote_code, revision, + code_revision) self.dtype = _get_and_verify_dtype(self.hf_config, dtype) - self.max_model_len = _get_and_verify_max_len(self.hf_config, max_model_len) + self.max_model_len = _get_and_verify_max_len(self.hf_config, + max_model_len) self._verify_load_format() self._verify_tokenizer_mode() self.hf_quant_config = self._get_and_verify_quantization() @@ -121,32 +121,30 @@ def __init__( def _verify_load_format(self) -> None: load_format = self.load_format.lower() - supported_load_format = ["auto", "pt", "safetensors", "npcache", "dummy"] + supported_load_format = [ + "auto", "pt", "safetensors", "npcache", "dummy" + ] rocm_not_supported_load_format = [] if load_format not in supported_load_format: raise ValueError( f"Unknown load format: {self.load_format}. Must be one of " - "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'." - ) + "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'.") if is_hip() and load_format in rocm_not_supported_load_format: rocm_supported_load_format = [ - f - for f in supported_load_format + f for f in supported_load_format if (f not in rocm_not_supported_load_format) ] raise ValueError( f"load format '{load_format}' is not supported in ROCm. " f"Supported load format are " - f"{rocm_supported_load_format}" - ) + f"{rocm_supported_load_format}") # TODO: Remove this check once HF updates the pt weights of Mixtral. architectures = getattr(self.hf_config, "architectures", []) if "MixtralForCausalLM" in architectures and load_format == "pt": raise ValueError( "Currently, the 'pt' format is not supported for Mixtral. " - "Please use the 'safetensors' format instead. " - ) + "Please use the 'safetensors' format instead. ") self.load_format = load_format def _verify_tokenizer_mode(self) -> None: @@ -154,8 +152,7 @@ def _verify_tokenizer_mode(self) -> None: if tokenizer_mode not in ["auto", "slow"]: raise ValueError( f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be " - "either 'auto' or 'slow'." - ) + "either 'auto' or 'slow'.") self.tokenizer_mode = tokenizer_mode def _get_and_verify_quantization(self) -> Any | None: @@ -183,34 +180,29 @@ def _get_and_verify_quantization(self) -> Any | None: "Quantization method specified in the model config " f"({hf_quant_method}) does not match the quantization " f"method specified in the `quantization` argument " - f"({self.quantization})." - ) + f"({self.quantization}).") if self.quantization is not None: if self.quantization not in supported_quantization: raise ValueError( f"Unknown quantization method: {self.quantization}. Must " - f"be one of {supported_quantization}." - ) - if is_hip() and self.quantization in rocm_not_supported_quantization: + f"be one of {supported_quantization}.") + if is_hip( + ) and self.quantization in rocm_not_supported_quantization: raise ValueError( f"{self.quantization} quantization is currently not supported " - f"in ROCm." - ) - logger.warning( - f"{self.quantization} quantization is not fully " - "optimized yet. The speed can be slower than " - "non-quantized models." - ) + f"in ROCm.") + logger.warning(f"{self.quantization} quantization is not fully " + "optimized yet. The speed can be slower than " + "non-quantized models.") return hf_quant_config def _verify_cuda_graph(self) -> None: if self.max_context_len_to_capture is None: self.max_context_len_to_capture = self.max_model_len - self.max_context_len_to_capture = min( - self.max_context_len_to_capture, self.max_model_len - ) + self.max_context_len_to_capture = min(self.max_context_len_to_capture, + self.max_model_len) def verify_with_parallel_config( self, @@ -222,8 +214,7 @@ def verify_with_parallel_config( raise ValueError( f"Total number of attention heads ({total_num_attention_heads})" " must be divisible by tensor parallel size " - f"({tensor_parallel_size})." - ) + f"({tensor_parallel_size}).") total_num_hidden_layers = self.hf_config.num_hidden_layers pipeline_parallel_size = parallel_config.pipeline_parallel_size @@ -231,8 +222,7 @@ def verify_with_parallel_config( raise ValueError( f"Total number of hidden layers ({total_num_hidden_layers}) " "must be divisible by pipeline parallel size " - f"({pipeline_parallel_size})." - ) + f"({pipeline_parallel_size}).") def get_sliding_window(self) -> Optional[int]: return getattr(self.hf_config, "sliding_window", None) @@ -258,11 +248,9 @@ def get_total_num_kv_heads(self) -> int: falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"] new_decoder_arch_falcon = ( self.hf_config.model_type in falcon_model_types - and getattr(self.hf_config, "new_decoder_architecture", False) - ) - if not new_decoder_arch_falcon and getattr( - self.hf_config, "multi_query", False - ): + and getattr(self.hf_config, "new_decoder_architecture", False)) + if not new_decoder_arch_falcon and getattr(self.hf_config, + "multi_query", False): # Multi-query attention, only one KV head. # Currently, tensor parallelism is not supported in this case. return 1 @@ -292,7 +280,8 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: # the tensor parallel size. We will replicate the KV heads in the # case where the number of KV heads is smaller than the tensor # parallel size so each GPU has at least one KV head. - return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size) + return max(1, + total_num_kv_heads // parallel_config.tensor_parallel_size) def get_num_layers(self, parallel_config: "ParallelConfig") -> int: total_num_hidden_layers = self.hf_config.num_hidden_layers @@ -334,8 +323,7 @@ def _verify_args(self) -> None: if self.gpu_memory_utilization > 1.0: raise ValueError( "GPU memory utilization must be less than 1.0. Got " - f"{self.gpu_memory_utilization}." - ) + f"{self.gpu_memory_utilization}.") def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": @@ -349,15 +337,13 @@ def _verify_cache_dtype(self) -> None: device_name = torch.cuda.get_device_name() if "AMD" in device_name: raise NotImplementedError( - "FP8_E5M2 KV Cache on AMD GPU has not been supported yet." - ) + "FP8_E5M2 KV Cache on AMD GPU has not been supported yet.") logger.info( "Using fp8_e5m2 data type to store kv cache. It reduces " "the GPU memory footprint and boosts the performance. " "But it may cause slight accuracy drop. " "Currently we only support fp8 without scaling factors and " - "make e5m2 as a default format." - ) + "make e5m2 as a default format.") else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") @@ -371,11 +357,9 @@ def verify_with_parallel_config( num_gpus_per_node = parallel_config.tensor_parallel_size cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node - msg = ( - f"{cpu_memory_usage / _GB:.2f} GiB out of " - f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is " - "allocated for the swap space." - ) + msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of " + f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is " + "allocated for the swap space.") if cpu_memory_usage > 0.7 * total_cpu_memory: raise ValueError("Too large swap space. " + msg) elif cpu_memory_usage > 0.4 * total_cpu_memory: @@ -419,20 +403,19 @@ def __init__( def _verify_args(self) -> None: if self.pipeline_parallel_size > 1: - raise NotImplementedError("Pipeline parallelism is not supported yet.") + raise NotImplementedError( + "Pipeline parallelism is not supported yet.") if not self.disable_custom_all_reduce and self.world_size > 1: if is_hip(): self.disable_custom_all_reduce = True logger.info( "Disabled the custom all-reduce kernel because it is not " - "supported on AMD GPUs." - ) + "supported on AMD GPUs.") elif self.pipeline_parallel_size > 1: self.disable_custom_all_reduce = True logger.info( "Disabled the custom all-reduce kernel because it is not " - "supported with pipeline parallelism." - ) + "supported with pipeline parallelism.") # FIXME(woosuk): Fix the stability issues and re-enable the custom # all-reduce kernel. @@ -441,8 +424,7 @@ def _verify_args(self) -> None: logger.info( "Custom all-reduce kernels are temporarily disabled due to " "stability issues. We will re-enable them once the issues are " - "resolved." - ) + "resolved.") class SchedulerConfig: @@ -484,17 +466,16 @@ def _verify_args(self) -> None: "This effectively limits the maximum sequence length to " "max_num_batched_tokens and makes vLLM reject longer " "sequences. Please increase max_num_batched_tokens or " - "decrease max_model_len." - ) + "decrease max_model_len.") if self.max_num_batched_tokens < self.max_num_seqs: raise ValueError( f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " "be greater than or equal to max_num_seqs " - f"({self.max_num_seqs})." - ) + f"({self.max_num_seqs}).") class DeviceConfig: + def __init__(self, device: str = "cuda") -> None: self.device = torch.device(device) @@ -516,13 +497,11 @@ def __post_init__(self): if self.max_lora_rank not in possible_max_ranks: raise ValueError( f"max_lora_rank ({self.max_lora_rank}) must be one of " - f"{possible_max_ranks}." - ) + f"{possible_max_ranks}.") if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size: raise ValueError( f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) " - f"must be one of {possible_lora_extra_vocab_size}." - ) + f"must be one of {possible_lora_extra_vocab_size}.") if self.max_loras < 1: raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.") if self.max_cpu_loras is None: @@ -530,8 +509,7 @@ def __post_init__(self): elif self.max_cpu_loras < self.max_loras: raise ValueError( f"max_cpu_loras ({self.max_cpu_loras}) must be >= " - f"max_loras ({self.max_loras})" - ) + f"max_loras ({self.max_loras})") def verify_with_model_config(self, model_config: ModelConfig): if self.lora_dtype in (None, "auto"): @@ -539,15 +517,15 @@ def verify_with_model_config(self, model_config: ModelConfig): elif isinstance(self.lora_dtype, str): self.lora_dtype = getattr(torch, self.lora_dtype) if model_config.quantization is not None: - raise ValueError("LoRA is not supported with quantized models yet.") + raise ValueError( + "LoRA is not supported with quantized models yet.") def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): if scheduler_config.max_num_batched_tokens > 65528: raise ValueError( "Due to limitations of the custom LoRA CUDA kernel, " "max_num_batched_tokens must be <= 65528 when " - "LoRA is enabled." - ) + "LoRA is enabled.") _STR_DTYPE_TO_TORCH_DTYPE = { @@ -591,14 +569,11 @@ def _get_and_verify_dtype( if is_hip() and torch_dtype == torch.float32: rocm_supported_dtypes = [ - k - for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items() + k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items() if (k not in _ROCM_NOT_SUPPORTED_DTYPE) ] - raise ValueError( - f"dtype '{dtype}' is not supported in ROCm. " - f"Supported dtypes are {rocm_supported_dtypes}" - ) + raise ValueError(f"dtype '{dtype}' is not supported in ROCm. " + f"Supported dtypes are {rocm_supported_dtypes}") # Verify the dtype. if torch_dtype != config_dtype: @@ -649,8 +624,7 @@ def _get_and_verify_max_len( "The model's config.json does not contain any of the following " "keys to determine the original maximum length of the model: " f"{possible_keys}. Assuming the model's maximum length is " - f"{default_max_len}." - ) + f"{default_max_len}.") derived_max_model_len = default_max_len rope_scaling = getattr(hf_config, "rope_scaling", None) @@ -658,7 +632,8 @@ def _get_and_verify_max_len( assert "factor" in rope_scaling scaling_factor = rope_scaling["factor"] if rope_scaling["type"] == "yarn": - derived_max_model_len = rope_scaling["original_max_position_embeddings"] + derived_max_model_len = rope_scaling[ + "original_max_position_embeddings"] derived_max_model_len *= scaling_factor if max_model_len is None: @@ -669,6 +644,5 @@ def _get_and_verify_max_len( f"the derived max_model_len ({max_len_key}={derived_max_model_len}" " in model's config.json). This may lead to incorrect model " "outputs or CUDA errors. Make sure the value is correct and " - "within the model context size." - ) + "within the model context size.") return int(max_model_len) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index de5b52f7e7f0c..abbdf7c649557 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -66,9 +66,9 @@ def create_weights( params_dtype: torch.dtype, ) -> Dict[str, Any]: weight = Parameter( - torch.empty( - output_size_per_partition, input_size_per_partition, dtype=params_dtype - ), + torch.empty(output_size_per_partition, + input_size_per_partition, + dtype=params_dtype), requires_grad=False, ) set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) @@ -133,8 +133,7 @@ def __init__( self.register_parameter(name, weight) if bias: self.bias = Parameter( - torch.empty(self.output_size, dtype=self.params_dtype) - ) + torch.empty(self.output_size, dtype=self.params_dtype)) set_weight_attrs(self.bias, {"output_dim": 0}) else: self.register_parameter("bias", None) @@ -205,8 +204,8 @@ def __init__( set_weight_attrs(weight, {"weight_loader": self.weight_loader}) if bias: self.bias = Parameter( - torch.empty(self.output_size_per_partition, dtype=params_dtype) - ) + torch.empty(self.output_size_per_partition, + dtype=params_dtype)) set_weight_attrs( self.bias, { @@ -224,7 +223,8 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): if output_dim is not None: shard_size = param_data.shape[output_dim] start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) @@ -233,8 +233,7 @@ def forward(self, input_): # Matrix multiply. output_parallel = self.linear_method.apply_weights( - self.linear_weights, input_, bias - ) + self.linear_weights, input_, bias) if self.gather_output: # All-gather across the partitions. output = tensor_model_parallel_all_gather(output_parallel) @@ -315,8 +314,7 @@ def weight_loader( shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor loaded_weight_shard = loaded_weight.narrow( - output_dim, shard_offset, shard_size - ) + output_dim, shard_offset, shard_size) self.weight_loader(param, loaded_weight_shard, shard_id) return @@ -338,17 +336,18 @@ def weight_loader( if packed_dim == output_dim: shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - param_data = param_data.narrow(output_dim, shard_offset, shard_size) + param_data = param_data.narrow(output_dim, shard_offset, + shard_size) start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) else: ignore_warning = getattr(param, "ignore_warning", False) if not ignore_warning: logger.warning( "Loading a weight without `output_dim` attribute in " "MergedColumnParallelLinear, assume the weight is " - "the same for all partitions." - ) + "the same for all partitions.") assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) @@ -399,14 +398,14 @@ def __init__( self.num_heads = divide(self.total_num_heads, tp_size) if tp_size >= self.total_num_kv_heads: self.num_kv_heads = 1 - self.num_kv_head_replicas = divide(tp_size, self.total_num_kv_heads) + self.num_kv_head_replicas = divide(tp_size, + self.total_num_kv_heads) else: self.num_kv_heads = divide(self.total_num_kv_heads, tp_size) self.num_kv_head_replicas = 1 input_size = self.hidden_size - output_size = ( - (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size - ) + output_size = ((self.num_heads + 2 * self.num_kv_heads) * tp_size * + self.head_size) super().__init__( input_size, output_size, @@ -441,7 +440,8 @@ def weight_loader( ), ( "v", - (self.total_num_heads + self.total_num_kv_heads) * self.head_size, + (self.total_num_heads + self.total_num_kv_heads) * + self.head_size, self.total_num_kv_heads * self.head_size, ), ] @@ -453,8 +453,7 @@ def weight_loader( shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor loaded_weight_shard = loaded_weight.narrow( - output_dim, shard_offset, shard_size - ) + output_dim, shard_offset, shard_size) self.weight_loader(param, loaded_weight_shard, shard_id) return @@ -468,7 +467,8 @@ def weight_loader( shard_offset = self.num_heads * self.head_size shard_size = self.num_kv_heads * self.head_size elif loaded_shard_id == "v": - shard_offset = (self.num_heads + self.num_kv_heads) * self.head_size + shard_offset = (self.num_heads + + self.num_kv_heads) * self.head_size shard_size = self.num_kv_heads * self.head_size # If quantized, we need to adjust the offset and size to account # for the packing. @@ -476,21 +476,22 @@ def weight_loader( if packed_dim == output_dim: shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - param_data = param_data.narrow(output_dim, shard_offset, shard_size) + param_data = param_data.narrow(output_dim, shard_offset, + shard_size) if loaded_shard_id == "q": shard_id = tp_rank else: shard_id = tp_rank // self.num_kv_head_replicas start_idx = shard_id * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) else: ignore_warning = getattr(param, "ignore_warning", False) if not ignore_warning: logger.warning( "Loading a weight without `output_dim` attribute in " "QKVParallelLinear, assume the weight is the same " - "for all partitions." - ) + "for all partitions.") assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) @@ -562,13 +563,12 @@ def __init__( set_weight_attrs(weight, {"weight_loader": self.weight_loader}) if not reduce_results and (bias and not skip_bias_add): - raise ValueError( - "When not reduce the results, adding bias to the " - "results can lead to incorrect results" - ) + raise ValueError("When not reduce the results, adding bias to the " + "results can lead to incorrect results") if bias: - self.bias = Parameter(torch.empty(self.output_size, dtype=params_dtype)) + self.bias = Parameter( + torch.empty(self.output_size, dtype=params_dtype)) set_weight_attrs( self.bias, { @@ -593,11 +593,13 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): start_idx = tp_rank * shard_size print(" loaded_weight dtype is ", loaded_weight.dtype) print(" data_param dtype is ", param_data.dtype) - #TEST - assert(start_idx == 0 and shard_size == loaded_weight.shape[input_dim]) + #TEST + assert (start_idx == 0 + and shard_size == loaded_weight.shape[input_dim]) - loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size) - print( "sharded loaded_weight is ", loaded_weight.shape) + loaded_weight = loaded_weight.narrow(input_dim, start_idx, + shard_size) + print("sharded loaded_weight is ", loaded_weight.shape) assert param_data.shape == loaded_weight.shape @@ -610,14 +612,12 @@ def forward(self, input_): else: tp_rank = get_tensor_model_parallel_rank() splitted_input = split_tensor_along_last_dim( - input_, num_partitions=self.tp_size - ) + input_, num_partitions=self.tp_size) input_parallel = splitted_input[tp_rank].contiguous() # Matrix multiply. output_parallel = self.linear_method.apply_weights( - self.linear_weights, input_parallel - ) + self.linear_weights, input_parallel) if self.reduce_results and self.tp_size > 1: output_ = tensor_model_parallel_all_reduce(output_parallel) else: diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index bb75d76e47e6d..0f299d770b4ee 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -41,17 +41,14 @@ def __init__( # I think pack factor is *probably* how many elements fit into one quantized tensor element. # though out group size makes it interesting, because really we are doing 2D blocks, potentially. # maybe this is vllms first 2D packing? Arg. - self.pack_factor = ( - self.in_group_size * self.out_group_size // self.num_codebooks - ) + self.pack_factor = (self.in_group_size * self.out_group_size // + self.num_codebooks) def __repr__(self) -> str: - return ( - f"AQLMConfig(in_group_size={self.in_group_size}, " - f"nbits_per_codebook={self.nbits_per_codebook}, " - f"num_codebooks={self.num_codebooks}, " - f"out_group_size={self.out_group_size})" - ) + return (f"AQLMConfig(in_group_size={self.in_group_size}, " + f"nbits_per_codebook={self.nbits_per_codebook}, " + f"num_codebooks={self.num_codebooks}, " + f"out_group_size={self.out_group_size})") @classmethod def get_name(cls) -> str: @@ -101,7 +98,8 @@ def from_config(cls, config: Dict[str, Any]) -> "AQLMConfig": num_code_books = cls.get_from_keys(config, ["num_codebooks"]) out_group_size = cls.get_from_keys(config, ["out_group_size"]) # TODO linear_weights_not_to_quantize ? - return cls(in_group_size, nbits_per_codebook, num_code_books, out_group_size) + return cls(in_group_size, nbits_per_codebook, num_code_books, + out_group_size) def get_linear_method(self) -> "AQLMLinearMethod": return AQLMLinearMethod(self) @@ -129,8 +127,8 @@ def create_weights( params_dtype: torch.dtype, ) -> Dict[str, Any]: #TEST - assert(output_size == output_size_per_partition) - assert(input_size == input_size_per_partition) + assert (output_size == output_size_per_partition) + assert (input_size == input_size_per_partition) del output_size # Unused. del input_size # Unused. @@ -140,21 +138,19 @@ def create_weights( raise ValueError( "The input size is not aligned with the quantized " "weight shape. This can be caused by too large " - "tensor parallel size." - ) + "tensor parallel size.") if output_size_per_partition % self.quant_config.out_group_size != 0: raise ValueError( "The output size is not aligned with the quantized " "weight shape. This can be caused by too large " - "tensor parallel size." - ) + "tensor parallel size.") # or does this need more dimensions and use the correct nbits_per_codebook as an int type. Does that pack them? codes = Parameter( torch.empty( output_size_per_partition, # not entirely sure what to do with num_out_groups, if we need this pack factor. input_size_per_partition // self.quant_config.pack_factor, - 1, # probably should be num codebooks. + 1, # probably should be num codebooks. dtype=get_int_dtype(self.quant_config.nbits_per_codebook), ), requires_grad=False, @@ -186,7 +182,8 @@ def create_weights( scales = Parameter( torch.empty( ( - output_size_per_partition // self.quant_config.out_group_size, + output_size_per_partition // + self.quant_config.out_group_size, 1, 1, 1, @@ -222,7 +219,7 @@ def apply_weights( print("input shape is ", x.shape) - if (x.shape[1] == 5) : + if (x.shape[1] == 5): print("codes shape is ", weights["codes"].shape) print("codebooks shape is ", weights["codebooks"].shape) print("scales shape is ", weights["scales"].shape) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 217edb20049ce..17ffc56ff42cf 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -45,8 +45,7 @@ DEFAULT_VOCAB_PADDING_SIZE, ) from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size, -) + get_tensor_model_parallel_world_size, ) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.weight_utils import ( default_weight_loader, @@ -59,6 +58,7 @@ class LlamaMLP(nn.Module): + def __init__( self, hidden_size: int, @@ -73,14 +73,13 @@ def __init__( bias=False, linear_method=linear_method, ) - self.down_proj = RowParallelLinear( - intermediate_size, hidden_size, bias=False, linear_method=linear_method - ) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + linear_method=linear_method) if hidden_act != "silu": - raise ValueError( - f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now." - ) + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") self.act_fn = SiluAndMul() def forward(self, x): @@ -91,6 +90,7 @@ def forward(self, x): class LlamaAttention(nn.Module): + def __init__( self, hidden_size: int, @@ -173,6 +173,7 @@ def forward( class LlamaDecoderLayer(nn.Module): + def __init__( self, config: LlamaConfig, @@ -182,14 +183,14 @@ def __init__( self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) rope_scaling = getattr(config, "rope_scaling", None) - max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) sliding_window = getattr(config, "sliding_window", None) self.self_attn = LlamaAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, - num_kv_heads=getattr( - config, "num_key_value_heads", config.num_attention_heads - ), + num_kv_heads=getattr(config, "num_key_value_heads", + config.num_attention_heads), rope_theta=rope_theta, rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, @@ -203,10 +204,10 @@ def __init__( hidden_act=config.hidden_act, linear_method=linear_method, ) - self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) def forward( self, @@ -221,7 +222,8 @@ def forward( residual = hidden_states hidden_states = self.input_layernorm(hidden_states) else: - hidden_states, residual = self.input_layernorm(hidden_states, residual) + hidden_states, residual = self.input_layernorm( + hidden_states, residual) hidden_states = self.self_attn( positions=positions, hidden_states=hidden_states, @@ -230,12 +232,14 @@ def forward( ) # Fully Connected - hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) hidden_states = self.mlp(hidden_states) return hidden_states, residual class LlamaModel(nn.Module): + def __init__( self, config: LlamaConfig, @@ -245,11 +249,8 @@ def __init__( super().__init__() self.config = config self.padding_idx = config.pad_token_id - lora_vocab = ( - (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) - if lora_config - else 0 - ) + lora_vocab = ((lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0) self.vocab_size = config.vocab_size + lora_vocab self.org_vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( @@ -257,12 +258,10 @@ def __init__( config.hidden_size, org_num_embeddings=config.vocab_size, ) - self.layers = nn.ModuleList( - [ - LlamaDecoderLayer(config, linear_method) - for _ in range(config.num_hidden_layers) - ] - ) + self.layers = nn.ModuleList([ + LlamaDecoderLayer(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) def forward( @@ -335,8 +334,7 @@ def __init__( padding_size=DEFAULT_VOCAB_PADDING_SIZE # We need bigger padding if using lora for kernel # compatibility - if not lora_config - else lora_config.lora_vocab_padding_size, + if not lora_config else lora_config.lora_vocab_padding_size, ) self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size) @@ -347,7 +345,8 @@ def forward( kv_caches: List[KVCache], input_metadata: InputMetadata, ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, input_metadata) + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) return hidden_states def sample( @@ -355,9 +354,8 @@ def sample( hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[SamplerOutput]: - next_tokens = self.sampler( - self.lm_head.weight, hidden_states, sampling_metadata - ) + next_tokens = self.sampler(self.lm_head.weight, hidden_states, + sampling_metadata) return next_tokens def load_weights( @@ -377,8 +375,7 @@ def load_weights( ] params_dict = dict(self.named_parameters()) for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision - ): + model_name_or_path, cache_dir, load_format, revision): if "rotary_emb.inv_freq" in name: continue if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: @@ -403,7 +400,8 @@ def load_weights( if name.endswith(".bias") and name not in params_dict: continue param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) # TEST print("loading ", name) weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 37c9725033d49..48900a8b02271 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -24,6 +24,7 @@ class Disabledtqdm(tqdm): + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs, disable=True) @@ -69,12 +70,10 @@ def convert_bin_to_safetensor_file( sf_size = os.stat(sf_filename).st_size pt_size = os.stat(pt_filename).st_size if (sf_size - pt_size) / pt_size > 0.01: - raise RuntimeError( - f"""The file size different is more than 1%: + raise RuntimeError(f"""The file size different is more than 1%: - {sf_filename}: {sf_size} - {pt_filename}: {pt_size} - """ - ) + """) # check if the tensors are the same reloaded = load_file(sf_filename) @@ -107,17 +106,16 @@ def get_quant_config(model_config: ModelConfig) -> QuantizationConfig: config_files = glob.glob(os.path.join(hf_folder, "*.json")) quant_config_files = [ - f - for f in config_files - if any(f.endswith(x) for x in quant_cls.get_config_filenames()) + f for f in config_files if any( + f.endswith(x) for x in quant_cls.get_config_filenames()) ] if len(quant_config_files) == 0: - raise ValueError(f"Cannot find the config file for {model_config.quantization}") + raise ValueError( + f"Cannot find the config file for {model_config.quantization}") if len(quant_config_files) > 1: raise ValueError( f"Found multiple config files for {model_config.quantization}: " - f"{quant_config_files}" - ) + f"{quant_config_files}") quant_config_file = quant_config_files[0] with open(quant_config_file, "r") as f: @@ -194,11 +192,13 @@ def prepare_hf_model_weights( "scaler.pt", ] hf_weights_files = [ - f for f in hf_weights_files if not any(f.endswith(x) for x in blacklist) + f for f in hf_weights_files + if not any(f.endswith(x) for x in blacklist) ] if len(hf_weights_files) == 0: - raise RuntimeError(f"Cannot find any model weights with `{model_name_or_path}`") + raise RuntimeError( + f"Cannot find any model weights with `{model_name_or_path}`") return hf_folder, hf_weights_files, use_safetensors @@ -280,7 +280,8 @@ def convert_pyslice_to_tensor(x: Any) -> torch.Tensor: return x -def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None: +def default_weight_loader(param: torch.Tensor, + loaded_weight: torch.Tensor) -> None: """Default weight loader.""" assert param.size() == loaded_weight.size() param.data.copy_(loaded_weight) From 8ea4d9d458d30f897693b3e4e2136ddb101c9b34 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 15:31:29 +0000 Subject: [PATCH 14/96] try reversing some formatting changes --- setup.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/setup.py b/setup.py index ba351402f7b16..56864a91b3a3b 100644 --- a/setup.py +++ b/setup.py @@ -11,12 +11,7 @@ import setuptools import torch import torch.utils.cpp_extension as torch_cpp_ext -from torch.utils.cpp_extension import ( - BuildExtension, - CUDAExtension, - CUDA_HOME, - ROCM_HOME, -) +from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME ROOT_DIR = os.path.dirname(__file__) From b993971ade28f4abaa8ad7339931e1a9866681d5 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 15:34:28 +0000 Subject: [PATCH 15/96] restored --- setup.py | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/setup.py b/setup.py index 56864a91b3a3b..8fcb86394f76d 100644 --- a/setup.py +++ b/setup.py @@ -70,12 +70,10 @@ def _is_cuda() -> bool: def get_hipcc_rocm_version(): # Run the hipcc --version command - result = subprocess.run( - ["hipcc", "--version"], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - ) + result = subprocess.run(['hipcc', '--version'], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True) # Check if the command was executed successfully if result.returncode != 0: @@ -83,7 +81,7 @@ def get_hipcc_rocm_version(): return None # Extract the version using a regular expression - match = re.search(r"HIP version: (\S+)", result.stdout) + match = re.search(r'HIP version: (\S+)', result.stdout) if match: # Return the version string return match.group(1) @@ -99,7 +97,6 @@ def glob(pattern: str): def get_neuronxcc_version(): import sysconfig - site_dir = sysconfig.get_paths()["purelib"] version_file = os.path.join(site_dir, "neuronxcc", "version", "__init__.py") @@ -145,8 +142,8 @@ def get_pytorch_rocm_arch() -> Set[str]: # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator if env_arch_list is None: command = "rocm_agent_enumerator" - env_arch_list = (subprocess.check_output( - [command]).decode("utf-8").strip().replace("\n", ";")) + env_arch_list = subprocess.check_output([command]).decode('utf-8')\ + .strip().replace("\n", ";") arch_source_str = "rocm_agent_enumerator" else: arch_source_str = "PYTORCH_ROCM_ARCH env variable" @@ -170,8 +167,7 @@ def get_pytorch_rocm_arch() -> Set[str]: f"excluded from the {arch_source_str} output " f"({env_arch_list}). Supported ROCM architectures are: " f"{ROCM_SUPPORTED_ARCHS}.", - stacklevel=2, - ) + stacklevel=2) return arch_list @@ -209,8 +205,7 @@ def get_torch_arch_list() -> Set[str]: "excluded from the `TORCH_CUDA_ARCH_LIST` env variable " f"({env_arch_list}). Supported CUDA architectures are: " f"{valid_archs}.", - stacklevel=2, - ) + stacklevel=2) return arch_list @@ -249,8 +244,8 @@ def get_torch_arch_list() -> Set[str]: if nvcc_cuda_version < Version("11.0"): raise RuntimeError( "CUDA 11.0 or higher is required to build the package.") - if nvcc_cuda_version < Version("11.1") and any( - cc.startswith("8.6") for cc in compute_capabilities): + if (nvcc_cuda_version < Version("11.1") + and any(cc.startswith("8.6") for cc in compute_capabilities)): raise RuntimeError( "CUDA 11.1 or higher is required for compute capability 8.6.") if nvcc_cuda_version < Version("11.8"): @@ -263,8 +258,7 @@ def get_torch_arch_list() -> Set[str]: warnings.warn( "CUDA 11.8 or higher is required for compute capability 8.9. " "Targeting compute capability 8.0 instead.", - stacklevel=2, - ) + stacklevel=2) compute_capabilities = set(cc for cc in compute_capabilities if not cc.startswith("8.9")) compute_capabilities.add("8.0+PTX") @@ -288,8 +282,7 @@ def get_torch_arch_list() -> Set[str]: ] if capability.endswith("+PTX"): NVCC_FLAGS_PUNICA += [ - "-gencode", - f"arch=compute_{num},code=compute_{num}", + "-gencode", f"arch=compute_{num},code=compute_{num}" ] # Use NVCC threads to parallelize the build. @@ -304,10 +297,10 @@ def get_torch_arch_list() -> Set[str]: # changes for punica kernels NVCC_FLAGS += torch_cpp_ext.COMMON_NVCC_FLAGS REMOVE_NVCC_FLAGS = [ - "-D__CUDA_NO_HALF_OPERATORS__", - "-D__CUDA_NO_HALF_CONVERSIONS__", - "-D__CUDA_NO_BFLOAT16_CONVERSIONS__", - "-D__CUDA_NO_HALF2_OPERATORS__", + '-D__CUDA_NO_HALF_OPERATORS__', + '-D__CUDA_NO_HALF_CONVERSIONS__', + '-D__CUDA_NO_BFLOAT16_CONVERSIONS__', + '-D__CUDA_NO_HALF2_OPERATORS__', ] for flag in REMOVE_NVCC_FLAGS: with contextlib.suppress(ValueError): @@ -348,8 +341,6 @@ def get_torch_arch_list() -> Set[str]: ] if _is_cuda(): - vllm_extension_sources.append("csrc/quantization/aqlm/aqlm_cuda_entry.cpp") - vllm_extension_sources.append("csrc/quantization/aqlm/aqlm_cuda_kernel.cu") vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu") vllm_extension_sources.append("csrc/custom_all_reduce.cu") From 17668866afc1471c57e91f245fd987f661e4304a Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 15:38:36 +0000 Subject: [PATCH 16/96] add aqlm_cuda --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8fcb86394f76d..d177a6e1d8d4d 100644 --- a/setup.py +++ b/setup.py @@ -343,7 +343,8 @@ def get_torch_arch_list() -> Set[str]: if _is_cuda(): vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu") vllm_extension_sources.append("csrc/custom_all_reduce.cu") - + vllm_extension_sources.append("csrc/quantization/aqlm/aqlm_cuda_entry.cpp") + vllm_extension_sources.append("csrc/quantization/aqlm/aqlm_cuda_kernel.cu") # Add MoE kernels. ext_modules.append( CUDAExtension( From b673f4791d9a3b71659f7df3f270bd61c64eea76 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 15:49:46 +0000 Subject: [PATCH 17/96] restore formatting --- vllm/config.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 19f1c0e27b103..70a5f3b77eba1 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -98,7 +98,6 @@ def __init__( # download model from ModelScope hub, # lazy import so that modelscope is not required for normal use. from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=C - if not os.path.exists(model): model_path = snapshot_download(model_id=model, cache_dir=download_dir, @@ -135,7 +134,7 @@ def _verify_load_format(self) -> None: if (f not in rocm_not_supported_load_format) ] raise ValueError( - f"load format '{load_format}' is not supported in ROCm. " + f"load format \'{load_format}\' is not supported in ROCm. " f"Supported load format are " f"{rocm_supported_load_format}") @@ -572,7 +571,7 @@ def _get_and_verify_dtype( k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items() if (k not in _ROCM_NOT_SUPPORTED_DTYPE) ] - raise ValueError(f"dtype '{dtype}' is not supported in ROCm. " + raise ValueError(f"dtype \'{dtype}\' is not supported in ROCm. " f"Supported dtypes are {rocm_supported_dtypes}") # Verify the dtype. From 4e7d39808f0df1c9f0b51499debbecc0ed84c3f4 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 15:52:50 +0000 Subject: [PATCH 18/96] restore format --- vllm/model_executor/layers/linear.py | 42 ++++++++++------------------ 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index abbdf7c649557..edec583e2585c 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -6,13 +6,9 @@ from torch.nn.parameter import Parameter from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, -) + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_all_reduce, - tensor_model_parallel_all_gather, -) + tensor_model_parallel_all_reduce, tensor_model_parallel_all_gather) from vllm.model_executor.parallel_utils.utils import divide, split_tensor_along_last_dim from vllm.model_executor.utils import set_weight_attrs from vllm.logger import init_logger @@ -24,24 +20,18 @@ class LinearMethodBase(ABC): """Base class for different (maybe quantized) linear methods.""" @abstractmethod - def create_weights( - self, - input_size_per_partition: int, - output_size_per_partition: int, - input_size: int, - output_size: int, - params_dtype: torch.dtype, - ) -> Dict[str, Any]: + def create_weights(self, input_size_per_partition: int, + output_size_per_partition: int, input_size: int, + output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: """Create weights for a linear layer.""" raise NotImplementedError @abstractmethod - def apply_weights( - self, - weights: Dict[str, torch.Tensor], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + def apply_weights(self, + weights: Dict[str, torch.Tensor], + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: """Apply the weights to the input tensor.""" raise NotImplementedError @@ -57,14 +47,10 @@ class UnquantizedLinearMethod(LinearMethodBase): def __init__(self, separate_bias_add: bool = False): self.separate_bias_add = separate_bias_add - def create_weights( - self, - input_size_per_partition: int, - output_size_per_partition: int, - input_size: int, - output_size: int, - params_dtype: torch.dtype, - ) -> Dict[str, Any]: + def create_weights(self, input_size_per_partition: int, + output_size_per_partition: int, input_size: int, + output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: weight = Parameter( torch.empty(output_size_per_partition, input_size_per_partition, From 4fc1426e0d5699d34ccce30f82591a2689dbbad0 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 15:56:11 +0000 Subject: [PATCH 19/96] more formatting --- vllm/model_executor/layers/linear.py | 42 +++++++++++----------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index edec583e2585c..c43508b6f86c7 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -9,7 +9,8 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.parallel_utils.communication_op import ( tensor_model_parallel_all_reduce, tensor_model_parallel_all_gather) -from vllm.model_executor.parallel_utils.utils import divide, split_tensor_along_last_dim +from vllm.model_executor.parallel_utils.utils import ( + divide, split_tensor_along_last_dim) from vllm.model_executor.utils import set_weight_attrs from vllm.logger import init_logger @@ -51,21 +52,17 @@ def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, params_dtype: torch.dtype) -> Dict[str, Any]: - weight = Parameter( - torch.empty(output_size_per_partition, - input_size_per_partition, - dtype=params_dtype), - requires_grad=False, - ) + weight = Parameter(torch.empty(output_size_per_partition, + input_size_per_partition, + dtype=params_dtype), + requires_grad=False) set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) return {"weight": weight} - def apply_weights( - self, - weights: Dict[str, torch.Tensor], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + def apply_weights(self, + weights: Dict[str, torch.Tensor], + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: weight = weights["weight"] if self.separate_bias_add: if bias: @@ -108,12 +105,8 @@ def __init__( linear_method = UnquantizedLinearMethod() self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( - self.input_size, - self.output_size, - self.input_size, - self.output_size, - self.params_dtype, - ) + self.input_size, self.output_size, self.input_size, + self.output_size, self.params_dtype) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) @@ -192,13 +185,10 @@ def __init__( self.bias = Parameter( torch.empty(self.output_size_per_partition, dtype=params_dtype)) - set_weight_attrs( - self.bias, - { - "output_dim": 0, - "weight_loader": self.weight_loader, - }, - ) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) else: self.register_parameter("bias", None) From ac2ef816258c775d0c44a08b3002432270905ea6 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 16:46:13 +0000 Subject: [PATCH 20/96] format --- vllm/model_executor/layers/linear.py | 44 ++++++++-------------------- 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index c43508b6f86c7..90e2e7ca0c15e 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -171,12 +171,8 @@ def __init__( linear_method = UnquantizedLinearMethod() self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( - self.input_size, - self.output_size_per_partition, - self.input_size, - self.output_size, - self.params_dtype, - ) + self.input_size, self.output_size_per_partition, self.input_size, + self.output_size, self.params_dtype) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) @@ -253,22 +249,13 @@ def __init__( self.output_sizes = output_sizes tp_size = get_tensor_model_parallel_world_size() assert all(output_size % tp_size == 0 for output_size in output_sizes) - super().__init__( - input_size, - sum(output_sizes), - bias, - gather_output, - skip_bias_add, - params_dtype, - linear_method, - ) + super().__init__(input_size, sum(output_sizes), bias, gather_output, + skip_bias_add, params_dtype, linear_method) - def weight_loader( - self, - param: Parameter, - loaded_weight: torch.Tensor, - loaded_shard_id: Optional[int] = None, - ): + def weight_loader(self, + param: Parameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[int] = None): param_data = param.data output_dim = getattr(param, "output_dim", None) if loaded_shard_id is None: @@ -380,17 +367,10 @@ def __init__( self.num_kv_heads = divide(self.total_num_kv_heads, tp_size) self.num_kv_head_replicas = 1 input_size = self.hidden_size - output_size = ((self.num_heads + 2 * self.num_kv_heads) * tp_size * - self.head_size) - super().__init__( - input_size, - output_size, - bias, - False, - skip_bias_add, - params_dtype, - linear_method, - ) + output_size = (self.num_heads + + 2 * self.num_kv_heads) * tp_size * self.head_size + super().__init__(input_size, output_size, bias, False, skip_bias_add, + params_dtype, linear_method) def weight_loader( self, From 30d2d42550e5a2a5b03943899651829266ef8ce6 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 16:52:51 +0000 Subject: [PATCH 21/96] restore formatting --- vllm/model_executor/layers/linear.py | 44 +++++++-------------- vllm/model_executor/models/llama.py | 57 +++++++++++----------------- 2 files changed, 36 insertions(+), 65 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 90e2e7ca0c15e..e086b735ca8ca 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -372,12 +372,10 @@ def __init__( super().__init__(input_size, output_size, bias, False, skip_bias_add, params_dtype, linear_method) - def weight_loader( - self, - param: Parameter, - loaded_weight: torch.Tensor, - loaded_shard_id: Optional[str] = None, - ): + def weight_loader(self, + param: Parameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[str] = None): param_data = param.data output_dim = getattr(param, "output_dim", None) if loaded_shard_id is None: @@ -389,17 +387,10 @@ def weight_loader( shard_offsets = [ # (shard_id, shard_offset, shard_size) ("q", 0, self.total_num_heads * self.head_size), - ( - "k", - self.total_num_heads * self.head_size, - self.total_num_kv_heads * self.head_size, - ), - ( - "v", - (self.total_num_heads + self.total_num_kv_heads) * - self.head_size, - self.total_num_kv_heads * self.head_size, - ), + ("k", self.total_num_heads * self.head_size, + self.total_num_kv_heads * self.head_size), + ("v", (self.total_num_heads + self.total_num_kv_heads) * + self.head_size, self.total_num_kv_heads * self.head_size), ] packed_dim = getattr(param, "packed_dim", None) for shard_id, shard_offset, shard_size in shard_offsets: @@ -507,12 +498,8 @@ def __init__( linear_method = UnquantizedLinearMethod() self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( - self.input_size_per_partition, - self.output_size, - self.input_size, - self.output_size, - self.params_dtype, - ) + self.input_size_per_partition, self.output_size, self.input_size, + self.output_size, self.params_dtype) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) @@ -525,13 +512,10 @@ def __init__( if bias: self.bias = Parameter( torch.empty(self.output_size, dtype=params_dtype)) - set_weight_attrs( - self.bias, - { - "output_dim": 0, - "weight_loader": self.weight_loader, - }, - ) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) else: self.register_parameter("bias", None) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 17ffc56ff42cf..e928ff1d83fd4 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -31,26 +31,19 @@ from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import PagedAttention from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( - LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear, -) +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, - ParallelLMHead, - DEFAULT_VOCAB_PADDING_SIZE, -) + VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size, ) + get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import ( - default_weight_loader, - hf_model_weights_iterator, -) +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) from vllm.sequence import SamplerOutput from vllm.config import LoRAConfig @@ -68,11 +61,9 @@ def __init__( ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, - [intermediate_size] * 2, + hidden_size, [intermediate_size] * 2, bias=False, - linear_method=linear_method, - ) + linear_method=linear_method) self.down_proj = RowParallelLinear(intermediate_size, hidden_size, bias=False, @@ -148,13 +139,11 @@ def __init__( base=rope_theta, rope_scaling=rope_scaling, ) - self.attn = PagedAttention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - sliding_window=sliding_window, - ) + self.attn = PagedAttention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=sliding_window) def forward( self, @@ -249,8 +238,8 @@ def __init__( super().__init__() self.config = config self.padding_idx = config.pad_token_id - lora_vocab = ((lora_config.lora_extra_vocab_size * - (lora_config.max_loras or 1)) if lora_config else 0) + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 self.vocab_size = config.vocab_size + lora_vocab self.org_vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( @@ -358,13 +347,11 @@ def sample( sampling_metadata) return next_tokens - def load_weights( - self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None, - ): + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), From 3fcb9446deeda3201fd96f0145bedc614e8801d9 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 16:54:27 +0000 Subject: [PATCH 22/96] restore formatting --- vllm/model_executor/models/llama.py | 5 +++-- vllm/model_executor/weight_utils.py | 5 +---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index e928ff1d83fd4..1ce2223ed2de4 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -365,11 +365,12 @@ def load_weights(self, model_name_or_path, cache_dir, load_format, revision): if "rotary_emb.inv_freq" in name: continue - if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - for param_name, weight_name, shard_id in stacked_params_mapping: + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue name = name.replace(weight_name, param_name) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 48900a8b02271..6c32ac9125d45 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -15,10 +15,7 @@ from vllm.config import ModelConfig from vllm.logger import init_logger -from vllm.model_executor.layers.quantization import ( - get_quantization_config, - QuantizationConfig, -) +from vllm.model_executor.layers.quantization import (get_quantization_config, QuantizationConfig) logger = init_logger(__name__) From 4e7291aebe80fbba81f6f79f0d041e8e75a2696a Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 16:56:40 +0000 Subject: [PATCH 23/96] formta --- vllm/model_executor/weight_utils.py | 30 +++++++++++++---------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 6c32ac9125d45..bdcb9d5976576 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -15,7 +15,8 @@ from vllm.config import ModelConfig from vllm.logger import init_logger -from vllm.model_executor.layers.quantization import (get_quantization_config, QuantizationConfig) +from vllm.model_executor.layers.quantization import (get_quantization_config, + QuantizationConfig) logger = init_logger(__name__) @@ -91,13 +92,11 @@ def get_quant_config(model_config: ModelConfig) -> QuantizationConfig: if not is_local: # Download the config files. with get_lock(model_name_or_path, model_config.download_dir): - hf_folder = snapshot_download( - model_name_or_path, - revision=model_config.revision, - allow_patterns="*.json", - cache_dir=model_config.download_dir, - tqdm_class=Disabledtqdm, - ) + hf_folder = snapshot_download(model_name_or_path, + revision=model_config.revision, + allow_patterns="*.json", + cache_dir=model_config.download_dir, + tqdm_class=Disabledtqdm) else: hf_folder = model_name_or_path config_files = glob.glob(os.path.join(hf_folder, "*.json")) @@ -162,13 +161,11 @@ def prepare_hf_model_weights( # Use file lock to prevent multiple processes from # downloading the same model weights at the same time. with get_lock(model_name_or_path, cache_dir): - hf_folder = snapshot_download( - model_name_or_path, - allow_patterns=allow_patterns, - cache_dir=cache_dir, - tqdm_class=Disabledtqdm, - revision=revision, - ) + hf_folder = snapshot_download(model_name_or_path, + allow_patterns=allow_patterns, + cache_dir=cache_dir, + tqdm_class=Disabledtqdm, + revision=revision) else: hf_folder = model_name_or_path hf_weights_files: List[str] = [] @@ -212,8 +209,7 @@ def hf_model_weights_iterator( cache_dir=cache_dir, load_format=load_format, fall_back_to_pt=fall_back_to_pt, - revision=revision, - ) + revision=revision) if load_format == "npcache": # Currently np_cache only support *.bin checkpoints From 39abbc0c18322602d03e9ce2f0678c5afdc8f479 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 20:27:14 +0000 Subject: [PATCH 24/96] first working aqlm --- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 9 --- examples/aqlm_test.py | 14 ++-- vllm/model_executor/layers/linear.py | 59 ++++++++------ .../layers/quantization/aqlm.py | 77 ++++++++++++------- .../model_executor/layers/quantization/awq.py | 4 +- .../layers/quantization/gptq.py | 12 +-- .../layers/quantization/squeezellm.py | 4 +- vllm/model_executor/models/llama.py | 5 +- 8 files changed, 98 insertions(+), 86 deletions(-) diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 6e4aa751c113d..fb4c9d54efdee 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -134,12 +134,3 @@ torch::Tensor code2x8_matmat( auto output = flat_output.reshape(output_sizes).clone(); return output; } - - -/* -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("code1x16_matmat", &code1x16_matmat, "1x16 (2bit) codebook matrix-matrix product."); - m.def("code2x8_matmat", &code2x8_matmat, "2x8 (2bit) codebook matrix-matrix product."); -} -*/ - diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py index 26d90584a7858..cbc9b37857452 100644 --- a/examples/aqlm_test.py +++ b/examples/aqlm_test.py @@ -1,13 +1,11 @@ from vllm import LLM, SamplingParams #model = LLM("nm-testing/llama2.c-stories110M-pruned2.4") -#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True) -model = LLM( - "/home/jaemz/.cache/huggingface/hub/models--BlackSamorez--Llama-2-7b-AQLM-2Bit-1x16-hf/snapshots/1756949fed7fe691c00f015e2ebd18503a975f3b", - enforce_eager=True) +model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True) +#model = LLM( +# "/home/jaemz/.cache/huggingface/hub/models--BlackSamorez--Llama-2-7b-AQLM-2Bit-1x16-hf/snapshots/1756949fed7fe691c00f015e2ebd18503a975f3b", +# enforce_eager=True) -sampling_params = SamplingParams(max_tokens=200, temperature=0) -outputs = model.generate("How are you ", sampling_params=sampling_params) -print("generated!") +sampling_params = SamplingParams(max_tokens=100, temperature=0) +outputs = model.generate("Hello my name is", sampling_params=sampling_params) print(outputs[0].outputs[0].text) -print("output above!") diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index e086b735ca8ca..c20d28054c29a 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -23,8 +23,8 @@ class LinearMethodBase(ABC): @abstractmethod def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, - output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: + output_size: int, params_dtype: torch.dtype, + shards: int) -> Dict[str, Any]: """Create weights for a linear layer.""" raise NotImplementedError @@ -50,8 +50,8 @@ def __init__(self, separate_bias_add: bool = False): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, - output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: + output_size: int, params_dtype: torch.dtype, + shards: int) -> Dict[str, Any]: weight = Parameter(torch.empty(output_size_per_partition, input_size_per_partition, dtype=params_dtype), @@ -106,7 +106,7 @@ def __init__( self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( self.input_size, self.output_size, self.input_size, - self.output_size, self.params_dtype) + self.output_size, self.params_dtype, 1) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) @@ -142,6 +142,7 @@ class ColumnParallelLinear(torch.nn.Module): skip adding bias but instead return it. params_dtype: Data type for the parameters. linear_method: (Maybe quantized) linear method. + shards: Number of packed shards, like for QKV this would be 3 """ def __init__( @@ -153,6 +154,7 @@ def __init__( skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, linear_method: Optional[LinearMethodBase] = None, + shards: int = 1, ): super().__init__() @@ -160,6 +162,7 @@ def __init__( self.input_size = input_size self.output_size = output_size self.gather_output = gather_output + self.shards = shards # Divide the weight matrix along the last dimension. tp_size = get_tensor_model_parallel_world_size() self.output_size_per_partition = divide(output_size, tp_size) @@ -172,7 +175,7 @@ def __init__( self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( self.input_size, self.output_size_per_partition, self.input_size, - self.output_size, self.params_dtype) + self.output_size, self.params_dtype, self.shards) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) @@ -250,14 +253,17 @@ def __init__( tp_size = get_tensor_model_parallel_world_size() assert all(output_size % tp_size == 0 for output_size in output_sizes) super().__init__(input_size, sum(output_sizes), bias, gather_output, - skip_bias_add, params_dtype, linear_method) + skip_bias_add, params_dtype, linear_method, + len(self.output_sizes)) def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor, loaded_shard_id: Optional[int] = None): + param_data = param.data output_dim = getattr(param, "output_dim", None) + shard_dim = getattr(param, "shard_dim", None) if loaded_shard_id is None: # Loaded weight is already packed. if output_dim is None: @@ -287,12 +293,6 @@ def weight_loader(self, if output_dim is not None: shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size shard_size = self.output_sizes[loaded_shard_id] // tp_size - #TEST - if loaded_shard_id > 0: - print(" loading a shard ", loaded_shard_id) - print(" param_data shape ", param_data.shape) - print(" loaded_weight shape ", loaded_weight.shape) - # If quantized, we need to adjust the offset and size to account # for the packing. packed_dim = getattr(param, "packed_dim", None) @@ -304,6 +304,13 @@ def weight_loader(self, start_idx = tp_rank * shard_size loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + elif shard_dim is not None: + shard_size = loaded_weight.shape[shard_dim] + shard_offset = loaded_shard_id * shard_size + param_data = param_data.narrow(shard_dim, shard_offset, shard_size) + # TODO what is up with this TP rank? + #start_idx = tp_rank * shard_size + #loaded_weight = loaded_weight.narrow(output_dim, start_idx,shard_size) else: ignore_warning = getattr(param, "ignore_warning", False) if not ignore_warning: @@ -370,14 +377,17 @@ def __init__( output_size = (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size super().__init__(input_size, output_size, bias, False, skip_bias_add, - params_dtype, linear_method) + params_dtype, linear_method, 3) def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor, loaded_shard_id: Optional[str] = None): param_data = param.data + output_dim = getattr(param, "output_dim", None) + shard_dim = getattr(param, "shard_dim", None) + if loaded_shard_id is None: # Loaded weight is already packed. if output_dim is None: @@ -432,6 +442,16 @@ def weight_loader(self, start_idx = shard_id * shard_size loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + elif shard_dim is not None: + shard_size = loaded_weight.shape[shard_dim] + if loaded_shard_id == "q": + shard_index = 0 + elif loaded_shard_id == "k": + shard_index = 1 + elif loaded_shard_id == "v": + shard_index = 2 + param_data = param_data.narrow(shard_dim, shard_index * shard_size, + shard_size) else: ignore_warning = getattr(param, "ignore_warning", False) if not ignore_warning: @@ -499,7 +519,7 @@ def __init__( self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( self.input_size_per_partition, self.output_size, self.input_size, - self.output_size, self.params_dtype) + self.output_size, self.params_dtype, 1) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) @@ -524,23 +544,14 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): input_dim = getattr(param, "input_dim", None) param_data = param.data - # TEST - print(" param data shape is ", param_data.shape) - print(" loaded_weight is ", loaded_weight.shape) - if input_dim is not None: shard_size = param_data.shape[input_dim] start_idx = tp_rank * shard_size - print(" loaded_weight dtype is ", loaded_weight.dtype) - print(" data_param dtype is ", param_data.dtype) - #TEST assert (start_idx == 0 and shard_size == loaded_weight.shape[input_dim]) loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size) - print("sharded loaded_weight is ", loaded_weight.shape) - assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 0f299d770b4ee..daa7f88a3adef 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -118,15 +118,10 @@ class AQLMLinearMethod(LinearMethodBase): def __init__(self, quant_config: AQLMConfig): self.quant_config = quant_config - def create_weights( - self, - input_size_per_partition: int, - output_size_per_partition: int, - input_size: int, - output_size: int, - params_dtype: torch.dtype, - ) -> Dict[str, Any]: - #TEST + def create_weights(self, input_size_per_partition: int, + output_size_per_partition: int, input_size: int, + output_size: int, params_dtype: torch.dtype, + shards: int) -> Dict[str, Any]: assert (output_size == output_size_per_partition) assert (input_size == input_size_per_partition) del output_size # Unused. @@ -145,19 +140,16 @@ def create_weights( "weight shape. This can be caused by too large " "tensor parallel size.") - # or does this need more dimensions and use the correct nbits_per_codebook as an int type. Does that pack them? codes = Parameter( torch.empty( output_size_per_partition, # not entirely sure what to do with num_out_groups, if we need this pack factor. input_size_per_partition // self.quant_config.pack_factor, - 1, # probably should be num codebooks. + 1, # probably should be num codebooks and change pack factor? dtype=get_int_dtype(self.quant_config.nbits_per_codebook), ), requires_grad=False, ) - print(codes.shape) - set_weight_attrs( codes, { @@ -170,7 +162,7 @@ def create_weights( codebooks = Parameter( torch.empty( - self.quant_config.num_codebooks, + self.quant_config.num_codebooks * shards, 2**self.quant_config.nbits_per_codebook, self.quant_config.out_group_size, self.quant_config.in_group_size, @@ -178,6 +170,13 @@ def create_weights( ), requires_grad=False, ) + set_weight_attrs( + codebooks, + { + "shard_dim": 0, + "shards": shards + }, + ) scales = Parameter( torch.empty( @@ -213,25 +212,45 @@ def apply_weights( x: torch.Tensor, bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: - # qweight = weights["qweight"] do I need the same flattening? - # out_shape = x.shape[:-1] + (qweight.shape[-1],) - # reshaped_x = x.reshape(-1, x.shape[-1]) # - - print("input shape is ", x.shape) - if (x.shape[1] == 5): - print("codes shape is ", weights["codes"].shape) - print("codebooks shape is ", weights["codebooks"].shape) - print("scales shape is ", weights["scales"].shape) - print("x is ", x) + codebooks = weights["codebooks"] + codes = weights["codes"] + scales = weights["scales"] + + shard_dim = getattr(codebooks, "shard_dim", None) + if shard_dim is not None: + output_shape = x.shape[:-1] + (scales.shape[0], ) + output = torch.empty(output_shape, dtype=x.dtype, device=x.device) + shards = getattr(codebooks, "shards", None) + # break the shards apart and combine them. + assert (shard_dim == 0) + num_codebooks = codebooks.shape[shard_dim] // shards + + assert (scales.shape[0] == codes.shape[0]) + assert (scales.shape[0] % shards == 0) + base_size = scales.shape[0] // shards + + for shard_id in range(shards): + shard_output = ops.aqlm_gemm( + x, codes.narrow(0, shard_id * base_size, base_size), + codebooks.narrow(shard_dim, shard_id * num_codebooks, + num_codebooks), + scales.narrow(0, shard_id * base_size, base_size), + None if bias is None else bias.narrow( + 0, shard_id * base_size, base_size)) + + output_slice = output.narrow(-1, shard_id * base_size, + base_size) + assert (output_slice.shape == shard_output.shape) + output_slice.copy_(shard_output) + return output output = ops.aqlm_gemm( - x, # hmm, reshape? - weights["codes"], - weights["codebooks"], - weights["scales"], + x, + codes, + codebooks, + scales, bias, ) - print("output shape is ", output.shape) return output diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 3e1c814dd233c..a3623ae5b0417 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -79,8 +79,8 @@ def __init__(self, quant_config: AWQConfig): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, - output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: + output_size: int, params_dtype: torch.dtype, + shards: int) -> Dict[str, Any]: if input_size_per_partition % self.quant_config.group_size != 0: raise ValueError( "The input size is not aligned with the quantized " diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 7218760fbe55d..45b06947f3799 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -87,14 +87,10 @@ class GPTQLinearMethod(LinearMethodBase): def __init__(self, quant_config: GPTQConfig): self.quant_config = quant_config - def create_weights( - self, - input_size_per_partition: int, - output_size_per_partition: int, - input_size: int, - output_size: int, - params_dtype: torch.dtype, - ) -> Dict[str, Any]: + def create_weights(self, input_size_per_partition: int, + output_size_per_partition: int, input_size: int, + output_size: int, params_dtype: torch.dtype, + shards: int) -> Dict[str, Any]: del output_size # Unused. if input_size_per_partition % self.quant_config.group_size != 0: raise ValueError( diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py index 9244e88552756..091ff22b9b095 100644 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -69,8 +69,8 @@ def __init__(self, quant_config: SqueezeLLMConfig): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, - output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: + output_size: int, params_dtype: torch.dtype, + shards: int) -> Dict[str, Any]: if input_size_per_partition % self.quant_config.pack_factor != 0: raise ValueError( "The input size is not aligned with the quantized " diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 1ce2223ed2de4..4c07fb19c490a 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -360,6 +360,7 @@ def load_weights(self, ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), ] + params_dict = dict(self.named_parameters()) for name, loaded_weight in hf_model_weights_iterator( model_name_or_path, cache_dir, load_format, revision): @@ -379,8 +380,6 @@ def load_weights(self, continue param = params_dict[name] weight_loader = param.weight_loader - # TEST - print("loading ", name) weight_loader(param, loaded_weight, shard_id) break else: @@ -390,6 +389,4 @@ def load_weights(self, param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - # TEST - print("loading ", name) weight_loader(param, loaded_weight) From 8d7fa9669f419efad1f77e08dfac8b3595b383d8 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 20:32:36 +0000 Subject: [PATCH 25/96] some improvements --- vllm/model_executor/layers/linear.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index c20d28054c29a..35da806a9097d 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -263,6 +263,7 @@ def weight_loader(self, param_data = param.data output_dim = getattr(param, "output_dim", None) + # shard_dim indicates fixed size concatenated at shard_id shard_dim = getattr(param, "shard_dim", None) if loaded_shard_id is None: # Loaded weight is already packed. @@ -308,9 +309,6 @@ def weight_loader(self, shard_size = loaded_weight.shape[shard_dim] shard_offset = loaded_shard_id * shard_size param_data = param_data.narrow(shard_dim, shard_offset, shard_size) - # TODO what is up with this TP rank? - #start_idx = tp_rank * shard_size - #loaded_weight = loaded_weight.narrow(output_dim, start_idx,shard_size) else: ignore_warning = getattr(param, "ignore_warning", False) if not ignore_warning: @@ -444,12 +442,7 @@ def weight_loader(self, shard_size) elif shard_dim is not None: shard_size = loaded_weight.shape[shard_dim] - if loaded_shard_id == "q": - shard_index = 0 - elif loaded_shard_id == "k": - shard_index = 1 - elif loaded_shard_id == "v": - shard_index = 2 + shard_index = ["q", "k", "v"].index(loaded_shard_id) param_data = param_data.narrow(shard_dim, shard_index * shard_size, shard_size) else: From 9a3dbe1daf4a0a640b77d827e238df0ea3bb7726 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 20:37:07 +0000 Subject: [PATCH 26/96] restore format --- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 2 +- vllm/model_executor/models/llama.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index fb4c9d54efdee..991e59f5022a7 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -74,7 +74,7 @@ torch::Tensor code1x16_matmat( auto output_sizes = input_sizes.vec(); output_sizes.pop_back(); output_sizes.push_back(-1); - auto output = flat_output.reshape(output_sizes).clone(); + auto output = flat_output.reshape(output_sizes); // .clone(); return output; } diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 4c07fb19c490a..b7f6b8f3ec374 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -360,7 +360,6 @@ def load_weights(self, ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), ] - params_dict = dict(self.named_parameters()) for name, loaded_weight in hf_model_weights_iterator( model_name_or_path, cache_dir, load_format, revision): From e7c2601efcedfeb2c3bbd39d5f6d994919d32b74 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 21:42:03 +0000 Subject: [PATCH 27/96] make a central c++ aqlm entry point --- csrc/ops.h | 2 +- csrc/pybind.cpp | 2 +- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 27 ++++++++++++++++++++-- examples/aqlm_test.py | 8 +++---- 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/csrc/ops.h b/csrc/ops.h index 246862ee048f0..c70a04e1a8694 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -69,7 +69,7 @@ void gelu_fast( torch::Tensor& out, torch::Tensor& input); -torch::Tensor code1x16_matmat( +torch::Tensor aqlm_gemm( const torch::Tensor& input, const torch::Tensor& codes, const torch::Tensor& codebooks, diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index d1410071d3afe..51664eeb6b461 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -53,7 +53,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); // Quantization ops - ops.def("aqlm_gemm", &code1x16_matmat, "Quantized GEMM for AQLM"); + ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM"); #ifndef USE_ROCM ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ"); diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 991e59f5022a7..ac620e9361854 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -74,7 +74,7 @@ torch::Tensor code1x16_matmat( auto output_sizes = input_sizes.vec(); output_sizes.pop_back(); output_sizes.push_back(-1); - auto output = flat_output.reshape(output_sizes); // .clone(); + auto output = flat_output.reshape(output_sizes); return output; } @@ -131,6 +131,29 @@ torch::Tensor code2x8_matmat( auto output_sizes = input_sizes.vec(); output_sizes.pop_back(); output_sizes.push_back(-1); - auto output = flat_output.reshape(output_sizes).clone(); + auto output = flat_output.reshape(output_sizes); return output; } + +torch::Tensor aqlm_gemm( + const torch::Tensor& input, + const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& scales, + const std::optional& bias +) +{ + int const nbooks = codebooks.size(0); + int const entries = codebooks.size(1); + + if (nbooks == 1 && entries == (1 << 16)) + { + return code1x16_matmat(input, codes, codebooks, scales, bias); + } + if (nbooks == 2 && entries == (1 << 8)) + { + return code2x8_matmat(input, codes, codebooks, scales, bias); + } + // TODO error somehow. + return {}; +} diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py index cbc9b37857452..1ca5400db6065 100644 --- a/examples/aqlm_test.py +++ b/examples/aqlm_test.py @@ -1,10 +1,10 @@ from vllm import LLM, SamplingParams -#model = LLM("nm-testing/llama2.c-stories110M-pruned2.4") +# 1x16 model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True) -#model = LLM( -# "/home/jaemz/.cache/huggingface/hub/models--BlackSamorez--Llama-2-7b-AQLM-2Bit-1x16-hf/snapshots/1756949fed7fe691c00f015e2ebd18503a975f3b", -# enforce_eager=True) + +# 2 x 8 +#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True) sampling_params = SamplingParams(max_tokens=100, temperature=0) outputs = model.generate("Hello my name is", sampling_params=sampling_params) From 6eba0357701bf259389825432fc192a9fff2996a Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 29 Feb 2024 22:03:14 +0000 Subject: [PATCH 28/96] add support for 2x8, worked shockingly easily --- examples/aqlm_test.py | 4 ++-- vllm/model_executor/layers/quantization/aqlm.py | 6 ++---- vllm/model_executor/models/llama.py | 1 + 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py index 1ca5400db6065..a2fd5a9b7a7c5 100644 --- a/examples/aqlm_test.py +++ b/examples/aqlm_test.py @@ -1,10 +1,10 @@ from vllm import LLM, SamplingParams # 1x16 -model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True) +#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True) # 2 x 8 -#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True) +model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True) sampling_params = SamplingParams(max_tokens=100, temperature=0) outputs = model.generate("Hello my name is", sampling_params=sampling_params) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index daa7f88a3adef..848efcbfe5d97 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -41,8 +41,7 @@ def __init__( # I think pack factor is *probably* how many elements fit into one quantized tensor element. # though out group size makes it interesting, because really we are doing 2D blocks, potentially. # maybe this is vllms first 2D packing? Arg. - self.pack_factor = (self.in_group_size * self.out_group_size // - self.num_codebooks) + self.pack_factor = (self.in_group_size * self.out_group_size) def __repr__(self) -> str: return (f"AQLMConfig(in_group_size={self.in_group_size}, " @@ -144,7 +143,7 @@ def create_weights(self, input_size_per_partition: int, torch.empty( output_size_per_partition, # not entirely sure what to do with num_out_groups, if we need this pack factor. input_size_per_partition // self.quant_config.pack_factor, - 1, # probably should be num codebooks and change pack factor? + self.quant_config.num_codebooks, dtype=get_int_dtype(self.quant_config.nbits_per_codebook), ), requires_grad=False, @@ -212,7 +211,6 @@ def apply_weights( x: torch.Tensor, bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: - codebooks = weights["codebooks"] codes = weights["codes"] scales = weights["scales"] diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index b7f6b8f3ec374..88aea8de02845 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -388,4 +388,5 @@ def load_weights(self, param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) From 604f66fef9b5329f7276046a2692f39a727d2a03 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 1 Mar 2024 18:05:57 +0000 Subject: [PATCH 29/96] support more than one model --- examples/aqlm_test.py | 10 ++++-- vllm/model_executor/layers/linear.py | 25 +++++++------- .../layers/quantization/aqlm.py | 33 +++++++++++-------- .../model_executor/layers/quantization/awq.py | 2 +- .../layers/quantization/gptq.py | 2 +- .../layers/quantization/squeezellm.py | 2 +- vllm/model_executor/models/llama.py | 4 ++- 7 files changed, 46 insertions(+), 32 deletions(-) diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py index a2fd5a9b7a7c5..5a4bd4cc7572e 100644 --- a/examples/aqlm_test.py +++ b/examples/aqlm_test.py @@ -1,10 +1,14 @@ from vllm import LLM, SamplingParams -# 1x16 #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True) -# 2 x 8 -model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True) +#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True) + +model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True) + +# These have custom code and the old format, and puzzling and conflicting stats, which probably I shouldn't even try to support. +#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True) +#model = LLM("BlackSamorez/Llama-2-13b-AQLM-2Bit-1x16-hf", enforce_eager=True, trust_remote_code=True) sampling_params = SamplingParams(max_tokens=100, temperature=0) outputs = model.generate("Hello my name is", sampling_params=sampling_params) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 35da806a9097d..4c7d246ca519e 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -24,7 +24,7 @@ class LinearMethodBase(ABC): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, params_dtype: torch.dtype, - shards: int) -> Dict[str, Any]: + output_sizes: List[int]) -> Dict[str, Any]: """Create weights for a linear layer.""" raise NotImplementedError @@ -51,7 +51,7 @@ def __init__(self, separate_bias_add: bool = False): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, params_dtype: torch.dtype, - shards: int) -> Dict[str, Any]: + output_sizes: List[int]) -> Dict[str, Any]: weight = Parameter(torch.empty(output_size_per_partition, input_size_per_partition, dtype=params_dtype), @@ -106,7 +106,7 @@ def __init__( self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( self.input_size, self.output_size, self.input_size, - self.output_size, self.params_dtype, 1) + self.output_size, self.params_dtype, [self.output_size]) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) @@ -142,7 +142,7 @@ class ColumnParallelLinear(torch.nn.Module): skip adding bias but instead return it. params_dtype: Data type for the parameters. linear_method: (Maybe quantized) linear method. - shards: Number of packed shards, like for QKV this would be 3 + output_sizes: list of output sizes packed into one output, like for QKV the list would be size 3. """ def __init__( @@ -154,7 +154,7 @@ def __init__( skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, linear_method: Optional[LinearMethodBase] = None, - shards: int = 1, + output_sizes: List[int] = [0], ): super().__init__() @@ -162,7 +162,6 @@ def __init__( self.input_size = input_size self.output_size = output_size self.gather_output = gather_output - self.shards = shards # Divide the weight matrix along the last dimension. tp_size = get_tensor_model_parallel_world_size() self.output_size_per_partition = divide(output_size, tp_size) @@ -175,7 +174,7 @@ def __init__( self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( self.input_size, self.output_size_per_partition, self.input_size, - self.output_size, self.params_dtype, self.shards) + self.output_size, self.params_dtype, output_sizes) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) @@ -254,7 +253,7 @@ def __init__( assert all(output_size % tp_size == 0 for output_size in output_sizes) super().__init__(input_size, sum(output_sizes), bias, gather_output, skip_bias_add, params_dtype, linear_method, - len(self.output_sizes)) + self.output_sizes) def weight_loader(self, param: Parameter, @@ -374,15 +373,19 @@ def __init__( input_size = self.hidden_size output_size = (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size + super().__init__(input_size, output_size, bias, False, skip_bias_add, - params_dtype, linear_method, 3) + params_dtype, linear_method, [ + self.num_heads * tp_size * self.head_size, + self.num_kv_heads * tp_size * self.head_size, + self.num_kv_heads * tp_size * self.head_size + ]) def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor, loaded_shard_id: Optional[str] = None): param_data = param.data - output_dim = getattr(param, "output_dim", None) shard_dim = getattr(param, "shard_dim", None) @@ -512,7 +515,7 @@ def __init__( self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( self.input_size_per_partition, self.output_size, self.input_size, - self.output_size, self.params_dtype, 1) + self.output_size, self.params_dtype, [self.output_size]) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 848efcbfe5d97..a9732257e462d 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -120,7 +120,7 @@ def __init__(self, quant_config: AQLMConfig): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, params_dtype: torch.dtype, - shards: int) -> Dict[str, Any]: + output_sizes: List[int]) -> Dict[str, Any]: assert (output_size == output_size_per_partition) assert (input_size == input_size_per_partition) del output_size # Unused. @@ -156,12 +156,13 @@ def create_weights(self, input_size_per_partition: int, "output_dim": 0, "packed_dim": 1, "pack_factor": self.quant_config.pack_factor, + "output_sizes": output_sizes }, ) codebooks = Parameter( torch.empty( - self.quant_config.num_codebooks * shards, + self.quant_config.num_codebooks * len(output_sizes), 2**self.quant_config.nbits_per_codebook, self.quant_config.out_group_size, self.quant_config.in_group_size, @@ -173,7 +174,6 @@ def create_weights(self, input_size_per_partition: int, codebooks, { "shard_dim": 0, - "shards": shards }, ) @@ -219,28 +219,33 @@ def apply_weights( if shard_dim is not None: output_shape = x.shape[:-1] + (scales.shape[0], ) output = torch.empty(output_shape, dtype=x.dtype, device=x.device) - shards = getattr(codebooks, "shards", None) + output_sizes = getattr(codes, "output_sizes", None) + outputs = len(output_sizes) + # break the shards apart and combine them. assert (shard_dim == 0) - num_codebooks = codebooks.shape[shard_dim] // shards + num_codebooks = codebooks.shape[shard_dim] // outputs assert (scales.shape[0] == codes.shape[0]) - assert (scales.shape[0] % shards == 0) - base_size = scales.shape[0] // shards + assert (scales.shape[0] == sum(output_sizes)) - for shard_id in range(shards): + output_offset = 0 + codebooks_offset = 0 + for output_size in output_sizes: shard_output = ops.aqlm_gemm( - x, codes.narrow(0, shard_id * base_size, base_size), - codebooks.narrow(shard_dim, shard_id * num_codebooks, + x, codes.narrow(0, output_offset, output_size), + codebooks.narrow(shard_dim, codebooks_offset, num_codebooks), - scales.narrow(0, shard_id * base_size, base_size), + scales.narrow(0, output_offset, output_size), None if bias is None else bias.narrow( - 0, shard_id * base_size, base_size)) + 0, output_offset, output_size)) - output_slice = output.narrow(-1, shard_id * base_size, - base_size) + output_slice = output.narrow(-1, output_offset, output_size) assert (output_slice.shape == shard_output.shape) output_slice.copy_(shard_output) + output_offset += output_size + codebooks_offset += num_codebooks + return output output = ops.aqlm_gemm( diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index a3623ae5b0417..60afacea9c2af 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -80,7 +80,7 @@ def __init__(self, quant_config: AWQConfig): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, params_dtype: torch.dtype, - shards: int) -> Dict[str, Any]: + output_sizes: List[int]) -> Dict[str, Any]: if input_size_per_partition % self.quant_config.group_size != 0: raise ValueError( "The input size is not aligned with the quantized " diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 45b06947f3799..ae2929dcd22f8 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -90,7 +90,7 @@ def __init__(self, quant_config: GPTQConfig): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, params_dtype: torch.dtype, - shards: int) -> Dict[str, Any]: + output_sizes: List[int]) -> Dict[str, Any]: del output_size # Unused. if input_size_per_partition % self.quant_config.group_size != 0: raise ValueError( diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py index 091ff22b9b095..0ec5be06abbd6 100644 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -70,7 +70,7 @@ def __init__(self, quant_config: SqueezeLLMConfig): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, params_dtype: torch.dtype, - shards: int) -> Dict[str, Any]: + output_sizes: List[int]) -> Dict[str, Any]: if input_size_per_partition % self.quant_config.pack_factor != 0: raise ValueError( "The input size is not aligned with the quantized " diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 88aea8de02845..d5e61a6e1ed33 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -360,6 +360,7 @@ def load_weights(self, ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), ] + params_dict = dict(self.named_parameters()) for name, loaded_weight in hf_model_weights_iterator( model_name_or_path, cache_dir, load_format, revision): @@ -378,6 +379,7 @@ def load_weights(self, if name.endswith(".bias") and name not in params_dict: continue param = params_dict[name] + weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break @@ -388,5 +390,5 @@ def load_weights(self, param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - + weight_loader(param, loaded_weight) From ce639374af6a6d7f69247b8a9ecbcc5fbad0fb96 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 1 Mar 2024 18:23:45 +0000 Subject: [PATCH 30/96] formatting --- vllm/model_executor/layers/linear.py | 5 ----- vllm/model_executor/models/llama.py | 3 --- 2 files changed, 8 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 4c7d246ca519e..1a27fb35ee5e9 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -539,17 +539,12 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): tp_rank = get_tensor_model_parallel_rank() input_dim = getattr(param, "input_dim", None) param_data = param.data - if input_dim is not None: shard_size = param_data.shape[input_dim] start_idx = tp_rank * shard_size - assert (start_idx == 0 - and shard_size == loaded_weight.shape[input_dim]) - loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size) assert param_data.shape == loaded_weight.shape - param_data.copy_(loaded_weight) def forward(self, input_): diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index d5e61a6e1ed33..b7f6b8f3ec374 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -360,7 +360,6 @@ def load_weights(self, ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), ] - params_dict = dict(self.named_parameters()) for name, loaded_weight in hf_model_weights_iterator( model_name_or_path, cache_dir, load_format, revision): @@ -379,7 +378,6 @@ def load_weights(self, if name.endswith(".bias") and name not in params_dict: continue param = params_dict[name] - weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break @@ -390,5 +388,4 @@ def load_weights(self, param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) From 6cbdff7c972418b6dd9a476957ca2280f23a9d9a Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 1 Mar 2024 18:29:27 +0000 Subject: [PATCH 31/96] remove secondary aqlm loading --- vllm/config.py | 23 ++++++++--------------- vllm/model_executor/weight_utils.py | 7 +++++-- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 70a5f3b77eba1..bf972b53e5c6c 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -165,21 +165,14 @@ def _get_and_verify_quantization(self) -> Any | None: hf_quant_config = getattr(self.hf_config, "quantization_config", None) if hf_quant_config is not None: hf_quant_method = str(hf_quant_config["quant_method"]).lower() - else: - # HF models such as https://huggingface.co/BlackSamorez/Llama-2-70b-AQLM-4Bit-2x16-hf/blob/main/config.json - # only have an aqlm block, no quantization_config block. - hf_quant_config = getattr(self.hf_config, "aqlm", None) - if hf_quant_config is not None: - hf_quant_method = "aqlm" - - if hf_quant_method is not None and self.quantization is None: - self.quantization = hf_quant_method - elif self.quantization != hf_quant_method: - raise ValueError( - "Quantization method specified in the model config " - f"({hf_quant_method}) does not match the quantization " - f"method specified in the `quantization` argument " - f"({self.quantization}).") + if self.quantization is None: + self.quantization = hf_quant_method + elif self.quantization != hf_quant_method: + raise ValueError( + "Quantization method specified in the model config " + f"({hf_quant_method}) does not match the quantization " + f"method specified in the `quantization` argument " + f"({self.quantization}).") if self.quantization is not None: if self.quantization not in supported_quantization: diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index bdcb9d5976576..3570366887e78 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -85,8 +85,11 @@ def convert_bin_to_safetensor_file( # TODO(woosuk): Move this to other place. def get_quant_config(model_config: ModelConfig) -> QuantizationConfig: quant_cls = get_quantization_config(model_config.quantization) - if model_config.hf_quant_config is not None: - return quant_cls.from_config(model_config.hf_quant_config) + # Read the quantization config from the HF model config, if available. + hf_quant_config = getattr(model_config.hf_config, "quantization_config", + None) + if hf_quant_config is not None: + return quant_cls.from_config(hf_quant_config) model_name_or_path = model_config.model is_local = os.path.isdir(model_name_or_path) if not is_local: From a58d369c987671c80ab8eccdb8367ca5990472c4 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 1 Mar 2024 18:30:41 +0000 Subject: [PATCH 32/96] restore trailing space --- vllm/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index bf972b53e5c6c..b6ef7a69471b7 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -45,7 +45,7 @@ class ModelConfig: a tag name, or a commit id. If unspecified, will use the default version. code_revision: The specific revision to use for the model code on - Hugging Face Hub. It can be a branch name, a tag name, or a + Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. tokenizer_revision: The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use @@ -166,7 +166,7 @@ def _get_and_verify_quantization(self) -> Any | None: if hf_quant_config is not None: hf_quant_method = str(hf_quant_config["quant_method"]).lower() if self.quantization is None: - self.quantization = hf_quant_method + self.quantization = hf_quant_method elif self.quantization != hf_quant_method: raise ValueError( "Quantization method specified in the model config " From 31f0ddc3899abda11a62a833bc99b2d1869d99b4 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 1 Mar 2024 18:32:13 +0000 Subject: [PATCH 33/96] remove some code --- vllm/config.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index b6ef7a69471b7..4448ee9dac017 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -115,7 +115,7 @@ def __init__( max_model_len) self._verify_load_format() self._verify_tokenizer_mode() - self.hf_quant_config = self._get_and_verify_quantization() + self._verify_quantization() self._verify_cuda_graph() def _verify_load_format(self) -> None: @@ -154,14 +154,13 @@ def _verify_tokenizer_mode(self) -> None: "either 'auto' or 'slow'.") self.tokenizer_mode = tokenizer_mode - def _get_and_verify_quantization(self) -> Any | None: + def _verify_quantization(self) -> None: supported_quantization = ["aqlm", "awq", "gptq", "squeezellm"] rocm_not_supported_quantization = ["awq"] if self.quantization is not None: self.quantization = self.quantization.lower() # Parse quantization method from the HF model config, if available. - hf_quant_method = None hf_quant_config = getattr(self.hf_config, "quantization_config", None) if hf_quant_config is not None: hf_quant_method = str(hf_quant_config["quant_method"]).lower() @@ -188,8 +187,6 @@ def _get_and_verify_quantization(self) -> Any | None: "optimized yet. The speed can be slower than " "non-quantized models.") - return hf_quant_config - def _verify_cuda_graph(self) -> None: if self.max_context_len_to_capture is None: self.max_context_len_to_capture = self.max_model_len From edc80c61e05d70be213abebba995633e8a7f0fe0 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 1 Mar 2024 18:35:49 +0000 Subject: [PATCH 34/96] remove some comments --- .../layers/quantization/aqlm.py | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index a9732257e462d..744f1b7cb13ac 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -62,30 +62,6 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]: def get_min_capability(cls) -> int: return 60 - # such as. (This one looks correct) - # https://huggingface.co/BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf/blob/main/config.json - # - # "quantization_config": { - # "in_group_size": 8, - # "nbits_per_codebook": 16, - # "num_codebooks": 1, - # "out_group_size": 1, - # "quant_method": "aqlm" - # "linear_weights_not_to_quantize": [ <--- hmmm ???? - # "model.embed_tokens.weight", - # "lm_head.weight" - # }, - - # https://huggingface.co/meta-llama/Llama-2-7b-hf <- can't see it, locked behind meta. - - # this is no-standard, has no "quantization_config", just an "aqlm" block. - # https://huggingface.co/BlackSamorez/Llama-2-70b-AQLM-4Bit-2x16-hf/blob/main/config.json - # "aqlm": { - # "in_group_size": 8, - # "nbits_per_codebook": 16, - # "num_codebooks": 2, - # "out_group_size": 1 - @classmethod def get_config_filenames(cls) -> List[str]: return [] # no extra configs. @@ -96,7 +72,6 @@ def from_config(cls, config: Dict[str, Any]) -> "AQLMConfig": nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"]) num_code_books = cls.get_from_keys(config, ["num_codebooks"]) out_group_size = cls.get_from_keys(config, ["out_group_size"]) - # TODO linear_weights_not_to_quantize ? return cls(in_group_size, nbits_per_codebook, num_code_books, out_group_size) From 3253dc77dfe53318870b90833d5503024dee688b Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 1 Mar 2024 18:38:43 +0000 Subject: [PATCH 35/96] add some attributions --- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 2 ++ csrc/quantization/aqlm/aqlm_cuda_kernel.cu | 2 ++ vllm/model_executor/layers/quantization/aqlm.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index ac620e9361854..729b6f854e6dc 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -1,3 +1,5 @@ +// Adapted from https://github.com/Vahe1994/AQLM + #include #include #include diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu index 0f97e93d678e6..52d4b2e960cea 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu +++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu @@ -1,3 +1,5 @@ +// Adapted from https://github.com/Vahe1994/AQLM + #include #include #include diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 744f1b7cb13ac..aedae6faeb5f7 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -1,3 +1,5 @@ +# Supports AQLM compression, see https://github.com/Vahe1994/AQLM and https://arxiv.org/pdf/2401.06118.pdf + from typing import Any, Dict, List, Optional import torch From fefe1c874ac404cb7ef98d91829ea6362f3f7c4e Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 1 Mar 2024 19:18:38 +0000 Subject: [PATCH 36/96] support 2 tp --- examples/aqlm_test.py | 2 +- vllm/model_executor/layers/quantization/aqlm.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py index 5a4bd4cc7572e..7995a2db5a328 100644 --- a/examples/aqlm_test.py +++ b/examples/aqlm_test.py @@ -4,7 +4,7 @@ #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True) -model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True) +model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True, tensor_parallel_size=2) # These have custom code and the old format, and puzzling and conflicting stats, which probably I shouldn't even try to support. #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index aedae6faeb5f7..38528b6f13df0 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -98,8 +98,6 @@ def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, params_dtype: torch.dtype, output_sizes: List[int]) -> Dict[str, Any]: - assert (output_size == output_size_per_partition) - assert (input_size == input_size_per_partition) del output_size # Unused. del input_size # Unused. @@ -202,13 +200,13 @@ def apply_weights( # break the shards apart and combine them. assert (shard_dim == 0) num_codebooks = codebooks.shape[shard_dim] // outputs - assert (scales.shape[0] == codes.shape[0]) - assert (scales.shape[0] == sum(output_sizes)) - + assert (sum(output_sizes) % scales.shape[0] == 0) + out_tp = sum(output_sizes) // scales.shape[0] output_offset = 0 codebooks_offset = 0 for output_size in output_sizes: + output_size //= out_tp shard_output = ops.aqlm_gemm( x, codes.narrow(0, output_offset, output_size), codebooks.narrow(shard_dim, codebooks_offset, From 4b12ed62c3221c348ef1ef0b51408190a8d73413 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 1 Mar 2024 19:34:01 +0000 Subject: [PATCH 37/96] better tp support --- vllm/model_executor/layers/linear.py | 8 ++++--- .../layers/quantization/aqlm.py | 21 +++++++++---------- .../model_executor/layers/quantization/awq.py | 2 +- .../layers/quantization/gptq.py | 2 +- .../layers/quantization/squeezellm.py | 2 +- 5 files changed, 18 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 1a27fb35ee5e9..3b880709733e2 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -24,7 +24,7 @@ class LinearMethodBase(ABC): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, params_dtype: torch.dtype, - output_sizes: List[int]) -> Dict[str, Any]: + output_partition_sizes: List[int]) -> Dict[str, Any]: """Create weights for a linear layer.""" raise NotImplementedError @@ -51,7 +51,7 @@ def __init__(self, separate_bias_add: bool = False): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, params_dtype: torch.dtype, - output_sizes: List[int]) -> Dict[str, Any]: + output_partition_sizes: List[int]) -> Dict[str, Any]: weight = Parameter(torch.empty(output_size_per_partition, input_size_per_partition, dtype=params_dtype), @@ -174,7 +174,9 @@ def __init__( self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( self.input_size, self.output_size_per_partition, self.input_size, - self.output_size, self.params_dtype, output_sizes) + self.output_size, self.params_dtype, + [x // tp_size for x in output_sizes]) + for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 38528b6f13df0..d7180c0226c88 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -97,7 +97,7 @@ def __init__(self, quant_config: AQLMConfig): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, params_dtype: torch.dtype, - output_sizes: List[int]) -> Dict[str, Any]: + output_partition_sizes: List[int]) -> Dict[str, Any]: del output_size # Unused. del input_size # Unused. @@ -131,13 +131,13 @@ def create_weights(self, input_size_per_partition: int, "output_dim": 0, "packed_dim": 1, "pack_factor": self.quant_config.pack_factor, - "output_sizes": output_sizes + "output_partition_sizes": output_partition_sizes }, ) codebooks = Parameter( torch.empty( - self.quant_config.num_codebooks * len(output_sizes), + self.quant_config.num_codebooks * len(output_partition_sizes), 2**self.quant_config.nbits_per_codebook, self.quant_config.out_group_size, self.quant_config.in_group_size, @@ -194,19 +194,18 @@ def apply_weights( if shard_dim is not None: output_shape = x.shape[:-1] + (scales.shape[0], ) output = torch.empty(output_shape, dtype=x.dtype, device=x.device) - output_sizes = getattr(codes, "output_sizes", None) - outputs = len(output_sizes) + output_partition_sizes = getattr(codes, "output_partition_sizes", + None) + num_outputs = len(output_partition_sizes) - # break the shards apart and combine them. + # break the inputs and codebooks apart then combine the outputs. assert (shard_dim == 0) - num_codebooks = codebooks.shape[shard_dim] // outputs + num_codebooks = codebooks.shape[shard_dim] // num_outputs assert (scales.shape[0] == codes.shape[0]) - assert (sum(output_sizes) % scales.shape[0] == 0) - out_tp = sum(output_sizes) // scales.shape[0] + assert (sum(output_partition_sizes) == scales.shape[0]) output_offset = 0 codebooks_offset = 0 - for output_size in output_sizes: - output_size //= out_tp + for output_size in output_partition_sizes: shard_output = ops.aqlm_gemm( x, codes.narrow(0, output_offset, output_size), codebooks.narrow(shard_dim, codebooks_offset, diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 60afacea9c2af..e6c7c658e1e19 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -80,7 +80,7 @@ def __init__(self, quant_config: AWQConfig): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, params_dtype: torch.dtype, - output_sizes: List[int]) -> Dict[str, Any]: + output_partition_sizes: List[int]) -> Dict[str, Any]: if input_size_per_partition % self.quant_config.group_size != 0: raise ValueError( "The input size is not aligned with the quantized " diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index ae2929dcd22f8..7ca29b941eeb0 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -90,7 +90,7 @@ def __init__(self, quant_config: GPTQConfig): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, params_dtype: torch.dtype, - output_sizes: List[int]) -> Dict[str, Any]: + output_partition_sizes: List[int]) -> Dict[str, Any]: del output_size # Unused. if input_size_per_partition % self.quant_config.group_size != 0: raise ValueError( diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py index 0ec5be06abbd6..e266e8a74af69 100644 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -70,7 +70,7 @@ def __init__(self, quant_config: SqueezeLLMConfig): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, params_dtype: torch.dtype, - output_sizes: List[int]) -> Dict[str, Any]: + output_partition_sizes: List[int]) -> Dict[str, Any]: if input_size_per_partition % self.quant_config.pack_factor != 0: raise ValueError( "The input size is not aligned with the quantized " From e5c2010fb50d99152576a882d195cab37f9f5fa5 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 1 Mar 2024 19:36:44 +0000 Subject: [PATCH 38/96] format --- vllm/model_executor/layers/quantization/gptq.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 7ca29b941eeb0..36a60990a8049 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -87,10 +87,15 @@ class GPTQLinearMethod(LinearMethodBase): def __init__(self, quant_config: GPTQConfig): self.quant_config = quant_config - def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, - output_size: int, params_dtype: torch.dtype, - output_partition_sizes: List[int]) -> Dict[str, Any]: + def create_weights( + self, + input_size_per_partition: int, + output_size_per_partition: int, + input_size: int, + output_size: int, + params_dtype: torch.dtype, + output_partition_sizes: List[int], + ) -> Dict[str, Any]: del output_size # Unused. if input_size_per_partition % self.quant_config.group_size != 0: raise ValueError( From eef729fb79c6e079b01ae91390af8454d0f5c7b9 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 1 Mar 2024 21:06:21 +0000 Subject: [PATCH 39/96] comments --- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 7 ++++++- examples/aqlm_test.py | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 729b6f854e6dc..2e58d13d82295 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -4,6 +4,9 @@ #include #include +#include +#include + void code1x16_matvec_cuda( const void* A, const void* B, @@ -156,6 +159,8 @@ torch::Tensor aqlm_gemm( { return code2x8_matmat(input, codes, codebooks, scales, bias); } - // TODO error somehow. + + std::cerr << "AQLM does not support " << nbooks << " codebooks with " << entries << " entries"; + std::abort(); return {}; } diff --git a/examples/aqlm_test.py b/examples/aqlm_test.py index 7995a2db5a328..35289103b2e70 100644 --- a/examples/aqlm_test.py +++ b/examples/aqlm_test.py @@ -6,9 +6,14 @@ model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True, tensor_parallel_size=2) -# These have custom code and the old format, and puzzling and conflicting stats, which probably I shouldn't even try to support. +# this has the codes 0 and 1 transposed. #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True) +# this model hangs +#model = LLM("BlackSamorez/Mixtral-8x7B-Instruct-v0_1-AQLM-2Bit-1x16-hf", enforce_eager=True) + +# These have custom code and the old format, and puzzling and conflicting stats, which probably I shouldn't even try to support. #model = LLM("BlackSamorez/Llama-2-13b-AQLM-2Bit-1x16-hf", enforce_eager=True, trust_remote_code=True) +#model = LLM("BlackSamorez/Mixtral-8x7b-AQLM-2Bit-1x16-hf", enforce_eager=True) sampling_params = SamplingParams(max_tokens=100, temperature=0) outputs = model.generate("Hello my name is", sampling_params=sampling_params) From d31241b8744a88fc6fc38f4e809f03ff32c53923 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 1 Mar 2024 21:38:24 +0000 Subject: [PATCH 40/96] comments --- vllm/model_executor/layers/quantization/aqlm.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index d7180c0226c88..9dcf36ef6ef5b 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -40,9 +40,8 @@ def __init__( self.num_codebooks = num_codebooks self.out_group_size = out_group_size - # I think pack factor is *probably* how many elements fit into one quantized tensor element. - # though out group size makes it interesting, because really we are doing 2D blocks, potentially. - # maybe this is vllms first 2D packing? Arg. + # out_group_size > 1 is untested, and probably won't work as-is. + assert(self.out_group_size == 1) self.pack_factor = (self.in_group_size * self.out_group_size) def __repr__(self) -> str: @@ -116,7 +115,11 @@ def create_weights(self, input_size_per_partition: int, codes = Parameter( torch.empty( - output_size_per_partition, # not entirely sure what to do with num_out_groups, if we need this pack factor. + # There could actually be two pack factors, one along input and one along output, + # but we don't currently support out_group_size, + # and only the one along output needs to be marked with "packed_dim". + # in order for QKVLinear to work. + output_size_per_partition, input_size_per_partition // self.quant_config.pack_factor, self.quant_config.num_codebooks, dtype=get_int_dtype(self.quant_config.nbits_per_codebook), From ba3c1256b7135bab0fa6a6fcb4b61240445d3288 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 1 Mar 2024 21:41:42 +0000 Subject: [PATCH 41/96] rename aqlm_test --- examples/{aqlm_test.py => aqlm_example.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/{aqlm_test.py => aqlm_example.py} (100%) diff --git a/examples/aqlm_test.py b/examples/aqlm_example.py similarity index 100% rename from examples/aqlm_test.py rename to examples/aqlm_example.py From 703fa798254da2012501f0205e84d595744934e0 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 1 Mar 2024 22:02:52 +0000 Subject: [PATCH 42/96] better comments --- examples/aqlm_example.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index 35289103b2e70..ef548a22d5893 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -6,12 +6,12 @@ model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True, tensor_parallel_size=2) -# this has the codes 0 and 1 transposed. +# this has the dimensions 0 and 1 transposed for the codes, and we don't currently support 8x8 anyway. #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True) -# this model hangs +# this model hangs, need to investigate. #model = LLM("BlackSamorez/Mixtral-8x7B-Instruct-v0_1-AQLM-2Bit-1x16-hf", enforce_eager=True) -# These have custom code and the old format, and puzzling and conflicting stats, which probably I shouldn't even try to support. +# These have custom code and no quantization_config block. #model = LLM("BlackSamorez/Llama-2-13b-AQLM-2Bit-1x16-hf", enforce_eager=True, trust_remote_code=True) #model = LLM("BlackSamorez/Mixtral-8x7b-AQLM-2Bit-1x16-hf", enforce_eager=True) From 6e47ff649cbb1c4d8867d4191c13c706582e10f9 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 1 Mar 2024 22:05:41 +0000 Subject: [PATCH 43/96] better comment --- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 2e58d13d82295..35aeb67046070 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -160,7 +160,7 @@ torch::Tensor aqlm_gemm( return code2x8_matmat(input, codes, codebooks, scales, bias); } - std::cerr << "AQLM does not support " << nbooks << " codebooks with " << entries << " entries"; + std::cerr << "AQLM with " << nbooks << " codebooks and " << entries << " entries is not currently supported."; std::abort(); return {}; } From 556178f134e9c705251c4e01330e1d22416d4417 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Mon, 4 Mar 2024 20:25:56 +0000 Subject: [PATCH 44/96] first attempt --- csrc/ops.h | 1 + csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 47 ++++++++++++++++--- csrc/quantization/aqlm/aqlm_cuda_kernel.cu | 23 +++++++-- .../layers/quantization/aqlm.py | 43 +++++------------ 4 files changed, 71 insertions(+), 43 deletions(-) diff --git a/csrc/ops.h b/csrc/ops.h index c70a04e1a8694..ec7ee30bf5015 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -74,6 +74,7 @@ torch::Tensor aqlm_gemm( const torch::Tensor& codes, const torch::Tensor& codebooks, const torch::Tensor& scales, + const torch::Tensor& codebook_partition_sizes, const std::optional& bias ); diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 35aeb67046070..e534c911b2b7e 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -13,7 +13,9 @@ void code1x16_matvec_cuda( void* C, const void* codebook, int prob_m, - int prob_k + int prob_k, + const int codebook_a_sizes[4], // cumulative sizes of A spanning each codebook, at most 3 long. + const int codebook_stride // as int4. ); void code2x8_matvec_cuda( @@ -29,8 +31,19 @@ void code1x16_matvec( const torch::Tensor& A, const torch::Tensor& B, torch::Tensor& C, - const torch::Tensor& codebook + const torch::Tensor& codebook, + const int codebook_a_sizes[4] // cumulative sizes of A spanning each codebook, at most 3 long. ) { + + // @TEST + int stride = codebook.stride(0) * codebook.element_size() / sizeof(int4); + printf("codebook rank is %ld: %ld %ld %ld %ld", codebook.dim(),codebook.size(0),codebook.size(1),codebook.size(2),codebook.size(3)); + std::cout << "codebook element size is " << codebook.element_size() << "\n"; + std::cout << "sizeof int4 is " << sizeof(int4) << "\n"; + std::cout << "stride is " << stride << "\n"; + //std::cout << "codebook dtype is " << codebook.dtype << "\n"; + assert(false); + const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); int prob_m = C.size(0); int prob_k = B.size(0); @@ -40,7 +53,9 @@ void code1x16_matvec( C.data_ptr(), codebook.data_ptr(), prob_m, - prob_k + prob_k, + codebook_a_sizes, + codebook.stride(0) * codebook.element_size() / sizeof(int4) ); } @@ -49,8 +64,8 @@ torch::Tensor code1x16_matmat( const torch::Tensor& codes, const torch::Tensor& codebooks, const torch::Tensor& scales, - const std::optional& bias -) { + const int codebook_a_sizes[4], + const std::optional& bias) { auto input_sizes = input.sizes(); auto out_features = codes.size(0) * codebooks.size(2); auto flat_input = input.reshape({-1, input.size(-1)}); @@ -67,7 +82,8 @@ torch::Tensor code1x16_matmat( codes.squeeze(2), input_vec, output_vec, - codebooks + codebooks, + codebook_a_sizes ); } flat_output *= scales.flatten().unsqueeze(0); @@ -145,6 +161,7 @@ torch::Tensor aqlm_gemm( const torch::Tensor& codes, const torch::Tensor& codebooks, const torch::Tensor& scales, + const torch::Tensor& codebook_partition_sizes, const std::optional& bias ) { @@ -153,7 +170,23 @@ torch::Tensor aqlm_gemm( if (nbooks == 1 && entries == (1 << 16)) { - return code1x16_matmat(input, codes, codebooks, scales, bias); + int cumulative_sizes[4]; + int i =0; + int last = 0; + for (; i < codebook_partition_sizes.size(0); ++i) + { + cumulative_sizes[i] = codebook_partition_sizes[i] + last; + printf("cum size %d is %d", i, cumulative_sizes[i]); + last = cumulative_sizes[i]; + } + // just fill in the rest with unreachable. + for (; i < 4; ++i) + { + cumulative_sizes[i] = last*10; + printf("cum size %d is %d", i, cumulative_sizes[i]); + } + + return code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias); } if (nbooks == 2 && entries == (1 << 8)) { diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu index 52d4b2e960cea..2bd6edd5f03e7 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu +++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu @@ -12,11 +12,22 @@ __global__ void Code1x16MatVec( const int4* __restrict__ B, int4* __restrict__ C, const int4* __restrict__ codebook, - int prob_m, - int prob_k + const int prob_m, + const int prob_k, + const int codebook_a_sizes[4], // cumulative sizes of A spanning each codebook, at most 3 long. + const int codebook_stride // as int4. ) { int a_gl_stride = prob_k / 8 / 8; int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); + + // advance to the codebook we are in, this easy because we only multiply one column of the codebook. + int codebook_index = 0; + while (a_gl_rd >= codebook_a_sizes[codebook_index]) + { + codebook += codebook_stride; + ++codebook_index; + } + bool pred = a_gl_rd < prob_m; int b_gl_rd = 0; int c_gl_wr = a_gl_rd; @@ -156,7 +167,9 @@ void code1x16_matvec_cuda( void* __restrict__ C, const void* __restrict__ codebook, int prob_m, - int prob_k + int prob_k, + const int codebook_a_sizes[4], + const int codebook_stride ) { int sms; cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); @@ -176,7 +189,9 @@ void code1x16_matvec_cuda( (int4*) C, (const int4*) codebook, prob_m, - prob_k + prob_k, + codebook_a_sizes, + codebook_stride ); } diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 9dcf36ef6ef5b..b115ea7d37b0d 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -134,7 +134,6 @@ def create_weights(self, input_size_per_partition: int, "output_dim": 0, "packed_dim": 1, "pack_factor": self.quant_config.pack_factor, - "output_partition_sizes": output_partition_sizes }, ) @@ -152,6 +151,7 @@ def create_weights(self, input_size_per_partition: int, codebooks, { "shard_dim": 0, + "output_partition_sizes": output_partition_sizes, }, ) @@ -192,45 +192,24 @@ def apply_weights( codebooks = weights["codebooks"] codes = weights["codes"] scales = weights["scales"] + output_partition_sizes = getattr(codebooks, "output_partition_sizes", + None) - shard_dim = getattr(codebooks, "shard_dim", None) - if shard_dim is not None: - output_shape = x.shape[:-1] + (scales.shape[0], ) - output = torch.empty(output_shape, dtype=x.dtype, device=x.device) - output_partition_sizes = getattr(codes, "output_partition_sizes", - None) - num_outputs = len(output_partition_sizes) - - # break the inputs and codebooks apart then combine the outputs. - assert (shard_dim == 0) - num_codebooks = codebooks.shape[shard_dim] // num_outputs - assert (scales.shape[0] == codes.shape[0]) - assert (sum(output_partition_sizes) == scales.shape[0]) - output_offset = 0 - codebooks_offset = 0 - for output_size in output_partition_sizes: - shard_output = ops.aqlm_gemm( - x, codes.narrow(0, output_offset, output_size), - codebooks.narrow(shard_dim, codebooks_offset, - num_codebooks), - scales.narrow(0, output_offset, output_size), - None if bias is None else bias.narrow( - 0, output_offset, output_size)) - - output_slice = output.narrow(-1, output_offset, output_size) - assert (output_slice.shape == shard_output.shape) - output_slice.copy_(shard_output) - output_offset += output_size - codebooks_offset += num_codebooks - - return output + #test + print("codes shape", codes.shape) + print("code books shape", codebooks.shape) + print("partition sizes", output_partition_sizes) + print("input shape", x.shape) output = ops.aqlm_gemm( x, codes, codebooks, scales, + output_partition_sizes, bias, ) + print("output shape", output.shape) + return output From e23f1cd5bab0abcc8c8ab4d734f2955902cce2bb Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 5 Mar 2024 16:21:40 +0000 Subject: [PATCH 45/96] got it working --- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 37 +++++++------------ csrc/quantization/aqlm/aqlm_cuda_kernel.cu | 12 +++--- examples/aqlm_example.py | 4 +- vllm/model_executor/layers/linear.py | 2 +- .../layers/quantization/aqlm.py | 2 +- 5 files changed, 24 insertions(+), 33 deletions(-) diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index e534c911b2b7e..ec271435fe15d 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -14,7 +14,7 @@ void code1x16_matvec_cuda( const void* codebook, int prob_m, int prob_k, - const int codebook_a_sizes[4], // cumulative sizes of A spanning each codebook, at most 3 long. + const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long. const int codebook_stride // as int4. ); @@ -32,21 +32,12 @@ void code1x16_matvec( const torch::Tensor& B, torch::Tensor& C, const torch::Tensor& codebook, - const int codebook_a_sizes[4] // cumulative sizes of A spanning each codebook, at most 3 long. + const int4 codebook_a_sizes // cumulative sizes of A spanning each codebook, at most 3 long. ) { - - // @TEST - int stride = codebook.stride(0) * codebook.element_size() / sizeof(int4); - printf("codebook rank is %ld: %ld %ld %ld %ld", codebook.dim(),codebook.size(0),codebook.size(1),codebook.size(2),codebook.size(3)); - std::cout << "codebook element size is " << codebook.element_size() << "\n"; - std::cout << "sizeof int4 is " << sizeof(int4) << "\n"; - std::cout << "stride is " << stride << "\n"; - //std::cout << "codebook dtype is " << codebook.dtype << "\n"; - assert(false); - const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); int prob_m = C.size(0); int prob_k = B.size(0); + code1x16_matvec_cuda( A.data_ptr(), B.data_ptr(), @@ -64,7 +55,7 @@ torch::Tensor code1x16_matmat( const torch::Tensor& codes, const torch::Tensor& codebooks, const torch::Tensor& scales, - const int codebook_a_sizes[4], + const int4 codebook_a_sizes, const std::optional& bias) { auto input_sizes = input.sizes(); auto out_features = codes.size(0) * codebooks.size(2); @@ -165,25 +156,25 @@ torch::Tensor aqlm_gemm( const std::optional& bias ) { - int const nbooks = codebooks.size(0); + int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0); int const entries = codebooks.size(1); if (nbooks == 1 && entries == (1 << 16)) { - int cumulative_sizes[4]; + int4 cumulative_sizes; + auto cumulative_size = &cumulative_sizes.x; int i =0; int last = 0; - for (; i < codebook_partition_sizes.size(0); ++i) + assert(codebook_partition_sizes.size(0) <= 4); + for (; i < codebook_partition_sizes.size(0); ++i, ++cumulative_size) { - cumulative_sizes[i] = codebook_partition_sizes[i] + last; - printf("cum size %d is %d", i, cumulative_sizes[i]); - last = cumulative_sizes[i]; + *cumulative_size = codebook_partition_sizes[i].item() + last; + last = *cumulative_size; } - // just fill in the rest with unreachable. - for (; i < 4; ++i) + // fill in the rest with unreachable. + for (; i < 4; ++i, ++cumulative_size) { - cumulative_sizes[i] = last*10; - printf("cum size %d is %d", i, cumulative_sizes[i]); + *cumulative_size = last*10; } return code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias); diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu index 2bd6edd5f03e7..4d8bdc6e47861 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu +++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu @@ -14,18 +14,18 @@ __global__ void Code1x16MatVec( const int4* __restrict__ codebook, const int prob_m, const int prob_k, - const int codebook_a_sizes[4], // cumulative sizes of A spanning each codebook, at most 3 long. + const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long. const int codebook_stride // as int4. ) { int a_gl_stride = prob_k / 8 / 8; int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - // advance to the codebook we are in, this easy because we only multiply one column of the codebook. - int codebook_index = 0; - while (a_gl_rd >= codebook_a_sizes[codebook_index]) + // advance to the correct codebook, this easy because we only multiply one column of the codebook. + auto codebook_size = &codebook_a_sizes.x; + while (a_gl_rd >= *codebook_size) { codebook += codebook_stride; - ++codebook_index; + ++codebook_size; } bool pred = a_gl_rd < prob_m; @@ -168,7 +168,7 @@ void code1x16_matvec_cuda( const void* __restrict__ codebook, int prob_m, int prob_k, - const int codebook_a_sizes[4], + const int4 codebook_a_sizes, const int codebook_stride ) { int sms; diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index ef548a22d5893..4ba4b4e16a9e5 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -1,10 +1,10 @@ from vllm import LLM, SamplingParams -#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True) +model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True) #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True) -model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True, tensor_parallel_size=2) +#model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True) # this has the dimensions 0 and 1 transposed for the codes, and we don't currently support 8x8 anyway. #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 3b880709733e2..4825dcff138c9 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -264,7 +264,7 @@ def weight_loader(self, param_data = param.data output_dim = getattr(param, "output_dim", None) - # shard_dim indicates fixed size concatenated at shard_id + # shard_dim indicates fixed size concatenated along shard_id shard_dim = getattr(param, "shard_dim", None) if loaded_shard_id is None: # Loaded weight is already packed. diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index b115ea7d37b0d..91b153a5bb8b3 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -151,7 +151,7 @@ def create_weights(self, input_size_per_partition: int, codebooks, { "shard_dim": 0, - "output_partition_sizes": output_partition_sizes, + "output_partition_sizes": torch.tensor(output_partition_sizes, device='cpu'), }, ) From 6253807fbad681d6998a1c883097ed398d27dc8e Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 5 Mar 2024 16:23:40 +0000 Subject: [PATCH 46/96] remove prints --- vllm/model_executor/layers/quantization/aqlm.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 91b153a5bb8b3..6ca0e7d0cafeb 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -195,12 +195,6 @@ def apply_weights( output_partition_sizes = getattr(codebooks, "output_partition_sizes", None) - #test - print("codes shape", codes.shape) - print("code books shape", codebooks.shape) - print("partition sizes", output_partition_sizes) - print("input shape", x.shape) - output = ops.aqlm_gemm( x, codes, @@ -210,6 +204,4 @@ def apply_weights( bias, ) - print("output shape", output.shape) - return output From 05ccd5072a98daa865d750c0c5defdeee0fc9383 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 5 Mar 2024 17:22:37 +0000 Subject: [PATCH 47/96] add arguments and options --- examples/aqlm_example.py | 41 +++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index ef548a22d5893..65b73c26a6080 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -1,20 +1,35 @@ from vllm import LLM, SamplingParams +import argparse -#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", enforce_eager=True) +def main(): -#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf", enforce_eager=True) + # Create argument parser + parser = argparse.ArgumentParser(description='Example script with command-line arguments') -model = LLM("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", enforce_eager=True, tensor_parallel_size=2) + # Add arguments + parser.add_argument('--model', '-m', type=int, default=0, help='Model ID [0-2]') + parser.add_argument('--tensor_parallel_size', '-t', type=int, default=1, help='tensor parallel size') -# this has the dimensions 0 and 1 transposed for the codes, and we don't currently support 8x8 anyway. -#model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True) -# this model hangs, need to investigate. -#model = LLM("BlackSamorez/Mixtral-8x7B-Instruct-v0_1-AQLM-2Bit-1x16-hf", enforce_eager=True) + # Parse the command-line arguments + args = parser.parse_args() -# These have custom code and no quantization_config block. -#model = LLM("BlackSamorez/Llama-2-13b-AQLM-2Bit-1x16-hf", enforce_eager=True, trust_remote_code=True) -#model = LLM("BlackSamorez/Mixtral-8x7b-AQLM-2Bit-1x16-hf", enforce_eager=True) + # These are the verified working models. + models = ["BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", "BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", "BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf"] -sampling_params = SamplingParams(max_tokens=100, temperature=0) -outputs = model.generate("Hello my name is", sampling_params=sampling_params) -print(outputs[0].outputs[0].text) + model = LLM(models[args.model], enforce_eager=True, tensor_parallel_size=args.tensor_parallel_size) + + # this has the dimensions 0 and 1 transposed for the codes, and we don't currently support 8x8 anyway. + #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True) + # this model hangs, need to investigate. + #model = LLM("BlackSamorez/Mixtral-8x7B-Instruct-v0_1-AQLM-2Bit-1x16-hf", enforce_eager=True) + + # These have custom code and no quantization_config block. + #model = LLM("BlackSamorez/Llama-2-13b-AQLM-2Bit-1x16-hf", enforce_eager=True, trust_remote_code=True) + #model = LLM("BlackSamorez/Mixtral-8x7b-AQLM-2Bit-1x16-hf", enforce_eager=True) + + sampling_params = SamplingParams(max_tokens=100, temperature=0) + outputs = model.generate("Hello my name is", sampling_params=sampling_params) + print(outputs[0].outputs[0].text) + +if __name__ == '__main__': + main() \ No newline at end of file From 7b67492cc6d01243c198cde9a8d3603de1b923a4 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 5 Mar 2024 17:24:02 +0000 Subject: [PATCH 48/96] rename shard_dim to just bool is_metadata --- vllm/model_executor/layers/linear.py | 20 +++++++++---------- .../layers/quantization/aqlm.py | 12 +++++------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 3b880709733e2..8f66bf6b1e677 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -264,8 +264,7 @@ def weight_loader(self, param_data = param.data output_dim = getattr(param, "output_dim", None) - # shard_dim indicates fixed size concatenated at shard_id - shard_dim = getattr(param, "shard_dim", None) + is_metadata = getattr(param, "is_metadata", False) if loaded_shard_id is None: # Loaded weight is already packed. if output_dim is None: @@ -306,10 +305,11 @@ def weight_loader(self, start_idx = tp_rank * shard_size loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - elif shard_dim is not None: - shard_size = loaded_weight.shape[shard_dim] + elif is_metadata: + # metadata indicates fixed size concatenated along dim 0 + shard_size = loaded_weight.shape[0] shard_offset = loaded_shard_id * shard_size - param_data = param_data.narrow(shard_dim, shard_offset, shard_size) + param_data = param_data.narrow(0, shard_offset, shard_size) else: ignore_warning = getattr(param, "ignore_warning", False) if not ignore_warning: @@ -389,8 +389,7 @@ def weight_loader(self, loaded_shard_id: Optional[str] = None): param_data = param.data output_dim = getattr(param, "output_dim", None) - shard_dim = getattr(param, "shard_dim", None) - + is_metadata = getattr(param, "is_metadata", False) if loaded_shard_id is None: # Loaded weight is already packed. if output_dim is None: @@ -445,10 +444,11 @@ def weight_loader(self, start_idx = shard_id * shard_size loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - elif shard_dim is not None: - shard_size = loaded_weight.shape[shard_dim] + elif is_metadata: + # metadata indicates fixed size concatenated along dim 0 + shard_size = loaded_weight.shape[0] shard_index = ["q", "k", "v"].index(loaded_shard_id) - param_data = param_data.narrow(shard_dim, shard_index * shard_size, + param_data = param_data.narrow(0, shard_index * shard_size, shard_size) else: ignore_warning = getattr(param, "ignore_warning", False) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 9dcf36ef6ef5b..b606370909a04 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -151,7 +151,8 @@ def create_weights(self, input_size_per_partition: int, set_weight_attrs( codebooks, { - "shard_dim": 0, + # metadata indicates fixed size concatenated along dim 0 + "is_metadata": True, }, ) @@ -193,8 +194,8 @@ def apply_weights( codes = weights["codes"] scales = weights["scales"] - shard_dim = getattr(codebooks, "shard_dim", None) - if shard_dim is not None: + is_metadata = getattr(codebooks, "is_metadata", False) + if is_metadata: output_shape = x.shape[:-1] + (scales.shape[0], ) output = torch.empty(output_shape, dtype=x.dtype, device=x.device) output_partition_sizes = getattr(codes, "output_partition_sizes", @@ -202,8 +203,7 @@ def apply_weights( num_outputs = len(output_partition_sizes) # break the inputs and codebooks apart then combine the outputs. - assert (shard_dim == 0) - num_codebooks = codebooks.shape[shard_dim] // num_outputs + num_codebooks = codebooks.shape[0] // num_outputs assert (scales.shape[0] == codes.shape[0]) assert (sum(output_partition_sizes) == scales.shape[0]) output_offset = 0 @@ -211,7 +211,7 @@ def apply_weights( for output_size in output_partition_sizes: shard_output = ops.aqlm_gemm( x, codes.narrow(0, output_offset, output_size), - codebooks.narrow(shard_dim, codebooks_offset, + codebooks.narrow(0, codebooks_offset, num_codebooks), scales.narrow(0, output_offset, output_size), None if bias is None else bias.narrow( From 3aafb3cab762e8851ff8ab77d7037fa70c69227a Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 5 Mar 2024 17:39:22 +0000 Subject: [PATCH 49/96] use TORCH_CHECK --- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index ec271435fe15d..c7a1d606911d5 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -184,7 +184,6 @@ torch::Tensor aqlm_gemm( return code2x8_matmat(input, codes, codebooks, scales, bias); } - std::cerr << "AQLM with " << nbooks << " codebooks and " << entries << " entries is not currently supported."; - std::abort(); + TORCH_CHECK(False, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.") return {}; } From ef608a612c1f98c8c9fa91a8a4ac1018f7cd32c7 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 5 Mar 2024 17:40:39 +0000 Subject: [PATCH 50/96] cleanup aqlm_example --- examples/aqlm_example.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index 65b73c26a6080..eee3877d73327 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -3,14 +3,11 @@ def main(): - # Create argument parser parser = argparse.ArgumentParser(description='Example script with command-line arguments') - # Add arguments parser.add_argument('--model', '-m', type=int, default=0, help='Model ID [0-2]') parser.add_argument('--tensor_parallel_size', '-t', type=int, default=1, help='tensor parallel size') - # Parse the command-line arguments args = parser.parse_args() # These are the verified working models. From 5bacc9d0fbb8db708220ed19ea0a10afcab0bee9 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 5 Mar 2024 19:10:02 +0000 Subject: [PATCH 51/96] format --- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 3 +- examples/aqlm_example.py | 34 ++++++++++++++----- vllm/config.py | 2 +- .../layers/quantization/aqlm.py | 5 ++- 4 files changed, 30 insertions(+), 14 deletions(-) diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 35aeb67046070..c2fa92c678df5 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -160,7 +160,6 @@ torch::Tensor aqlm_gemm( return code2x8_matmat(input, codes, codebooks, scales, bias); } - std::cerr << "AQLM with " << nbooks << " codebooks and " << entries << " entries is not currently supported."; - std::abort(); + TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.") return {}; } diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index eee3877d73327..6e6dc07a9f7a8 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -1,20 +1,36 @@ from vllm import LLM, SamplingParams import argparse + def main(): - parser = argparse.ArgumentParser(description='Example script with command-line arguments') + parser = argparse.ArgumentParser( + description='Example script with command-line arguments') - parser.add_argument('--model', '-m', type=int, default=0, help='Model ID [0-2]') - parser.add_argument('--tensor_parallel_size', '-t', type=int, default=1, help='tensor parallel size') + parser.add_argument('--model', + '-m', + type=int, + default=0, + help='Model ID [0-2]') + parser.add_argument('--tensor_parallel_size', + '-t', + type=int, + default=1, + help='tensor parallel size') args = parser.parse_args() # These are the verified working models. - models = ["BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", "BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", "BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf"] + models = [ + "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", + "BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", + "BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf" + ] + + model = LLM(models[args.model], + enforce_eager=True, + tensor_parallel_size=args.tensor_parallel_size) - model = LLM(models[args.model], enforce_eager=True, tensor_parallel_size=args.tensor_parallel_size) - # this has the dimensions 0 and 1 transposed for the codes, and we don't currently support 8x8 anyway. #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True) # this model hangs, need to investigate. @@ -25,8 +41,10 @@ def main(): #model = LLM("BlackSamorez/Mixtral-8x7b-AQLM-2Bit-1x16-hf", enforce_eager=True) sampling_params = SamplingParams(max_tokens=100, temperature=0) - outputs = model.generate("Hello my name is", sampling_params=sampling_params) + outputs = model.generate("Hello my name is", + sampling_params=sampling_params) print(outputs[0].outputs[0].text) + if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/vllm/config.py b/vllm/config.py index 4448ee9dac017..d8825397e0fe8 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, Union, ClassVar +from typing import Optional, Union, ClassVar from dataclasses import dataclass import os from packaging.version import Version diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index b606370909a04..9bea3dfb87e37 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -41,7 +41,7 @@ def __init__( self.out_group_size = out_group_size # out_group_size > 1 is untested, and probably won't work as-is. - assert(self.out_group_size == 1) + assert (self.out_group_size == 1) self.pack_factor = (self.in_group_size * self.out_group_size) def __repr__(self) -> str: @@ -211,8 +211,7 @@ def apply_weights( for output_size in output_partition_sizes: shard_output = ops.aqlm_gemm( x, codes.narrow(0, output_offset, output_size), - codebooks.narrow(0, codebooks_offset, - num_codebooks), + codebooks.narrow(0, codebooks_offset, num_codebooks), scales.narrow(0, output_offset, output_size), None if bias is None else bias.narrow( 0, output_offset, output_size)) From 2def434a0c543c7d3394bcbd0e59d4d3d0a185c5 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 5 Mar 2024 19:17:45 +0000 Subject: [PATCH 52/96] some stuff --- vllm/model_executor/layers/linear.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 8f66bf6b1e677..eb1778b95548e 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -154,7 +154,7 @@ def __init__( skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, linear_method: Optional[LinearMethodBase] = None, - output_sizes: List[int] = [0], + output_sizes: Optional[List[int]] = None, ): super().__init__() @@ -171,6 +171,8 @@ def __init__( self.params_dtype = params_dtype if linear_method is None: linear_method = UnquantizedLinearMethod() + if output_sizes is None: + output_sizes = [output_size] self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( self.input_size, self.output_size_per_partition, self.input_size, From 821ee99ef29be6aba8993d27b08e6d91921e54da Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 5 Mar 2024 19:45:24 +0000 Subject: [PATCH 53/96] change 60 to 70 for min cap --- vllm/model_executor/layers/quantization/aqlm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 9bea3dfb87e37..8c0a30c86e3dd 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -61,7 +61,7 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]: @classmethod # Need to figure it out def get_min_capability(cls) -> int: - return 60 + return 70 @classmethod def get_config_filenames(cls) -> List[str]: From d0816bf1f53d72cac1ade0f0d65fefae18419bd4 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 5 Mar 2024 19:49:52 +0000 Subject: [PATCH 54/96] format --- vllm/model_executor/layers/quantization/aqlm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index ff0a7763c7e49..ce7824be74b4e 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -151,8 +151,10 @@ def create_weights(self, input_size_per_partition: int, codebooks, { # metadata indicates fixed size concatenated along dim 0 - "is_metadata": True, - "output_partition_sizes": torch.tensor(output_partition_sizes, device='cpu'), + "is_metadata": + True, + "output_partition_sizes": + torch.tensor(output_partition_sizes, device='cpu'), }, ) From 6372c64eeed9bed56ef5a7a4298bb299b02fb820 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 5 Mar 2024 19:59:24 +0000 Subject: [PATCH 55/96] make aqlm not rocm supported --- vllm/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index d8825397e0fe8..7d53d03c1abb6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -156,7 +156,7 @@ def _verify_tokenizer_mode(self) -> None: def _verify_quantization(self) -> None: supported_quantization = ["aqlm", "awq", "gptq", "squeezellm"] - rocm_not_supported_quantization = ["awq"] + rocm_not_supported_quantization = ["aqlm", "awq"] if self.quantization is not None: self.quantization = self.quantization.lower() From 83c207077429c45b96d486a6736663def2488728 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 5 Mar 2024 21:58:00 +0000 Subject: [PATCH 56/96] Add LICENSE file --- csrc/quantization/aqlm/LICENSE | 201 +++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 csrc/quantization/aqlm/LICENSE diff --git a/csrc/quantization/aqlm/LICENSE b/csrc/quantization/aqlm/LICENSE new file mode 100644 index 0000000000000..6d83e5c5d2c26 --- /dev/null +++ b/csrc/quantization/aqlm/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2024] [AQLM authors] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. From 267b3390be9c3424b36660af37a1c164ab5a9f3f Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 5 Mar 2024 21:59:58 +0000 Subject: [PATCH 57/96] add reference --- csrc/quantization/aqlm/LICENSE | 2 ++ 1 file changed, 2 insertions(+) diff --git a/csrc/quantization/aqlm/LICENSE b/csrc/quantization/aqlm/LICENSE index 6d83e5c5d2c26..bfa740da977e9 100644 --- a/csrc/quantization/aqlm/LICENSE +++ b/csrc/quantization/aqlm/LICENSE @@ -1,3 +1,5 @@ +Contains code from https://github.com/Vahe1994/AQLM + Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ From 040878966c7097bd726fce8659dc930232dff997 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 5 Mar 2024 22:02:32 +0000 Subject: [PATCH 58/96] add better license headers --- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 17 ++++++++++++++++- csrc/quantization/aqlm/aqlm_cuda_kernel.cu | 17 ++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 1b5e69893617d..302a0a3fda6fe 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -1,4 +1,19 @@ -// Adapted from https://github.com/Vahe1994/AQLM +/* + * Modified by Neural Magic + * Adapted from https://github.com/Vahe1994/AQLM + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include #include diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu index 4d8bdc6e47861..1d8046eb01c5c 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu +++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu @@ -1,4 +1,19 @@ -// Adapted from https://github.com/Vahe1994/AQLM +/* + * Modified by Neural Magic + * Adapted from https://github.com/Vahe1994/AQLM + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include #include From 48838b8a822c3c1a4980320d29bd8f46f654ae34 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 7 Mar 2024 17:26:08 +0000 Subject: [PATCH 59/96] add support for 2x8 optimization --- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 21 ++++++---- csrc/quantization/aqlm/aqlm_cuda_kernel.cu | 40 ++++++++++++++----- .../layers/quantization/aqlm.py | 1 - 3 files changed, 45 insertions(+), 17 deletions(-) diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 302a0a3fda6fe..2e4ed8ac4eb41 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -39,7 +39,9 @@ void code2x8_matvec_cuda( void* C, const void* codebook, int prob_m, - int prob_k + int prob_k, + const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long. + const int codebook_stride // as int4. ); void code1x16_matvec( @@ -109,7 +111,8 @@ void code2x8_matvec( const torch::Tensor& A, const torch::Tensor& B, torch::Tensor& C, - const torch::Tensor& codebook + const torch::Tensor& codebook, + const int4 codebook_a_sizes ) { const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); int prob_m = C.size(0); @@ -120,7 +123,9 @@ void code2x8_matvec( C.data_ptr(), codebook.data_ptr(), prob_m, - prob_k + prob_k, + codebook_a_sizes, + 2 * codebook.stride(0) * codebook.element_size() / sizeof(int4) ); } @@ -129,6 +134,7 @@ torch::Tensor code2x8_matmat( const torch::Tensor& codes, const torch::Tensor& codebooks, const torch::Tensor& scales, + const int4 codebook_a_sizes, const std::optional& bias ) { auto input_sizes = input.sizes(); @@ -147,7 +153,8 @@ torch::Tensor code2x8_matmat( codes.squeeze(2), input_vec, output_vec, - codebooks + codebooks, + codebook_a_sizes ); } flat_output *= scales.flatten().unsqueeze(0); @@ -174,8 +181,6 @@ torch::Tensor aqlm_gemm( int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0); int const entries = codebooks.size(1); - if (nbooks == 1 && entries == (1 << 16)) - { int4 cumulative_sizes; auto cumulative_size = &cumulative_sizes.x; int i =0; @@ -192,11 +197,13 @@ torch::Tensor aqlm_gemm( *cumulative_size = last*10; } + if (nbooks == 1 && entries == (1 << 16)) + { return code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias); } if (nbooks == 2 && entries == (1 << 8)) { - return code2x8_matmat(input, codes, codebooks, scales, bias); + return code2x8_matmat(input, codes, codebooks, scales, cumulative_sizes, bias); } TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.") diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu index 1d8046eb01c5c..9ae6a7eeb1587 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu +++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu @@ -34,16 +34,19 @@ __global__ void Code1x16MatVec( ) { int a_gl_stride = prob_k / 8 / 8; int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); + bool pred = a_gl_rd < prob_m; - // advance to the correct codebook, this easy because we only multiply one column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) + if (pred) { - codebook += codebook_stride; - ++codebook_size; + // advance to the correct codebook, this easy because we only multiply one column of the codebook. + auto codebook_size = &codebook_a_sizes.x; + while (a_gl_rd >= *codebook_size) + { + codebook += codebook_stride; + ++codebook_size; + } } - bool pred = a_gl_rd < prob_m; int b_gl_rd = 0; int c_gl_wr = a_gl_rd; a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; @@ -104,11 +107,26 @@ __global__ void Code2x8MatVec( int4* __restrict__ C, const int4* __restrict__ codebook, int prob_m, - int prob_k + int prob_k, + const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long. + const int codebook_stride // as int4. + ) { int a_gl_stride = prob_k / 8 / 8; int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); bool pred = a_gl_rd < prob_m; + + if (pred) + { + // advance to the correct codebook, this easy because we only multiply one column of the codebook. + auto codebook_size = &codebook_a_sizes.x; + while (a_gl_rd >= *codebook_size) + { + codebook += codebook_stride; + ++codebook_size; + } + } + int b_gl_rd = 0; int c_gl_wr = a_gl_rd; a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; @@ -216,7 +234,9 @@ void code2x8_matvec_cuda( void* __restrict__ C, const void* __restrict__ codebook, int prob_m, - int prob_k + int prob_k, + const int4 codebook_a_sizes, + const int codebook_stride ) { int sms; cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); @@ -240,6 +260,8 @@ void code2x8_matvec_cuda( (int4*) C, (const int4*) codebook, prob_m, - prob_k + prob_k, + codebook_a_sizes, + codebook_stride ); } diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index ce7824be74b4e..f4f95cec91174 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -9,7 +9,6 @@ from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs from vllm.model_executor.layers.quantization.base_config import QuantizationConfig - def get_int_dtype(nbits: int) -> torch.dtype: if nbits <= 8: return torch.int8 From 482262947aa78df4972135bc4c2a890917f1ed1a Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 7 Mar 2024 17:26:23 +0000 Subject: [PATCH 60/96] format --- vllm/model_executor/layers/quantization/aqlm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index f4f95cec91174..ce7824be74b4e 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -9,6 +9,7 @@ from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs from vllm.model_executor.layers.quantization.base_config import QuantizationConfig + def get_int_dtype(nbits: int) -> torch.dtype: if nbits <= 8: return torch.int8 From c255f443885472035b8dafe44e7cb3d3c1d89e94 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 7 Mar 2024 19:10:17 +0000 Subject: [PATCH 61/96] add better example models, and replace output_partition_size with sizes --- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 2 +- examples/aqlm_example.py | 2 ++ vllm/model_executor/layers/linear.py | 26 +++++++++---------- .../layers/quantization/aqlm.py | 14 +++++++--- .../model_executor/layers/quantization/awq.py | 14 +++++++--- .../layers/quantization/gptq.py | 4 +-- .../layers/quantization/squeezellm.py | 14 +++++++--- 7 files changed, 48 insertions(+), 28 deletions(-) diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 2e4ed8ac4eb41..4cb874b27e698 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -198,7 +198,7 @@ torch::Tensor aqlm_gemm( } if (nbooks == 1 && entries == (1 << 16)) - { + { return code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias); } if (nbooks == 2 && entries == (1 << 8)) diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index 6e6dc07a9f7a8..691dbb68f0685 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -22,6 +22,8 @@ def main(): # These are the verified working models. models = [ + "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", + "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf", "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", "BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", "BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf" diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index eb1778b95548e..857367340ec34 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -22,9 +22,9 @@ class LinearMethodBase(ABC): @abstractmethod def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, - output_size: int, params_dtype: torch.dtype, - output_partition_sizes: List[int]) -> Dict[str, Any]: + output_partition_sizes: List[int], input_size: int, + output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: """Create weights for a linear layer.""" raise NotImplementedError @@ -49,9 +49,10 @@ def __init__(self, separate_bias_add: bool = False): self.separate_bias_add = separate_bias_add def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, - output_size: int, params_dtype: torch.dtype, - output_partition_sizes: List[int]) -> Dict[str, Any]: + output_partition_sizes: List[int], input_size: int, + output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + output_size_per_partition = sum(output_partition_sizes) weight = Parameter(torch.empty(output_size_per_partition, input_size_per_partition, dtype=params_dtype), @@ -105,8 +106,8 @@ def __init__( linear_method = UnquantizedLinearMethod() self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( - self.input_size, self.output_size, self.input_size, - self.output_size, self.params_dtype, [self.output_size]) + self.input_size, [self.output_size], self.input_size, + self.output_size, self.params_dtype) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) @@ -175,9 +176,8 @@ def __init__( output_sizes = [output_size] self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( - self.input_size, self.output_size_per_partition, self.input_size, - self.output_size, self.params_dtype, - [x // tp_size for x in output_sizes]) + self.input_size, [x // tp_size for x in output_sizes], + self.input_size, self.output_size, self.params_dtype) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): @@ -518,8 +518,8 @@ def __init__( linear_method = UnquantizedLinearMethod() self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( - self.input_size_per_partition, self.output_size, self.input_size, - self.output_size, self.params_dtype, [self.output_size]) + self.input_size_per_partition, [self.output_size], self.input_size, + self.output_size, self.params_dtype) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index ce7824be74b4e..e1d33d4ae168d 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -93,10 +93,14 @@ class AQLMLinearMethod(LinearMethodBase): def __init__(self, quant_config: AQLMConfig): self.quant_config = quant_config - def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, - output_size: int, params_dtype: torch.dtype, - output_partition_sizes: List[int]) -> Dict[str, Any]: + def create_weights( + self, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + ) -> Dict[str, Any]: del output_size # Unused. del input_size # Unused. @@ -107,6 +111,8 @@ def create_weights(self, input_size_per_partition: int, "The input size is not aligned with the quantized " "weight shape. This can be caused by too large " "tensor parallel size.") + + output_size_per_partition = sum(output_partition_sizes) if output_size_per_partition % self.quant_config.out_group_size != 0: raise ValueError( "The output size is not aligned with the quantized " diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index e6c7c658e1e19..5751920590bd5 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -77,15 +77,21 @@ class AWQLinearMethod(LinearMethodBase): def __init__(self, quant_config: AWQConfig): self.quant_config = quant_config - def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, - output_size: int, params_dtype: torch.dtype, - output_partition_sizes: List[int]) -> Dict[str, Any]: + def create_weights( + self, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + ) -> Dict[str, Any]: if input_size_per_partition % self.quant_config.group_size != 0: raise ValueError( "The input size is not aligned with the quantized " "weight shape. This can be caused by too large " "tensor parallel size.") + + output_size_per_partition = sum(output_partition_sizes) if output_size_per_partition % self.quant_config.pack_factor != 0: raise ValueError( "The output size is not aligned with the quantized " diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 36a60990a8049..a044d5f219d3f 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -90,11 +90,10 @@ def __init__(self, quant_config: GPTQConfig): def create_weights( self, input_size_per_partition: int, - output_size_per_partition: int, + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, - output_partition_sizes: List[int], ) -> Dict[str, Any]: del output_size # Unused. if input_size_per_partition % self.quant_config.group_size != 0: @@ -102,6 +101,7 @@ def create_weights( "The input size is not aligned with the quantized " "weight shape. This can be caused by too large " "tensor parallel size.") + output_size_per_partition = sum(output_partition_sizes) if output_size_per_partition % self.quant_config.pack_factor != 0: raise ValueError( "The output size is not aligned with the quantized " diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py index e266e8a74af69..0769cc71e8d0c 100644 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -67,15 +67,21 @@ class SqueezeLLMLinearMethod(LinearMethodBase): def __init__(self, quant_config: SqueezeLLMConfig): self.quant_config = quant_config - def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, - output_size: int, params_dtype: torch.dtype, - output_partition_sizes: List[int]) -> Dict[str, Any]: + def create_weights( + self, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + ) -> Dict[str, Any]: if input_size_per_partition % self.quant_config.pack_factor != 0: raise ValueError( "The input size is not aligned with the quantized " "weight shape. This can be caused by too large " "tensor parallel size.") + + output_size_per_partition = sum(output_partition_sizes) qweight = Parameter( torch.empty( input_size_per_partition // self.quant_config.pack_factor, From 15d7206f4dd7452f0eae8d3bf61c1b158abc91f8 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 7 Mar 2024 19:20:05 +0000 Subject: [PATCH 62/96] format --- vllm/config.py | 4 +++- vllm/model_executor/layers/linear.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 598b8cf7a3aef..a62865f954490 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -157,7 +157,9 @@ def _verify_tokenizer_mode(self) -> None: self.tokenizer_mode = tokenizer_mode def _verify_quantization(self) -> None: - supported_quantization = ["aqlm", "awq", "gptq", "squeezellm", "marlin"] + supported_quantization = [ + "aqlm", "awq", "gptq", "squeezellm", "marlin" + ] rocm_not_supported_quantization = ["aqlm", "awq", "marlin"] if self.quantization is not None: self.quantization = self.quantization.lower() diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 8c0e654a09919..3101536c9e0d6 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -410,7 +410,7 @@ def weight_loader(self, param_data = param.data output_dim = getattr(param, "output_dim", None) is_metadata = getattr(param, "is_metadata", False) - + if loaded_shard_id is None: # Loaded weight is already packed. if output_dim is None: From 8df10d974bc8f8a285bd0def150619e7fccc66c3 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 7 Mar 2024 21:11:35 +0000 Subject: [PATCH 63/96] Add test_aqlm.py --- tests/models/test_aqlm.py | 77 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 tests/models/test_aqlm.py diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py new file mode 100644 index 0000000000000..65eeeafe38c56 --- /dev/null +++ b/tests/models/test_aqlm.py @@ -0,0 +1,77 @@ +"""Compare the outputs of a AQLM model between vLLM and HF Transformers + +Run `pytest tests/models/test_aqlm.py --forked`. +""" + +import pytest +import torch +from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY + +capability = torch.cuda.get_device_capability() +capability = capability[0] * 10 + capability[1] +aqlm_not_supported = ( + capability < _QUANTIZATION_CONFIG_REGISTRY["aqlm"].get_min_capability()) + +# In this test we hardcode prompts and generations for the model so we don't need to require the AQLM package as a dependency +example_prompts = [ + 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.\n', + 'Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.\n', + 'Compare and contrast artificial intelligence with human intelligence in terms of processing information.\n', + 'Describe the basic components of a neural network and how it can be trained.\n', + 'Write a short story about a robot that dreams for the first time.\n', + 'Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.\n', + 'Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.\n', + "Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'\n" +] + +# These ground truth generations were generated using `transformers==4.38.1 aqlm==1.1.0 torch==2.2.0` +# and the below code: +# ```python +# from transformers import AutoTokenizer, AutoModelForCausalLM +# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf" +# quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="cuda").cuda() +# tokenizer = AutoTokenizer.from_pretrained(model_id) +# outputs = [] +# for prompt in example_prompts: +# input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda") +# hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32) +# outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:])) +# ``` +ground_truth_generations = [ + '\n### Features\n\n- **High-throughput**: vLLM is designed to be memory-efficient and high-throughput. It', + 'The major milestones in the development of artificial intelligence from 1950 to 2020 are as follows:\n1950', + 'Compare and contrast artificial intelligence with human intelligence in terms of processing information. The processing of information is a key component of artificial intelligence. The processing of information is', + 'Explain the difference between supervised and unsupervised learning.\nExplain the difference between feedforward and recurrent neural networks.\nExplain the difference', + 'Write a short story about a robot that dreams for the first time. The story should be about 1000 words.\nThe story should be', + 'Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. The COVID-19 pandemic has had a', + 'The Mona Lisa is a painting by Leonardo da Vinci, and it is considered to be one of the most famous paintings in the world. The', + 'The early bird catches the worm.\nThe early bird catches the worm. (Japanese)\nLe petit oiseau attrait' +] + + +@pytest.mark.skipif(aqlm_not_supported, + reason="AQLM is not supported on this GPU type.") +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [3]) +def test_models( + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, + num_logprobs: int, +) -> None: + + vllm_model = vllm_runner(model, dtype=dtype, enforce_eager=True) + vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, + max_tokens, + num_logprobs) + + # loop through the prompts to compare against the ground truth generations + for prompt_idx in range(len(example_prompts)): + vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[ + prompt_idx] + + assert vllm_output_str == ground_truth_generations[prompt_idx] From a3039dd1c86cc66d0340684cb1a934682431cddb Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 7 Mar 2024 23:00:07 +0000 Subject: [PATCH 64/96] remove comments --- examples/aqlm_example.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index 691dbb68f0685..8a358e53a485c 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -20,28 +20,18 @@ def main(): args = parser.parse_args() - # These are the verified working models. models = [ "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf", + "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf", + "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf", "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", - "BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf", - "BlackSamorez/Llama-2-7b-AQLM-2Bit-2x8-hf" ] model = LLM(models[args.model], enforce_eager=True, tensor_parallel_size=args.tensor_parallel_size) - # this has the dimensions 0 and 1 transposed for the codes, and we don't currently support 8x8 anyway. - #model = LLM("BlackSamorez/Llama-2-7b-AQLM-2Bit-8x8-hf", enforce_eager=True) - # this model hangs, need to investigate. - #model = LLM("BlackSamorez/Mixtral-8x7B-Instruct-v0_1-AQLM-2Bit-1x16-hf", enforce_eager=True) - - # These have custom code and no quantization_config block. - #model = LLM("BlackSamorez/Llama-2-13b-AQLM-2Bit-1x16-hf", enforce_eager=True, trust_remote_code=True) - #model = LLM("BlackSamorez/Mixtral-8x7b-AQLM-2Bit-1x16-hf", enforce_eager=True) - sampling_params = SamplingParams(max_tokens=100, temperature=0) outputs = model.generate("Hello my name is", sampling_params=sampling_params) From 2ecce81309035714770b21cbc768872c4b529f8d Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 8 Mar 2024 15:10:33 +0000 Subject: [PATCH 65/96] put aqlm inside rocm block --- csrc/ops.h | 2 +- csrc/pybind.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/ops.h b/csrc/ops.h index 3a81874ff4b24..ea0722adf9621 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -69,6 +69,7 @@ void gelu_fast( torch::Tensor& out, torch::Tensor& input); +#ifndef USE_ROCM torch::Tensor aqlm_gemm( const torch::Tensor& input, const torch::Tensor& codes, @@ -78,7 +79,6 @@ torch::Tensor aqlm_gemm( const std::optional& bias ); -#ifndef USE_ROCM torch::Tensor awq_gemm( torch::Tensor _in_feats, torch::Tensor _kernel, diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index 6b23c706e252f..ba2c00147d7f4 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -53,8 +53,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); // Quantization ops - ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM"); #ifndef USE_ROCM + ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM"); ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ"); ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ"); ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ"); From 5864a00352f00a9686b6bcf1dc75fde33a7d3cac Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 8 Mar 2024 18:39:56 +0000 Subject: [PATCH 66/96] add model to example --- examples/aqlm_example.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index 8a358e53a485c..ca014be91405e 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -5,13 +5,18 @@ def main(): parser = argparse.ArgumentParser( - description='Example script with command-line arguments') + description='AQLM examples') parser.add_argument('--model', '-m', + type=str, + default=None, + help='model path, as for HF') + parser.add_argument('--choice', + '-c', type=int, default=0, - help='Model ID [0-2]') + help='known good models by index, [0-4]') parser.add_argument('--tensor_parallel_size', '-t', type=int, @@ -28,7 +33,7 @@ def main(): "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", ] - model = LLM(models[args.model], + model = LLM(args.model if args.model is not None else models[args.choice], enforce_eager=True, tensor_parallel_size=args.tensor_parallel_size) From 58dbb014f326fde320af85fba0545bfc922a0522 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 8 Mar 2024 18:46:27 +0000 Subject: [PATCH 67/96] remove comment --- vllm/model_executor/layers/quantization/aqlm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index e1d33d4ae168d..c069e8c006861 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -59,7 +59,6 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.half] @classmethod - # Need to figure it out def get_min_capability(cls) -> int: return 70 From 7dc5f83a775ed1a6b97f5689f7c3329f6db83c8d Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 8 Mar 2024 20:23:02 +0000 Subject: [PATCH 68/96] format --- examples/aqlm_example.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index ca014be91405e..468364d935d0c 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -4,8 +4,7 @@ def main(): - parser = argparse.ArgumentParser( - description='AQLM examples') + parser = argparse.ArgumentParser(description='AQLM examples') parser.add_argument('--model', '-m', From 8069375b6303502bdfa7ce819bc2ac18b5984af0 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 8 Mar 2024 20:36:27 +0000 Subject: [PATCH 69/96] fix test --- tests/models/test_aqlm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index 65eeeafe38c56..2464e7e20aa70 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -51,7 +51,7 @@ @pytest.mark.skipif(aqlm_not_supported, reason="AQLM is not supported on this GPU type.") -@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"]) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [3]) From 9891e22a2c95e9dc36d1b42eb1fe10c4b0018454 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 12 Mar 2024 16:57:23 +0000 Subject: [PATCH 70/96] Add dequantization kernel --- .../layers/quantization/aqlm.py | 101 ++++++++++++++++-- 1 file changed, 93 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index c069e8c006861..a192f120cd9b8 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -2,8 +2,10 @@ from typing import Any, Dict, List, Optional +import math import torch from torch.nn.parameter import Parameter +import torch.nn.functional as F from vllm._C import ops from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs @@ -22,6 +24,60 @@ def get_int_dtype(nbits: int) -> torch.dtype: raise ValueError(f"No dtype available for {nbits}-bit codebooks") +@torch.inference_mode() +def unpack_int_data(data: torch.IntTensor, nbits: int) -> torch.IntTensor: + return data.to(torch.int64) % (2**nbits) + + +def dequantize_weight(codes: torch.Tensor, + codebooks: torch.Tensor, + scales: Optional[torch.Tensor] = None) -> torch.Tensor: + """ + Decode float weights from quantization codes. Differentiable. + :param codes: tensor of integer quantization codes, shape [*dims, num_out_groups, num_in_groups, num_codebooks] + :param codebooks: tensor of vectors for each quantization code, [num_codebooks, codebook_size, out_group_size, in_group_size] + :param scales: weight will be multiplied by this factor, must be broadcastble with [*dims, out_groups, num_in_groups, out_group_size, in_group_size] + :return: reconstructed weight tensor of shape [*dims, num_in_groups*group_size] + """ + num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:] + num_codebooks, codebook_size, out_group_size, in_group_size = codebooks.shape + out_features = num_out_groups * out_group_size + in_features = num_in_groups * in_group_size + codebook_offsets = torch.arange( + 0, num_codebooks * codebook_size, codebook_size, + device=codes.device) # shape: [num_codebooks] + reconstructed_weight_flat = F.embedding_bag( + codes.flatten(0, -2) + codebook_offsets, + codebooks.flatten(0, 1).flatten(-2, -1), + mode="sum" + ) # [prod(dims) * num_out_groups * num_in_groups, out_group_size * in_group_size] + + reconstructed_weight_groupwise = reconstructed_weight_flat.view( + list(codes.shape[:-3]) + + [num_out_groups, num_in_groups, out_group_size, in_group_size]) + if scales is not None: + reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul( + scales) + return reconstructed_weight_groupwise.swapaxes( + -3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features]) + + +def dequantize_gemm( + input: torch.Tensor, # [..., in_features] + codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] + codebooks: torch. + Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] + scales: torch.Tensor, # [num_out_groups, 1, 1, 1] + bias: Optional[torch.Tensor], +) -> torch.Tensor: + dequantized_weight = dequantize_weight( + unpack_int_data(codes, codebooks.shape[1].bit_length() - 1), + codebooks, + scales, + ) + return F.linear(input, dequantized_weight, bias) + + class AQLMConfig(QuantizationConfig): """Config class for AQLM. @@ -203,13 +259,42 @@ def apply_weights( output_partition_sizes = getattr(codebooks, "output_partition_sizes", None) - output = ops.aqlm_gemm( - x, - codes, - codebooks, - scales, - output_partition_sizes, - bias, - ) + output = None + + use_gemv = math.prod(x.shape[:-1]) <= 32 + + if not use_gemv: + output_shape = x.shape[:-1] + (scales.shape[0], ) + output = torch.empty(output_shape, dtype=x.dtype, device=x.device) + num_outputs = len(output_partition_sizes) + + # break the inputs and codebooks apart then combine the outputs. + num_codebooks = codebooks.shape[0] // num_outputs + assert (scales.shape[0] == codes.shape[0]) + assert (sum(output_partition_sizes) == scales.shape[0]) + output_offset = 0 + codebooks_offset = 0 + for output_size in output_partition_sizes: + shard_output = dequantize_gemm( + x, codes.narrow(0, output_offset, output_size), + codebooks.narrow(0, codebooks_offset, num_codebooks), + scales.narrow(0, output_offset, output_size), + None if bias is None else bias.narrow( + 0, output_offset, output_size)) + + output_slice = output.narrow(-1, output_offset, output_size) + assert (output_slice.shape == shard_output.shape) + output_slice.copy_(shard_output) + output_offset += output_size + codebooks_offset += num_codebooks + else: + output = ops.aqlm_gemm( + x, + codes, + codebooks, + scales, + output_partition_sizes, + bias, + ) return output From a51192f9debccf25879cdee214ad91e6603207a8 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 12 Mar 2024 14:22:51 -0400 Subject: [PATCH 71/96] Update csrc/quantization/aqlm/aqlm_cuda_entry.cpp --- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 30 +++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 4cb874b27e698..81a5c36f5afef 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -181,21 +181,21 @@ torch::Tensor aqlm_gemm( int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0); int const entries = codebooks.size(1); - int4 cumulative_sizes; - auto cumulative_size = &cumulative_sizes.x; - int i =0; - int last = 0; - assert(codebook_partition_sizes.size(0) <= 4); - for (; i < codebook_partition_sizes.size(0); ++i, ++cumulative_size) - { - *cumulative_size = codebook_partition_sizes[i].item() + last; - last = *cumulative_size; - } - // fill in the rest with unreachable. - for (; i < 4; ++i, ++cumulative_size) - { - *cumulative_size = last*10; - } + int4 cumulative_sizes; + auto cumulative_size = &cumulative_sizes.x; + int i =0; + int last = 0; + assert(codebook_partition_sizes.size(0) <= 4); + for (; i < codebook_partition_sizes.size(0); ++i, ++cumulative_size) + { + *cumulative_size = codebook_partition_sizes[i].item() + last; + last = *cumulative_size; + } + // fill in the rest with unreachable. + for (; i < 4; ++i, ++cumulative_size) + { + *cumulative_size = last*10; + } if (nbooks == 1 && entries == (1 << 16)) { From 992d5849fbf34e9c69df46759cd316ba31a95159 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 12 Mar 2024 14:23:24 -0400 Subject: [PATCH 72/96] Update csrc/quantization/aqlm/aqlm_cuda_entry.cpp --- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 81a5c36f5afef..435cb90e69233 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -183,7 +183,7 @@ torch::Tensor aqlm_gemm( int4 cumulative_sizes; auto cumulative_size = &cumulative_sizes.x; - int i =0; + int i = 0; int last = 0; assert(codebook_partition_sizes.size(0) <= 4); for (; i < codebook_partition_sizes.size(0); ++i, ++cumulative_size) From 9143b453b168a5ecdd6dac0dde2bde1db6142f5f Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 12 Mar 2024 20:56:31 +0000 Subject: [PATCH 73/96] set gpu_memory_utilization --- examples/aqlm_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index 468364d935d0c..766fc93809bac 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -33,7 +33,7 @@ def main(): ] model = LLM(args.model if args.model is not None else models[args.choice], - enforce_eager=True, + gpu_memory_utilization=.85, tensor_parallel_size=args.tensor_parallel_size) sampling_params = SamplingParams(max_tokens=100, temperature=0) From 5d24991d3ec3fc1b331648e0b4c46a392e7665a1 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 14 Mar 2024 19:58:51 +0000 Subject: [PATCH 74/96] add benchmark and refactor a bit. --- benchmarks/kernels/benchmark_aqlm.py | 122 ++++++++++++++++++ .../layers/quantization/aqlm.py | 90 +++++++------ 2 files changed, 175 insertions(+), 37 deletions(-) create mode 100644 benchmarks/kernels/benchmark_aqlm.py diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py new file mode 100644 index 0000000000000..f31ab6cbfb489 --- /dev/null +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -0,0 +1,122 @@ +import json +import os +import sys + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + +from vllm.model_executor.layers.quantization.aqlm import dequantize_partioned_gemm +from vllm._C import ops + +import torch +import torch.nn.functional as F + +def main(): + methods = [ + dequantize_partioned_gemm, ops.aqlm_gemm + ] + + filename = "./benchmark.csv" + print(f"writing benchmarks to file {filename}") + with open(filename, "a") as f: + sys.stdout = f + + print('m | k | n', end='') + for method in methods: + print(f' | {method.__name__}', end='') + print('') + + # These are reasonable prefill sizes. + ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )), + (4096, (11008, 11008)), (11008, (4096, ))) + + # reasonable ranges for m. + for m in [ + 1, 2, 4, 8, #16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536, + #2048, 3072, 4096 + ]: + print(f'{m}', file=sys.__stdout__) + for ksp in ksandpartions: + run_grid(m, ksp[0], torch.tensor(ksp[1]), methods) + + sys.stdout = sys.__stdout__ + + +def run_grid(m: int, k: int, parts: torch.tensor, methods): + + num_warmup_trials = 1 + num_trials = 1 + + num_calls = 100 + + # warmup. + for method in methods: + for _ in range(num_warmup_trials): + run_timing( + num_calls=num_calls, + m=m, + k=k, + parts=parts, + method=method, + ) + + n = parts.sum().item() + print(f'{m} | {k} | {n}:{parts.tolist()}', end='') + + for method in methods: + best_time_us = 1e20 + for _ in range(num_trials): + kernel_dur_ms = run_timing( + num_calls=num_calls, + m=m, + k=k, + parts=parts, + method=method, + ) + + kernel_dur_us = 1000 * kernel_dur_ms + + if kernel_dur_us < best_time_us: + best_time_us = kernel_dur_us + + print(f' | {kernel_dur_us:.0f}', end='') + + print('') + + +def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor, + method) -> float: + + n = parts.sum().item() + + device = torch.device('cuda:0') + + input = torch.randn((1, m, k), dtype=torch.float16, device=device) + + codes = torch.randint(-32768, + 32768, + size=(n, k // 8, 1), + dtype=torch.int16, + device=device) + + codebooks = torch.randn(size=(parts.shape[0], 65536, 1, 8), + dtype=torch.float16, + device=device) + + scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device) + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + start_event.record() + for i in range(num_calls): + output = method(input, codes, codebooks, scales, parts, None) + + end_event.record() + end_event.synchronize() + + dur_ms = start_event.elapsed_time(end_event) / num_calls + return dur_ms + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index a192f120cd9b8..13b9751d79898 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -78,6 +78,41 @@ def dequantize_gemm( return F.linear(input, dequantized_weight, bias) +def dequantize_partioned_gemm( + input: torch.Tensor, # [..., in_features] + codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] + codebooks: torch. + Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] + scales: torch.Tensor, # [num_out_groups, 1, 1, 1] + output_partition_sizes: torch.IntTensor, + bias: Optional[torch.Tensor], +) -> torch.Tensor: + output_shape = input.shape[:-1] + (scales.shape[0], ) + output = torch.empty(output_shape, dtype=input.dtype, device=input.device) + num_outputs = len(output_partition_sizes) + + # break the inputs and codebooks apart then combine the outputs. + # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big multiply at the end. + num_codebooks = codebooks.shape[0] // num_outputs + assert (scales.shape[0] == codes.shape[0]) + assert (sum(output_partition_sizes) == scales.shape[0]) + output_offset = 0 + codebooks_offset = 0 + for output_size in output_partition_sizes: + shard_output = dequantize_gemm( + input, codes.narrow(0, output_offset, output_size), + codebooks.narrow(0, codebooks_offset, num_codebooks), + scales.narrow(0, output_offset, output_size), None + if bias is None else bias.narrow(0, output_offset, output_size)) + + output_slice = output.narrow(-1, output_offset, output_size) + assert (output_slice.shape == shard_output.shape) + output_slice.copy_(shard_output) + output_offset += output_size + codebooks_offset += num_codebooks + return output + + class AQLMConfig(QuantizationConfig): """Config class for AQLM. @@ -259,42 +294,23 @@ def apply_weights( output_partition_sizes = getattr(codebooks, "output_partition_sizes", None) - output = None - - use_gemv = math.prod(x.shape[:-1]) <= 32 - - if not use_gemv: - output_shape = x.shape[:-1] + (scales.shape[0], ) - output = torch.empty(output_shape, dtype=x.dtype, device=x.device) - num_outputs = len(output_partition_sizes) - - # break the inputs and codebooks apart then combine the outputs. - num_codebooks = codebooks.shape[0] // num_outputs - assert (scales.shape[0] == codes.shape[0]) - assert (sum(output_partition_sizes) == scales.shape[0]) - output_offset = 0 - codebooks_offset = 0 - for output_size in output_partition_sizes: - shard_output = dequantize_gemm( - x, codes.narrow(0, output_offset, output_size), - codebooks.narrow(0, codebooks_offset, num_codebooks), - scales.narrow(0, output_offset, output_size), - None if bias is None else bias.narrow( - 0, output_offset, output_size)) - - output_slice = output.narrow(-1, output_offset, output_size) - assert (output_slice.shape == shard_output.shape) - output_slice.copy_(shard_output) - output_offset += output_size - codebooks_offset += num_codebooks - else: - output = ops.aqlm_gemm( - x, - codes, - codebooks, - scales, - output_partition_sizes, - bias, - ) + use_gemv = math.prod( + x.shape[:-1]) <= 32 or output_partition_sizes is None + + output = ops.aqlm_gemm( + x, + codes, + codebooks, + scales, + output_partition_sizes, + bias, + ) if use_gemv else dequantize_partioned_gemm( + x, + codes, + codebooks, + scales, + output_partition_sizes, + bias, + ) return output From d9152e206bfb29c55109acc386e1eaf255a20934 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 21 Mar 2024 17:31:54 +0000 Subject: [PATCH 75/96] add aqlm --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 66842e6845edd..eb1e88eb405b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -156,6 +156,8 @@ set(VLLM_EXT_SRC if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_EXT_SRC + "csrc/quantization/aqlm/aqlm_cuda_entry.cpp" + "csrc/quantization/aqlm/aqlm_cuda_kernel.cu" "csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/marlin/marlin_cuda_kernel.cu" "csrc/custom_all_reduce.cu") From 0574dfffd892c9382b5b5f8fb9d4d4bc1cb9e737 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 21 Mar 2024 17:41:32 +0000 Subject: [PATCH 76/96] Add dequant methods --- benchmarks/kernels/benchmark_aqlm.py | 192 ++++++++++++++++-- csrc/ops.h | 6 + csrc/pybind.cpp | 1 + csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 96 +++++++-- csrc/quantization/aqlm/aqlm_cuda_kernel.cu | 159 +++++++++++++++ vllm/model_executor/layers/linear.py | 3 +- .../layers/quantization/aqlm.py | 110 +++++++--- 7 files changed, 507 insertions(+), 60 deletions(-) diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index f31ab6cbfb489..8f2323c695830 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -1,28 +1,163 @@ -import json import os import sys +from typing import Optional os.environ['CUDA_VISIBLE_DEVICES'] = '0' -from vllm.model_executor.layers.quantization.aqlm import dequantize_partioned_gemm +from vllm.model_executor.layers.quantization.aqlm import ( + generic_dequantize_gemm, optimized_dequantize_gemm, dequantize_weight, + get_int_dtype) from vllm._C import ops import torch import torch.nn.functional as F + +def torch_mult( + input: torch.Tensor, # [..., in_features] + weights: torch.Tensor, + scales: torch.Tensor, # [num_out_groups, 1, 1, 1] +) -> torch.Tensor: + output = F.linear(input, weights) + return output + + +def dequant_out_scale( + input: torch.Tensor, # [..., in_features] + codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] + codebooks: torch. + Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] + scales: torch.Tensor, # [num_out_groups, 1, 1, 1] + output_partition_sizes: torch.IntTensor, + bias: Optional[torch.Tensor], +) -> torch.Tensor: + + weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) + + if bias is None: + output = F.linear(input, weights, bias) + orig_shape = output.shape + flattened_output = output.view(-1, output.size(-1)) + f_scales = scales.view(-1, scales.shape[0]) + b_scales = f_scales.expand(flattened_output.shape[0], -1) + flattened_output *= b_scales + return flattened_output.view(orig_shape) + else: + b_scales = scales.view(scales.shape[:-3] + (-1, )).expand( + -1, weights.shape[1]) + weights *= b_scales + return F.linear(input, weights, bias) + + +def dequant_weight_scale( + input: torch.Tensor, # [..., in_features] + codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] + codebooks: torch. + Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] + scales: torch.Tensor, # [num_out_groups, 1, 1, 1] + output_partition_sizes: torch.IntTensor, + bias: Optional[torch.Tensor], +) -> torch.Tensor: + + weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) + + b_scales = scales.view(scales.shape[:-3] + (-1, )).expand( + -1, weights.shape[1]) + weights *= b_scales + return F.linear(input, weights, bias) + + +def dequant_no_scale( + input: torch.Tensor, # [..., in_features] + codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] + codebooks: torch. + Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] + scales: torch.Tensor, # [num_out_groups, 1, 1, 1] + output_partition_sizes: torch.IntTensor, + bias: Optional[torch.Tensor], +) -> torch.Tensor: + + weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) + + return F.linear(input, weights, bias) + + +# Compare my kernel against the gold standard. +def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None: + + n = parts.sum().item() + + device = torch.device('cuda:0') + + code_range = (1 << bits) // 2 + ingroups = 8 + + codes = torch.randint(-code_range, + code_range, + size=(n, k // ingroups, nbooks), + dtype=get_int_dtype(bits), + device=device) + + codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), + dtype=torch.float16, + device=device) + + count = 0 + for index in range(16): + for i in range(8): + for book in range(nbooks): + codebooks[book, index, 0, i] = count * (10**book) + count += 1 + + print("codes shape", codes.shape) + + for i in range(16): + for book in range(nbooks): + codes[0, i, book] = i + codes[0, -i, book] = i + + weights = dequantize_weight(codes, codebooks, None) # TODO Scales. + weights2 = ops.aqlm_dequant(codes, codebooks, parts) + + print("weights shape:", weights.shape) + print("weights2 shape:", weights2.shape) + + print("weights are:", weights) + print("weights2 are:", weights2) + + print("first 128 weights are", weights[0, 0:128].to(torch.int32)) + print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32)) + + print("last 128 weights are", weights[0, -128:]) + print("last 128 weights2 are:", weights2[0, -128:]) + + def main(): + + nbooks = 2 + bits = 8 + + dequant_test(4096, torch.tensor((4096, )), nbooks, bits) + return + methods = [ - dequantize_partioned_gemm, ops.aqlm_gemm + ops.aqlm_gemm, + dequant_out_scale, + generic_dequantize_gemm, + optimized_dequantize_gemm, + dequant_weight_scale, + torch_mult, + dequant_no_scale, ] - filename = "./benchmark.csv" + filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv" print(f"writing benchmarks to file {filename}") - with open(filename, "a") as f: + with open(filename, "w") as f: sys.stdout = f - print('m | k | n', end='') + print('m | k | n | n parts', end='') for method in methods: - print(f' | {method.__name__}', end='') + print(f" | {method.__name__.replace('_', ' ')} (µs)", end='') print('') # These are reasonable prefill sizes. @@ -31,17 +166,19 @@ def main(): # reasonable ranges for m. for m in [ - 1, 2, 4, 8, #16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536, - #2048, 3072, 4096 + 1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112, + 128, 256, 512, 1024, 1536, 2048, 3072, 4096 ]: print(f'{m}', file=sys.__stdout__) for ksp in ksandpartions: - run_grid(m, ksp[0], torch.tensor(ksp[1]), methods) + run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, + methods) sys.stdout = sys.__stdout__ -def run_grid(m: int, k: int, parts: torch.tensor, methods): +def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int, + methods): num_warmup_trials = 1 num_trials = 1 @@ -56,11 +193,13 @@ def run_grid(m: int, k: int, parts: torch.tensor, methods): m=m, k=k, parts=parts, + nbooks=nbooks, + bits=bits, method=method, ) n = parts.sum().item() - print(f'{m} | {k} | {n}:{parts.tolist()}', end='') + print(f'{m} | {k} | {n} | {parts.tolist()}', end='') for method in methods: best_time_us = 1e20 @@ -70,6 +209,8 @@ def run_grid(m: int, k: int, parts: torch.tensor, methods): m=m, k=k, parts=parts, + nbooks=nbooks, + bits=bits, method=method, ) @@ -84,7 +225,7 @@ def run_grid(m: int, k: int, parts: torch.tensor, methods): def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor, - method) -> float: + nbooks: int, bits: int, method) -> float: n = parts.sum().item() @@ -92,24 +233,35 @@ def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor, input = torch.randn((1, m, k), dtype=torch.float16, device=device) - codes = torch.randint(-32768, - 32768, - size=(n, k // 8, 1), - dtype=torch.int16, + code_range = (1 << bits) // 2 + ingroups = 8 + + codes = torch.randint(-code_range, + code_range, + size=(n, k // ingroups, nbooks), + dtype=get_int_dtype(bits), device=device) - codebooks = torch.randn(size=(parts.shape[0], 65536, 1, 8), + codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), dtype=torch.float16, device=device) scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device) + # for comparison to just a pytorch mult. + weights = torch.randn((n, k), dtype=torch.float16, device=device) + start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) start_event.record() - for i in range(num_calls): - output = method(input, codes, codebooks, scales, parts, None) + + if method is torch_mult: + for i in range(num_calls): + output = torch_mult(input, weights, scales) + else: + for i in range(num_calls): + output = method(input, codes, codebooks, scales, parts, None) end_event.record() end_event.synchronize() diff --git a/csrc/ops.h b/csrc/ops.h index 8c495ea29f61c..ed0cf7d984ca0 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -93,6 +93,12 @@ torch::Tensor aqlm_gemm( const std::optional& bias ); +torch::Tensor aqlm_dequant( + const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& codebook_partition_sizes +); + torch::Tensor awq_gemm( torch::Tensor _in_feats, torch::Tensor _kernel, diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index 5f0839653b8b5..c99ae3ff54ab8 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -64,6 +64,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { // Quantization ops #ifndef USE_ROCM ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM"); + ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM"); ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ"); ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ"); ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ"); diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 435cb90e69233..7ebfbd7af9fa4 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -44,6 +44,32 @@ void code2x8_matvec_cuda( const int codebook_stride // as int4. ); +void code1x16_dequant( + void* weights, + const void* a, + const void* codebook, + const int a_rows, // code rows in element space, so k + const int a_cols, // code columns in element space, so n + const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols. + const int codebook_stride // as int4 +); + +void code2x8_dequant( + void* weights, + const void* a, + const void* codebook, + const int a_rows, // code rows in element space, so k + const int a_cols, // code columns in element space, so n + const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols. + const int codebook_stride // as int4 +); + + +int codebook_stride(const torch::Tensor& codebooks) +{ + return codebooks.stride(0) * codebooks.element_size() / sizeof(int4); +} + void code1x16_matvec( const torch::Tensor& A, const torch::Tensor& B, @@ -63,7 +89,7 @@ void code1x16_matvec( prob_m, prob_k, codebook_a_sizes, - codebook.stride(0) * codebook.element_size() / sizeof(int4) + codebook_stride(codebook) ); } @@ -125,7 +151,7 @@ void code2x8_matvec( prob_m, prob_k, codebook_a_sizes, - 2 * codebook.stride(0) * codebook.element_size() / sizeof(int4) + 2 * codebook_stride(codebook) ); } @@ -169,18 +195,9 @@ torch::Tensor code2x8_matmat( return output; } -torch::Tensor aqlm_gemm( - const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const torch::Tensor& codebook_partition_sizes, - const std::optional& bias -) +// Accumulate the partition sizes. +int4 accumulate_sizes (const torch::Tensor& codebook_partition_sizes) { - int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0); - int const entries = codebooks.size(1); - int4 cumulative_sizes; auto cumulative_size = &cumulative_sizes.x; int i = 0; @@ -196,6 +213,22 @@ torch::Tensor aqlm_gemm( { *cumulative_size = last*10; } + return cumulative_sizes; +} + +torch::Tensor aqlm_gemm( + const torch::Tensor& input, + const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& scales, + const torch::Tensor& codebook_partition_sizes, + const std::optional& bias +) +{ + int4 cumulative_sizes = accumulate_sizes(codebook_partition_sizes); + + int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0); + int const entries = codebooks.size(1); if (nbooks == 1 && entries == (1 << 16)) { @@ -209,3 +242,40 @@ torch::Tensor aqlm_gemm( TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.") return {}; } + +torch::Tensor aqlm_dequant( + const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& codebook_partition_sizes +) +{ + int4 cumulative_sizes = accumulate_sizes(codebook_partition_sizes); + + int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0); + int const entries = codebooks.size(1); + + const at::cuda::OptionalCUDAGuard device_guard(device_of(codes)); + int rows = codes.size(1); + int cols = codes.size(0); + + auto weights = torch::empty({cols, rows * 8}, + torch::TensorOptions() + .dtype(codebooks.dtype()) + .device(codebooks.device()) + ); + + if (nbooks == 1 && entries == (1 << 16)) + { + code1x16_dequant(weights.data_ptr(), codes.data_ptr(), codebooks.data_ptr(), rows, cols, cumulative_sizes, codebook_stride(codebooks)); + return weights; + } + + if (nbooks == 2 && entries == (1 << 8)) + { + code2x8_dequant(weights.data_ptr(), codes.data_ptr(), codebooks.data_ptr(), rows, cols, cumulative_sizes, codebook_stride(codebooks)); + return weights; + } + + TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.") + return {}; +} diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu index 9ae6a7eeb1587..9e9570ee0b195 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu +++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu @@ -188,6 +188,98 @@ __global__ void Code2x8MatVec( } } + +// Dequantizes the code and codebook into weights. +// We span horizontally and do an int4 at a time in an attempt to maximize throughput. +__global__ void Code1x16Dequant( + int4* __restrict__ weights, + const int4* __restrict__ a, + const int4* __restrict__ codebook, + const int a_rows, // code rows in int4 space, so same as stride. + const int a_cols, // code columns (matter?) + const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, sums to m. + const int codebook_stride // as int4 +) { + // Each thread decodes one int4 worth of codebook. + int a_col = blockIdx.x * 32 + threadIdx.x; + int a_row = blockIdx.y * 32 + threadIdx.y; + + // out of range + if (a_row >= a_rows) + return; + + const int weight_stride = a_rows * 8; // as int4 + weights += a_col * weight_stride + a_row * 8; + + // advance to the correct codebook, this easy because we only multiply one column of the codebook. + auto codebook_size = &codebook_a_sizes.x; + while (a_col >= *codebook_size) + { + codebook += codebook_stride; + ++codebook_size; + } + + // do one int4 read and write, hopefully maxing out bandwidth. + int4 code_block = a[a_row + a_col * a_rows]; + const uint16_t* enc = reinterpret_cast(&code_block); + #pragma unroll + for (int i = 0; i < 8; i++) { + weights[i] = codebook[enc[i]]; + } +} + +// Dequantizes the code and codebook for 2x8 +// We span horizontally and do an int4 at a time in an attempt to maximize throughput. +__global__ void Code2x8Dequant( + int4* __restrict__ weights, + const int4* __restrict__ a, + const int4* __restrict__ codebook, + const int a_rows, // code rows in int4 space, so same as stride. + const int a_cols, // code columns (matter?) + const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, sums to m. + const int codebook_stride // as int4 +) { + // Each thread decodes one int4 worth of codebook. + int a_col = blockIdx.x * 32 + threadIdx.x; + int a_row = blockIdx.y * 32 + threadIdx.y; + + // out of range, can happen. + if (a_row >= a_rows) + return; + + const int weight_stride = a_rows * 8; // as int4 + weights += a_col * weight_stride + a_row * 8; + + // advance to the correct codebook, this easy because we only multiply one column of the codebook. + auto codebook_size = &codebook_a_sizes.x; + while (a_col >= *codebook_size) + { + // in pairs of two + codebook += codebook_stride * 2; + ++codebook_size; + } + + // do one int4 read to get it into local memory, hopefully maxing out bandwidth. + int4 code_block = a[a_row + a_col * a_rows]; + const uint8_t* enc = reinterpret_cast(&code_block); + #pragma unroll + for (int i = 0; i < 8; i++) { + int4 code1 = codebook[enc[i*2]]; + int4 code2 = (codebook + codebook_stride)[enc[i*2 + 1]]; + + half2* a = reinterpret_cast(&code1); + half2* b = reinterpret_cast(&code2); + #pragma unroll + for (int j = 0; j < 4; j++) + { + a[j].x = __hadd(a[j].x, b[j].x); + a[j].y = __hadd(a[j].y, b[j].y); + } + weights[i] = code1; + } +} + + inline int ceildiv(int a, int b) { return (a + b - 1) / b; } @@ -265,3 +357,70 @@ void code2x8_matvec_cuda( codebook_stride ); } + + +// Dequantizes the code and codebook into weights. +void code1x16_dequant( + void* __restrict__ weights, + const void* __restrict__ a, + const void* __restrict__ codebook, + const int a_rows, // code rows in element space, so k + const int a_cols, // code columns in element space, so n + const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, sums to m. + const int codebook_stride // as int4 +) { + dim3 threads(32, 32, 1); + + assert(a_cols % 32 == 0); + // each thread does one int4 worth. + assert(a_rows % 8 == 0); + + const int rows = a_rows/8; + + dim3 blocks(ceildiv(a_cols, 32), ceildiv(rows, 32), 1); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); + Code1x16Dequant<<>>( + (int4*) weights, + (const int4*) a, + (const int4*) codebook, + rows, // in int4 space. + a_cols, + codebook_a_sizes, + codebook_stride + ); +} + +// Dequantizes the code and codebook into weights. +void code2x8_dequant( + void* __restrict__ weights, + const void* __restrict__ a, + const void* __restrict__ codebook, + const int a_rows, // code rows in element space, so k + const int a_cols, // code columns in element space, so n + const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, sums to m. + const int codebook_stride // as int4 +) { + dim3 threads(32, 32, 1); + + assert(a_cols % 32 == 0); + // each thread does one int4 worth. + assert(a_rows % 8 == 0); + + const int rows = a_rows/8; + + dim3 blocks(ceildiv(a_cols, 32), ceildiv(rows, 32), 1); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); + Code2x8Dequant<<>>( + (int4*) weights, + (const int4*) a, + (const int4*) codebook, + rows, // in int4 space. + a_cols, + codebook_a_sizes, + codebook_stride + ); +} + + diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index f53f9be6ad599..76f6d15c70ddb 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -151,7 +151,8 @@ class ColumnParallelLinear(torch.nn.Module): skip adding bias but instead return it. params_dtype: Data type for the parameters. linear_method: (Maybe quantized) linear method. - output_sizes: list of output sizes packed into one output, like for QKV the list would be size 3. + output_sizes: list of output sizes packed into one output, like for QKV + the list would be size 3. """ def __init__( diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 13b9751d79898..b49f7736684d5 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -1,4 +1,5 @@ -# Supports AQLM compression, see https://github.com/Vahe1994/AQLM and https://arxiv.org/pdf/2401.06118.pdf +# Supports AQLM compression, see https://github.com/Vahe1994/AQLM +# and https://arxiv.org/pdf/2401.06118.pdf from typing import Any, Dict, List, Optional @@ -9,7 +10,8 @@ from vllm._C import ops from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) def get_int_dtype(nbits: int) -> torch.dtype: @@ -34,10 +36,15 @@ def dequantize_weight(codes: torch.Tensor, scales: Optional[torch.Tensor] = None) -> torch.Tensor: """ Decode float weights from quantization codes. Differentiable. - :param codes: tensor of integer quantization codes, shape [*dims, num_out_groups, num_in_groups, num_codebooks] - :param codebooks: tensor of vectors for each quantization code, [num_codebooks, codebook_size, out_group_size, in_group_size] - :param scales: weight will be multiplied by this factor, must be broadcastble with [*dims, out_groups, num_in_groups, out_group_size, in_group_size] - :return: reconstructed weight tensor of shape [*dims, num_in_groups*group_size] + :param codes: tensor of integer quantization codes, shape + [*dims, num_out_groups, num_in_groups, num_codebooks] + :param codebooks: tensor of vectors for each quantization code, + [num_codebooks, codebook_size, out_group_size, in_group_size] + :param scales: weight will be multiplied by this factor, must be + broadcastble with + [*dims, out_groups, num_in_groups, out_group_size, in_group_size] + :return: reconstructed weight tensor of shape + [*dims, num_in_groups*group_size] """ num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:] num_codebooks, codebook_size, out_group_size, in_group_size = codebooks.shape @@ -50,7 +57,8 @@ def dequantize_weight(codes: torch.Tensor, codes.flatten(0, -2) + codebook_offsets, codebooks.flatten(0, 1).flatten(-2, -1), mode="sum" - ) # [prod(dims) * num_out_groups * num_in_groups, out_group_size * in_group_size] + ) # [prod(dims) * num_out_groups * num_in_groups, out_group_size + # * in_group_size] reconstructed_weight_groupwise = reconstructed_weight_flat.view( list(codes.shape[:-3]) + @@ -78,7 +86,8 @@ def dequantize_gemm( return F.linear(input, dequantized_weight, bias) -def dequantize_partioned_gemm( +# Generic dequantization, slow but flexible. +def generic_dequantize_gemm( input: torch.Tensor, # [..., in_features] codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] codebooks: torch. @@ -92,7 +101,8 @@ def dequantize_partioned_gemm( num_outputs = len(output_partition_sizes) # break the inputs and codebooks apart then combine the outputs. - # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big multiply at the end. + # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big + # multiply at the end. num_codebooks = codebooks.shape[0] // num_outputs assert (scales.shape[0] == codes.shape[0]) assert (sum(output_partition_sizes) == scales.shape[0]) @@ -113,6 +123,35 @@ def dequantize_partioned_gemm( return output +# Optimized dequnantize/decompression kernels, supports 1x16 and 2x8 +# at 6 and 9 times faster than the generic version above, respectively. +def optimized_dequantize_gemm( + input: torch.Tensor, # [..., in_features] + codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] + codebooks: torch. + Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] + scales: torch.Tensor, # [num_out_groups, 1, 1, 1] + output_partition_sizes: torch.IntTensor, + bias: Optional[torch.Tensor], +) -> torch.Tensor: + weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) + + if bias is None: + # scaling the output is fastest, so we do that when possible. + output = F.linear(input, weights, bias) + orig_shape = output.shape + flattened_output = output.view(-1, output.size(-1)) + f_scales = scales.view(-1, scales.shape[0]) + b_scales = f_scales.expand(flattened_output.shape[0], -1) + flattened_output *= b_scales + return output.view(orig_shape) + else: + b_scales = scales.view(scales.shape[:-3] + (-1, )).expand( + -1, weights.shape[1]) + weights *= b_scales + return F.linear(input, weights, bias) + + class AQLMConfig(QuantizationConfig): """Config class for AQLM. @@ -211,10 +250,10 @@ def create_weights( codes = Parameter( torch.empty( - # There could actually be two pack factors, one along input and one along output, - # but we don't currently support out_group_size, - # and only the one along output needs to be marked with "packed_dim". - # in order for QKVLinear to work. + # There could actually be two pack factors, one along input and + # one along output, but we don't currently support + # out_group_size, and only the one along output needs to be + # marked with "packed_dim" in order for QKVLinear to work. output_size_per_partition, input_size_per_partition // self.quant_config.pack_factor, self.quant_config.num_codebooks, @@ -294,17 +333,38 @@ def apply_weights( output_partition_sizes = getattr(codebooks, "output_partition_sizes", None) - use_gemv = math.prod( - x.shape[:-1]) <= 32 or output_partition_sizes is None - - output = ops.aqlm_gemm( - x, - codes, - codebooks, - scales, - output_partition_sizes, - bias, - ) if use_gemv else dequantize_partioned_gemm( + nbooks = codes.shape[2] + ingroups = codebooks.shape[3] + outgroups = codebooks.shape[2] + bits = codebooks.shape[1] + + # We support these formats with dedicated gemm and decompression + # kernels. + if ingroups == 8 and outgroups == 1 and ( + (bits == 256 and nbooks == 2) or (bits == 65536 and nbooks == 1)): + + # thresholds determined by timings on an A6000 + m_threshold = 8 if bits == 65536 else 12 + use_gemv = math.prod(x.shape[:-1]) <= m_threshold + + return ops.aqlm_gemm( + x, + codes, + codebooks, + scales, + output_partition_sizes, + bias, + ) if use_gemv else optimized_dequantize_gemm( + x, + codes, + codebooks, + scales, + output_partition_sizes, + bias, + ) + + # fall back all unoptimized formats + return generic_dequantize_gemm( x, codes, codebooks, @@ -312,5 +372,3 @@ def apply_weights( output_partition_sizes, bias, ) - - return output From 39ca4a03b9b4d8605ba65ededcb978f33659b8dd Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 21 Mar 2024 17:44:58 +0000 Subject: [PATCH 77/96] fix format --- benchmarks/kernels/benchmark_aqlm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index 8f2323c695830..37d75ff6d020f 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -258,10 +258,10 @@ def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor, if method is torch_mult: for i in range(num_calls): - output = torch_mult(input, weights, scales) + torch_mult(input, weights, scales) else: for i in range(num_calls): - output = method(input, codes, codebooks, scales, parts, None) + method(input, codes, codebooks, scales, parts, None) end_event.record() end_event.synchronize() From 522f99021d76dd64e77a1f6e74a4904fe51b914f Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 21 Mar 2024 17:45:41 +0000 Subject: [PATCH 78/96] formatA --- benchmarks/kernels/benchmark_aqlm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index 37d75ff6d020f..9ec8f20e4d8cd 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -2,8 +2,6 @@ import sys from typing import Optional -os.environ['CUDA_VISIBLE_DEVICES'] = '0' - from vllm.model_executor.layers.quantization.aqlm import ( generic_dequantize_gemm, optimized_dequantize_gemm, dequantize_weight, get_int_dtype) @@ -12,6 +10,8 @@ import torch import torch.nn.functional as F +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + def torch_mult( input: torch.Tensor, # [..., in_features] From d2ac6b2ec9688d10c0fa2716180ff9cc64b92068 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 21 Mar 2024 17:48:37 +0000 Subject: [PATCH 79/96] some format fixes --- tests/models/test_aqlm.py | 3 ++- vllm/model_executor/layers/quantization/aqlm.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index 2464e7e20aa70..088c995e0c149 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -5,7 +5,8 @@ import pytest import torch -from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY +from vllm.model_executor.layers.quantization import ( + _QUANTIZATION_CONFIG_REGISTRY) capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index b49f7736684d5..12e198d3daa7e 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -47,7 +47,8 @@ def dequantize_weight(codes: torch.Tensor, [*dims, num_in_groups*group_size] """ num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:] - num_codebooks, codebook_size, out_group_size, in_group_size = codebooks.shape + num_codebooks, codebook_size, out_group_size, in_group_size = \ + codebooks.shape out_features = num_out_groups * out_group_size in_features = num_in_groups * in_group_size codebook_offsets = torch.arange( From bb66e3cc414a061b151522b5f8265c3cd8d7bb3f Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 21 Mar 2024 17:51:53 +0000 Subject: [PATCH 80/96] formatting --- tests/models/test_aqlm.py | 53 ++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index 088c995e0c149..5e1d57f5c3b43 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -15,22 +15,31 @@ # In this test we hardcode prompts and generations for the model so we don't need to require the AQLM package as a dependency example_prompts = [ - 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.\n', - 'Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.\n', - 'Compare and contrast artificial intelligence with human intelligence in terms of processing information.\n', - 'Describe the basic components of a neural network and how it can be trained.\n', + 'vLLM is a high-throughput and memory-efficient inference and serving ' + 'engine for LLMs.\n', + 'Briefly describe the major milestones in the development of artificial ' + 'intelligence from 1950 to 2020.\n', + 'Compare and contrast artificial intelligence with human intelligence in ' + 'terms of processing information.\n', + 'Describe the basic components of a neural network and how it can be ' + 'trained.\n', 'Write a short story about a robot that dreams for the first time.\n', - 'Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.\n', - 'Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.\n', - "Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'\n" + 'Analyze the impact of the COVID-19 pandemic on global economic structures ' + 'and future business models.\n', + 'Explain the cultural significance of the Mona Lisa painting, and how its ' + 'perception might vary in Western versus Eastern societies.\n', + "Translate the following English sentence into Japanese, French, and " + "Swahili: 'The early bird catches the worm.'\n" ] -# These ground truth generations were generated using `transformers==4.38.1 aqlm==1.1.0 torch==2.2.0` +# These ground truth generations were generated using `transformers==4.38.1 +# aqlm==1.1.0 torch==2.2.0` # and the below code: # ```python # from transformers import AutoTokenizer, AutoModelForCausalLM # model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf" -# quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="cuda").cuda() +# quantized_model = AutoModelForCausalLM.from_pretrained(model_id, +# torch_dtype="auto", device_map="cuda").cuda() # tokenizer = AutoTokenizer.from_pretrained(model_id) # outputs = [] # for prompt in example_prompts: @@ -39,14 +48,24 @@ # outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:])) # ``` ground_truth_generations = [ - '\n### Features\n\n- **High-throughput**: vLLM is designed to be memory-efficient and high-throughput. It', - 'The major milestones in the development of artificial intelligence from 1950 to 2020 are as follows:\n1950', - 'Compare and contrast artificial intelligence with human intelligence in terms of processing information. The processing of information is a key component of artificial intelligence. The processing of information is', - 'Explain the difference between supervised and unsupervised learning.\nExplain the difference between feedforward and recurrent neural networks.\nExplain the difference', - 'Write a short story about a robot that dreams for the first time. The story should be about 1000 words.\nThe story should be', - 'Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. The COVID-19 pandemic has had a', - 'The Mona Lisa is a painting by Leonardo da Vinci, and it is considered to be one of the most famous paintings in the world. The', - 'The early bird catches the worm.\nThe early bird catches the worm. (Japanese)\nLe petit oiseau attrait' + '\n### Features\n\n- **High-throughput**: vLLM is designed to be ' + 'memory-efficient and high-throughput. It', + 'The major milestones in the development of artificial intelligence from ' + '1950 to 2020 are as follows:\n1950', + 'Compare and contrast artificial intelligence with human intelligence in ' + 'terms of processing information. The processing of information is a key ' + 'component of artificial intelligence. The processing of information is', + 'Explain the difference between supervised and unsupervised ' + 'learning.\nExplain the difference between feedforward and recurrent ' + 'neural networks.\nExplain the difference', + 'Write a short story about a robot that dreams for the first time. The ' + 'story should be about 1000 words.\nThe story should be', + 'Analyze the impact of the COVID-19 pandemic on global economic structures ' + 'and future business models. The COVID-19 pandemic has had a', + 'The Mona Lisa is a painting by Leonardo da Vinci, and it is considered to ' + 'be one of the most famous paintings in the world. The', + 'The early bird catches the worm.\nThe early bird catches the worm. ' + '(Japanese)\nLe petit oiseau attrait' ] From 11c7950e226c8696f7398b4e8f6520098f6d4923 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 21 Mar 2024 17:52:24 +0000 Subject: [PATCH 81/96] format --- tests/models/test_aqlm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index 5e1d57f5c3b43..aedd707406e75 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -13,7 +13,8 @@ aqlm_not_supported = ( capability < _QUANTIZATION_CONFIG_REGISTRY["aqlm"].get_min_capability()) -# In this test we hardcode prompts and generations for the model so we don't need to require the AQLM package as a dependency +# In this test we hardcode prompts and generations for the model so we don't +# need to require the AQLM package as a dependency example_prompts = [ 'vLLM is a high-throughput and memory-efficient inference and serving ' 'engine for LLMs.\n', From fb78b9504b2f3bf2e46805376ddcbfb7cb741d40 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 21 Mar 2024 17:54:52 +0000 Subject: [PATCH 82/96] remove dead space --- tests/models/test_aqlm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index aedd707406e75..47c8f3db6ea33 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -13,7 +13,7 @@ aqlm_not_supported = ( capability < _QUANTIZATION_CONFIG_REGISTRY["aqlm"].get_min_capability()) -# In this test we hardcode prompts and generations for the model so we don't +# In this test we hardcode prompts and generations for the model so we don't # need to require the AQLM package as a dependency example_prompts = [ 'vLLM is a high-throughput and memory-efficient inference and serving ' From d73a92beb3ed37f3bba8100953a40dc8c769abd7 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Thu, 21 Mar 2024 20:31:30 +0000 Subject: [PATCH 83/96] niceties for aqlm benchmark --- benchmarks/kernels/benchmark_aqlm.py | 44 +++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index 9ec8f20e4d8cd..e9383a8f1fc1a 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -1,5 +1,6 @@ import os import sys +import argparse from typing import Optional from vllm.model_executor.layers.quantization.aqlm import ( @@ -82,7 +83,9 @@ def dequant_no_scale( return F.linear(input, weights, bias) -# Compare my kernel against the gold standard. +# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against +# the generic pytorch version. +# Just visual comparison. def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None: n = parts.sum().item() @@ -116,7 +119,7 @@ def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None: codes[0, i, book] = i codes[0, -i, book] = i - weights = dequantize_weight(codes, codebooks, None) # TODO Scales. + weights = dequantize_weight(codes, codebooks, None) weights2 = ops.aqlm_dequant(codes, codebooks, parts) print("weights shape:", weights.shape) @@ -134,12 +137,36 @@ def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None: def main(): - nbooks = 2 - bits = 8 - - dequant_test(4096, torch.tensor((4096, )), nbooks, bits) - return - + parser = argparse.ArgumentParser(description="Benchmark aqlm performance.") + + # Add arguments + parser.add_argument("--nbooks", + type=int, + default=1, + help="Number of codebooks (default: 1)") + parser.add_argument("--bits", + type=int, + default=16, + help="Number of bits per code element (default: 16)") + parser.add_argument( + "--test", + type=bool, + default=False, + help="Run the decompression/dequant tester rather than benchmarking " + "(default: False)") + + # Parse the arguments + args = parser.parse_args() + + # Extract values + nbooks = args.nbooks + bits = args.bits + + if args.test: + dequant_test(4096, torch.tensor((4096, )), nbooks, bits) + return + + # Otherwise, benchmark. methods = [ ops.aqlm_gemm, dequant_out_scale, @@ -180,6 +207,7 @@ def main(): def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int, methods): + # I didn't see visible improvements from increasing these, but feel free :) num_warmup_trials = 1 num_trials = 1 From 44065550693fe99f4eec2d8a1f65c5421bb65697 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 22 Mar 2024 17:19:23 +0000 Subject: [PATCH 84/96] update the test file --- examples/aqlm_example.py | 7 ++++--- tests/models/test_aqlm.py | 10 +++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index 766fc93809bac..bacf68fac401f 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -32,9 +32,10 @@ def main(): "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", ] - model = LLM(args.model if args.model is not None else models[args.choice], - gpu_memory_utilization=.85, - tensor_parallel_size=args.tensor_parallel_size) + model = LLM( + args.model if args.model is not None else models[args.choice], + #gpu_memory_utilization=.85, + tensor_parallel_size=args.tensor_parallel_size) sampling_params = SamplingParams(max_tokens=100, temperature=0) outputs = model.generate("Hello my name is", diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index 47c8f3db6ea33..c814b9e70711d 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -57,16 +57,16 @@ 'terms of processing information. The processing of information is a key ' 'component of artificial intelligence. The processing of information is', 'Explain the difference between supervised and unsupervised ' - 'learning.\nExplain the difference between feedforward and recurrent ' - 'neural networks.\nExplain the difference', + 'learning.\nExplain the difference between a feedforward neural network ' + 'and a recurrent neural network.\n', 'Write a short story about a robot that dreams for the first time. The ' 'story should be about 1000 words.\nThe story should be', 'Analyze the impact of the COVID-19 pandemic on global economic structures ' 'and future business models. The COVID-19 pandemic has had a', 'The Mona Lisa is a painting by Leonardo da Vinci, and it is considered to ' 'be one of the most famous paintings in the world. The', - 'The early bird catches the worm.\nThe early bird catches the worm. ' - '(Japanese)\nLe petit oiseau attrait' + "Translate the following English sentence into Japanese, French, and " + "Swahili: 'The early bird catches the worm.'\nThe early bird catches" ] @@ -85,7 +85,7 @@ def test_models( num_logprobs: int, ) -> None: - vllm_model = vllm_runner(model, dtype=dtype, enforce_eager=True) + vllm_model = vllm_runner(model, dtype=dtype) vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, max_tokens, num_logprobs) From 36223428d0f73d59314eb5374015f6232e0e7468 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Fri, 22 Mar 2024 17:20:13 +0000 Subject: [PATCH 85/96] remove gpu_memory_utilization reduction --- examples/aqlm_example.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index bacf68fac401f..f32605420d0b0 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -34,7 +34,6 @@ def main(): model = LLM( args.model if args.model is not None else models[args.choice], - #gpu_memory_utilization=.85, tensor_parallel_size=args.tensor_parallel_size) sampling_params = SamplingParams(max_tokens=100, temperature=0) From e2b3529e1c0159cebd58e55ec5ab2b5d1db9e3ae Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 26 Mar 2024 14:24:11 +0000 Subject: [PATCH 86/96] port over better dequant kernels from aqlm --- csrc/quantization/aqlm/aqlm_cuda_entry.cpp | 63 +++-- csrc/quantization/aqlm/aqlm_cuda_kernel.cu | 253 ++++++++++++--------- examples/aqlm_example.py | 5 +- 3 files changed, 190 insertions(+), 131 deletions(-) diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp index 7ebfbd7af9fa4..683488a2bb4ef 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp +++ b/csrc/quantization/aqlm/aqlm_cuda_entry.cpp @@ -44,22 +44,22 @@ void code2x8_matvec_cuda( const int codebook_stride // as int4. ); -void code1x16_dequant( - void* weights, - const void* a, +void code1x16_dequant_cuda( + const void* A, + void* C, const void* codebook, - const int a_rows, // code rows in element space, so k - const int a_cols, // code columns in element space, so n - const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols. - const int codebook_stride // as int4 + int prob_m, + int prob_k, + const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long. + const int codebook_stride // as int4. ); -void code2x8_dequant( - void* weights, - const void* a, +void code2x8_dequant_cuda( + const void* A, + void* C, const void* codebook, - const int a_rows, // code rows in element space, so k - const int a_cols, // code columns in element space, so n + int prob_m, + int prob_k, const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols. const int codebook_stride // as int4 ); @@ -196,7 +196,7 @@ torch::Tensor code2x8_matmat( } // Accumulate the partition sizes. -int4 accumulate_sizes (const torch::Tensor& codebook_partition_sizes) +int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes) { int4 cumulative_sizes; auto cumulative_size = &cumulative_sizes.x; @@ -258,21 +258,48 @@ torch::Tensor aqlm_dequant( int rows = codes.size(1); int cols = codes.size(0); - auto weights = torch::empty({cols, rows * 8}, + auto in_features = codes.size(1) * 8; + auto out_features = codes.size(0); + + assert(out_features = codebook_partition_sizes.sum().item()); + + auto weights = torch::empty({out_features, in_features}, torch::TensorOptions() .dtype(codebooks.dtype()) .device(codebooks.device()) ); if (nbooks == 1 && entries == (1 << 16)) - { - code1x16_dequant(weights.data_ptr(), codes.data_ptr(), codebooks.data_ptr(), rows, cols, cumulative_sizes, codebook_stride(codebooks)); + { + code1x16_dequant_cuda( + codes.data_ptr(), + weights.data_ptr(), + codebooks.data_ptr(), + out_features, + in_features, + cumulative_sizes, + codebook_stride(codebooks)); + + // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation.) + // weights *= scales.index({"...", 0, 0}); + return weights; } if (nbooks == 2 && entries == (1 << 8)) - { - code2x8_dequant(weights.data_ptr(), codes.data_ptr(), codebooks.data_ptr(), rows, cols, cumulative_sizes, codebook_stride(codebooks)); + { + code2x8_dequant_cuda( + codes.data_ptr(), + weights.data_ptr(), + codebooks.data_ptr(), + out_features, + in_features, + cumulative_sizes, + codebook_stride(codebooks)); + + // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation) + // weights *= scales.index({"...", 0, 0}); + return weights; } diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu index 9e9570ee0b195..d2e950f0d24c2 100644 --- a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu +++ b/csrc/quantization/aqlm/aqlm_cuda_kernel.cu @@ -189,97 +189,127 @@ __global__ void Code2x8MatVec( } -// Dequantizes the code and codebook into weights. -// We span horizontally and do an int4 at a time in an attempt to maximize throughput. __global__ void Code1x16Dequant( - int4* __restrict__ weights, - const int4* __restrict__ a, + const int4* __restrict__ A, + int4* __restrict__ C, const int4* __restrict__ codebook, - const int a_rows, // code rows in int4 space, so same as stride. - const int a_cols, // code columns (matter?) + int prob_m, + int prob_k, const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, sums to m. const int codebook_stride // as int4 ) { - // Each thread decodes one int4 worth of codebook. - int a_col = blockIdx.x * 32 + threadIdx.x; - int a_row = blockIdx.y * 32 + threadIdx.y; - - // out of range - if (a_row >= a_rows) - return; - - const int weight_stride = a_rows * 8; // as int4 - weights += a_col * weight_stride + a_row * 8; + int a_gl_stride = prob_k / 8 / 8; + int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); + bool pred = a_gl_rd < prob_m; - // advance to the correct codebook, this easy because we only multiply one column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_col >= *codebook_size) + if (pred) { - codebook += codebook_stride; - ++codebook_size; + // advance to the correct codebook, this easy because we only multiply one column of the codebook. + auto codebook_size = &codebook_a_sizes.x; + while (a_gl_rd >= *codebook_size) + { + codebook += codebook_stride; + ++codebook_size; + } } - // do one int4 read and write, hopefully maxing out bandwidth. - int4 code_block = a[a_row + a_col * a_rows]; - const uint16_t* enc = reinterpret_cast(&code_block); - #pragma unroll - for (int i = 0; i < 8; i++) { - weights[i] = codebook[enc[i]]; + a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; + int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; + + int c_gl_stride = prob_k / 8; + int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); + c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8; + + int iters = (prob_k / 8 - 1) / (8 * 32) + 1; + while (iters--) { + if (pred && a_gl_rd < a_gl_end) { + const uint16_t* enc = reinterpret_cast(&A[a_gl_rd]); + #pragma unroll + for (int i = 0; i < 8; i++) { + int4 chunk; + auto dec = reinterpret_cast(&chunk); + // We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't + // actually help us; this brings > 2x speedup. + asm volatile ( + "ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3]) + : "l"((void*) &codebook[enc[i]]) + ); + + C[a_gl_rd * 8 + i] = chunk; + } + } + a_gl_rd += 32; } } -// Dequantizes the code and codebook for 2x8 -// We span horizontally and do an int4 at a time in an attempt to maximize throughput. + __global__ void Code2x8Dequant( - int4* __restrict__ weights, - const int4* __restrict__ a, + const int4* __restrict__ A, + int4* __restrict__ C, const int4* __restrict__ codebook, - const int a_rows, // code rows in int4 space, so same as stride. - const int a_cols, // code columns (matter?) - const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, sums to m. + int prob_m, + int prob_k, + const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols. const int codebook_stride // as int4 ) { - // Each thread decodes one int4 worth of codebook. - int a_col = blockIdx.x * 32 + threadIdx.x; - int a_row = blockIdx.y * 32 + threadIdx.y; + int a_gl_stride = prob_k / 8 / 8; + int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); + bool pred = a_gl_rd < prob_m; - // out of range, can happen. - if (a_row >= a_rows) - return; + if (pred) + { + // advance to the correct codebook, this easy because we only multiply one column of the codebook. + auto codebook_size = &codebook_a_sizes.x; + while (a_gl_rd >= *codebook_size) + { + codebook += codebook_stride; + ++codebook_size; + } + } - const int weight_stride = a_rows * 8; // as int4 - weights += a_col * weight_stride + a_row * 8; + a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; + int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; + int lane = threadIdx.x % 8; - // advance to the correct codebook, this easy because we only multiply one column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_col >= *codebook_size) - { - // in pairs of two - codebook += codebook_stride * 2; - ++codebook_size; + int c_gl_stride = prob_k / 8; + int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); + c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8; + + extern __shared__ int4 sh[]; + int4* sh_code = sh; + int4* sh_code0 = sh_code; + int4* sh_code1 = sh_code + 256 * 8; + + for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) { + int4 dec = codebook[i]; + #pragma unroll + for (int j = 0; j < 8; j++) + sh_code[8 * i + (j + lane) % 8] = dec; } + __syncthreads(); - // do one int4 read to get it into local memory, hopefully maxing out bandwidth. - int4 code_block = a[a_row + a_col * a_rows]; - const uint8_t* enc = reinterpret_cast(&code_block); - #pragma unroll - for (int i = 0; i < 8; i++) { - int4 code1 = codebook[enc[i*2]]; - int4 code2 = (codebook + codebook_stride)[enc[i*2 + 1]]; + float res = 0; - half2* a = reinterpret_cast(&code1); - half2* b = reinterpret_cast(&code2); + int iters = (prob_k / 8 - 1) / (8 * 32) + 1; + while (iters--) { + if (pred && a_gl_rd < a_gl_end) { + const uint8_t* enc = reinterpret_cast(&A[a_gl_rd]); #pragma unroll - for (int j = 0; j < 4; j++) - { - a[j].x = __hadd(a[j].x, b[j].x); - a[j].y = __hadd(a[j].y, b[j].y); + for (int i = 0; i < 8; i++) { + int4 chunk; + half2* a0 = reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]); + half2* a1 = reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]); + #pragma unroll + for (int j = 0; j < 4; j++) + reinterpret_cast(&chunk)[j] = __hadd2(a0[j], a1[j]); + C[a_gl_rd * 8 + i] = chunk; } - weights[i] = code1; + } + a_gl_rd += 32; } } - inline int ceildiv(int a, int b) { return (a + b - 1) / b; } @@ -358,69 +388,72 @@ void code2x8_matvec_cuda( ); } - -// Dequantizes the code and codebook into weights. -void code1x16_dequant( - void* __restrict__ weights, - const void* __restrict__ a, +void code1x16_dequant_cuda( + const void* __restrict__ A, + void* __restrict__ C, const void* __restrict__ codebook, - const int a_rows, // code rows in element space, so k - const int a_cols, // code columns in element space, so n - const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, sums to m. - const int codebook_stride // as int4 + int prob_m, + int prob_k, + const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long. + const int codebook_stride // as int4. ) { - dim3 threads(32, 32, 1); - - assert(a_cols % 32 == 0); - // each thread does one int4 worth. - assert(a_rows % 8 == 0); - - const int rows = a_rows/8; - - dim3 blocks(ceildiv(a_cols, 32), ceildiv(rows, 32), 1); + int sms; + cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); + int waves = 0; + int thread_m; + do { + waves++; + thread_m = ceildiv(prob_m, waves * sms); + } while (thread_m > THREAD_M); + int blocks = ceildiv(prob_m, thread_m); + int threads = 32 * thread_m; cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); Code1x16Dequant<<>>( - (int4*) weights, - (const int4*) a, + (const int4*) A, + (int4*) C, (const int4*) codebook, - rows, // in int4 space. - a_cols, - codebook_a_sizes, - codebook_stride + prob_m, + prob_k, + codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long. + codebook_stride // as int4. ); } // Dequantizes the code and codebook into weights. -void code2x8_dequant( - void* __restrict__ weights, - const void* __restrict__ a, +void code2x8_dequant_cuda( + const void* __restrict__ A, + void* __restrict__ C, const void* __restrict__ codebook, - const int a_rows, // code rows in element space, so k - const int a_cols, // code columns in element space, so n - const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, sums to m. + int prob_m, + int prob_k, + const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols. const int codebook_stride // as int4 ) { - dim3 threads(32, 32, 1); - - assert(a_cols % 32 == 0); - // each thread does one int4 worth. - assert(a_rows % 8 == 0); - - const int rows = a_rows/8; - - dim3 blocks(ceildiv(a_cols, 32), ceildiv(rows, 32), 1); + int sms; + cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); + int waves = 0; + int thread_m; + do { + waves++; + thread_m = ceildiv(prob_m, waves * sms); + } while (thread_m > THREAD_M); + int blocks = ceildiv(prob_m, thread_m); + int threads = 32 * thread_m; + int shared = 16 * (2 * 256 * 8 + 32 * 9); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - Code2x8Dequant<<>>( - (int4*) weights, - (const int4*) a, + + cudaFuncSetAttribute( + Code2x8Dequant, cudaFuncAttributeMaxDynamicSharedMemorySize, shared + ); + Code2x8Dequant<<>>( + (const int4*) A, + (int4*) C, (const int4*) codebook, - rows, // in int4 space. - a_cols, + prob_m, + prob_k, codebook_a_sizes, codebook_stride ); } - - diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index f32605420d0b0..d290bfdefd4ae 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -32,9 +32,8 @@ def main(): "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", ] - model = LLM( - args.model if args.model is not None else models[args.choice], - tensor_parallel_size=args.tensor_parallel_size) + model = LLM(args.model if args.model is not None else models[args.choice], + tensor_parallel_size=args.tensor_parallel_size) sampling_params = SamplingParams(max_tokens=100, temperature=0) outputs = model.generate("Hello my name is", From 3d65a48ac0f465d3ccc1222f8391b7d82213b3d8 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 26 Mar 2024 16:20:42 +0000 Subject: [PATCH 87/96] better threshold for aqlm --- vllm/model_executor/layers/quantization/aqlm.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 12e198d3daa7e..193c86b2acfd2 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -344,9 +344,8 @@ def apply_weights( if ingroups == 8 and outgroups == 1 and ( (bits == 256 and nbooks == 2) or (bits == 65536 and nbooks == 1)): - # thresholds determined by timings on an A6000 - m_threshold = 8 if bits == 65536 else 12 - use_gemv = math.prod(x.shape[:-1]) <= m_threshold + # thresholds determined by timings on an A6000, one GPU + use_gemv = math.prod(x.shape[:-1]) <= 6 return ops.aqlm_gemm( x, From d033c85d63c8e398f8fb75d8212709e433d274a0 Mon Sep 17 00:00:00 2001 From: James Fleming Date: Tue, 26 Mar 2024 16:27:13 +0000 Subject: [PATCH 88/96] format --- benchmarks/kernels/benchmark_aqlm.py | 12 ++++++------ examples/aqlm_example.py | 3 ++- tests/models/test_aqlm.py | 1 + vllm/model_executor/layers/quantization/aqlm.py | 7 ++++--- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index e9383a8f1fc1a..9602d20bcbc74 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -1,16 +1,16 @@ +import argparse import os import sys -import argparse from typing import Optional -from vllm.model_executor.layers.quantization.aqlm import ( - generic_dequantize_gemm, optimized_dequantize_gemm, dequantize_weight, - get_int_dtype) -from vllm._C import ops - import torch import torch.nn.functional as F +from vllm._C import ops +from vllm.model_executor.layers.quantization.aqlm import ( + dequantize_weight, generic_dequantize_gemm, get_int_dtype, + optimized_dequantize_gemm) + os.environ['CUDA_VISIBLE_DEVICES'] = '0' diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index d290bfdefd4ae..e7c17fa0362ae 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -1,6 +1,7 @@ -from vllm import LLM, SamplingParams import argparse +from vllm import LLM, SamplingParams + def main(): diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index c814b9e70711d..380a8ee67e1f8 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -5,6 +5,7 @@ import pytest import torch + from vllm.model_executor.layers.quantization import ( _QUANTIZATION_CONFIG_REGISTRY) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 193c86b2acfd2..272d2c2fa2694 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -1,15 +1,16 @@ # Supports AQLM compression, see https://github.com/Vahe1994/AQLM # and https://arxiv.org/pdf/2401.06118.pdf +import math from typing import Any, Dict, List, Optional -import math import torch -from torch.nn.parameter import Parameter import torch.nn.functional as F +from torch.nn.parameter import Parameter from vllm._C import ops -from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs +from vllm.model_executor.layers.linear import (LinearMethodBase, + set_weight_attrs) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) From 92206de9a8440d7fc3f59a43340b15f39680abcb Mon Sep 17 00:00:00 2001 From: mgoin Date: Tue, 9 Apr 2024 14:25:14 +0000 Subject: [PATCH 89/96] Update test point --- tests/models/test_aqlm.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index 380a8ee67e1f8..e0a6c9e697dbc 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -48,6 +48,7 @@ # input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda") # hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32) # outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:])) +# print(outputs) # ``` ground_truth_generations = [ '\n### Features\n\n- **High-throughput**: vLLM is designed to be ' @@ -57,17 +58,17 @@ 'Compare and contrast artificial intelligence with human intelligence in ' 'terms of processing information. The processing of information is a key ' 'component of artificial intelligence. The processing of information is', - 'Explain the difference between supervised and unsupervised ' - 'learning.\nExplain the difference between a feedforward neural network ' - 'and a recurrent neural network.\n', + 'Explain the difference between supervised and unsupervised learning.\n' + 'Explain the difference between feedforward and recurrent neural networks.' + '\nExplain the difference', 'Write a short story about a robot that dreams for the first time. The ' 'story should be about 1000 words.\nThe story should be', - 'Analyze the impact of the COVID-19 pandemic on global economic structures ' - 'and future business models. The COVID-19 pandemic has had a', - 'The Mona Lisa is a painting by Leonardo da Vinci, and it is considered to ' - 'be one of the most famous paintings in the world. The', - "Translate the following English sentence into Japanese, French, and " - "Swahili: 'The early bird catches the worm.'\nThe early bird catches" + 'Analyze the impact of the COVID-19 pandemic on global economic structures' + ' and future business models. The COVID-19 pandemic has had a', + 'The Mona Lisa is a painting by Leonardo da Vinci, and it is considered ' + 'to be one of the most famous paintings in the world. The', + 'The early bird catches the worm.\nThe early bird catches the worm. ' + '(Japanese)\nLe petit oiseau attrait' ] From 811e2cc156a1869f75e245a62a3fe9d5ec52379f Mon Sep 17 00:00:00 2001 From: mgoin Date: Tue, 9 Apr 2024 15:53:10 +0000 Subject: [PATCH 90/96] Poke test again --- tests/models/test_aqlm.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index e0a6c9e697dbc..020897aaaf9c8 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -1,6 +1,6 @@ """Compare the outputs of a AQLM model between vLLM and HF Transformers -Run `pytest tests/models/test_aqlm.py --forked`. +Run `pytest tests/models/test_aqlm.py`. """ import pytest @@ -51,24 +51,17 @@ # print(outputs) # ``` ground_truth_generations = [ - '\n### Features\n\n- **High-throughput**: vLLM is designed to be ' - 'memory-efficient and high-throughput. It', + '\n### Features\n\n- **High-throughput**: v', 'The major milestones in the development of artificial intelligence from ' - '1950 to 2020 are as follows:\n1950', + '195', 'Compare and contrast artificial intelligence with human intelligence in ' - 'terms of processing information. The processing of information is a key ' - 'component of artificial intelligence. The processing of information is', - 'Explain the difference between supervised and unsupervised learning.\n' - 'Explain the difference between feedforward and recurrent neural networks.' - '\nExplain the difference', - 'Write a short story about a robot that dreams for the first time. The ' - 'story should be about 1000 words.\nThe story should be', - 'Analyze the impact of the COVID-19 pandemic on global economic structures' - ' and future business models. The COVID-19 pandemic has had a', - 'The Mona Lisa is a painting by Leonardo da Vinci, and it is considered ' - 'to be one of the most famous paintings in the world. The', - 'The early bird catches the worm.\nThe early bird catches the worm. ' - '(Japanese)\nLe petit oiseau attrait' + 'terms of processing information. The', + 'Explain the difference between supervised and unsupervised learning.' + '\nExplain', + 'Write a short story about a robot that dreams for the first time. The', + 'Analyze the impact of the COVID-19 pandemic on global economic', + 'The Mona Lisa is a painting by Leonardo da Vinci, and it', + 'The early bird catches the worm.\nThe early bird catches the' ] @@ -76,8 +69,8 @@ reason="AQLM is not supported on this GPU type.") @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"]) @pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [3]) +@pytest.mark.parametrize("max_tokens", [16]) +@pytest.mark.parametrize("num_logprobs", [1]) def test_models( vllm_runner, example_prompts, @@ -97,4 +90,5 @@ def test_models( vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[ prompt_idx] + print("Output generation:", repr(vllm_output_str)) assert vllm_output_str == ground_truth_generations[prompt_idx] From d0e8d0cc1af5637d4e5f9f5904da28f97cbc9e9b Mon Sep 17 00:00:00 2001 From: mgoin Date: Mon, 15 Apr 2024 19:55:32 +0000 Subject: [PATCH 91/96] Resolve create_weights updates --- .../layers/quantization/aqlm.py | 32 +++++++++---------- .../layers/quantization/marlin.py | 3 +- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 272d2c2fa2694..6115b1de679ad 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -224,14 +224,11 @@ class AQLMLinearMethod(LinearMethodBase): def __init__(self, quant_config: AQLMConfig): self.quant_config = quant_config - def create_weights( - self, - input_size_per_partition: int, - output_partition_sizes: List[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - ) -> Dict[str, Any]: + def create_weights(self, layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], input_size: int, + output_size: int, params_dtype: torch.dtype, + **extra_weight_attrs): del output_size # Unused. del input_size # Unused. @@ -317,21 +314,22 @@ def create_weights( }, ) - return { - "codes": codes, - "codebooks": codebooks, - "scales": scales, - } + layer.register_parameter("codes", codes) + set_weight_attrs(codes, extra_weight_attrs) + layer.register_parameter("codebooks", codebooks) + set_weight_attrs(codebooks, extra_weight_attrs) + layer.register_parameter("scales", scales) + set_weight_attrs(scales, extra_weight_attrs) def apply_weights( self, - weights: Dict[str, Any], + layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: - codebooks = weights["codebooks"] - codes = weights["codes"] - scales = weights["scales"] + codebooks = layer.codebooks + codes = layer.codes + scales = layer.scales output_partition_sizes = getattr(codebooks, "output_partition_sizes", None) diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index bf0500f1155a1..00c3c404c2d7a 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -93,7 +93,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_size_per_partition: int, + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, @@ -106,6 +106,7 @@ def create_weights( f"The params dtype must be float16, but got {params_dtype}") # Validate output_size_per_partition + output_size_per_partition = sum(output_partition_sizes) if output_size_per_partition % self.quant_config.min_n_threads != 0: raise ValueError( f"Weight output_size_per_partition = " From 6bb89c00d4e7cb87b23cdb0de404586007d1c19e Mon Sep 17 00:00:00 2001 From: mgoin Date: Tue, 16 Apr 2024 14:59:40 +0000 Subject: [PATCH 92/96] Better test debug output (manually tested TP) --- tests/models/test_aqlm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index 020897aaaf9c8..f653d340fc18f 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -90,5 +90,7 @@ def test_models( vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[ prompt_idx] - print("Output generation:", repr(vllm_output_str)) + print("Prompt: ", repr(example_prompts[prompt_idx])) + print("Reference output:", repr(ground_truth_generations[prompt_idx])) + print("Output output: ", repr(vllm_output_str)) assert vllm_output_str == ground_truth_generations[prompt_idx] From 4d46f1810e57dc994486d7e85e87696ef761f158 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 18 Apr 2024 10:57:18 -0400 Subject: [PATCH 93/96] Delete csrc/quantization/aqlm/LICENSE --- csrc/quantization/aqlm/LICENSE | 203 --------------------------------- 1 file changed, 203 deletions(-) delete mode 100644 csrc/quantization/aqlm/LICENSE diff --git a/csrc/quantization/aqlm/LICENSE b/csrc/quantization/aqlm/LICENSE deleted file mode 100644 index bfa740da977e9..0000000000000 --- a/csrc/quantization/aqlm/LICENSE +++ /dev/null @@ -1,203 +0,0 @@ -Contains code from https://github.com/Vahe1994/AQLM - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [2024] [AQLM authors] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. From a29008d3e1725d17709fc6d17442963fac8d17ae Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 18 Apr 2024 15:43:56 +0000 Subject: [PATCH 94/96] Address comments --- CMakeLists.txt | 4 ++-- .../aqlm/{aqlm_cuda_entry.cpp => cuda_entry.cpp} | 0 .../aqlm/{aqlm_cuda_kernel.cu => gemm_kernels.cu} | 0 vllm/model_executor/layers/linear.py | 11 ++++++----- 4 files changed, 8 insertions(+), 7 deletions(-) rename csrc/quantization/aqlm/{aqlm_cuda_entry.cpp => cuda_entry.cpp} (100%) rename csrc/quantization/aqlm/{aqlm_cuda_kernel.cu => gemm_kernels.cu} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 10a5179666487..6e8e371764150 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,8 +173,8 @@ set(VLLM_EXT_SRC if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_EXT_SRC - "csrc/quantization/aqlm/aqlm_cuda_entry.cpp" - "csrc/quantization/aqlm/aqlm_cuda_kernel.cu" + "csrc/quantization/aqlm/cuda_entry.cpp" + "csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/marlin/marlin_cuda_kernel.cu" "csrc/custom_all_reduce.cu") diff --git a/csrc/quantization/aqlm/aqlm_cuda_entry.cpp b/csrc/quantization/aqlm/cuda_entry.cpp similarity index 100% rename from csrc/quantization/aqlm/aqlm_cuda_entry.cpp rename to csrc/quantization/aqlm/cuda_entry.cpp diff --git a/csrc/quantization/aqlm/aqlm_cuda_kernel.cu b/csrc/quantization/aqlm/gemm_kernels.cu similarity index 100% rename from csrc/quantization/aqlm/aqlm_cuda_kernel.cu rename to csrc/quantization/aqlm/gemm_kernels.cu diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 0ec448cc6ab8b..d471a43fe822b 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -399,13 +399,14 @@ def __init__( input_size = self.hidden_size output_size = (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size + output_sizes = [ + self.num_heads * tp_size * self.head_size, + self.num_kv_heads * tp_size * self.head_size, + self.num_kv_heads * tp_size * self.head_size + ] super().__init__(input_size, output_size, bias, False, skip_bias_add, - params_dtype, linear_method, [ - self.num_heads * tp_size * self.head_size, - self.num_kv_heads * tp_size * self.head_size, - self.num_kv_heads * tp_size * self.head_size - ]) + params_dtype, linear_method, output_sizes) def weight_loader(self, param: Parameter, From 385211568061b29d34c179d07dd64d52fbbefbfb Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 18 Apr 2024 15:52:31 +0000 Subject: [PATCH 95/96] Update test --- tests/models/test_aqlm.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index f653d340fc18f..a7abc011f57d7 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -6,13 +6,12 @@ import pytest import torch -from vllm.model_executor.layers.quantization import ( - _QUANTIZATION_CONFIG_REGISTRY) +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] -aqlm_not_supported = ( - capability < _QUANTIZATION_CONFIG_REGISTRY["aqlm"].get_min_capability()) +aqlm_not_supported = (capability < + QUANTIZATION_METHODS["aqlm"].get_min_capability()) # In this test we hardcode prompts and generations for the model so we don't # need to require the AQLM package as a dependency From d3678950196c38fd773be14955ed52d6979c3d98 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 18 Apr 2024 18:56:14 +0000 Subject: [PATCH 96/96] Cleanup namespaces --- CMakeLists.txt | 1 - csrc/quantization/aqlm/cuda_entry.cpp | 308 ------------------------- csrc/quantization/aqlm/gemm_kernels.cu | 253 ++++++++++++++++++++ 3 files changed, 253 insertions(+), 309 deletions(-) delete mode 100644 csrc/quantization/aqlm/cuda_entry.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e8e371764150..b2d0cf3e568b7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,7 +173,6 @@ set(VLLM_EXT_SRC if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_EXT_SRC - "csrc/quantization/aqlm/cuda_entry.cpp" "csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/marlin/marlin_cuda_kernel.cu" diff --git a/csrc/quantization/aqlm/cuda_entry.cpp b/csrc/quantization/aqlm/cuda_entry.cpp deleted file mode 100644 index 683488a2bb4ef..0000000000000 --- a/csrc/quantization/aqlm/cuda_entry.cpp +++ /dev/null @@ -1,308 +0,0 @@ -/* - * Modified by Neural Magic - * Adapted from https://github.com/Vahe1994/AQLM - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include -#include - -void code1x16_matvec_cuda( - const void* A, - const void* B, - void* C, - const void* codebook, - int prob_m, - int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long. - const int codebook_stride // as int4. -); - -void code2x8_matvec_cuda( - const void* A, - const void* B, - void* C, - const void* codebook, - int prob_m, - int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long. - const int codebook_stride // as int4. -); - -void code1x16_dequant_cuda( - const void* A, - void* C, - const void* codebook, - int prob_m, - int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long. - const int codebook_stride // as int4. -); - -void code2x8_dequant_cuda( - const void* A, - void* C, - const void* codebook, - int prob_m, - int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols. - const int codebook_stride // as int4 -); - - -int codebook_stride(const torch::Tensor& codebooks) -{ - return codebooks.stride(0) * codebooks.element_size() / sizeof(int4); -} - -void code1x16_matvec( - const torch::Tensor& A, - const torch::Tensor& B, - torch::Tensor& C, - const torch::Tensor& codebook, - const int4 codebook_a_sizes // cumulative sizes of A spanning each codebook, at most 3 long. -) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); - int prob_m = C.size(0); - int prob_k = B.size(0); - - code1x16_matvec_cuda( - A.data_ptr(), - B.data_ptr(), - C.data_ptr(), - codebook.data_ptr(), - prob_m, - prob_k, - codebook_a_sizes, - codebook_stride(codebook) - ); -} - -torch::Tensor code1x16_matmat( - const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const int4 codebook_a_sizes, - const std::optional& bias) { - auto input_sizes = input.sizes(); - auto out_features = codes.size(0) * codebooks.size(2); - auto flat_input = input.reshape({-1, input.size(-1)}); - auto flat_output = torch::empty({flat_input.size(0), out_features}, - torch::TensorOptions() - .dtype(input.dtype()) - .device(input.device()) - ); - - for (int i = 0; i < flat_input.size(0); ++i) { - auto input_vec = flat_input.index({i}); - auto output_vec = flat_output.index({i}); - code1x16_matvec( - codes.squeeze(2), - input_vec, - output_vec, - codebooks, - codebook_a_sizes - ); - } - flat_output *= scales.flatten().unsqueeze(0); - - if (bias.has_value()) { - flat_output += bias->unsqueeze(0); - } - - auto output_sizes = input_sizes.vec(); - output_sizes.pop_back(); - output_sizes.push_back(-1); - auto output = flat_output.reshape(output_sizes); - return output; -} - -void code2x8_matvec( - const torch::Tensor& A, - const torch::Tensor& B, - torch::Tensor& C, - const torch::Tensor& codebook, - const int4 codebook_a_sizes -) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); - int prob_m = C.size(0); - int prob_k = B.size(0); - code2x8_matvec_cuda( - A.data_ptr(), - B.data_ptr(), - C.data_ptr(), - codebook.data_ptr(), - prob_m, - prob_k, - codebook_a_sizes, - 2 * codebook_stride(codebook) - ); -} - -torch::Tensor code2x8_matmat( - const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const int4 codebook_a_sizes, - const std::optional& bias -) { - auto input_sizes = input.sizes(); - auto out_features = codes.size(0) * codebooks.size(2); - auto flat_input = input.reshape({-1, input.size(-1)}); - auto flat_output = torch::empty({flat_input.size(0), out_features}, - torch::TensorOptions() - .dtype(input.dtype()) - .device(input.device()) - ); - - for (int i = 0; i < flat_input.size(0); ++i) { - auto input_vec = flat_input.index({i}); - auto output_vec = flat_output.index({i}); - code2x8_matvec( - codes.squeeze(2), - input_vec, - output_vec, - codebooks, - codebook_a_sizes - ); - } - flat_output *= scales.flatten().unsqueeze(0); - if (bias.has_value()) { - flat_output += bias->unsqueeze(0); - } - - auto output_sizes = input_sizes.vec(); - output_sizes.pop_back(); - output_sizes.push_back(-1); - auto output = flat_output.reshape(output_sizes); - return output; -} - -// Accumulate the partition sizes. -int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes) -{ - int4 cumulative_sizes; - auto cumulative_size = &cumulative_sizes.x; - int i = 0; - int last = 0; - assert(codebook_partition_sizes.size(0) <= 4); - for (; i < codebook_partition_sizes.size(0); ++i, ++cumulative_size) - { - *cumulative_size = codebook_partition_sizes[i].item() + last; - last = *cumulative_size; - } - // fill in the rest with unreachable. - for (; i < 4; ++i, ++cumulative_size) - { - *cumulative_size = last*10; - } - return cumulative_sizes; -} - -torch::Tensor aqlm_gemm( - const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const torch::Tensor& codebook_partition_sizes, - const std::optional& bias -) -{ - int4 cumulative_sizes = accumulate_sizes(codebook_partition_sizes); - - int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0); - int const entries = codebooks.size(1); - - if (nbooks == 1 && entries == (1 << 16)) - { - return code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias); - } - if (nbooks == 2 && entries == (1 << 8)) - { - return code2x8_matmat(input, codes, codebooks, scales, cumulative_sizes, bias); - } - - TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.") - return {}; -} - -torch::Tensor aqlm_dequant( - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& codebook_partition_sizes -) -{ - int4 cumulative_sizes = accumulate_sizes(codebook_partition_sizes); - - int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0); - int const entries = codebooks.size(1); - - const at::cuda::OptionalCUDAGuard device_guard(device_of(codes)); - int rows = codes.size(1); - int cols = codes.size(0); - - auto in_features = codes.size(1) * 8; - auto out_features = codes.size(0); - - assert(out_features = codebook_partition_sizes.sum().item()); - - auto weights = torch::empty({out_features, in_features}, - torch::TensorOptions() - .dtype(codebooks.dtype()) - .device(codebooks.device()) - ); - - if (nbooks == 1 && entries == (1 << 16)) - { - code1x16_dequant_cuda( - codes.data_ptr(), - weights.data_ptr(), - codebooks.data_ptr(), - out_features, - in_features, - cumulative_sizes, - codebook_stride(codebooks)); - - // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation.) - // weights *= scales.index({"...", 0, 0}); - - return weights; - } - - if (nbooks == 2 && entries == (1 << 8)) - { - code2x8_dequant_cuda( - codes.data_ptr(), - weights.data_ptr(), - codebooks.data_ptr(), - out_features, - in_features, - cumulative_sizes, - codebook_stride(codebooks)); - - // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation) - // weights *= scales.index({"...", 0, 0}); - - return weights; - } - - TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.") - return {}; -} diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu index d2e950f0d24c2..4415316e1e8cd 100644 --- a/csrc/quantization/aqlm/gemm_kernels.cu +++ b/csrc/quantization/aqlm/gemm_kernels.cu @@ -18,9 +18,16 @@ #include #include #include +#include #include +#include #include +#include + + +namespace vllm { +namespace aqlm { __global__ void Code1x16MatVec( const int4* __restrict__ A, @@ -457,3 +464,249 @@ void code2x8_dequant_cuda( codebook_stride ); } + +int codebook_stride(const torch::Tensor& codebooks) +{ + return codebooks.stride(0) * codebooks.element_size() / sizeof(int4); +} + +void code1x16_matvec( + const torch::Tensor& A, + const torch::Tensor& B, + torch::Tensor& C, + const torch::Tensor& codebook, + const int4 codebook_a_sizes // cumulative sizes of A spanning each codebook, at most 3 long. +) { + const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); + int prob_m = C.size(0); + int prob_k = B.size(0); + + code1x16_matvec_cuda( + A.data_ptr(), + B.data_ptr(), + C.data_ptr(), + codebook.data_ptr(), + prob_m, + prob_k, + codebook_a_sizes, + codebook_stride(codebook) + ); +} + +torch::Tensor code1x16_matmat( + const torch::Tensor& input, + const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& scales, + const int4 codebook_a_sizes, + const std::optional& bias) { + auto input_sizes = input.sizes(); + auto out_features = codes.size(0) * codebooks.size(2); + auto flat_input = input.reshape({-1, input.size(-1)}); + auto flat_output = torch::empty({flat_input.size(0), out_features}, + torch::TensorOptions() + .dtype(input.dtype()) + .device(input.device()) + ); + + for (int i = 0; i < flat_input.size(0); ++i) { + auto input_vec = flat_input.index({i}); + auto output_vec = flat_output.index({i}); + code1x16_matvec( + codes.squeeze(2), + input_vec, + output_vec, + codebooks, + codebook_a_sizes + ); + } + flat_output *= scales.flatten().unsqueeze(0); + + if (bias.has_value()) { + flat_output += bias->unsqueeze(0); + } + + auto output_sizes = input_sizes.vec(); + output_sizes.pop_back(); + output_sizes.push_back(-1); + auto output = flat_output.reshape(output_sizes); + return output; +} + +void code2x8_matvec( + const torch::Tensor& A, + const torch::Tensor& B, + torch::Tensor& C, + const torch::Tensor& codebook, + const int4 codebook_a_sizes +) { + const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); + int prob_m = C.size(0); + int prob_k = B.size(0); + code2x8_matvec_cuda( + A.data_ptr(), + B.data_ptr(), + C.data_ptr(), + codebook.data_ptr(), + prob_m, + prob_k, + codebook_a_sizes, + 2 * codebook_stride(codebook) + ); +} + +torch::Tensor code2x8_matmat( + const torch::Tensor& input, + const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& scales, + const int4 codebook_a_sizes, + const std::optional& bias +) { + auto input_sizes = input.sizes(); + auto out_features = codes.size(0) * codebooks.size(2); + auto flat_input = input.reshape({-1, input.size(-1)}); + auto flat_output = torch::empty({flat_input.size(0), out_features}, + torch::TensorOptions() + .dtype(input.dtype()) + .device(input.device()) + ); + + for (int i = 0; i < flat_input.size(0); ++i) { + auto input_vec = flat_input.index({i}); + auto output_vec = flat_output.index({i}); + code2x8_matvec( + codes.squeeze(2), + input_vec, + output_vec, + codebooks, + codebook_a_sizes + ); + } + flat_output *= scales.flatten().unsqueeze(0); + if (bias.has_value()) { + flat_output += bias->unsqueeze(0); + } + + auto output_sizes = input_sizes.vec(); + output_sizes.pop_back(); + output_sizes.push_back(-1); + auto output = flat_output.reshape(output_sizes); + return output; +} + +// Accumulate the partition sizes. +int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes) +{ + int4 cumulative_sizes; + auto cumulative_size = &cumulative_sizes.x; + int i = 0; + int last = 0; + assert(codebook_partition_sizes.size(0) <= 4); + for (; i < codebook_partition_sizes.size(0); ++i, ++cumulative_size) + { + *cumulative_size = codebook_partition_sizes[i].item() + last; + last = *cumulative_size; + } + // fill in the rest with unreachable. + for (; i < 4; ++i, ++cumulative_size) + { + *cumulative_size = last*10; + } + return cumulative_sizes; +} + +} // namespace aqlm +} // namespace vllm + + +torch::Tensor aqlm_gemm( + const torch::Tensor& input, + const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& scales, + const torch::Tensor& codebook_partition_sizes, + const std::optional& bias +) +{ + int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes); + + int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0); + int const entries = codebooks.size(1); + + if (nbooks == 1 && entries == (1 << 16)) + { + return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias); + } + if (nbooks == 2 && entries == (1 << 8)) + { + return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales, cumulative_sizes, bias); + } + + TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.") + return {}; +} + +torch::Tensor aqlm_dequant( + const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& codebook_partition_sizes +) +{ + int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes); + + int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0); + int const entries = codebooks.size(1); + + const at::cuda::OptionalCUDAGuard device_guard(device_of(codes)); + int rows = codes.size(1); + int cols = codes.size(0); + + auto in_features = codes.size(1) * 8; + auto out_features = codes.size(0); + + assert(out_features = codebook_partition_sizes.sum().item()); + + auto weights = torch::empty({out_features, in_features}, + torch::TensorOptions() + .dtype(codebooks.dtype()) + .device(codebooks.device()) + ); + + if (nbooks == 1 && entries == (1 << 16)) + { + vllm::aqlm::code1x16_dequant_cuda( + codes.data_ptr(), + weights.data_ptr(), + codebooks.data_ptr(), + out_features, + in_features, + cumulative_sizes, + vllm::aqlm::codebook_stride(codebooks)); + + // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation.) + // weights *= scales.index({"...", 0, 0}); + + return weights; + } + + if (nbooks == 2 && entries == (1 << 8)) + { + vllm::aqlm::code2x8_dequant_cuda( + codes.data_ptr(), + weights.data_ptr(), + codebooks.data_ptr(), + out_features, + in_features, + cumulative_sizes, + vllm::aqlm::codebook_stride(codebooks)); + + // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation) + // weights *= scales.index({"...", 0, 0}); + + return weights; + } + + TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.") + return {}; +}