Skip to content

Commit

Permalink
[Misc] Reduce supported Punica dtypes (vllm-project#4304)
Browse files Browse the repository at this point in the history
  • Loading branch information
WoosukKwon authored and jimpang committed Apr 25, 2024
1 parent f21e5b9 commit 1b358df
Show file tree
Hide file tree
Showing 16 changed files with 66 additions and 72 deletions.
12 changes: 0 additions & 12 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -212,23 +212,11 @@ define_gpu_extension_target(

set(VLLM_PUNICA_EXT_SRC
"csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu"
"csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu"
"csrc/punica/punica_ops.cc")

#
Expand Down
4 changes: 0 additions & 4 deletions csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu

This file was deleted.

4 changes: 0 additions & 4 deletions csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu

This file was deleted.

4 changes: 0 additions & 4 deletions csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu

This file was deleted.

4 changes: 0 additions & 4 deletions csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu

This file was deleted.

4 changes: 0 additions & 4 deletions csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu

This file was deleted.

4 changes: 0 additions & 4 deletions csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu

This file was deleted.

4 changes: 0 additions & 4 deletions csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu

This file was deleted.

4 changes: 0 additions & 4 deletions csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu

This file was deleted.

4 changes: 0 additions & 4 deletions csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu

This file was deleted.

4 changes: 0 additions & 4 deletions csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu

This file was deleted.

4 changes: 0 additions & 4 deletions csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu

This file was deleted.

4 changes: 0 additions & 4 deletions csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu

This file was deleted.

20 changes: 20 additions & 0 deletions csrc/punica/bgmv/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,26 @@
if weight_dtype == "fp32":
# FP32 weights are not supported.
continue
if output_dtype == "fp32":
# LoRA A matrix.
if input_dtype != weight_dtype:
# NOTE(woosuk): While Punica supports the case where the
# input and weight dtypes are different, we only generate
# the kernels the same dtypes to reduce the binary size.
continue
elif input_dtype == "fp32":
# LoRA B matrix.
if output_dtype != weight_dtype:
# NOTE(woosuk): While Punica supports the case where the
# output and weight dtypes are different, we only generate
# the kernels the same dtypes to reduce the binary size.
continue
elif not (input_dtype == output_dtype == weight_dtype):
# NOTE(woosuk): While Punica supports mixed data types for
# input, output, and weight, we only generate the kernels with
# the same data types to reduce the binary size.
continue

kernel_definition = TEMPLATE.format(
input_dtype=DTYPE_MAP[input_dtype],
output_dtype=DTYPE_MAP[output_dtype],
Expand Down
17 changes: 17 additions & 0 deletions csrc/punica/punica_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,23 @@ inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
int64_t y_offset, int64_t full_y_size,
int64_t batch_size, int64_t num_layers,
int64_t layer_idx, float scale) {
// NOTE(woosuk): While Punica supports various combinations of input/output
// data types, we limit the supported data types to reduce the binary size.
constexpr bool is_input_float = std::is_same<in_T, float>::value;
constexpr bool is_output_float = std::is_same<out_T, float>::value;
if (is_input_float) {
if (!std::is_same<out_T, W_T>::value) {
return false;
}
} else if (is_output_float) {
if (!std::is_same<in_T, W_T>::value) {
return false;
}
} else if (!(std::is_same<in_T, W_T>::value &&
std::is_same<out_T, W_T>::value)) {
return false;
}

switch (pack_u32(in_features, out_features)) {
#define CASE_ONESIDE(_in_T, _out_T, _W_T, feat_in, feat_out) \
case pack_u32(feat_in, feat_out): \
Expand Down
41 changes: 29 additions & 12 deletions tests/lora/test_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,9 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,

def _pretest():
linear = ParallelLMHead(vocab_size + lora_config.lora_extra_vocab_size,
1024, vocab_size)
1024,
vocab_size,
params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data)
linear.weight.data[:, vocab_size:] = 0
logits_processor = LogitsProcessor(
Expand Down Expand Up @@ -445,7 +447,7 @@ def _pretest():
num_inputs=8 * num_loras, # * 3,
input_size=(1, 1024),
input_range=(0, 1),
input_type=torch.float32,
input_type=torch.float16,
)
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)

Expand Down Expand Up @@ -494,7 +496,7 @@ def _pretest():
num_inputs=8 * num_loras * 3,
input_size=(1, 1024),
input_range=(0, 1),
input_type=torch.float32,
input_type=torch.float16,
)
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)

Expand Down Expand Up @@ -533,11 +535,17 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:

def create_random_linear_parallel_layer():
if orientation == "row":
linear = RowParallelLinear(4096, 4096, bias=False)
linear = RowParallelLinear(4096,
4096,
bias=False,
params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = RowParallelLinearWithLoRA(linear)
else:
linear = ColumnParallelLinear(4096, 4096, bias=False)
linear = ColumnParallelLinear(4096,
4096,
bias=False,
params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = ColumnParallelLinearWithLoRA(linear)
lora_linear.create_lora_weights(max_loras, lora_config)
Expand All @@ -561,7 +569,7 @@ def create_random_linear_parallel_layer():
num_inputs=32 * num_loras,
input_size=(1, 4096),
input_range=(0, 1),
input_type=torch.float32,
input_type=torch.float16,
)
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)

Expand Down Expand Up @@ -600,7 +608,7 @@ def create_random_linear_parallel_layer():
num_inputs=32 * num_loras,
input_size=(1, 4096),
input_range=(0, 1),
input_type=torch.float32,
input_type=torch.float16,
)
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)

Expand Down Expand Up @@ -633,15 +641,24 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
def create_column_parallel_packed_layer():
if repeats == 2:
linear = MergedColumnParallelLinear(4096, [4096] * repeats,
bias=False)
bias=False,
params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = MergedColumnParallelLinearWithLoRA(linear)
elif repeats == 3:
linear = QKVParallelLinear(4096, 64, 32, bias=False)
linear = QKVParallelLinear(4096,
64,
32,
bias=False,
params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = MergedQKVParallelLinearWithLora(linear)
else:
linear = QKVParallelLinear(4096, 64, 32, bias=False)
linear = QKVParallelLinear(4096,
64,
32,
bias=False,
params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = QKVParallelLinearWithLora(linear)

Expand Down Expand Up @@ -676,7 +693,7 @@ class FakeConfig:
num_inputs=32 * num_loras,
input_size=(1, 4096),
input_range=(0, 1),
input_type=torch.float32,
input_type=torch.float16,
)
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)

Expand Down Expand Up @@ -716,7 +733,7 @@ class FakeConfig:
num_inputs=32 * num_loras,
input_size=(1, 4096),
input_range=(0, 1),
input_type=torch.float32,
input_type=torch.float16,
)
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)

Expand Down

0 comments on commit 1b358df

Please sign in to comment.