forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c06170c
commit 0fbfc4b
Showing
35 changed files
with
1,781 additions
and
81 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
/* | ||
Copied from https://github.com/turboderp/exllamav2 | ||
*/ | ||
|
||
#ifndef _compat_cuh | ||
#define _compat_cuh | ||
|
||
namespace vllm { | ||
namespace gptq { | ||
// atomicAdd for half types, to support CC < 7.x | ||
|
||
__device__ __forceinline__ void atomicAdd_half(half* address, half val) | ||
{ | ||
unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); | ||
unsigned int old = *address_as_ui; | ||
unsigned int assumed; | ||
|
||
do | ||
{ | ||
assumed = old; | ||
__half_raw hsum; | ||
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); | ||
half tmpres = __hadd(hsum, val); | ||
hsum = __half_raw(tmpres); | ||
old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; | ||
old = atomicCAS(address_as_ui, assumed, old); | ||
} | ||
while (assumed != old); | ||
} | ||
|
||
// atomicAdd for half2 types | ||
|
||
__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) | ||
{ | ||
unsigned int* address_as_ui = (unsigned int*)address; | ||
unsigned int old = *address_as_ui; | ||
unsigned int assumed; | ||
do | ||
{ | ||
assumed = old; | ||
half2 old_val = *((half2*)&old); | ||
half2 new_val = __hadd2(old_val, val); | ||
old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val)); | ||
} | ||
while (assumed != old); | ||
} | ||
|
||
// | ||
|
||
#if defined(__CUDA_ARCH__) || defined(USE_ROCM) | ||
#if __CUDA_ARCH__ < 700 || defined(USE_ROCM) | ||
|
||
__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } | ||
|
||
#if __CUDA_ARCH__ < 600 || defined(USE_ROCM) | ||
__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } | ||
#endif | ||
|
||
#endif | ||
#endif | ||
|
||
} // namespace gptq | ||
} // namespace vllm | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
/* | ||
Adapted from https://github.com/turboderp/exllamav2 and https://github.com/turboderp/exllama | ||
*/ | ||
|
||
#ifndef _matrix_view_cuh | ||
#define _matrix_view_cuh | ||
|
||
#include <cuda_runtime.h> | ||
#include <cuda_fp16.h> | ||
|
||
#include "qdq_util.cuh" | ||
|
||
namespace vllm { | ||
namespace gptq { | ||
|
||
class MatrixView_half | ||
{ | ||
public: | ||
const half* data; | ||
const int height; | ||
const int width; | ||
|
||
__device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width) | ||
: data(data), height(height), width(width) | ||
{ } | ||
|
||
__device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } | ||
__device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } | ||
__device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); } | ||
__device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; } | ||
|
||
__device__ __forceinline__ void item4(half (&items)[4], int row, int column) const | ||
{ | ||
half2* ptr = (half2*) item_ptr(row, column); | ||
half2 i01 = ptr[0]; | ||
half2 i23 = ptr[1]; | ||
items[0] = __low2half(i01); | ||
items[1] = __high2half(i01); | ||
items[2] = __low2half(i23); | ||
items[3] = __high2half(i23); | ||
} | ||
__device__ __forceinline__ void item4_f(float (&items)[4], int row, int column) const | ||
{ | ||
half2* ptr = (half2*)item_ptr(row, column); | ||
half2 i01 = ptr[0]; | ||
half2 i23 = ptr[1]; | ||
items[0] = __half2float(__low2half(i01)); | ||
items[1] = __half2float(__high2half(i01)); | ||
items[2] = __half2float(__low2half(i23)); | ||
items[3] = __half2float(__high2half(i23)); | ||
} | ||
|
||
__device__ __forceinline__ void item4_h2(half2 (&items)[4], int row, int column) const | ||
{ | ||
half2* ptr = (half2*)item_ptr(row, column); | ||
half2 i01 = ptr[0]; | ||
half2 i23 = ptr[1]; | ||
items[0] = __half2half2(__low2half(i01)); | ||
items[1] = __half2half2(__high2half(i01)); | ||
items[2] = __half2half2(__low2half(i23)); | ||
items[3] = __half2half2(__high2half(i23)); | ||
} | ||
}; | ||
|
||
class MatrixView_half_rw | ||
{ | ||
public: | ||
half* data; | ||
const int height; | ||
const int width; | ||
|
||
__device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width) | ||
: data(data), height(height), width(width) | ||
{ } | ||
|
||
__device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } | ||
__device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } | ||
__device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; } | ||
__device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; } | ||
__device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; } | ||
|
||
__device__ __forceinline__ void set4(int row, int column, half v0, half v1, half v2, half v3) | ||
{ | ||
half2 v01 = __halves2half2(v0, v1); | ||
half2 v23 = __halves2half2(v2, v3); | ||
half2* ptr = (half2*) item_ptr(row, column); | ||
ptr[0] = v01; | ||
ptr[1] = v23; | ||
} | ||
}; | ||
|
||
class MatrixView_q4_row | ||
{ | ||
public: | ||
const uint32_t* data; | ||
const int height; | ||
const int width; | ||
|
||
__device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width) | ||
: data(data), height(height), width(width) | ||
{ } | ||
|
||
__device__ __forceinline__ int item(int row, int column) const | ||
{ | ||
int shift = (column & 0x07) * 4; | ||
return (data[row * width / 8 + column / 8] >> shift) & 0x0f; | ||
} | ||
|
||
__device__ __forceinline__ void item2(int (&items)[2], int row, int column) const | ||
{ | ||
int shift = (column & 0x07) * 4; | ||
uint32_t d = data[row * width / 8 + column / 8] >> shift; | ||
items[0] = d & 0x0f; | ||
items[1] = (d >> 4) & 0x0f; | ||
} | ||
|
||
__device__ __forceinline__ void item4(int (&items)[4], int row, int column) const | ||
{ | ||
int shift = (column & 0x07) * 4; | ||
uint32_t d = data[row * width / 8 + column / 8] >> shift; | ||
items[0] = d & 0x0f; | ||
items[1] = (d >> 4) & 0x0f; | ||
items[2] = (d >> 8) & 0x0f; | ||
items[3] = (d >> 12) & 0x0f; | ||
} | ||
}; | ||
|
||
class MatrixView_q4_column | ||
{ | ||
public: | ||
const uint32_t* data; | ||
const int height; | ||
const int width; | ||
|
||
__device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width) | ||
: data(data), height(height), width(width) | ||
{ } | ||
|
||
__device__ __forceinline__ int item(int row, int column) const | ||
{ | ||
int shift = (row & 0x07) * 4; | ||
return (data[row / 8 * width + column] >> shift) & 0x0f; | ||
} | ||
|
||
__device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; } | ||
__device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; } | ||
}; | ||
|
||
} // namespace gptq | ||
} // namespace vllm | ||
#endif |
Oops, something went wrong.