Skip to content

Commit

Permalink
remove extra, backend from ggml.c, ggml.h
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannesGaessler committed Jun 6, 2023
1 parent e645d12 commit f000018
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 56 deletions.
8 changes: 6 additions & 2 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1196,7 +1196,7 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true);
}

void ggml_cuda_noop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
(void) src0;
(void) src1;
(void) dst;
Expand Down Expand Up @@ -1287,6 +1287,10 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
}

void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
if (tensor->src0 != nullptr && tensor->src0->op == GGML_OP_RESHAPE) {
ggml_cuda_assign_buffers(tensor);
}

const size_t size = ggml_nbytes(tensor);
const size_t scratch_size = g_n_batch * GGML_CUDA_SCRATCH_SIZE_PER_BATCH;
GGML_ASSERT(size <= scratch_size);
Expand Down Expand Up @@ -1367,7 +1371,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
if (!any_on_device) {
return false;
}
func = ggml_cuda_noop;
func = ggml_cuda_nop;
break;
case GGML_OP_ROPE:
if (!any_on_device) {
Expand Down
30 changes: 1 addition & 29 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -3639,8 +3639,6 @@ struct ggml_context {

struct ggml_scratch scratch;
struct ggml_scratch scratch_save;

enum ggml_backend default_backend;
};

struct ggml_context_container {
Expand Down Expand Up @@ -3967,7 +3965,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
/*.objects_end =*/ NULL,
/*.scratch =*/ { 0, 0, NULL, },
/*.scratch_save =*/ { 0, 0, NULL, },
/*.default_backend =*/ GGML_BACKEND_CPU,
};

GGML_ASSERT(ctx->mem_buffer != NULL);
Expand Down Expand Up @@ -4026,10 +4023,6 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
ctx->no_alloc = no_alloc;
}

void ggml_set_default_backend(struct ggml_context * ctx, enum ggml_backend backend) {
ctx->default_backend = backend;
}

void * ggml_get_mem_buffer(struct ggml_context * ctx) {
return ctx->mem_buffer;
}
Expand Down Expand Up @@ -4141,7 +4134,7 @@ struct ggml_tensor * ggml_new_tensor_impl(

*result = (struct ggml_tensor) {
/*.type =*/ type,
/*.backend =*/ ctx->default_backend,
/*.backend =*/ GGML_BACKEND_CPU,
/*.n_dims =*/ n_dims,
/*.ne =*/ { 1, 1, 1, 1 },
/*.nb =*/ { 0, 0, 0, 0 },
Expand Down Expand Up @@ -4174,15 +4167,6 @@ struct ggml_tensor * ggml_new_tensor_impl(
result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
}

#ifdef GGML_USE_CUBLAS
if (result->backend == GGML_BACKEND_GPU) {
ggml_cuda_assign_buffers(result);
}
#else
GGML_ASSERT(result->backend == GGML_BACKEND_CPU);
#endif // GGML_USE_CUBLAS
GGML_ASSERT(result->backend != GGML_BACKEND_GPU_SPLIT);

ctx->n_objects++;

return result;
Expand Down Expand Up @@ -4537,8 +4521,6 @@ struct ggml_tensor * ggml_view_tensor(
result->nb[1] = src->nb[1];
result->nb[2] = src->nb[2];
result->nb[3] = src->nb[3];
result->backend = src->backend;
result->extra = src->extra;

return result;
}
Expand Down Expand Up @@ -5691,8 +5673,6 @@ struct ggml_tensor * ggml_reshape(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = NULL;
result->backend = a->backend;
result->extra = a->extra;

return result;
}
Expand All @@ -5717,8 +5697,6 @@ struct ggml_tensor * ggml_reshape_1d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = NULL;
result->backend = a->backend;
result->extra = a->extra;

return result;
}
Expand All @@ -5744,8 +5722,6 @@ struct ggml_tensor * ggml_reshape_2d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = NULL;
result->backend = a->backend;
result->extra = a->extra;

return result;
}
Expand All @@ -5772,8 +5748,6 @@ struct ggml_tensor * ggml_reshape_3d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = NULL;
result->backend = a->backend;
result->extra = a->extra;

return result;
}
Expand Down Expand Up @@ -5802,8 +5776,6 @@ struct ggml_tensor * ggml_reshape_4d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = NULL;
result->backend = a->backend;
result->extra = a->extra;

return result;
}
Expand Down
1 change: 0 additions & 1 deletion ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,6 @@ extern "C" {

GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
GGML_API void ggml_set_default_backend(struct ggml_context * ctx, enum ggml_backend backend);

GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
Expand Down
55 changes: 31 additions & 24 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ static const size_t MB = 1024*1024;
// TODO: dynamically determine these sizes
// needs modifications in ggml

typedef void (*offload_func_t)(struct ggml_tensor * tensor);
void llama_nop(struct ggml_tensor * tensor) {} // do nothing by default

static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
{
static std::map<e_model, size_t> k_sizes = {
Expand Down Expand Up @@ -1300,10 +1303,11 @@ static bool llama_eval_internal(
const int i_gpu_start = n_layer - n_gpu_layers;

for (int il = 0; il < n_layer; ++il) {
ggml_backend backend_offload = GGML_BACKEND_CPU;
offload_func_t offload_func = llama_nop;

#ifdef GGML_USE_CUBLAS
if (il >= i_gpu_start) {
backend_offload = GGML_BACKEND_GPU;
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
}
#endif // GGML_USE_CUBLAS

Expand All @@ -1313,40 +1317,31 @@ static bool llama_eval_internal(

// norm
{
ggml_set_default_backend(ctx0, backend_offload);
cur = ggml_rms_norm(ctx0, inpL);
offload_func(cur);
ggml_set_name(cur, "rms_norm_0");

// cur = cur*attention_norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
offload_func(cur);
ggml_set_name(cur, "attention_norm_0");
}

// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * tmpq = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N);
offload_func(cur);
ggml_set_name(tmpq, "tmpq");
struct ggml_tensor * tmpk = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N);
offload_func(cur);
ggml_set_name(tmpk, "tmpk");
ggml_set_default_backend(ctx0, GGML_BACKEND_CPU);

#ifdef GGML_USE_CUBLAS
struct ggml_tensor * Kcur;
struct ggml_tensor * Qcur;
if (backend_offload == GGML_BACKEND_GPU) {
Kcur = ggml_rope(ctx0, tmpk, n_past, n_rot, 0);
Qcur = ggml_rope(ctx0, tmpq, n_past, n_rot, 0);
} else {
Kcur = ggml_rope_inplace(ctx0, tmpk, n_past, n_rot, 0);
Qcur = ggml_rope_inplace(ctx0, tmpq, n_past, n_rot, 0);
}
#else
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, tmpk, n_past, n_rot, 0);
ggml_set_name(Kcur, "Kcur");

struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, tmpq, n_past, n_rot, 0);
#endif // GGML_USE_CUBLAS
ggml_set_name(Qcur, "Qcur");
ggml_set_name(Kcur, "Kcur");

// store key and value to memory
{
Expand Down Expand Up @@ -1430,62 +1425,70 @@ static bool llama_eval_internal(
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
ggml_set_name(cur, "KQV_merged_contiguous");

ggml_set_default_backend(ctx0, backend_offload);
// projection (no bias)
cur = ggml_mul_mat(ctx0,
model.layers[il].wo,
cur);
offload_func(cur);
ggml_set_name(cur, "result_wo");
}

lctx.use_buf(ctx0, 1);
//ggml_cuda_set_scratch(1);

struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
offload_func(inpFF);
ggml_set_name(inpFF, "inpFF");

// feed-forward network
{
// norm
{
cur = ggml_rms_norm(ctx0, inpFF);
offload_func(cur);
ggml_set_name(cur, "rms_norm_1");

// cur = cur*ffn_norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
offload_func(cur);
ggml_set_name(cur, "ffn_norm");
}

struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
model.layers[il].w3,
cur);
ggml_set_name(cur, "result_w3");
offload_func(tmp);
ggml_set_name(tmp, "result_w3");

cur = ggml_mul_mat(ctx0,
model.layers[il].w1,
cur);
offload_func(cur);
ggml_set_name(cur, "result_w2");

// SILU activation
cur = ggml_silu(ctx0, cur);
offload_func(cur);
ggml_set_name(cur, "silu");

cur = ggml_mul(ctx0, cur, tmp);
offload_func(cur);
ggml_set_name(cur, "silu_x_result_w3");

cur = ggml_mul_mat(ctx0,
model.layers[il].w2,
cur);
offload_func(cur);
ggml_set_name(cur, "result_w2");
}

cur = ggml_add(ctx0, cur, inpFF);
offload_func(cur);
ggml_set_name(cur, "inpFF_+_result_w2");

// input for next layer
inpL = cur;

ggml_set_default_backend(ctx0, GGML_BACKEND_CPU);
}

lctx.use_buf(ctx0, 0);
Expand All @@ -1494,28 +1497,32 @@ static bool llama_eval_internal(
// used at the end to optionally extract the embeddings
struct ggml_tensor * embeddings = NULL;

offload_func_t offload_func = llama_nop;

#ifdef GGML_USE_CUBLAS
if (n_gpu_layers > n_layer) {
ggml_set_default_backend(ctx0, GGML_BACKEND_GPU);
}
if (n_gpu_layers > n_layer) {
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
}
#endif // GGML_USE_CUBLAS

// norm
{
cur = ggml_rms_norm(ctx0, inpL);
offload_func(cur);
ggml_set_name(cur, "rms_norm_inpL");

cur = ggml_rms_norm(ctx0, cur);
offload_func(cur);
ggml_set_name(cur, "rms_norm_after");

// cur = cur*norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.norm);
offload_func(cur);
ggml_set_name(cur, "result_norm");

embeddings = cur;
}

ggml_set_default_backend(ctx0, GGML_BACKEND_CPU);

// lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
Expand Down

0 comments on commit f000018

Please sign in to comment.