From 7cdc3eaa76b077aaea7a91b6a3a795bb05c9e5da Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Wed, 20 Sep 2023 17:11:33 -0700 Subject: [PATCH 01/22] Produces garbage output --- convert-adept-st-to-gguf.py | 149 ++++++++++++++++ convert.py | 7 +- gguf-py/gguf/gguf.py | 42 +++++ llama.cpp | 339 +++++++++++++++++++++++++++++++++++- 4 files changed, 528 insertions(+), 9 deletions(-) create mode 100644 convert-adept-st-to-gguf.py diff --git a/convert-adept-st-to-gguf.py b/convert-adept-st-to-gguf.py new file mode 100644 index 0000000000000..4844d5f81658d --- /dev/null +++ b/convert-adept-st-to-gguf.py @@ -0,0 +1,149 @@ +from convert import lazy_load_safetensors_file +import sys +import torch +from safetensors import safe_open +from pathlib import Path +from pprint import pprint +from sentencepiece import SentencePieceProcessor +import argparse +import gguf +import json +import struct + +def file_is_safetensors(path: Path) -> bool: + fp = open(path, 'rb') + first8 = fp.read(8) + fp.seek(0) + if first8[:2] == b'PK': + # A zip file, i.e. PyTorch format + return False + return struct.unpack(' None: + parser = argparse.ArgumentParser(description="Convert an Adept model (e.g. Persimmon 8b) to a GGML compatible file") + parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") + parser.add_argument("--outtype", choices=["f32"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") + parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm") + args = parser.parse_args(args_in) + + assert file_is_safetensors(args.model), 'Error: model file is not a SafeTensors file' + model = lazy_load_safetensors_file(open(args.model, 'rb'), args.model) + dir_model = args.model.parent + with open(dir_model / 'config.json', 'r') as f: + hparams = json.load(f) + pprint(hparams) + arch = gguf.MODEL_ARCH.ADEPT + gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch]) + + block_count = hparams['num_layers'] + head_count = hparams['num_attention_heads'] + head_count_kv = head_count + ctx_length = hparams['seq_length'] + hidden_size = hparams['hidden_size'] + + gguf_writer.add_name('persimmon-8b-chat') + gguf_writer.add_context_length(ctx_length) + gguf_writer.add_embedding_length(hidden_size) + gguf_writer.add_block_count(block_count) + gguf_writer.add_feed_forward_length(hparams['ffn_hidden_size']) + gguf_writer.add_rope_dimension_count(hidden_size // head_count) + gguf_writer.add_head_count(head_count) + gguf_writer.add_head_count_kv(head_count_kv) + if True: + tokens, scores, toktypes = handle_tokenizer(dir_model) + gguf_writer.add_tokenizer_model('llama') + gguf_writer.add_token_list(tokens) + gguf_writer.add_token_scores(scores) + gguf_writer.add_token_types(toktypes) + tensor_map = gguf.get_tensor_name_map(arch, block_count) + print(tensor_map) + tensors = {} + with safe_open(args.model, framework="pt") as f: + for k in f.keys(): + tensors[k] = f.get_tensor(k) + print(len(tensors.keys())) + for name in tensors.keys(): + data = tensors[name] + print(name) + + # we don't need these + + if name.endswith(".self_attention.rotary_emb.inv_freq"): + continue + old_dtype = data.dtype + if 'layernorm.weight' in name: + data = data.to(torch.float32) + else: + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float16) + # check for nans + if torch.isnan(data).any(): + print("WARNING: tensor '" + name + "' contains NaNs") + sys.exit() + if torch.isinf(data).any(): + print("WARNING: tensor '" + name + "' contains infinities") + sys.exit() + + data = data.squeeze().numpy() + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + n_dims = len(data.shape) + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + gguf_writer.add_tensor(new_name, data) + print("gguf: write header") + gguf_writer.write_header_to_file() + print("gguf: write metadata") + gguf_writer.write_kv_data_to_file() + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + + gguf_writer.close() + + print(f"gguf: model successfully exported to '{args.outfile}'") + print("") + + + +if __name__ == '__main__': + main() diff --git a/convert.py b/convert.py index 4ac5030db61eb..cf6e88cf348d4 100755 --- a/convert.py +++ b/convert.py @@ -701,13 +701,18 @@ def rebuild_from_type_v2(func, new_type, args, state): def find_class(self, module: str, name: str) -> Any: if not module.startswith('torch'): return super().find_class(module, name) - return self.CLASSES[(module, name)] + if (module, name) in self.CLASSES: + return self.CLASSES[(module, name)] + else: + print(f'Missing mapping for {module}.{name}') + raise KeyError def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus: zf = zipfile.ZipFile(outer_fp) pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')] assert len(pickle_paths) == 1, pickle_paths + print(pickle_paths) pickle_fp = zf.open(pickle_paths[0], 'r') unpickler = LazyUnpickler(pickle_fp, data_base_path=pickle_paths[0][:-4], diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index e0e0dbcbbe840..93a397109fa64 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -85,6 +85,7 @@ class MODEL_ARCH(IntEnum): GPTNEOX : int = auto() MPT : int = auto() STARCODER : int = auto() + ADEPT : int = auto() class MODEL_TENSOR(IntEnum): @@ -105,6 +106,8 @@ class MODEL_TENSOR(IntEnum): FFN_DOWN : int = auto() FFN_UP : int = auto() FFN_NORM : int = auto() + ATTN_Q_NORM : int = auto() + ATTN_K_NORM : int = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -116,6 +119,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GPTNEOX: "gptneox", MODEL_ARCH.MPT: "mpt", MODEL_ARCH.STARCODER: "starcoder", + MODEL_ARCH.ADEPT: "adept", } MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = { @@ -185,6 +189,20 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", }, + MODEL_ARCH.ADEPT: { + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", + MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + }, MODEL_ARCH.GPT2: { # TODO }, @@ -201,6 +219,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], + MODEL_ARCH.ADEPT: [ + MODEL_TENSOR.ROPE_FREQS, + ] } @@ -213,6 +234,7 @@ class TensorNameMap: "transformer.word_embeddings", # falcon "model.embed_tokens", # llama-hf "tok_embeddings", # llama-pth + "language_model.embedding.word_embeddings", # adept ), # Position embeddings @@ -225,6 +247,7 @@ class TensorNameMap: "embed_out", # gptneox "lm_head", # gpt2 mpt falcon llama-hf baichuan "output", # llama-pth + "word_embeddings_for_head", # adept ), # Output norm @@ -233,6 +256,7 @@ class TensorNameMap: "transformer.ln_f", # gpt2 falcon "model.norm", # llama-hf baichuan "norm", # llama-pth + "language_model.encoder.final_layernorm", # adept ), # Rope frequencies @@ -251,6 +275,7 @@ class TensorNameMap: "transformer.h.{bid}.ln_mlp", # falcon40b "model.layers.{bid}.input_layernorm", # llama-hf "layers.{bid}.attention_norm", # llama-pth + "language_model.encoder.layers.{bid}.input_layernorm", # adept ), # Attention norm 2 @@ -264,6 +289,7 @@ class TensorNameMap: "transformer.h.{bid}.attn.c_attn", # gpt2 "transformer.blocks.{bid}.attn.Wqkv", # mpt "transformer.h.{bid}.self_attention.query_key_value", # falcon + "language_model.encoder.layers.{bid}.self_attention.query_key_value", # adept ), # Attention query @@ -292,6 +318,7 @@ class TensorNameMap: "transformer.h.{bid}.self_attention.dense", # falcon "model.layers.{bid}.self_attn.o_proj", # llama-hf "layers.{bid}.attention.wo", # llama-pth + "language_model.encoder.layers.{bid}.self_attention.dense" # adept ), # Rotary embeddings @@ -307,6 +334,7 @@ class TensorNameMap: "transformer.blocks.{bid}.norm_2", # mpt "model.layers.{bid}.post_attention_layernorm", # llama-hf "layers.{bid}.ffn_norm", # llama-pth + "language_model.encoder.layers.{bid}.post_attention_layernorm", # adept ), # Feed-forward up @@ -317,6 +345,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon "model.layers.{bid}.mlp.up_proj", # llama-hf "layers.{bid}.feed_forward.w3", # llama-pth + "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # adept ), # Feed-forward gate @@ -333,7 +362,20 @@ class TensorNameMap: "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon "model.layers.{bid}.mlp.down_proj", # llama-hf "layers.{bid}.feed_forward.w2", # llama-pth + "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # adept + ), + + MODEL_TENSOR.ATTN_Q_NORM: ( + "language_model.encoder.layers.{bid}.self_attention.q_layernorm", ), + + MODEL_TENSOR.ATTN_K_NORM: ( + "language_model.encoder.layers.{bid}.self_attention.k_layernorm", + ), + + MODEL_TENSOR.ROPE_FREQS: ( + "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # adept + ) } mapping: dict[str, tuple[MODEL_TENSOR, str]] diff --git a/llama.cpp b/llama.cpp index 79b48897d8bbe..c354f1ef22c97 100644 --- a/llama.cpp +++ b/llama.cpp @@ -162,6 +162,7 @@ enum llm_arch { LLM_ARCH_GPTNEOX, LLM_ARCH_MPT, LLM_ARCH_STARCODER, + LLM_ARCH_ADEPT, LLM_ARCH_UNKNOWN, }; @@ -174,6 +175,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_MPT, "mpt" }, { LLM_ARCH_BAICHUAN, "baichuan" }, { LLM_ARCH_STARCODER, "starcoder" }, + { LLM_ARCH_ADEPT, "adept" }, }; enum llm_kv { @@ -292,6 +294,8 @@ enum llm_tensor { LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, LLM_TENSOR_FFN_NORM, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, }; static std::map> LLM_TENSOR_NAMES = { @@ -373,6 +377,23 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_ADEPT, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd"}, + { LLM_TENSOR_OUTPUT_NORM, "output_norm"}, + { LLM_TENSOR_OUTPUT, "output"}, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"}, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"}, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"}, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"}, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"}, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"}, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"}, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"}, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"}, + }, + }, { LLM_ARCH_MPT, { @@ -916,6 +937,7 @@ enum e_model { MODEL_1B, MODEL_3B, MODEL_7B, + MODEL_8B, MODEL_13B, MODEL_15B, MODEL_30B, @@ -979,6 +1001,10 @@ struct llama_layer { struct ggml_tensor * attn_norm_b; struct ggml_tensor * attn_norm_2; struct ggml_tensor * attn_norm_2_b; + struct ggml_tensor * attn_q_norm; + struct ggml_tensor * attn_q_norm_b; + struct ggml_tensor * attn_k_norm; + struct ggml_tensor * attn_k_norm_b; // attention struct ggml_tensor * wq; @@ -1629,6 +1655,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_1B: return "1B"; case MODEL_3B: return "3B"; case MODEL_7B: return "7B"; + case MODEL_8B: return "8B"; case MODEL_13B: return "13B"; case MODEL_15B: return "15B"; case MODEL_30B: return "30B"; @@ -2296,6 +2323,57 @@ static void llm_load_tensors( } } } break; + case LLM_ARCH_ADEPT: + { + model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + + const uint32_t n_ff = hparams.n_ff; + const int i_gpu_start = n_layer - n_gpu_layers; + model.layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; + auto & layer = model.layers[i]; + /* + input_layernorm.bias torch.Size([4096]) + input_layernorm.weight torch.Size([4096]) + mlp.dense_4h_to_h.bias torch.Size([4096]) + mlp.dense_4h_to_h.weight torch.Size([4096, 16384]) + mlp.dense_h_to_4h.bias torch.Size([16384]) + mlp.dense_h_to_4h.weight torch.Size([16384, 4096]) + post_attention_layernorm.bias torch.Size([4096]) + post_attention_layernorm.weight torch.Size([4096]) + self_attention.dense.bias torch.Size([4096]) + self_attention.dense.weight torch.Size([4096, 4096]) + self_attention.k_layernorm.bias torch.Size([64]) + self_attention.k_layernorm.weight torch.Size([64]) + self_attention.q_layernorm.bias torch.Size([64]) + self_attention.q_layernorm.weight torch.Size([64]) + self_attention.query_key_value.bias torch.Size([12288]) + self_attention.query_key_value.weight torch.Size([12288, 4096]) + self_attention.rotary_emb.inv_freq torch.Size([16]) + */ + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend); + layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend); + layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend); + layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); + } + } break; default: throw std::runtime_error("unknown architecture"); }; @@ -2444,6 +2522,8 @@ static struct ggml_cgraph * llm_build_llama( int n_past) { GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT + LLAMA_LOG_INFO("%s: n_tokens = %d\n", __func__, n_tokens); + LLAMA_LOG_INFO("BUILDING GRAPH"); const int N = n_tokens; @@ -3664,6 +3744,238 @@ static struct ggml_cgraph * llm_build_starcoder( return gf; } +static struct ggml_cgraph * llm_build_adept( + llama_context & lctx, + const llama_token * tokens, + const float * embd, + int n_tokens, + int n_past +) { + GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT + const int N = n_tokens; + const auto & model = lctx.model; + const auto & hparams = model.hparams; + + const auto & kv_self = lctx.kv_self; + GGML_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = hparams.n_ctx; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_head = hparams.n_head; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + const float freq_base = hparams.rope_freq_base; + const float freq_scale = hparams.rope_freq_scale; + + GGML_ASSERT(n_embd_head == hparams.n_rot); + auto & buf_compute = lctx.buf_compute; + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ false, + }; + params.no_alloc = true; + + struct ggml_context * ctx0 = ggml_init(params); + ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + if (tokens) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + + ggml_allocr_alloc(lctx.alloc, inp_tokens); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inp_tokens->data, inp_tokens, N*ggml_element_size(inp_tokens)); + } + LLAMA_LOG_INFO("Token ids:\n", __func__); + for (int i = 0; i < N; ++i) { + LLAMA_LOG_INFO(" %d ", tokens[i]); + } + ggml_set_name(inp_tokens, "inp_tokens"); + + inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + } else { + inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + ggml_allocr_alloc(lctx.alloc, inpL); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); + } + } + // Log all of the token ids sequentially + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(lctx.alloc, KQ_scale); + if (!ggml_allocr_is_measure(lctx.alloc)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + //LLAMA_LOG_INFO("Entering n_layers loop\n", __func__); + for (int il=0; il < n_layer; ++il) { + struct ggml_tensor * attn_norm; + offload_func_t offload_func = llama_nop; + // Attention + { + // input norming + attn_norm = ggml_norm(ctx0, inpL, hparams.f_norm_eps); + attn_norm = ggml_add(ctx0, ggml_mul( + ctx0, attn_norm, model.layers[il].attn_norm), + model.layers[il].attn_norm_b); + + // QKV + bias + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm); + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + + const size_t wsize = ggml_type_size(cur->type); + // Apply Q, K layernorm + + struct ggml_tensor * tmpq = ggml_cont( + ctx0, ggml_view_3d( + ctx0, cur, n_embd_head, n_head, N, + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + 0 + ) + ); + struct ggml_tensor * tmpk = ggml_cont( + ctx0, ggml_view_3d( + ctx0, cur, n_embd_head, n_head, N, + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + wsize * n_embd_head * n_head + ) + ); + tmpk = ggml_norm(ctx0, tmpk, hparams.f_norm_eps); + tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); + tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); + + tmpq = ggml_norm(ctx0, tmpq, hparams.f_norm_eps); + tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); + tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); + + + struct ggml_tensor * Qcur = ggml_rope_custom_inplace( + ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale + ); + struct ggml_tensor * Kcur = ggml_rope_custom_inplace( + ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale + ); + + struct ggml_tensor * tmpv = ggml_view_3d( + ctx0, cur, n_embd_head, n_head_kv, N, + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + wsize * n_embd_head * (n_head + n_head_kv)); + + { + // Set kv cache elements? + struct ggml_tensor * Vcur = ggml_transpose( + ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N) + ); + ggml_set_name(Vcur, "Vcur"); + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, + (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past) + ); + ggml_set_name(k, "k"); + + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + //LLAMA_LOG_INFO("3889\n", __func__); + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + ggml_set_name(Q, "Q"); + + // index into kv cache? + struct ggml_tensor * K = + ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_past + N, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + + ggml_set_name(K, "K"); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + ggml_set_name(KQ, "KQ"); + + struct ggml_tensor * KQ_scaled = ggml_scale_inplace (ctx0, KQ, KQ_scale); + ggml_set_name(KQ_scaled, "KQ_scaled"); + + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + ggml_set_name(KQ_masked, "KQ_soft_max"); + + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + ggml_set_name(KQ_soft_max, "KQ_soft_max"); + + //LLAMA_LOG_INFO("3915\n", __func__); + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_past + N, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + ggml_set_name(V, "V"); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + ggml_set_name(KQV, "KQV"); + + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + ggml_set_name(KQV_merged, "KQV_merged"); + + cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + ggml_set_name(cur, "KQV_merged_contiguous"); + + cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + ggml_set_name(cur, "result_wo"); + //LLAMA_LOG_INFO("EoWo\n", __func__); + } + struct ggml_tensor * attn_out = cur; + { + struct ggml_tensor * inpFF = attn_norm; + // Norm + { + cur = ggml_norm(ctx0, inpFF, hparams.f_norm_eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); + } + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); + // Squared ReLU + cur = ggml_relu(ctx0, cur); + cur = ggml_mul(ctx0, cur, cur); + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); + } + cur = ggml_add(ctx0, cur, attn_out); + ggml_set_name(cur, "inpFF_+_attn_out"); + inpL = cur; + //LLAMA_LOG_INFO("EoL\n", __func__); + } + //LLAMA_LOG_INFO("Exited from n_layers loop\n", __func__); + cur = inpL; + { + //LLAMA_LOG_INFO("norm\n", __func__); + cur = ggml_norm(ctx0, cur, hparams.f_norm_eps); + //LLAMA_LOG_INFO("ggml_norm\n", __func__); + cur = ggml_add(ctx0, + ggml_mul(ctx0, cur, model.output_norm), + model.output_norm_b); + //LLAMA_LOG_INFO("result_norm\n", __func__); + ggml_set_name(cur, "result_norm"); + } + //LLAMA_LOG_INFO("matmul\n", __func__); + cur = ggml_mul_mat(ctx0, model.output, cur); + ggml_set_name(cur, "result_output"); + //LLAMA_LOG_INFO("bf expand\n", __func__); + ggml_build_forward_expand(gf, cur); + //LLAMA_LOG_INFO("Freeing ctx0\n", __func__); + ggml_free(ctx0); + //LLAMA_LOG_INFO("Exiting fun\n", __func__); + return gf; +} + static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_token * tokens, @@ -3691,6 +4003,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past); } break; + case LLM_ARCH_ADEPT: + { + result = llm_build_adept(lctx, tokens, embd, n_tokens, n_past); + } break; default: GGML_ASSERT(false); }; @@ -3746,6 +4062,7 @@ static bool llama_eval_internal( ggml_allocr_reset(lctx.alloc); + //LLAMA_LOG_INFO("Building graph\n", __func__); ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past); ggml_allocr_alloc_graph(lctx.alloc, gf); @@ -6329,6 +6646,7 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } + LLAMA_LOG_INFO("Kv self cache: %7.2f MB\n", ggml_nbytes(ctx->kv_self.k) / 1024.0 / 1024.0); { const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); @@ -6337,6 +6655,7 @@ struct llama_context * llama_new_context_with_model( const auto & hparams = ctx->model.hparams; + //LLAMA_LOG_INFO("hg\n", __func__); // resized during inference if (params.logits_all) { ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab); @@ -6360,17 +6679,21 @@ struct llama_context * llama_new_context_with_model( int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); int n_past = hparams.n_ctx - n_tokens; llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + //LLAMA_LOG_INFO("Entering build graph () The Ree.\n", __func__); ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); + //LLAMA_LOG_INFO("Egress from build graph.\n", __func__); #ifdef GGML_USE_METAL - if (params.n_gpu_layers > 0) { - ctx->ctx_metal = ggml_metal_init(1); - if (!ctx->ctx_metal) { - LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); - llama_free(ctx); - return NULL; + if (false) { + if (params.n_gpu_layers > 0) { + ctx->ctx_metal = ggml_metal_init(1); + if (!ctx->ctx_metal) { + LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); + llama_free(ctx); + return NULL; + } + ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); + ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); } - ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); - ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); } #endif // measure memory requirements for the graph From 4bcf412d86edd0379aba6921622cbecd2b391ec4 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Mon, 25 Sep 2023 23:49:35 -0700 Subject: [PATCH 02/22] wip: correct tensors up to RoPE --- convert-adept-st-to-gguf.py | 14 +- ggml.c | 111 ++++++++++++++++ llama.cpp | 254 ++++++++++++++++++++++++------------ 3 files changed, 295 insertions(+), 84 deletions(-) diff --git a/convert-adept-st-to-gguf.py b/convert-adept-st-to-gguf.py index 4844d5f81658d..1a6eda8a19f6d 100644 --- a/convert-adept-st-to-gguf.py +++ b/convert-adept-st-to-gguf.py @@ -87,12 +87,16 @@ def main(args_in: list[str] | None = None) -> None: gguf_writer.add_rope_dimension_count(hidden_size // head_count) gguf_writer.add_head_count(head_count) gguf_writer.add_head_count_kv(head_count_kv) + gguf_writer.add_rope_freq_base(hparams['rotary_emb_base']) + gguf_writer.add_layer_norm_eps(hparams['layernorm_epsilon']) if True: tokens, scores, toktypes = handle_tokenizer(dir_model) gguf_writer.add_tokenizer_model('llama') gguf_writer.add_token_list(tokens) gguf_writer.add_token_scores(scores) gguf_writer.add_token_types(toktypes) + gguf_writer.add_bos_token_id(71013) + gguf_writer.add_eos_token_id(71013) tensor_map = gguf.get_tensor_name_map(arch, block_count) print(tensor_map) tensors = {} @@ -105,15 +109,17 @@ def main(args_in: list[str] | None = None) -> None: print(name) # we don't need these - - if name.endswith(".self_attention.rotary_emb.inv_freq"): + if name.endswith(".self_attention.rotary_emb.inv_freq"): continue old_dtype = data.dtype - if 'layernorm.weight' in name: + """ + if 'layernorm.weight' in name or 'word_embeddings.weight' in name: data = data.to(torch.float32) else: if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float16) + data = data.to(torch.float32) + """ + data = data.to(torch.float32) # check for nans if torch.isnan(data).any(): print("WARNING: tensor '" + name + "' contains NaNs") diff --git a/ggml.c b/ggml.c index a0be068d6c9f7..2f02865fc92bb 100644 --- a/ggml.c +++ b/ggml.c @@ -4290,6 +4290,65 @@ void ggml_print_objects(const struct ggml_context * ctx) { GGML_PRINT("%s: --- end ---\n", __func__); } +static void ggml_print_tensor(const struct ggml_tensor * tensor) { + GGML_PRINT("Tensor (null): %s | rank %d | shape (", ggml_type_name(tensor->type), tensor->n_dims); + for (int i=0; in_dims; ++i) { + GGML_PRINT("%lld ", tensor->ne[i]); + } + GGML_PRINT(") | strides ("); + for (int i=0; in_dims; ++i) { + GGML_PRINT("%lld ", tensor->nb[i]); + } + GGML_PRINT(")\n"); +} + +static void ggml_print_tensor_values(const struct ggml_tensor * tensor, int starts[], int dim, int nelts) { + GGML_ASSERT(tensor->type == GGML_TYPE_F32); + GGML_PRINT("printing values for %s[", tensor->name); + for (int i=0; in_dims; ++i) { + if (i!=dim) { + GGML_PRINT("%d", starts[i]); + } else { + if (starts[i] > 0) { + GGML_PRINT("%d:%d", starts[i], starts[i]+nelts); + } else { + GGML_PRINT(":%d", starts[i]+nelts); + } + } + if (in_dims-1) { + GGML_PRINT(","); + } + } + GGML_PRINT("]\n"); + + float *dataPtr = (float *) tensor->data; + + // Compute the offset into data for starts + int offset = 0; + for (int j = 0; j < tensor->n_dims; j++) { + offset += (starts[j] * tensor->nb[j]) / sizeof(float); // Assuming nb[j] is in bytes, divide by sizeof(float) to get float offset. + } + + dataPtr += offset; + + for (int i = 0; i < nelts; i++) { + GGML_PRINT("%f ", *dataPtr); + dataPtr += tensor->nb[dim] / sizeof(float); // Increment by strides for the given dimension. + } + GGML_PRINT("\n"); + /* + char * ptr = (char *)tensor->data; + for (int j=0; jn_dims;j++) { + ptr += tensor->nb[j]*starts[j]; + } + for (int i=0; inb[dim]; + } + GGML_PRINT("\n"); + */ +} + int64_t ggml_nelements(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); @@ -6162,6 +6221,7 @@ struct ggml_tensor * ggml_mul_mat( const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); + GGML_PRINT("ggml_mul_mat result shape : (%lld, %lld, %lld, %lld)\n", ne[0], ne[1], ne[2], ne[3]); result->op = GGML_OP_MUL_MAT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -8823,6 +8883,15 @@ static void ggml_compute_forward_add_f32( } } } + if ((strncmp(src0->name, "preadd", 6) == 0 + || strncmp(src0->name, "qkv_preadd", 10) == 0) + && ith == 0) { + // print name + printf("\nadd outputs for %s\n", src0->name); + ggml_print_tensor(dst); + int starts[] = {0, 3, 0}; + ggml_print_tensor_values(dst, starts, 0, 10); + } } static void ggml_compute_forward_add_f16_f32( @@ -10804,6 +10873,18 @@ static void ggml_compute_forward_norm_f32( } GGML_ASSERT(src0->nb[0] == sizeof(float)); + // If the name starts with "layer_inputs", and we are on thread 0, print the tensor + if ((strncmp(src0->name, "layer_inputs", 12) == 0 + || strncmp(src0->name, "tmpq", 4) == 0) + && params->ith == 0) { + GGML_PRINT("\nlayernorm inputs for %s\n", src0->name); + ggml_print_tensor(src0); + int starts[] = {0, 1, 0}; + ggml_print_tensor_values(src0, starts, 0, 10); + for (int i=64; i<74; ++i) { + GGML_PRINT("%f ", ggml_get_f32_1d(src0, i)); + } + } const int ith = params->ith; const int nth = params->nth; @@ -11227,8 +11308,25 @@ static void ggml_compute_forward_mul_mat( struct ggml_tensor * dst) { int64_t t0 = ggml_perf_time_us(); UNUSED(t0); + if (strncmp(src1->name, "KQ_soft_max", 11) == 0 && params->ith == 0 + && src1->ne[0] == src1->ne[1]) { + GGML_PRINT("\n KQ_softmax at mul mat time for %s\n", src1->name); + ggml_print_tensor(src1); + if (ggml_nelements(src1) >= 14) { + for (int i=0; i < src1->ne[0] * src1->ne[1]; ++i) { + if (i % src1->ne[1] == 0) { + GGML_PRINT("\n"); + } + GGML_PRINT(" %f ", ((float *)src1->data)[i]); + } + GGML_PRINT("\n"); + } else { + GGML_PRINT("Not enough elements to print\n"); + } + } GGML_TENSOR_BINARY_OP_LOCALS; + // If on thread 0, src1 starts with KQ_softmax, print const int ith = params->ith; const int nth = params->nth; @@ -12628,6 +12726,12 @@ static void ggml_compute_forward_rope_f32( if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } + if (strncmp(src0->name, "qrot", 4) == 0 && params->ith == 0) { + GGML_PRINT("\nValues at RoPE time for %s\n", src0->name); + ggml_print_tensor(src0); + int starts[] = {0, 0, 1, 0}; + ggml_print_tensor_values(src0, starts, 1, 10); + } float freq_base; float freq_scale; @@ -12756,6 +12860,13 @@ static void ggml_compute_forward_rope_f32( } } } + if (strncmp(src0->name, "qrot", 4) == 0 && params->ith == 0) { + GGML_PRINT("\n dest at RoPE time for %s\n", src0->name); + // print shape and strides + int starts[4] = {0,0,0,0}; + ggml_print_tensor(dst); + ggml_print_tensor_values(dst, starts, 0, 10); + } } static void ggml_compute_forward_rope_f16( diff --git a/llama.cpp b/llama.cpp index c354f1ef22c97..a8a724c2c2f90 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2337,25 +2337,6 @@ static void llm_load_tensors( const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; auto & layer = model.layers[i]; - /* - input_layernorm.bias torch.Size([4096]) - input_layernorm.weight torch.Size([4096]) - mlp.dense_4h_to_h.bias torch.Size([4096]) - mlp.dense_4h_to_h.weight torch.Size([4096, 16384]) - mlp.dense_h_to_4h.bias torch.Size([16384]) - mlp.dense_h_to_4h.weight torch.Size([16384, 4096]) - post_attention_layernorm.bias torch.Size([4096]) - post_attention_layernorm.weight torch.Size([4096]) - self_attention.dense.bias torch.Size([4096]) - self_attention.dense.weight torch.Size([4096, 4096]) - self_attention.k_layernorm.bias torch.Size([64]) - self_attention.k_layernorm.weight torch.Size([64]) - self_attention.q_layernorm.bias torch.Size([64]) - self_attention.q_layernorm.weight torch.Size([64]) - self_attention.query_key_value.bias torch.Size([12288]) - self_attention.query_key_value.weight torch.Size([12288, 4096]) - self_attention.rotary_emb.inv_freq torch.Size([16]) - */ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); @@ -3744,6 +3725,20 @@ static struct ggml_cgraph * llm_build_starcoder( return gf; } +static void log_tensor( + ggml_tensor * a +) { + LLAMA_LOG_INFO("Shape of %s is ", a->name); + for (int i = 0; i < a->n_dims; ++i) { + LLAMA_LOG_INFO("%d", a->ne[i]); + if (i < a->n_dims - 1) { + LLAMA_LOG_INFO(","); + } + LLAMA_LOG_INFO(" "); + } + LLAMA_LOG_INFO("\n"); +} + static struct ggml_cgraph * llm_build_adept( llama_context & lctx, const llama_token * tokens, @@ -3760,7 +3755,7 @@ static struct ggml_cgraph * llm_build_adept( GGML_ASSERT(!!kv_self.ctx); const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; + const int64_t n_layer = 1; const int64_t n_ctx = hparams.n_ctx; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_head = hparams.n_head; @@ -3785,18 +3780,28 @@ static struct ggml_cgraph * llm_build_adept( struct ggml_tensor * inpL; if (tokens) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_allocr_alloc(lctx.alloc, inp_tokens); if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, inp_tokens, N*ggml_element_size(inp_tokens)); + memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); } + ggml_set_name(inp_tokens, "inp_tokens"); LLAMA_LOG_INFO("Token ids:\n", __func__); for (int i = 0; i < N; ++i) { LLAMA_LOG_INFO(" %d ", tokens[i]); } - ggml_set_name(inp_tokens, "inp_tokens"); - + LLAMA_LOG_INFO("\n", __func__); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + /* + LLAMA_LOG_INFO("\ninpL:\n", __func__); + if (ggml_nelements(model.tok_embeddings) >= 5) { + for (int i=0; i < 5; ++i) { + LLAMA_LOG_INFO(" %f ", ggml_get_f32_1d(model.tok_embeddings, i)); + } + LLAMA_LOG_INFO("\n"); + } else { + LLAMA_LOG_INFO("Not enough elements to print\n", __func__); + } + */ } else { inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); ggml_allocr_alloc(lctx.alloc, inpL); @@ -3804,7 +3809,6 @@ static struct ggml_cgraph * llm_build_adept( memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); } } - // Log all of the token ids sequentially struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); ggml_allocr_alloc(lctx.alloc, KQ_scale); if (!ggml_allocr_is_measure(lctx.alloc)) { @@ -3813,63 +3817,159 @@ static struct ggml_cgraph * llm_build_adept( ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); //LLAMA_LOG_INFO("Entering n_layers loop\n", __func__); for (int il=0; il < n_layer; ++il) { - struct ggml_tensor * attn_norm; offload_func_t offload_func = llama_nop; + // Input is (d_model, L) // Attention + struct ggml_tensor * residual = inpL; + ggml_set_name(residual, format((char*)"layer_inputs_%d", il).c_str()); { // input norming - attn_norm = ggml_norm(ctx0, inpL, hparams.f_norm_eps); - attn_norm = ggml_add(ctx0, ggml_mul( - ctx0, attn_norm, model.layers[il].attn_norm), + cur = ggml_norm(ctx0, inpL, hparams.f_norm_eps); + cur = ggml_add(ctx0, ggml_mul( + ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); - - // QKV + bias - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm); + } + ggml_set_name(cur, "cur"); + { + // QKV + log_tensor(cur); + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + // 3 * d_model, L + // or 2 * n_head_kv + n_embd_head, L + // + bias + ggml_format_name(cur, "qkv_preadd_%d", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - const size_t wsize = ggml_type_size(cur->type); // Apply Q, K layernorm + // Where is the Q/K/V? it's in order. Hopefully... + // So q has offset 0. + // And split into heads + // -> (d_h, n_head, L) + const size_t wsize = ggml_type_size(cur->type); + GGML_ASSERT(n_head_kv == n_head); + LLAMA_LOG_INFO("N: %d\n", N); + ggml_set_name(cur, format("qkv_%d", il).c_str()); + log_tensor(cur); + + // cur is (3 * d_head * n_head, N) + struct ggml_tensor * tmpqkv = ggml_view_4d( + ctx0, cur, n_embd_head, 3, n_head, N, + /* nb1 = */ wsize * n_embd_head, + /* nb2 = */ wsize * n_embd_head * 3, + /* nb3 = */ wsize * n_embd_head * 3 * n_head, + /* offset = */ 0 + ); + // get it to (d_h, n_head, L, 3) + struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); + ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il); + log_tensor(tmpqkv_perm); struct ggml_tensor * tmpq = ggml_cont( - ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head, N, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0 + ctx0, + ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, N, + /* nb1 = */ sizeof(float) * n_embd_head, + /* nb2 = */ sizeof(float) * n_embd_head * n_head, + /* offset = */ 0 ) ); struct ggml_tensor * tmpk = ggml_cont( - ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head, N, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head + ctx0, + ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, N, + /* nb1 = */ sizeof(float) * n_embd_head, + /* nb2 = */ sizeof(float) * n_embd_head * n_head, + /* offset = */ sizeof(float) * n_embd_head * n_head * N ) ); - tmpk = ggml_norm(ctx0, tmpk, hparams.f_norm_eps); - tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); - tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); - + struct ggml_tensor * tmpv = ggml_cont( + ctx0, + ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, N, + /* nb1 = */ sizeof(float) * n_embd_head, + /* nb2 = */ sizeof(float) * n_embd_head * n_head, + /* offset = */ sizeof(float) * n_embd_head * n_head * N * 2 + ) + ); + ggml_set_name(tmpq, format("tmpq_%d", il).c_str()); tmpq = ggml_norm(ctx0, tmpq, hparams.f_norm_eps); tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); + ggml_set_name(tmpq, format("preadd_%d", il).c_str()); tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); - - struct ggml_tensor * Qcur = ggml_rope_custom_inplace( - ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale - ); - struct ggml_tensor * Kcur = ggml_rope_custom_inplace( - ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale - ); - - struct ggml_tensor * tmpv = ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, N, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * (n_head + n_head_kv)); + tmpk = ggml_norm(ctx0, tmpk, hparams.f_norm_eps); + tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); + tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); + ggml_set_name(tmpq, format("tmpq_%d", il).c_str()); + ggml_set_name(tmpk, format("tmpk_%d", il).c_str()); + log_tensor(tmpq); + log_tensor(tmpk); + + + const size_t n_rot = n_embd_head / 2; + struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d( + ctx0, tmpq, n_rot, n_head, N, + /* nb1 = */ wsize * n_embd_head, + /* nb2 = */ wsize * n_embd_head * n_head, + /* offset = */ 0 + )); + struct ggml_tensor * qpass = ggml_cont(ctx0, ggml_permute(ctx0, ggml_view_3d( + ctx0, tmpq, n_rot, n_head, N, + /* nb1 = */ wsize * n_rot, + /* nb2 = */ wsize * n_rot * n_head, + /* offset = */ (wsize * n_embd_head * n_head) / 2 + ), 2, 1, 0, 3)); + ggml_set_name(qrot, format("qrot_%d", il).c_str()); + ggml_set_name(qpass, format("qpass_%d", il).c_str()); + log_tensor(qrot); + log_tensor(qpass); + + struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d( + ctx0, tmpk, n_rot, n_head, N, + /* nb1 = */ wsize * n_rot, + /* nb2 = */ wsize * n_rot * n_head, + /* offset = */ 0 + )); + struct ggml_tensor * kpass = ggml_cont(ctx0, + ggml_permute(ctx0, + ggml_view_3d( + ctx0, tmpk, n_rot, n_head, N, + /* nb1 = */ wsize * n_rot, + /* nb2 = */ wsize * n_rot * n_head, + /* offset = */ (wsize * n_embd_head * n_head) / 2 + ), 2, 1, 0, 3)); + ggml_set_name(krot, format("krot_%d", il).c_str()); + ggml_set_name(kpass, format("kpass_%d", il).c_str()); + log_tensor(krot); + log_tensor(kpass); + + struct ggml_tensor * qrotated = ggml_cont(ctx0, ggml_permute(ctx0, + ggml_rope_custom_inplace( + ctx0, qrot, n_past, n_rot, 0, 0, freq_base, freq_scale + ), + 2, 1, 0, 3 + )); + struct ggml_tensor * krotated = ggml_cont(ctx0, ggml_permute(ctx0, + ggml_rope_custom_inplace( + ctx0, krot, n_past, n_rot, 0, 0, freq_base, freq_scale + ), + 2, 1, 0, 3 + )); + ggml_set_name(qrotated, format("qrotated_%d", il).c_str()); + ggml_set_name(krotated, format("krotated_%d", il).c_str()); + log_tensor(qrotated); + log_tensor(krotated); + struct ggml_tensor * Qcur = ggml_cont(ctx0, + ggml_permute(ctx0, + ggml_concat(ctx0, qrotated, qpass), + 2, 1, 0, 3)); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_permute(ctx0, ggml_concat(ctx0, krotated, kpass), 2, 1, 0, 3)); + ggml_set_name(Qcur, format("Qcur_%d", il).c_str()); + ggml_set_name(Kcur, format("Kcur_%d", il).c_str()); + log_tensor(Qcur); + log_tensor(Kcur); { - // Set kv cache elements? struct ggml_tensor * Vcur = ggml_transpose( ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N) ); @@ -3886,11 +3986,11 @@ static struct ggml_cgraph * llm_build_adept( ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } - //LLAMA_LOG_INFO("3889\n", __func__); struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); ggml_set_name(Q, "Q"); + log_tensor(Q); - // index into kv cache? + // view kv cache? struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, n_embd_head, n_past + N, n_head_kv, @@ -3907,12 +4007,11 @@ static struct ggml_cgraph * llm_build_adept( ggml_set_name(KQ_scaled, "KQ_scaled"); struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - ggml_set_name(KQ_masked, "KQ_soft_max"); + ggml_set_name(KQ_masked, "KQ_mask"); struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + ggml_set_name(KQ_soft_max, format("KQ_soft_max_%d", il).c_str()); - //LLAMA_LOG_INFO("3915\n", __func__); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, n_past + N, n_embd_head, n_head_kv, @@ -3932,15 +4031,19 @@ static struct ggml_cgraph * llm_build_adept( cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); ggml_set_name(cur, "result_wo"); - //LLAMA_LOG_INFO("EoWo\n", __func__); } - struct ggml_tensor * attn_out = cur; + cur = ggml_add(ctx0, residual, cur); + residual = cur; + ggml_set_name(residual, "residual"); { - struct ggml_tensor * inpFF = attn_norm; + struct ggml_tensor * inpFF = cur; // Norm { cur = ggml_norm(ctx0, inpFF, hparams.f_norm_eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); + cur = ggml_add(ctx0, + ggml_mul(ctx0, cur, model.layers[il].ffn_norm), + model.layers[il].ffn_norm_b + ); } cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); // Squared ReLU @@ -3948,31 +4051,22 @@ static struct ggml_cgraph * llm_build_adept( cur = ggml_mul(ctx0, cur, cur); cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); } - cur = ggml_add(ctx0, cur, attn_out); + cur = ggml_add(ctx0, cur, residual); ggml_set_name(cur, "inpFF_+_attn_out"); inpL = cur; - //LLAMA_LOG_INFO("EoL\n", __func__); } - //LLAMA_LOG_INFO("Exited from n_layers loop\n", __func__); cur = inpL; { - //LLAMA_LOG_INFO("norm\n", __func__); cur = ggml_norm(ctx0, cur, hparams.f_norm_eps); - //LLAMA_LOG_INFO("ggml_norm\n", __func__); cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); - //LLAMA_LOG_INFO("result_norm\n", __func__); ggml_set_name(cur, "result_norm"); } - //LLAMA_LOG_INFO("matmul\n", __func__); cur = ggml_mul_mat(ctx0, model.output, cur); ggml_set_name(cur, "result_output"); - //LLAMA_LOG_INFO("bf expand\n", __func__); ggml_build_forward_expand(gf, cur); - //LLAMA_LOG_INFO("Freeing ctx0\n", __func__); ggml_free(ctx0); - //LLAMA_LOG_INFO("Exiting fun\n", __func__); return gf; } From c9e1446f525088261e6cf649c3a9d47ebeef4a7f Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Tue, 26 Sep 2023 00:07:19 -0700 Subject: [PATCH 03/22] correct tensors thru RoPE --- ggml.c | 6 +++--- llama.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml.c b/ggml.c index 2f02865fc92bb..3cf682ab97bb3 100644 --- a/ggml.c +++ b/ggml.c @@ -12729,8 +12729,8 @@ static void ggml_compute_forward_rope_f32( if (strncmp(src0->name, "qrot", 4) == 0 && params->ith == 0) { GGML_PRINT("\nValues at RoPE time for %s\n", src0->name); ggml_print_tensor(src0); - int starts[] = {0, 0, 1, 0}; - ggml_print_tensor_values(src0, starts, 1, 10); + int starts[] = {0, 1, 0, 0}; + ggml_print_tensor_values(src0, starts, 0, 10); } float freq_base; @@ -12863,7 +12863,7 @@ static void ggml_compute_forward_rope_f32( if (strncmp(src0->name, "qrot", 4) == 0 && params->ith == 0) { GGML_PRINT("\n dest at RoPE time for %s\n", src0->name); // print shape and strides - int starts[4] = {0,0,0,0}; + int starts[4] = {0,0,1,0}; ggml_print_tensor(dst); ggml_print_tensor_values(dst, starts, 0, 10); } diff --git a/llama.cpp b/llama.cpp index a8a724c2c2f90..31f92cad2263b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3945,13 +3945,13 @@ static struct ggml_cgraph * llm_build_adept( struct ggml_tensor * qrotated = ggml_cont(ctx0, ggml_permute(ctx0, ggml_rope_custom_inplace( - ctx0, qrot, n_past, n_rot, 0, 0, freq_base, freq_scale + ctx0, qrot, n_past, n_rot, 2, 0, freq_base, freq_scale ), 2, 1, 0, 3 )); struct ggml_tensor * krotated = ggml_cont(ctx0, ggml_permute(ctx0, ggml_rope_custom_inplace( - ctx0, krot, n_past, n_rot, 0, 0, freq_base, freq_scale + ctx0, krot, n_past, n_rot, 2, 0, freq_base, freq_scale ), 2, 1, 0, 3 )); From d1b40efcfa2725e48db9182a85a1eca21f33d638 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Tue, 26 Sep 2023 11:36:36 -0700 Subject: [PATCH 04/22] Correct outputs through masked & softmax'd KQ --- ggml.c | 29 +++++++++---------- llama.cpp | 84 +++++++++++++++++++++++++++---------------------------- 2 files changed, 54 insertions(+), 59 deletions(-) diff --git a/ggml.c b/ggml.c index 3cf682ab97bb3..8eaad0c162429 100644 --- a/ggml.c +++ b/ggml.c @@ -11308,21 +11308,18 @@ static void ggml_compute_forward_mul_mat( struct ggml_tensor * dst) { int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - if (strncmp(src1->name, "KQ_soft_max", 11) == 0 && params->ith == 0 - && src1->ne[0] == src1->ne[1]) { - GGML_PRINT("\n KQ_softmax at mul mat time for %s\n", src1->name); + if ( + strncmp(src1->name, "printme", 7) == 0 + && params->ith == 0) { + GGML_PRINT("\nInputs to matmul: %s\n", src1->name); ggml_print_tensor(src1); - if (ggml_nelements(src1) >= 14) { - for (int i=0; i < src1->ne[0] * src1->ne[1]; ++i) { - if (i % src1->ne[1] == 0) { - GGML_PRINT("\n"); - } - GGML_PRINT(" %f ", ((float *)src1->data)[i]); + for (int i=0; i < src1->ne[0] * src1->ne[1]; ++i) { + if (i % src1->ne[0] == 0) { + GGML_PRINT("\n"); } - GGML_PRINT("\n"); - } else { - GGML_PRINT("Not enough elements to print\n"); + GGML_PRINT(" %f ", ((float *)src1->data)[i + (src1->ne[0] * src1->ne[1])]); } + GGML_PRINT("\n"); } GGML_TENSOR_BINARY_OP_LOCALS; @@ -12726,10 +12723,10 @@ static void ggml_compute_forward_rope_f32( if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - if (strncmp(src0->name, "qrot", 4) == 0 && params->ith == 0) { - GGML_PRINT("\nValues at RoPE time for %s\n", src0->name); + if (strncmp(src0->name, "krot", 4) == 0 && params->ith == 0) { + GGML_PRINT("\ninputs of RoPE for %s\n", src0->name); ggml_print_tensor(src0); - int starts[] = {0, 1, 0, 0}; + int starts[] = {0, 0, 1, 0}; ggml_print_tensor_values(src0, starts, 0, 10); } @@ -12860,7 +12857,7 @@ static void ggml_compute_forward_rope_f32( } } } - if (strncmp(src0->name, "qrot", 4) == 0 && params->ith == 0) { + if (strncmp(src0->name, "krot", 4) == 0 && params->ith == 0) { GGML_PRINT("\n dest at RoPE time for %s\n", src0->name); // print shape and strides int starts[4] = {0,0,1,0}; diff --git a/llama.cpp b/llama.cpp index 31f92cad2263b..7a00fe0395dc0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3791,17 +3791,6 @@ static struct ggml_cgraph * llm_build_adept( } LLAMA_LOG_INFO("\n", __func__); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - /* - LLAMA_LOG_INFO("\ninpL:\n", __func__); - if (ggml_nelements(model.tok_embeddings) >= 5) { - for (int i=0; i < 5; ++i) { - LLAMA_LOG_INFO(" %f ", ggml_get_f32_1d(model.tok_embeddings, i)); - } - LLAMA_LOG_INFO("\n"); - } else { - LLAMA_LOG_INFO("Not enough elements to print\n", __func__); - } - */ } else { inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); ggml_allocr_alloc(lctx.alloc, inpL); @@ -3812,7 +3801,7 @@ static struct ggml_cgraph * llm_build_adept( struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); ggml_allocr_alloc(lctx.alloc, KQ_scale); if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); } ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); //LLAMA_LOG_INFO("Entering n_layers loop\n", __func__); @@ -3891,18 +3880,19 @@ static struct ggml_cgraph * llm_build_adept( /* offset = */ sizeof(float) * n_embd_head * n_head * N * 2 ) ); + // Q / K layernorm ggml_set_name(tmpq, format("tmpq_%d", il).c_str()); tmpq = ggml_norm(ctx0, tmpq, hparams.f_norm_eps); tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); - ggml_set_name(tmpq, format("preadd_%d", il).c_str()); tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); + ggml_set_name(tmpq, format("tmpq_%d", il).c_str()); + log_tensor(tmpq); tmpk = ggml_norm(ctx0, tmpk, hparams.f_norm_eps); tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); + ggml_set_name(tmpk, format("preadd_%d", il).c_str()); tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); - ggml_set_name(tmpq, format("tmpq_%d", il).c_str()); ggml_set_name(tmpk, format("tmpk_%d", il).c_str()); - log_tensor(tmpq); log_tensor(tmpk); @@ -3913,12 +3903,13 @@ static struct ggml_cgraph * llm_build_adept( /* nb2 = */ wsize * n_embd_head * n_head, /* offset = */ 0 )); - struct ggml_tensor * qpass = ggml_cont(ctx0, ggml_permute(ctx0, ggml_view_3d( + // get the second half of tmpq, e.g tmpq[n_rot:, :, :] + struct ggml_tensor * qpass = ggml_cont(ctx0, ggml_view_3d( ctx0, tmpq, n_rot, n_head, N, - /* nb1 = */ wsize * n_rot, - /* nb2 = */ wsize * n_rot * n_head, - /* offset = */ (wsize * n_embd_head * n_head) / 2 - ), 2, 1, 0, 3)); + /* nb1 = */ wsize * n_embd_head, + /* nb2 = */ wsize * n_embd_head * n_head, + /* offset = */ wsize * n_rot + )); ggml_set_name(qrot, format("qrot_%d", il).c_str()); ggml_set_name(qpass, format("qpass_%d", il).c_str()); log_tensor(qrot); @@ -3926,18 +3917,16 @@ static struct ggml_cgraph * llm_build_adept( struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d( ctx0, tmpk, n_rot, n_head, N, - /* nb1 = */ wsize * n_rot, - /* nb2 = */ wsize * n_rot * n_head, + /* nb1 = */ wsize * n_embd_head, + /* nb2 = */ wsize * n_embd_head * n_head, /* offset = */ 0 )); - struct ggml_tensor * kpass = ggml_cont(ctx0, - ggml_permute(ctx0, - ggml_view_3d( + struct ggml_tensor * kpass = ggml_cont(ctx0, ggml_view_3d( ctx0, tmpk, n_rot, n_head, N, - /* nb1 = */ wsize * n_rot, - /* nb2 = */ wsize * n_rot * n_head, - /* offset = */ (wsize * n_embd_head * n_head) / 2 - ), 2, 1, 0, 3)); + /* nb1 = */ wsize * n_embd_head, + /* nb2 = */ wsize * n_embd_head * n_head, + /* offset = */ wsize * n_rot + )); ggml_set_name(krot, format("krot_%d", il).c_str()); ggml_set_name(kpass, format("kpass_%d", il).c_str()); log_tensor(krot); @@ -3949,68 +3938,77 @@ static struct ggml_cgraph * llm_build_adept( ), 2, 1, 0, 3 )); + ggml_set_name(qrotated, format("qrotated_%d", il).c_str()); + log_tensor(qrotated); + qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3)); struct ggml_tensor * krotated = ggml_cont(ctx0, ggml_permute(ctx0, ggml_rope_custom_inplace( ctx0, krot, n_past, n_rot, 2, 0, freq_base, freq_scale ), 2, 1, 0, 3 )); - ggml_set_name(qrotated, format("qrotated_%d", il).c_str()); ggml_set_name(krotated, format("krotated_%d", il).c_str()); - log_tensor(qrotated); log_tensor(krotated); + kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3)); + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_permute(ctx0, ggml_concat(ctx0, qrotated, qpass), 2, 1, 0, 3)); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_permute(ctx0, ggml_concat(ctx0, krotated, kpass), 2, 1, 0, 3)); + struct ggml_tensor * Kcur = ggml_cont(ctx0, + ggml_permute(ctx0, ggml_concat(ctx0, krotated, kpass), + 2, 1, 0, 3) + ); ggml_set_name(Qcur, format("Qcur_%d", il).c_str()); ggml_set_name(Kcur, format("Kcur_%d", il).c_str()); log_tensor(Qcur); log_tensor(Kcur); - + log_tensor(kv_self.k); { + // View v as (N, n_embd) struct ggml_tensor * Vcur = ggml_transpose( - ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N) + ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd, N) ); ggml_set_name(Vcur, "Vcur"); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, - (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past) + + // Select k from kv cache as 1d view (N * n_embd) + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, + (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past) ); ggml_set_name(k, "k"); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3)); ggml_set_name(Q, "Q"); log_tensor(Q); - // view kv cache? struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, + ggml_cont(ctx0, ggml_view_3d(ctx0, kv_self.k, n_embd_head, n_past + N, n_head_kv, ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il)); ggml_set_name(K, "K"); + log_tensor(K); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); ggml_set_name(KQ, "KQ"); - struct ggml_tensor * KQ_scaled = ggml_scale_inplace (ctx0, KQ, KQ_scale); + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); ggml_set_name(KQ_scaled, "KQ_scaled"); struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); ggml_set_name(KQ_masked, "KQ_mask"); struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, format("KQ_soft_max_%d", il).c_str()); + ggml_set_name(KQ_soft_max, format("printme_KQ_soft_max_%d", il).c_str()); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, From db2181a47bafeb2b76dfaa75bc450e252c42e1ac Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Tue, 26 Sep 2023 13:10:04 -0700 Subject: [PATCH 05/22] fp32 works --- ggml.c | 24 ++++++++------- llama.cpp | 88 +++++++++++++++++++++++++++++++------------------------ 2 files changed, 62 insertions(+), 50 deletions(-) diff --git a/ggml.c b/ggml.c index 8eaad0c162429..a1afd037f3711 100644 --- a/ggml.c +++ b/ggml.c @@ -6221,7 +6221,7 @@ struct ggml_tensor * ggml_mul_mat( const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); - GGML_PRINT("ggml_mul_mat result shape : (%lld, %lld, %lld, %lld)\n", ne[0], ne[1], ne[2], ne[3]); + //GGML_PRINT("ggml_mul_mat result shape : (%lld, %lld, %lld, %lld)\n", ne[0], ne[1], ne[2], ne[3]); result->op = GGML_OP_MUL_MAT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -8883,13 +8883,14 @@ static void ggml_compute_forward_add_f32( } } } - if ((strncmp(src0->name, "preadd", 6) == 0 - || strncmp(src0->name, "qkv_preadd", 10) == 0) - && ith == 0) { - // print name - printf("\nadd outputs for %s\n", src0->name); + if ( + strncmp(src0->name, "printme", 7) == 0 + && params->ith == 0) { + GGML_PRINT("\noutputs of add: %s + %s\n", src0->name, src1->name); + ggml_print_tensor(src0); + ggml_print_tensor(src1); ggml_print_tensor(dst); - int starts[] = {0, 3, 0}; + int starts[] = {0, 1, 0}; ggml_print_tensor_values(dst, starts, 0, 10); } } @@ -10874,8 +10875,7 @@ static void ggml_compute_forward_norm_f32( GGML_ASSERT(src0->nb[0] == sizeof(float)); // If the name starts with "layer_inputs", and we are on thread 0, print the tensor - if ((strncmp(src0->name, "layer_inputs", 12) == 0 - || strncmp(src0->name, "tmpq", 4) == 0) + if (strncmp(src0->name, "printme", 7) == 0 && params->ith == 0) { GGML_PRINT("\nlayernorm inputs for %s\n", src0->name); ggml_print_tensor(src0); @@ -11313,6 +11313,7 @@ static void ggml_compute_forward_mul_mat( && params->ith == 0) { GGML_PRINT("\nInputs to matmul: %s\n", src1->name); ggml_print_tensor(src1); + /* for (int i=0; i < src1->ne[0] * src1->ne[1]; ++i) { if (i % src1->ne[0] == 0) { GGML_PRINT("\n"); @@ -11320,6 +11321,7 @@ static void ggml_compute_forward_mul_mat( GGML_PRINT(" %f ", ((float *)src1->data)[i + (src1->ne[0] * src1->ne[1])]); } GGML_PRINT("\n"); + */ } GGML_TENSOR_BINARY_OP_LOCALS; @@ -12723,7 +12725,7 @@ static void ggml_compute_forward_rope_f32( if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - if (strncmp(src0->name, "krot", 4) == 0 && params->ith == 0) { + if (strncmp(src0->name, "printme", 7) == 0 && params->ith == 0) { GGML_PRINT("\ninputs of RoPE for %s\n", src0->name); ggml_print_tensor(src0); int starts[] = {0, 0, 1, 0}; @@ -12857,7 +12859,7 @@ static void ggml_compute_forward_rope_f32( } } } - if (strncmp(src0->name, "krot", 4) == 0 && params->ith == 0) { + if (strncmp(src0->name, "printme", 7) == 0 && params->ith == 0) { GGML_PRINT("\n dest at RoPE time for %s\n", src0->name); // print shape and strides int starts[4] = {0,0,1,0}; diff --git a/llama.cpp b/llama.cpp index 7a00fe0395dc0..66cef8b59cf2a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3755,7 +3755,7 @@ static struct ggml_cgraph * llm_build_adept( GGML_ASSERT(!!kv_self.ctx); const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = 1; + const int64_t n_layer = hparams.n_layer; const int64_t n_ctx = hparams.n_ctx; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_head = hparams.n_head; @@ -3785,11 +3785,12 @@ static struct ggml_cgraph * llm_build_adept( memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); } ggml_set_name(inp_tokens, "inp_tokens"); - LLAMA_LOG_INFO("Token ids:\n", __func__); + /*LLAMA_LOG_INFO("Token ids:\n", __func__); for (int i = 0; i < N; ++i) { LLAMA_LOG_INFO(" %d ", tokens[i]); } LLAMA_LOG_INFO("\n", __func__); + */ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); @@ -3809,7 +3810,7 @@ static struct ggml_cgraph * llm_build_adept( offload_func_t offload_func = llama_nop; // Input is (d_model, L) // Attention - struct ggml_tensor * residual = inpL; + struct ggml_tensor * residual = ggml_dup(ctx0, inpL); ggml_set_name(residual, format((char*)"layer_inputs_%d", il).c_str()); { // input norming @@ -3821,7 +3822,7 @@ static struct ggml_cgraph * llm_build_adept( ggml_set_name(cur, "cur"); { // QKV - log_tensor(cur); + //log_tensor(cur); cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); // 3 * d_model, L // or 2 * n_head_kv + n_embd_head, L @@ -3837,9 +3838,9 @@ static struct ggml_cgraph * llm_build_adept( // -> (d_h, n_head, L) const size_t wsize = ggml_type_size(cur->type); GGML_ASSERT(n_head_kv == n_head); - LLAMA_LOG_INFO("N: %d\n", N); + //LLAMA_LOG_INFO("N: %d\n", N); ggml_set_name(cur, format("qkv_%d", il).c_str()); - log_tensor(cur); + //log_tensor(cur); // cur is (3 * d_head * n_head, N) struct ggml_tensor * tmpqkv = ggml_view_4d( @@ -3852,7 +3853,7 @@ static struct ggml_cgraph * llm_build_adept( // get it to (d_h, n_head, L, 3) struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il); - log_tensor(tmpqkv_perm); + //log_tensor(tmpqkv_perm); struct ggml_tensor * tmpq = ggml_cont( ctx0, ggml_view_3d( @@ -3886,14 +3887,14 @@ static struct ggml_cgraph * llm_build_adept( tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); ggml_set_name(tmpq, format("tmpq_%d", il).c_str()); - log_tensor(tmpq); + //log_tensor(tmpq); tmpk = ggml_norm(ctx0, tmpk, hparams.f_norm_eps); tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); ggml_set_name(tmpk, format("preadd_%d", il).c_str()); tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); ggml_set_name(tmpk, format("tmpk_%d", il).c_str()); - log_tensor(tmpk); + //log_tensor(tmpk); const size_t n_rot = n_embd_head / 2; @@ -3912,8 +3913,8 @@ static struct ggml_cgraph * llm_build_adept( )); ggml_set_name(qrot, format("qrot_%d", il).c_str()); ggml_set_name(qpass, format("qpass_%d", il).c_str()); - log_tensor(qrot); - log_tensor(qpass); + //log_tensor(qrot); + //log_tensor(qpass); struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d( ctx0, tmpk, n_rot, n_head, N, @@ -3929,8 +3930,8 @@ static struct ggml_cgraph * llm_build_adept( )); ggml_set_name(krot, format("krot_%d", il).c_str()); ggml_set_name(kpass, format("kpass_%d", il).c_str()); - log_tensor(krot); - log_tensor(kpass); + //log_tensor(krot); + //log_tensor(kpass); struct ggml_tensor * qrotated = ggml_cont(ctx0, ggml_permute(ctx0, ggml_rope_custom_inplace( @@ -3939,7 +3940,7 @@ static struct ggml_cgraph * llm_build_adept( 2, 1, 0, 3 )); ggml_set_name(qrotated, format("qrotated_%d", il).c_str()); - log_tensor(qrotated); + //log_tensor(qrotated); qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3)); struct ggml_tensor * krotated = ggml_cont(ctx0, ggml_permute(ctx0, ggml_rope_custom_inplace( @@ -3948,7 +3949,7 @@ static struct ggml_cgraph * llm_build_adept( 2, 1, 0, 3 )); ggml_set_name(krotated, format("krotated_%d", il).c_str()); - log_tensor(krotated); + //log_tensor(krotated); kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3)); struct ggml_tensor * Qcur = ggml_cont(ctx0, @@ -3961,9 +3962,9 @@ static struct ggml_cgraph * llm_build_adept( ); ggml_set_name(Qcur, format("Qcur_%d", il).c_str()); ggml_set_name(Kcur, format("Kcur_%d", il).c_str()); - log_tensor(Qcur); - log_tensor(Kcur); - log_tensor(kv_self.k); + //log_tensor(Qcur); + //////log_tensor(Kcur); + //log_tensor(kv_self.k); { // View v as (N, n_embd) struct ggml_tensor * Vcur = ggml_transpose( @@ -3986,7 +3987,7 @@ static struct ggml_cgraph * llm_build_adept( } struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3)); ggml_set_name(Q, "Q"); - log_tensor(Q); + //log_tensor(Q); struct ggml_tensor * K = ggml_cont(ctx0, ggml_view_3d(ctx0, kv_self.k, @@ -3996,7 +3997,7 @@ static struct ggml_cgraph * llm_build_adept( ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il)); ggml_set_name(K, "K"); - log_tensor(K); + //log_tensor(K); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); ggml_set_name(KQ, "KQ"); @@ -4008,7 +4009,7 @@ static struct ggml_cgraph * llm_build_adept( ggml_set_name(KQ_masked, "KQ_mask"); struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, format("printme_KQ_soft_max_%d", il).c_str()); + ggml_set_name(KQ_soft_max, format("KQ_soft_max_%d", il).c_str()); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, @@ -4028,28 +4029,37 @@ static struct ggml_cgraph * llm_build_adept( ggml_set_name(cur, "KQV_merged_contiguous"); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + cur = ggml_add(ctx0, cur, model.layers[il].bo); ggml_set_name(cur, "result_wo"); + //log_tensor(cur); } cur = ggml_add(ctx0, residual, cur); - residual = cur; - ggml_set_name(residual, "residual"); - { - struct ggml_tensor * inpFF = cur; - // Norm - { - cur = ggml_norm(ctx0, inpFF, hparams.f_norm_eps); - cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.layers[il].ffn_norm), - model.layers[il].ffn_norm_b - ); - } - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); - // Squared ReLU - cur = ggml_relu(ctx0, cur); - cur = ggml_mul(ctx0, cur, cur); - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); + struct ggml_tensor * residual2 = ggml_dup(ctx0, cur); + ggml_set_name(residual2, "residual2"); + // Norm + { + cur = ggml_norm(ctx0, cur, hparams.f_norm_eps); + cur = ggml_add(ctx0, + ggml_mul(ctx0, cur, model.layers[il].ffn_norm), + model.layers[il].ffn_norm_b + ); } - cur = ggml_add(ctx0, cur, residual); + // FFN + cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); + ggml_format_name(cur, "pre_act_%d", il); + cur = ggml_add(ctx0, cur, model.layers[il].b3); + // //log_tensor(cur); + // Correct through here. + // Squared ReLU + cur = ggml_relu(ctx0, cur); + cur = ggml_sqr(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); + ggml_format_name(cur, "post_ffn_down_%d", il); + struct ggml_tensor * ffn_out = ggml_add(ctx0, + cur, + model.layers[il].b2); + ggml_format_name(ffn_out, "pre_residual2_%d", il); + cur = ggml_add(ctx0, ffn_out, residual2); ggml_set_name(cur, "inpFF_+_attn_out"); inpL = cur; } From 3f3179996d8b882e8ac5bf8a18fb2e3fc3d6d9ed Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Thu, 28 Sep 2023 10:47:44 -0700 Subject: [PATCH 06/22] Rename adept->persimmon --- ...gguf.py => convert-persimmon-st-to-gguf.py | 36 +---- ggml.c | 53 +++---- gguf-py/gguf/gguf.py | 8 +- llama.cpp | 139 ++++++------------ 4 files changed, 76 insertions(+), 160 deletions(-) rename convert-adept-st-to-gguf.py => convert-persimmon-st-to-gguf.py (78%) diff --git a/convert-adept-st-to-gguf.py b/convert-persimmon-st-to-gguf.py similarity index 78% rename from convert-adept-st-to-gguf.py rename to convert-persimmon-st-to-gguf.py index 1a6eda8a19f6d..ee0d2b1d85096 100644 --- a/convert-adept-st-to-gguf.py +++ b/convert-persimmon-st-to-gguf.py @@ -19,7 +19,7 @@ def file_is_safetensors(path: Path) -> bool: return False return struct.unpack(' None: - parser = argparse.ArgumentParser(description="Convert an Adept model (e.g. Persimmon 8b) to a GGML compatible file") + parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file") parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") - parser.add_argument("--outtype", choices=["f32"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") + parser.add_argument("--outtype", choices=["f32"], help="currently only support fp32") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.safetensors)") parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm") args = parser.parse_args(args_in) assert file_is_safetensors(args.model), 'Error: model file is not a SafeTensors file' - model = lazy_load_safetensors_file(open(args.model, 'rb'), args.model) dir_model = args.model.parent with open(dir_model / 'config.json', 'r') as f: hparams = json.load(f) pprint(hparams) - arch = gguf.MODEL_ARCH.ADEPT + arch = gguf.MODEL_ARCH.PERSIMMON gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch]) block_count = hparams['num_layers'] @@ -90,7 +89,7 @@ def main(args_in: list[str] | None = None) -> None: gguf_writer.add_rope_freq_base(hparams['rotary_emb_base']) gguf_writer.add_layer_norm_eps(hparams['layernorm_epsilon']) if True: - tokens, scores, toktypes = handle_tokenizer(dir_model) + tokens, scores, toktypes = get_tokenizer_info(dir_model) gguf_writer.add_tokenizer_model('llama') gguf_writer.add_token_list(tokens) gguf_writer.add_token_scores(scores) @@ -103,32 +102,13 @@ def main(args_in: list[str] | None = None) -> None: with safe_open(args.model, framework="pt") as f: for k in f.keys(): tensors[k] = f.get_tensor(k) - print(len(tensors.keys())) for name in tensors.keys(): data = tensors[name] - print(name) - - # we don't need these if name.endswith(".self_attention.rotary_emb.inv_freq"): continue old_dtype = data.dtype - """ - if 'layernorm.weight' in name or 'word_embeddings.weight' in name: - data = data.to(torch.float32) - else: - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - """ - data = data.to(torch.float32) - # check for nans - if torch.isnan(data).any(): - print("WARNING: tensor '" + name + "' contains NaNs") - sys.exit() - if torch.isinf(data).any(): - print("WARNING: tensor '" + name + "' contains infinities") - sys.exit() - - data = data.squeeze().numpy() + # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?) + data = data.to(torch.float32).squeeze().numpy() new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) if new_name is None: print("Can not map tensor '" + name + "'") diff --git a/ggml.c b/ggml.c index a1afd037f3711..7e4099dcdf909 100644 --- a/ggml.c +++ b/ggml.c @@ -4304,49 +4304,34 @@ static void ggml_print_tensor(const struct ggml_tensor * tensor) { static void ggml_print_tensor_values(const struct ggml_tensor * tensor, int starts[], int dim, int nelts) { GGML_ASSERT(tensor->type == GGML_TYPE_F32); - GGML_PRINT("printing values for %s[", tensor->name); + GGML_PRINT("Printing values for tensor %s[", tensor->name); for (int i=0; in_dims; ++i) { - if (i!=dim) { - GGML_PRINT("%d", starts[i]); - } else { - if (starts[i] > 0) { + GGML_ASSERT(starts[i] >= 0); + if (i == dim) { + if (starts[i] > 0) { GGML_PRINT("%d:%d", starts[i], starts[i]+nelts); } else { GGML_PRINT(":%d", starts[i]+nelts); } + } else { + GGML_PRINT("%d", starts[i]); } if (in_dims-1) { GGML_PRINT(","); } } GGML_PRINT("]\n"); - - float *dataPtr = (float *) tensor->data; - - // Compute the offset into data for starts + float *data_ptr = (float *) tensor->data; int offset = 0; for (int j = 0; j < tensor->n_dims; j++) { - offset += (starts[j] * tensor->nb[j]) / sizeof(float); // Assuming nb[j] is in bytes, divide by sizeof(float) to get float offset. + offset += (starts[j] * tensor->nb[j]) / ggml_type_size(GGML_TYPE_F32); } - - dataPtr += offset; - + data_ptr += offset; for (int i = 0; i < nelts; i++) { - GGML_PRINT("%f ", *dataPtr); - dataPtr += tensor->nb[dim] / sizeof(float); // Increment by strides for the given dimension. + GGML_PRINT("%f ", *data_ptr); + data_ptr += tensor->nb[dim] / ggml_type_size(GGML_TYPE_F32); } GGML_PRINT("\n"); - /* - char * ptr = (char *)tensor->data; - for (int j=0; jn_dims;j++) { - ptr += tensor->nb[j]*starts[j]; - } - for (int i=0; inb[dim]; - } - GGML_PRINT("\n"); - */ } int64_t ggml_nelements(const struct ggml_tensor * tensor) { @@ -8883,14 +8868,14 @@ static void ggml_compute_forward_add_f32( } } } - if ( - strncmp(src0->name, "printme", 7) == 0 + if ((strncmp(src0->name, "printme", 7) == 0 + ||strncmp(src1->name, "printme", 7) == 0) && params->ith == 0) { GGML_PRINT("\noutputs of add: %s + %s\n", src0->name, src1->name); ggml_print_tensor(src0); ggml_print_tensor(src1); ggml_print_tensor(dst); - int starts[] = {0, 1, 0}; + int starts[] = {0, 0, 0}; ggml_print_tensor_values(dst, starts, 0, 10); } } @@ -10879,11 +10864,8 @@ static void ggml_compute_forward_norm_f32( && params->ith == 0) { GGML_PRINT("\nlayernorm inputs for %s\n", src0->name); ggml_print_tensor(src0); - int starts[] = {0, 1, 0}; + int starts[] = {0, 0, 0}; ggml_print_tensor_values(src0, starts, 0, 10); - for (int i=64; i<74; ++i) { - GGML_PRINT("%f ", ggml_get_f32_1d(src0, i)); - } } const int ith = params->ith; @@ -11313,15 +11295,14 @@ static void ggml_compute_forward_mul_mat( && params->ith == 0) { GGML_PRINT("\nInputs to matmul: %s\n", src1->name); ggml_print_tensor(src1); - /* + size_t offset = 0;//(src1->ne[0] * src1->ne[1]) for (int i=0; i < src1->ne[0] * src1->ne[1]; ++i) { if (i % src1->ne[0] == 0) { GGML_PRINT("\n"); } - GGML_PRINT(" %f ", ((float *)src1->data)[i + (src1->ne[0] * src1->ne[1])]); + GGML_PRINT(" %f ", ((float *)src1->data)[i + offset]); } GGML_PRINT("\n"); - */ } GGML_TENSOR_BINARY_OP_LOCALS; diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 93a397109fa64..8a1fc93168c3e 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -85,7 +85,7 @@ class MODEL_ARCH(IntEnum): GPTNEOX : int = auto() MPT : int = auto() STARCODER : int = auto() - ADEPT : int = auto() + PERSIMMON : int = auto() class MODEL_TENSOR(IntEnum): @@ -119,7 +119,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GPTNEOX: "gptneox", MODEL_ARCH.MPT: "mpt", MODEL_ARCH.STARCODER: "starcoder", - MODEL_ARCH.ADEPT: "adept", + MODEL_ARCH.PERSIMMON: "persimmon", } MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = { @@ -189,7 +189,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", }, - MODEL_ARCH.ADEPT: { + MODEL_ARCH.PERSIMMON: { MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.OUTPUT: "output", MODEL_TENSOR.OUTPUT_NORM: "output_norm", @@ -219,7 +219,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], - MODEL_ARCH.ADEPT: [ + MODEL_ARCH.PERSIMMON: [ MODEL_TENSOR.ROPE_FREQS, ] } diff --git a/llama.cpp b/llama.cpp index 66cef8b59cf2a..1b155e5b778a4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -162,7 +162,7 @@ enum llm_arch { LLM_ARCH_GPTNEOX, LLM_ARCH_MPT, LLM_ARCH_STARCODER, - LLM_ARCH_ADEPT, + LLM_ARCH_PERSIMMON, LLM_ARCH_UNKNOWN, }; @@ -175,7 +175,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_MPT, "mpt" }, { LLM_ARCH_BAICHUAN, "baichuan" }, { LLM_ARCH_STARCODER, "starcoder" }, - { LLM_ARCH_ADEPT, "adept" }, + { LLM_ARCH_PERSIMMON, "persimmon" }, }; enum llm_kv { @@ -378,7 +378,7 @@ static std::map> LLM_TENSOR_NAMES = }, }, { - LLM_ARCH_ADEPT, + LLM_ARCH_PERSIMMON, { { LLM_TENSOR_TOKEN_EMBD, "token_embd"}, { LLM_TENSOR_OUTPUT_NORM, "output_norm"}, @@ -2323,7 +2323,7 @@ static void llm_load_tensors( } } } break; - case LLM_ARCH_ADEPT: + case LLM_ARCH_PERSIMMON: { model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); @@ -3739,7 +3739,7 @@ static void log_tensor( LLAMA_LOG_INFO("\n"); } -static struct ggml_cgraph * llm_build_adept( +static struct ggml_cgraph * llm_build_persimmon( llama_context & lctx, const llama_token * tokens, const float * embd, @@ -3756,6 +3756,7 @@ static struct ggml_cgraph * llm_build_adept( const int64_t n_embd = hparams.n_embd; const int64_t n_layer = hparams.n_layer; + //const int64_t n_layer = 1; const int64_t n_ctx = hparams.n_ctx; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_head = hparams.n_head; @@ -3811,105 +3812,74 @@ static struct ggml_cgraph * llm_build_adept( // Input is (d_model, L) // Attention struct ggml_tensor * residual = ggml_dup(ctx0, inpL); - ggml_set_name(residual, format((char*)"layer_inputs_%d", il).c_str()); + //ggml_format_name(inpL, "printme_layer_inputs_%d", il); { // input norming cur = ggml_norm(ctx0, inpL, hparams.f_norm_eps); - cur = ggml_add(ctx0, ggml_mul( - ctx0, cur, model.layers[il].attn_norm), - model.layers[il].attn_norm_b); + cur = ggml_mul( + ctx0, cur, model.layers[il].attn_norm); + //ggml_format_name(cur, "printme_normed_%d", il); + cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); } ggml_set_name(cur, "cur"); { // QKV //log_tensor(cur); cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - // 3 * d_model, L - // or 2 * n_head_kv + n_embd_head, L - // + bias ggml_format_name(cur, "qkv_preadd_%d", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); // Apply Q, K layernorm - // Where is the Q/K/V? it's in order. Hopefully... - // So q has offset 0. - // And split into heads - // -> (d_h, n_head, L) - const size_t wsize = ggml_type_size(cur->type); + // split qkv GGML_ASSERT(n_head_kv == n_head); - //LLAMA_LOG_INFO("N: %d\n", N); ggml_set_name(cur, format("qkv_%d", il).c_str()); - //log_tensor(cur); - - // cur is (3 * d_head * n_head, N) - struct ggml_tensor * tmpqkv = ggml_view_4d( - ctx0, cur, n_embd_head, 3, n_head, N, - /* nb1 = */ wsize * n_embd_head, - /* nb2 = */ wsize * n_embd_head * 3, - /* nb3 = */ wsize * n_embd_head * 3 * n_head, - /* offset = */ 0 - ); + struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, N); // get it to (d_h, n_head, L, 3) struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il); - //log_tensor(tmpqkv_perm); - struct ggml_tensor * tmpq = ggml_cont( - ctx0, - ggml_view_3d( + struct ggml_tensor * tmpq = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, N, - /* nb1 = */ sizeof(float) * n_embd_head, - /* nb2 = */ sizeof(float) * n_embd_head * n_head, + /* nb1 = */ ggml_element_size(tmpqkv_perm) * n_embd_head, + /* nb2 = */ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, /* offset = */ 0 - ) - ); - struct ggml_tensor * tmpk = ggml_cont( - ctx0, - ggml_view_3d( + ); + struct ggml_tensor * tmpk = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, N, - /* nb1 = */ sizeof(float) * n_embd_head, - /* nb2 = */ sizeof(float) * n_embd_head * n_head, - /* offset = */ sizeof(float) * n_embd_head * n_head * N - ) - ); - struct ggml_tensor * tmpv = ggml_cont( - ctx0, - ggml_view_3d( + /* nb1 = */ ggml_element_size(tmpqkv_perm) * n_embd_head, + /* nb2 = */ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + /* offset = */ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * N + ); + + struct ggml_tensor * tmpv = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, N, - /* nb1 = */ sizeof(float) * n_embd_head, - /* nb2 = */ sizeof(float) * n_embd_head * n_head, - /* offset = */ sizeof(float) * n_embd_head * n_head * N * 2 - ) - ); - // Q / K layernorm - ggml_set_name(tmpq, format("tmpq_%d", il).c_str()); + /* nb1 = */ ggml_element_size(tmpqkv_perm) * n_embd_head, + /* nb2 = */ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + /* offset = */ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * N * 2 + ); + //ggml_format_name(tmpq, "printme_tmpq_%d", il); tmpq = ggml_norm(ctx0, tmpq, hparams.f_norm_eps); tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); - ggml_set_name(tmpq, format("tmpq_%d", il).c_str()); - //log_tensor(tmpq); + //ggml_format_name(tmpq, "printme_tmpk_%d", il); tmpk = ggml_norm(ctx0, tmpk, hparams.f_norm_eps); tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); - ggml_set_name(tmpk, format("preadd_%d", il).c_str()); tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); - ggml_set_name(tmpk, format("tmpk_%d", il).c_str()); - //log_tensor(tmpk); - - const size_t n_rot = n_embd_head / 2; + struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d( ctx0, tmpq, n_rot, n_head, N, - /* nb1 = */ wsize * n_embd_head, - /* nb2 = */ wsize * n_embd_head * n_head, + /* nb1 = */ ggml_element_size(tmpq) * n_embd_head, + /* nb2 = */ ggml_element_size(tmpq) * n_embd_head * n_head, /* offset = */ 0 )); // get the second half of tmpq, e.g tmpq[n_rot:, :, :] struct ggml_tensor * qpass = ggml_cont(ctx0, ggml_view_3d( ctx0, tmpq, n_rot, n_head, N, - /* nb1 = */ wsize * n_embd_head, - /* nb2 = */ wsize * n_embd_head * n_head, - /* offset = */ wsize * n_rot + /* nb1 = */ ggml_element_size(tmpq) * n_embd_head, + /* nb2 = */ ggml_element_size(tmpq) * n_embd_head * n_head, + /* offset = */ ggml_element_size(tmpq) * n_rot )); ggml_set_name(qrot, format("qrot_%d", il).c_str()); ggml_set_name(qpass, format("qpass_%d", il).c_str()); @@ -3918,20 +3888,18 @@ static struct ggml_cgraph * llm_build_adept( struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d( ctx0, tmpk, n_rot, n_head, N, - /* nb1 = */ wsize * n_embd_head, - /* nb2 = */ wsize * n_embd_head * n_head, + /* nb1 = */ ggml_element_size(tmpk) * n_embd_head, + /* nb2 = */ ggml_element_size(tmpk) * n_embd_head * n_head, /* offset = */ 0 )); struct ggml_tensor * kpass = ggml_cont(ctx0, ggml_view_3d( ctx0, tmpk, n_rot, n_head, N, - /* nb1 = */ wsize * n_embd_head, - /* nb2 = */ wsize * n_embd_head * n_head, - /* offset = */ wsize * n_rot + /* nb1 = */ ggml_element_size(tmpk) * n_embd_head, + /* nb2 = */ ggml_element_size(tmpk) * n_embd_head * n_head, + /* offset = */ ggml_element_size(tmpk) * n_rot )); ggml_set_name(krot, format("krot_%d", il).c_str()); ggml_set_name(kpass, format("kpass_%d", il).c_str()); - //log_tensor(krot); - //log_tensor(kpass); struct ggml_tensor * qrotated = ggml_cont(ctx0, ggml_permute(ctx0, ggml_rope_custom_inplace( @@ -3939,17 +3907,15 @@ static struct ggml_cgraph * llm_build_adept( ), 2, 1, 0, 3 )); - ggml_set_name(qrotated, format("qrotated_%d", il).c_str()); - //log_tensor(qrotated); qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3)); + + //ggml_format_name(krot, "printme_krot_%d", il); struct ggml_tensor * krotated = ggml_cont(ctx0, ggml_permute(ctx0, ggml_rope_custom_inplace( ctx0, krot, n_past, n_rot, 2, 0, freq_base, freq_scale ), 2, 1, 0, 3 )); - ggml_set_name(krotated, format("krotated_%d", il).c_str()); - //log_tensor(krotated); kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3)); struct ggml_tensor * Qcur = ggml_cont(ctx0, @@ -3962,16 +3928,12 @@ static struct ggml_cgraph * llm_build_adept( ); ggml_set_name(Qcur, format("Qcur_%d", il).c_str()); ggml_set_name(Kcur, format("Kcur_%d", il).c_str()); - //log_tensor(Qcur); - //////log_tensor(Kcur); - //log_tensor(kv_self.k); { // View v as (N, n_embd) struct ggml_tensor * Vcur = ggml_transpose( ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd, N) ); ggml_set_name(Vcur, "Vcur"); - // Select k from kv cache as 1d view (N * n_embd) struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past) @@ -3997,7 +3959,6 @@ static struct ggml_cgraph * llm_build_adept( ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il)); ggml_set_name(K, "K"); - //log_tensor(K); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); ggml_set_name(KQ, "KQ"); @@ -4009,7 +3970,7 @@ static struct ggml_cgraph * llm_build_adept( ggml_set_name(KQ_masked, "KQ_mask"); struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, format("KQ_soft_max_%d", il).c_str()); + //ggml_set_name(KQ_soft_max, format("printme_KQ_soft_max_%d", il).c_str()); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, @@ -4031,7 +3992,6 @@ static struct ggml_cgraph * llm_build_adept( cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); cur = ggml_add(ctx0, cur, model.layers[il].bo); ggml_set_name(cur, "result_wo"); - //log_tensor(cur); } cur = ggml_add(ctx0, residual, cur); struct ggml_tensor * residual2 = ggml_dup(ctx0, cur); @@ -4044,17 +4004,12 @@ static struct ggml_cgraph * llm_build_adept( model.layers[il].ffn_norm_b ); } - // FFN cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - ggml_format_name(cur, "pre_act_%d", il); cur = ggml_add(ctx0, cur, model.layers[il].b3); - // //log_tensor(cur); - // Correct through here. - // Squared ReLU cur = ggml_relu(ctx0, cur); cur = ggml_sqr(ctx0, cur); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - ggml_format_name(cur, "post_ffn_down_%d", il); + //ggml_format_name(cur, "printme_ffn_down_%d", il); struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, model.layers[il].b2); @@ -4105,9 +4060,9 @@ static struct ggml_cgraph * llama_build_graph( { result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past); } break; - case LLM_ARCH_ADEPT: + case LLM_ARCH_PERSIMMON: { - result = llm_build_adept(lctx, tokens, embd, n_tokens, n_past); + result = llm_build_persimmon(lctx, tokens, embd, n_tokens, n_past); } break; default: GGML_ASSERT(false); From d61eed0a39b223cbcb6e56e5cb453db26a6eeb55 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Thu, 28 Sep 2023 22:10:45 -0700 Subject: [PATCH 07/22] Produces correct outputs --- ggml.c | 67 +++++++++++--- llama.cpp | 262 +++++++++++++++++++++++++++++++++++------------------- 2 files changed, 225 insertions(+), 104 deletions(-) diff --git a/ggml.c b/ggml.c index 25fa236a2bbad..2b727cb07e504 100644 --- a/ggml.c +++ b/ggml.c @@ -8831,6 +8831,12 @@ static void ggml_compute_forward_dup( struct ggml_tensor * dst) { if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { ggml_compute_forward_dup_same_cont(params, src0, dst); + if (strncmp(src0->name, "printme_tmp_", 12) == 0 && params->ith == 0) { + GGML_PRINT("\noutputs of dupe for %s\n", src0->name); + ggml_print_tensor(dst); + int starts[] = {0, 0, 0, 0}; + ggml_print_tensor_values(dst, starts, 0, 10); + } return; } switch (src0->type) { @@ -8847,6 +8853,12 @@ static void ggml_compute_forward_dup( GGML_ASSERT(false); } break; } + if (strncmp(src0->name, "printme_tmp_", 12) == 0 && params->ith == 0) { + GGML_PRINT("\noutputs of dupe for %s\n", src0->name); + ggml_print_tensor(dst); + int starts[] = {0, 0, 0, 0}; + ggml_print_tensor_values(dst, starts, 0, 10); + } } // ggml_compute_forward_add @@ -8926,10 +8938,8 @@ static void ggml_compute_forward_add_f32( ||strncmp(src1->name, "printme", 7) == 0) && params->ith == 0) { GGML_PRINT("\noutputs of add: %s + %s\n", src0->name, src1->name); - ggml_print_tensor(src0); - ggml_print_tensor(src1); ggml_print_tensor(dst); - int starts[] = {0, 0, 0}; + int starts[] = {0, 0, 0, 0}; ggml_print_tensor_values(dst, starts, 0, 10); } } @@ -10918,7 +10928,7 @@ static void ggml_compute_forward_norm_f32( && params->ith == 0) { GGML_PRINT("\nlayernorm inputs for %s\n", src0->name); ggml_print_tensor(src0); - int starts[] = {0, 0, 0}; + int starts[] = {0, 1, 0}; ggml_print_tensor_values(src0, starts, 0, 10); } @@ -11344,19 +11354,36 @@ static void ggml_compute_forward_mul_mat( struct ggml_tensor * dst) { int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - if ( - strncmp(src1->name, "printme", 7) == 0 + if ((strncmp(src0->name, "printme", 7) == 0 || + strncmp(src1->name, "printme", 7) == 0) && params->ith == 0) { GGML_PRINT("\nInputs to matmul: %s\n", src1->name); - ggml_print_tensor(src1); size_t offset = 0;//(src1->ne[0] * src1->ne[1]) - for (int i=0; i < src1->ne[0] * src1->ne[1]; ++i) { - if (i % src1->ne[0] == 0) { + size_t x = src1->ne[0]; + size_t y = src1->ne[1]; + for (int i=0; i < x * y; ++i) { + if (i % x == 0) { GGML_PRINT("\n"); } - GGML_PRINT(" %f ", ((float *)src1->data)[i + offset]); + if (i % x < 4) { + GGML_PRINT(" %f ", ((float *)src1->data)[i + offset]); + } + } + GGML_PRINT("\n"); + /* + GGML_PRINT("\nInputs to matmul: %s\n", src0->name); + ggml_print_tensor(src0); + if (src0->type == GGML_TYPE_F16) { + for (int i=0; i < src0->ne[0] * src0->ne[1]; ++i) { + if (i % src0->ne[0] == 0) { + GGML_PRINT("\n"); + } + GGML_PRINT(" %f", ((ggml_fp16_t *) src0->data)[i]); + } } GGML_PRINT("\n"); + */ + } GGML_TENSOR_BINARY_OP_LOCALS; @@ -11753,6 +11780,12 @@ static void ggml_compute_forward_scale_f32( } ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v); } + if (strncmp(src0->name, "printme", 7) == 0 && params->ith == 0) { + GGML_PRINT("\nInputs of scale: %s\n", dst->name); + ggml_print_tensor(src0); + int starts[4] = {0, 0, 0, 0}; + ggml_print_tensor_values(src0, starts, 0, 32); + } } static void ggml_compute_forward_scale( @@ -11910,8 +11943,16 @@ static void ggml_compute_forward_view( const struct ggml_compute_params * params, const struct ggml_tensor * src0) { // NOP - UNUSED(params); - UNUSED(src0); + if (strncmp(src0->name, "cache_k", 7) == 0 && params->ith == 0) { + /* + GGML_PRINT("\noutputs of cache_k for view%s\n", src0->name); + ggml_print_tensor(src0); + int starts[] = {4096 * }; + ggml_print_tensor_values(src0, starts, 0, 10); + */ + } + //UNUSED(params); + //UNUSED(src0); } // ggml_compute_forward_permute @@ -12895,7 +12936,7 @@ static void ggml_compute_forward_rope_f32( if (strncmp(src0->name, "printme", 7) == 0 && params->ith == 0) { GGML_PRINT("\n dest at RoPE time for %s\n", src0->name); // print shape and strides - int starts[4] = {0,0,1,0}; + int starts[3] = {0,0,1}; ggml_print_tensor(dst); ggml_print_tensor_values(dst, starts, 0, 10); } diff --git a/llama.cpp b/llama.cpp index baf3ac0fef4f3..0d4df77a52c96 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2640,8 +2640,8 @@ static bool llama_model_load( } static struct ggml_cgraph * llm_build_llama( - llama_context & lctx, - const llama_batch & batch) { + llama_context & lctx, + const llama_batch & batch) { const auto & model = lctx.model; const auto & hparams = model.hparams; @@ -2668,6 +2668,10 @@ static struct ggml_cgraph * llm_build_llama( const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + LLAMA_LOG_INFO("n_kv = %d\n", n_kv); + LLAMA_LOG_INFO("n_tokens = %d\n", n_tokens); + LLAMA_LOG_INFO("n_ctx = %d\n", n_ctx); + LLAMA_LOG_INFO("kvself.n = %d\n", kv_self.n); const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; @@ -2678,11 +2682,9 @@ static struct ggml_cgraph * llm_build_llama( struct ggml_init_params params = { /*.mem_size =*/ buf_compute.size, /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ false, + /*.no_alloc =*/ true, }; - params.no_alloc = true; - struct ggml_context * ctx0 = ggml_init(params); ggml_cgraph * gf = ggml_new_graph(ctx0); @@ -2911,6 +2913,7 @@ static struct ggml_cgraph * llm_build_llama( struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); offload_func_v(KQ_soft_max); ggml_set_name(KQ_soft_max, "KQ_soft_max"); + //ggml_set_name(KQ_soft_max, format("printme_KQ_soft_max_%d", il).c_str()); // split cached V into n_head heads struct ggml_tensor * V = @@ -4077,6 +4080,7 @@ static struct ggml_cgraph * llm_build_persimmon( const auto & hparams = model.hparams; const auto & kv_self = lctx.kv_self; + GGML_ASSERT(!!kv_self.ctx); const int64_t n_embd = hparams.n_embd; @@ -4086,33 +4090,55 @@ static struct ggml_cgraph * llm_build_persimmon( const int64_t n_head = hparams.n_head; const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + const float freq_base = hparams.rope_freq_base; + const float freq_scale = hparams.rope_freq_scale; + const float norm_eps = 1e-5f; + const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const size_t n_rot = n_embd_head / 2; + /* + printf("\nnorm_eps is %f\n", norm_eps); + printf("freq_base is %f\n", freq_base); + LLAMA_LOG_INFO("n_kv = %d\n", n_kv); + LLAMA_LOG_INFO("n_tokens = %d\n", n_tokens); + LLAMA_LOG_INFO("n_ctx = %d\n", n_ctx); + LLAMA_LOG_INFO("kvself.n = %d\n", kv_self.n); + */ - const float freq_base = hparams.rope_freq_base; - const float freq_scale = hparams.rope_freq_scale; + const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; - GGML_ASSERT(n_embd_head == hparams.n_rot); auto & buf_compute = lctx.buf_compute; struct ggml_init_params params = { /*.mem_size =*/ buf_compute.size, /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ false, + /*.no_alloc =*/ true, }; - params.no_alloc = true; struct ggml_context * ctx0 = ggml_init(params); + ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_tensor * cur; struct ggml_tensor * inpL; + if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_allocr_alloc(lctx.alloc, inp_tokens); if (!ggml_allocr_is_measure(lctx.alloc)) { memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); } ggml_set_name(inp_tokens, "inp_tokens"); + /* + LLAMA_LOG_INFO("\ninp_tokens: ["); + for (int i = 0; i < n_tokens; ++i) { + LLAMA_LOG_INFO("%d, ", batch.token[i]); + } + LLAMA_LOG_INFO("]\n"); + */ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); @@ -4121,12 +4147,32 @@ static struct ggml_cgraph * llm_build_persimmon( memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); } } + // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); ggml_allocr_alloc(lctx.alloc, KQ_scale); if (!ggml_allocr_is_measure(lctx.alloc)) { ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); } ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + ggml_set_name(KQ_mask, "KQ_mask"); + ggml_allocr_alloc(lctx.alloc, KQ_mask); + if (!ggml_allocr_is_measure(lctx.alloc)) { + float * data = (float *) KQ_mask->data; + memset(data, 0, ggml_nbytes(KQ_mask)); + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j]; + for (int i = 0; i < n_kv; ++i) { + if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } + } + struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); ggml_set_name(KQ_pos, "KQ_pos"); ggml_allocr_alloc(lctx.alloc, KQ_pos); @@ -4136,31 +4182,49 @@ static struct ggml_cgraph * llm_build_persimmon( data[i] = batch.pos[i]; } } + if (do_rope_shift) { + LLAMA_LOG_INFO("do_rope_shift...?\n"); + struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); + ggml_set_name(K_shift, "K_shift"); + ggml_allocr_alloc(lctx.alloc, K_shift); + if (!ggml_allocr_is_measure(lctx.alloc)) { + int * data = (int *) K_shift->data; + for (int i = 0; i < n_ctx; ++i) { + data[i] = kv_self.cells[i].delta; + } + } + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * tmp = + ggml_rope_custom_inplace(ctx0, + ggml_view_3d(ctx0, kv_self.k, + n_rot, n_head, n_ctx, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)// + n_rot) + ), + K_shift, n_rot, 2, 0, freq_base, freq_scale); + ggml_build_forward_expand(gf, tmp); + } + } //LLAMA_LOG_INFO("Entering n_layers loop\n", __func__); for (int il=0; il < n_layer; ++il) { - offload_func_t offload_func = llama_nop; - // Input is (d_model, L) - // Attention + //ggml_format_name(inpL, "printme_layer_input_%d", il); struct ggml_tensor * residual = ggml_dup(ctx0, inpL); - //ggml_format_name(inpL, "printme_layer_inputs_%d", il); { - // input norming - cur = ggml_norm(ctx0, inpL, hparams.f_norm_eps); - cur = ggml_mul( - ctx0, cur, model.layers[il].attn_norm); - //ggml_format_name(cur, "printme_normed_%d", il); + //ggml_format_name(inpL, "printme_inputs_%d", il); + cur = ggml_norm(ctx0, inpL, norm_eps); + cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); + //ggml_format_name(cur, "printme_layernorm_outputs%d", il); cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); + ggml_format_name(cur, "input_layernorm_%d", il); } - ggml_set_name(cur, "cur"); + // self attention { - // QKV //log_tensor(cur); cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); ggml_format_name(cur, "qkv_preadd_%d", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - // Apply Q, K layernorm - // split qkv GGML_ASSERT(n_head_kv == n_head); ggml_set_name(cur, format("qkv_%d", il).c_str()); @@ -4168,67 +4232,64 @@ static struct ggml_cgraph * llm_build_persimmon( // get it to (d_h, n_head, L, 3) struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il); - struct ggml_tensor * tmpq = ggml_view_3d( + struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - /* nb1 = */ ggml_element_size(tmpqkv_perm) * n_embd_head, - /* nb2 = */ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - /* offset = */ 0 - ); + ggml_element_size(tmpqkv_perm) * n_embd_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + 0 + )); struct ggml_tensor * tmpk = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - /* nb1 = */ ggml_element_size(tmpqkv_perm) * n_embd_head, - /* nb2 = */ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - /* offset = */ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens + ggml_element_size(tmpqkv_perm) * n_embd_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens ); - - struct ggml_tensor * tmpv = ggml_view_3d( + struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - /* nb1 = */ ggml_element_size(tmpqkv_perm) * n_embd_head, - /* nb2 = */ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - /* offset = */ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 - ); - //ggml_format_name(tmpq, "printme_tmpq_%d", il); - tmpq = ggml_norm(ctx0, tmpq, hparams.f_norm_eps); + ggml_element_size(tmpqkv_perm) * n_embd_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 + )); + tmpq = ggml_norm(ctx0, tmpq, norm_eps); tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); + //ggml_format_name(tmpq, "printme_tmpq_%d", il); tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); - //ggml_format_name(tmpq, "printme_tmpk_%d", il); - tmpk = ggml_norm(ctx0, tmpk, hparams.f_norm_eps); + tmpk = ggml_norm(ctx0, tmpk, norm_eps); tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); + //ggml_format_name(tmpk, "printme_tmpk_%d", il); tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); - const size_t n_rot = n_embd_head / 2; struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d( ctx0, tmpq, n_rot, n_head, n_tokens, - /* nb1 = */ ggml_element_size(tmpq) * n_embd_head, - /* nb2 = */ ggml_element_size(tmpq) * n_embd_head * n_head, - /* offset = */ 0 + ggml_element_size(tmpq) * n_embd_head, + ggml_element_size(tmpq) * n_embd_head * n_head, + 0 )); - // get the second half of tmpq, e.g tmpq[n_rot:, :, :] - struct ggml_tensor * qpass = ggml_cont(ctx0, ggml_view_3d( - ctx0, tmpq, n_rot, n_head, n_tokens, - /* nb1 = */ ggml_element_size(tmpq) * n_embd_head, - /* nb2 = */ ggml_element_size(tmpq) * n_embd_head * n_head, - /* offset = */ ggml_element_size(tmpq) * n_rot - )); - ggml_set_name(qrot, format("qrot_%d", il).c_str()); - ggml_set_name(qpass, format("qpass_%d", il).c_str()); - //log_tensor(qrot); - //log_tensor(qpass); - - struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d( + struct ggml_tensor * krottmp = ggml_view_3d( ctx0, tmpk, n_rot, n_head, n_tokens, /* nb1 = */ ggml_element_size(tmpk) * n_embd_head, /* nb2 = */ ggml_element_size(tmpk) * n_embd_head * n_head, /* offset = */ 0 + ); + //ggml_format_name(krottmp, "printme_krottmp_%d", il); + struct ggml_tensor * krot = ggml_cont(ctx0, krottmp); + // get the second half of tmpq, e.g tmpq[n_rot:, :, :] + struct ggml_tensor * qpass = ggml_cont(ctx0, ggml_view_3d( + ctx0, tmpq, n_rot, n_head, n_tokens, + ggml_element_size(tmpq) * n_embd_head, + ggml_element_size(tmpq) * n_embd_head * n_head, + ggml_element_size(tmpq) * n_rot )); struct ggml_tensor * kpass = ggml_cont(ctx0, ggml_view_3d( - ctx0, tmpk, n_rot, n_head, n_tokens, - /* nb1 = */ ggml_element_size(tmpk) * n_embd_head, - /* nb2 = */ ggml_element_size(tmpk) * n_embd_head * n_head, - /* offset = */ ggml_element_size(tmpk) * n_rot + ctx0, tmpk, n_rot, n_head, n_tokens, + ggml_element_size(tmpk) * n_embd_head, + ggml_element_size(tmpk) * n_embd_head * n_head, + ggml_element_size(tmpk) * n_rot )); - ggml_set_name(krot, format("krot_%d", il).c_str()); + ggml_set_name(qrot, format("qrot_%d", il).c_str()); + //ggml_set_name(krot, format("printme_krot_%d", il).c_str()); + ggml_set_name(qpass, format("qpass_%d", il).c_str()); ggml_set_name(kpass, format("kpass_%d", il).c_str()); struct ggml_tensor * qrotated = ggml_cont(ctx0, ggml_permute(ctx0, @@ -4239,7 +4300,6 @@ static struct ggml_cgraph * llm_build_persimmon( )); qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3)); - //ggml_format_name(krot, "printme_krot_%d", il); struct ggml_tensor * krotated = ggml_cont(ctx0, ggml_permute(ctx0, ggml_rope_custom( ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale @@ -4252,18 +4312,38 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_permute(ctx0, ggml_concat(ctx0, qrotated, qpass), 2, 1, 0, 3)); - struct ggml_tensor * Kcur = ggml_cont(ctx0, - ggml_permute(ctx0, ggml_concat(ctx0, krotated, kpass), - 2, 1, 0, 3) - ); + struct ggml_tensor * tmp = ggml_permute(ctx0, ggml_concat(ctx0, krotated, kpass), 2, 1, 0, 3); + //ggml_format_name(tmp, "printme_tmp_%d", il); + struct ggml_tensor * Kcur = ggml_cont(ctx0, tmp); ggml_set_name(Qcur, format("Qcur_%d", il).c_str()); + // kcur appears healthy. ggml_set_name(Kcur, format("Kcur_%d", il).c_str()); { - struct ggml_tensor * Vcur = ggml_transpose( - ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd, n_tokens) + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); + ggml_set_name(Vcur, "Vcur"); + + struct ggml_tensor * k = ggml_view_1d( + ctx0, kv_self.k, n_tokens*n_embd_gqa, + (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head) ); + ggml_set_name(k, "k"); + + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); + ggml_set_name(v, "v"); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + /* + struct ggml_tensor * Vcur = ggml_cont(ctx0, + ggml_transpose( + ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd, n_tokens) + )); ggml_set_name(Vcur, "Vcur"); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd, + struct ggml_tensor * k = ggml_view_1d( + ctx0, kv_self.k, n_tokens*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + kv_head) ); ggml_set_name(k, "k"); @@ -4274,28 +4354,28 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + */ } - struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3)); + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); ggml_set_name(Q, "Q"); //log_tensor(Q); - - struct ggml_tensor * K = - ggml_cont(ctx0, ggml_view_3d(ctx0, kv_self.k, + // For some reason this is all zeros and no balls... + struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, n_embd_head, n_kv, n_head_kv, ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il)); - - ggml_set_name(K, "K"); + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + //ggml_format_name(K, "printme_K_%d", il); + //log_tensor(K); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_set_name(KQ, "KQ"); - - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); + //ggml_set_name(KQ, "KQ"); + //ggml_format_name(KQ, "printme_KQ_%d", il); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); ggml_set_name(KQ_scaled, "KQ_scaled"); - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_kv); - ggml_set_name(KQ_masked, "KQ_mask"); + struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); + ggml_set_name(KQ_masked, "KQ_masked"); struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); //ggml_set_name(KQ_soft_max, format("printme_KQ_soft_max_%d", il).c_str()); @@ -4314,10 +4394,11 @@ static struct ggml_cgraph * llm_build_persimmon( struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); ggml_set_name(KQV_merged, "KQV_merged"); - cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens)); + cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); ggml_set_name(cur, "KQV_merged_contiguous"); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + //ggml_format_name(cur, "printme_wo_%d", il); cur = ggml_add(ctx0, cur, model.layers[il].bo); ggml_set_name(cur, "result_wo"); } @@ -4326,7 +4407,7 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_set_name(residual2, "residual2"); // Norm { - cur = ggml_norm(ctx0, cur, hparams.f_norm_eps); + cur = ggml_norm(ctx0, cur, norm_eps); cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b @@ -4334,8 +4415,7 @@ static struct ggml_cgraph * llm_build_persimmon( } cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); cur = ggml_add(ctx0, cur, model.layers[il].b3); - cur = ggml_relu(ctx0, cur); - cur = ggml_sqr(ctx0, cur); + cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur)); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); //ggml_format_name(cur, "printme_ffn_down_%d", il); struct ggml_tensor * ffn_out = ggml_add(ctx0, @@ -4348,10 +4428,10 @@ static struct ggml_cgraph * llm_build_persimmon( } cur = inpL; { - cur = ggml_norm(ctx0, cur, hparams.f_norm_eps); - cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.output_norm), - model.output_norm_b); + cur = ggml_norm(ctx0, cur, norm_eps); + cur = ggml_mul(ctx0, cur, model.output_norm); + //ggml_set_name(cur, "printme_final"); + cur = ggml_add(ctx0, cur, model.output_norm_b); ggml_set_name(cur, "result_norm"); } cur = ggml_mul_mat(ctx0, model.output, cur); From fa92f6e82790232efd1719c8a81756c2aa70c0c4 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Thu, 28 Sep 2023 22:16:59 -0700 Subject: [PATCH 08/22] clean up convert scripts --- convert-persimmon-st-to-gguf.py | 30 +++++++++++++++--------------- convert.py | 9 ++------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/convert-persimmon-st-to-gguf.py b/convert-persimmon-st-to-gguf.py index ee0d2b1d85096..f8fcbb4bdaae5 100644 --- a/convert-persimmon-st-to-gguf.py +++ b/convert-persimmon-st-to-gguf.py @@ -21,7 +21,7 @@ def file_is_safetensors(path: Path) -> bool: def get_tokenizer_info(dir_model: Path): tokenizer_path = dir_model / 'adept_vocab.model' - print('gguf: get sentencepiece tokenizer from', tokenizer_path) + print('gguf: getting sentencepiece tokenizer from', tokenizer_path) tokenizer = SentencePieceProcessor(str(tokenizer_path)) tokens: list[bytes] = [] scores: list[float] = [] @@ -55,20 +55,20 @@ def get_tokenizer_info(dir_model: Path): return tokens, scores, toktypes -def main(args_in: list[str] | None = None) -> None: +def get_args(): parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file") - parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") - parser.add_argument("--outtype", choices=["f32"], help="currently only support fp32") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.safetensors)") - parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm") - args = parser.parse_args(args_in) + args = parser.parse_args() + return args + +def main() -> None: + args = get_args() assert file_is_safetensors(args.model), 'Error: model file is not a SafeTensors file' dir_model = args.model.parent with open(dir_model / 'config.json', 'r') as f: hparams = json.load(f) - pprint(hparams) arch = gguf.MODEL_ARCH.PERSIMMON gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch]) @@ -88,14 +88,14 @@ def main(args_in: list[str] | None = None) -> None: gguf_writer.add_head_count_kv(head_count_kv) gguf_writer.add_rope_freq_base(hparams['rotary_emb_base']) gguf_writer.add_layer_norm_eps(hparams['layernorm_epsilon']) - if True: - tokens, scores, toktypes = get_tokenizer_info(dir_model) - gguf_writer.add_tokenizer_model('llama') - gguf_writer.add_token_list(tokens) - gguf_writer.add_token_scores(scores) - gguf_writer.add_token_types(toktypes) - gguf_writer.add_bos_token_id(71013) - gguf_writer.add_eos_token_id(71013) + tokens, scores, toktypes = get_tokenizer_info(dir_model) + gguf_writer.add_tokenizer_model('llama') + gguf_writer.add_token_list(tokens) + gguf_writer.add_token_scores(scores) + gguf_writer.add_token_types(toktypes) + gguf_writer.add_bos_token_id(71013) + gguf_writer.add_eos_token_id(71013) + tensor_map = gguf.get_tensor_name_map(arch, block_count) print(tensor_map) tensors = {} diff --git a/convert.py b/convert.py index de752cb0190de..4ac5030db61eb 100755 --- a/convert.py +++ b/convert.py @@ -439,7 +439,7 @@ def __repr__(self) -> str: def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv + n_head //= n_head_kv return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) .swapaxes(1, 2) .reshape(weights.shape)) @@ -701,18 +701,13 @@ def rebuild_from_type_v2(func, new_type, args, state): def find_class(self, module: str, name: str) -> Any: if not module.startswith('torch'): return super().find_class(module, name) - if (module, name) in self.CLASSES: - return self.CLASSES[(module, name)] - else: - print(f'Missing mapping for {module}.{name}') - raise KeyError + return self.CLASSES[(module, name)] def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus: zf = zipfile.ZipFile(outer_fp) pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')] assert len(pickle_paths) == 1, pickle_paths - print(pickle_paths) pickle_fp = zf.open(pickle_paths[0], 'r') unpickler = LazyUnpickler(pickle_fp, data_base_path=pickle_paths[0][:-4], From c28a6c5ba0c25b70fc9d6796233a98e8d10194eb Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Thu, 28 Sep 2023 22:18:56 -0700 Subject: [PATCH 09/22] remove printing logic from ggml.c --- ggml.c | 83 ---------------------------------------------------------- 1 file changed, 83 deletions(-) diff --git a/ggml.c b/ggml.c index 8befffa675de5..072078806cf17 100644 --- a/ggml.c +++ b/ggml.c @@ -9079,12 +9079,6 @@ static void ggml_compute_forward_dup( struct ggml_tensor * dst) { if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { ggml_compute_forward_dup_same_cont(params, src0, dst); - if (strncmp(src0->name, "printme_tmp_", 12) == 0 && params->ith == 0) { - GGML_PRINT("\noutputs of dupe for %s\n", src0->name); - ggml_print_tensor(dst); - int starts[] = {0, 0, 0, 0}; - ggml_print_tensor_values(dst, starts, 0, 10); - } return; } switch (src0->type) { @@ -9101,12 +9095,6 @@ static void ggml_compute_forward_dup( GGML_ASSERT(false); } break; } - if (strncmp(src0->name, "printme_tmp_", 12) == 0 && params->ith == 0) { - GGML_PRINT("\noutputs of dupe for %s\n", src0->name); - ggml_print_tensor(dst); - int starts[] = {0, 0, 0, 0}; - ggml_print_tensor_values(dst, starts, 0, 10); - } } // ggml_compute_forward_add @@ -9182,14 +9170,6 @@ static void ggml_compute_forward_add_f32( } } } - if ((strncmp(src0->name, "printme", 7) == 0 - ||strncmp(src1->name, "printme", 7) == 0) - && params->ith == 0) { - GGML_PRINT("\noutputs of add: %s + %s\n", src0->name, src1->name); - ggml_print_tensor(dst); - int starts[] = {0, 0, 0, 0}; - ggml_print_tensor_values(dst, starts, 0, 10); - } } static void ggml_compute_forward_add_f16_f32( @@ -11226,13 +11206,6 @@ static void ggml_compute_forward_norm_f32( GGML_ASSERT(src0->nb[0] == sizeof(float)); // If the name starts with "layer_inputs", and we are on thread 0, print the tensor - if (strncmp(src0->name, "printme", 7) == 0 - && params->ith == 0) { - GGML_PRINT("\nlayernorm inputs for %s\n", src0->name); - ggml_print_tensor(src0); - int starts[] = {0, 1, 0}; - ggml_print_tensor_values(src0, starts, 0, 10); - } const int ith = params->ith; const int nth = params->nth; @@ -11656,44 +11629,7 @@ static void ggml_compute_forward_mul_mat( struct ggml_tensor * dst) { int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - if ((strncmp(src0->name, "printme", 7) == 0 || - strncmp(src1->name, "printme", 7) == 0) - && params->ith == 0) { - GGML_PRINT("\nInputs to matmul: %s\n", src1->name); - size_t offset = 0;//(src1->ne[0] * src1->ne[1]) - size_t x = src1->ne[0]; - size_t y = src1->ne[1]; - for (int i=0; i < x * y; ++i) { - if (i % x == 0) { - GGML_PRINT("\n"); - } - if (i % x < 4) { - GGML_PRINT(" %f ", ((float *)src1->data)[i + offset]); - } - } - GGML_PRINT("\n"); - /* - GGML_PRINT("\nInputs to matmul: %s\n", src0->name); - ggml_print_tensor(src0); - if (src0->type == GGML_TYPE_F16) { - for (int i=0; i < src0->ne[0] * src0->ne[1]; ++i) { - if (i % src0->ne[0] == 0) { - GGML_PRINT("\n"); - } - GGML_PRINT(" %f", ((ggml_fp16_t *) src0->data)[i]); - } - } - GGML_PRINT("\n"); - */ - - } - -<<<<<<< HEAD - GGML_TENSOR_BINARY_OP_LOCALS; - // If on thread 0, src1 starts with KQ_softmax, print -======= GGML_TENSOR_BINARY_OP_LOCALS ->>>>>>> bc39553c901a91cfcb757863586250838c83eeab const int ith = params->ith; const int nth = params->nth; @@ -12229,12 +12165,6 @@ static void ggml_compute_forward_scale_f32( } ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v); } - if (strncmp(src0->name, "printme", 7) == 0 && params->ith == 0) { - GGML_PRINT("\nInputs of scale: %s\n", dst->name); - ggml_print_tensor(src0); - int starts[4] = {0, 0, 0, 0}; - ggml_print_tensor_values(src0, starts, 0, 32); - } } static void ggml_compute_forward_scale( @@ -13245,12 +13175,6 @@ static void ggml_compute_forward_rope_f32( if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - if (strncmp(src0->name, "printme", 7) == 0 && params->ith == 0) { - GGML_PRINT("\ninputs of RoPE for %s\n", src0->name); - ggml_print_tensor(src0); - int starts[] = {0, 0, 1, 0}; - ggml_print_tensor_values(src0, starts, 0, 10); - } float freq_base; float freq_scale; @@ -13379,13 +13303,6 @@ static void ggml_compute_forward_rope_f32( } } } - if (strncmp(src0->name, "printme", 7) == 0 && params->ith == 0) { - GGML_PRINT("\n dest at RoPE time for %s\n", src0->name); - // print shape and strides - int starts[3] = {0,0,1}; - ggml_print_tensor(dst); - ggml_print_tensor_values(dst, starts, 0, 10); - } } static void ggml_compute_forward_rope_f16( From 47dcb9fcf54105d6500c973bdbe20e941abb1e66 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Thu, 28 Sep 2023 22:21:00 -0700 Subject: [PATCH 10/22] remove prints from llama.cpp & fix merge --- llama.cpp | 102 ++++-------------------------------------------------- 1 file changed, 6 insertions(+), 96 deletions(-) diff --git a/llama.cpp b/llama.cpp index 3a9d706f8221e..20feae50d524a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2633,10 +2633,6 @@ static struct ggml_cgraph * llm_build_llama( const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; - LLAMA_LOG_INFO("n_kv = %d\n", n_kv); - LLAMA_LOG_INFO("n_tokens = %d\n", n_tokens); - LLAMA_LOG_INFO("n_ctx = %d\n", n_ctx); - LLAMA_LOG_INFO("kvself.n = %d\n", kv_self.n); const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; @@ -2875,7 +2871,6 @@ static struct ggml_cgraph * llm_build_llama( struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); offload_func_v(KQ_soft_max); ggml_set_name(KQ_soft_max, "KQ_soft_max"); - //ggml_set_name(KQ_soft_max, format("printme_KQ_soft_max_%d", il).c_str()); // split cached V into n_head heads struct ggml_tensor * V = @@ -4017,19 +4012,6 @@ static struct ggml_cgraph * llm_build_starcoder( return gf; } -static void log_tensor( - ggml_tensor * a -) { - LLAMA_LOG_INFO("Shape of %s is ", a->name); - for (int i = 0; i < a->n_dims; ++i) { - LLAMA_LOG_INFO("%d", a->ne[i]); - if (i < a->n_dims - 1) { - LLAMA_LOG_INFO(","); - } - LLAMA_LOG_INFO(" "); - } - LLAMA_LOG_INFO("\n"); -} static struct ggml_cgraph * llm_build_persimmon( llama_context & lctx, @@ -4042,31 +4024,24 @@ static struct ggml_cgraph * llm_build_persimmon( GGML_ASSERT(!!kv_self.ctx); + const auto & cparams = lctx.cparams; const int64_t n_embd = hparams.n_embd; const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = hparams.n_ctx; + const int64_t n_ctx = cparams.n_ctx; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_head = hparams.n_head; const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); - const float freq_base = hparams.rope_freq_base; - const float freq_scale = hparams.rope_freq_scale; - const float norm_eps = 1e-5f; + const float freq_base = cparams.rope_freq_base; + const float freq_scale = cparams.rope_freq_scale; + + float norm_eps = hparams.f_norm_eps < 0 ? 1e-5f : hparams.f_norm_eps; const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; const size_t n_rot = n_embd_head / 2; - /* - printf("\nnorm_eps is %f\n", norm_eps); - printf("freq_base is %f\n", freq_base); - LLAMA_LOG_INFO("n_kv = %d\n", n_kv); - LLAMA_LOG_INFO("n_tokens = %d\n", n_tokens); - LLAMA_LOG_INFO("n_ctx = %d\n", n_ctx); - LLAMA_LOG_INFO("kvself.n = %d\n", kv_self.n); - */ - const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; auto & buf_compute = lctx.buf_compute; @@ -4091,13 +4066,6 @@ static struct ggml_cgraph * llm_build_persimmon( memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); } ggml_set_name(inp_tokens, "inp_tokens"); - /* - LLAMA_LOG_INFO("\ninp_tokens: ["); - for (int i = 0; i < n_tokens; ++i) { - LLAMA_LOG_INFO("%d, ", batch.token[i]); - } - LLAMA_LOG_INFO("]\n"); - */ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); @@ -4165,21 +4133,16 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_build_forward_expand(gf, tmp); } } - //LLAMA_LOG_INFO("Entering n_layers loop\n", __func__); for (int il=0; il < n_layer; ++il) { - //ggml_format_name(inpL, "printme_layer_input_%d", il); struct ggml_tensor * residual = ggml_dup(ctx0, inpL); { - //ggml_format_name(inpL, "printme_inputs_%d", il); cur = ggml_norm(ctx0, inpL, norm_eps); cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - //ggml_format_name(cur, "printme_layernorm_outputs%d", il); cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); ggml_format_name(cur, "input_layernorm_%d", il); } // self attention { - //log_tensor(cur); cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); ggml_format_name(cur, "qkv_preadd_%d", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -4211,12 +4174,10 @@ static struct ggml_cgraph * llm_build_persimmon( )); tmpq = ggml_norm(ctx0, tmpq, norm_eps); tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); - //ggml_format_name(tmpq, "printme_tmpq_%d", il); tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); tmpk = ggml_norm(ctx0, tmpk, norm_eps); tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); - //ggml_format_name(tmpk, "printme_tmpk_%d", il); tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d( @@ -4231,7 +4192,6 @@ static struct ggml_cgraph * llm_build_persimmon( /* nb2 = */ ggml_element_size(tmpk) * n_embd_head * n_head, /* offset = */ 0 ); - //ggml_format_name(krottmp, "printme_krottmp_%d", il); struct ggml_tensor * krot = ggml_cont(ctx0, krottmp); // get the second half of tmpq, e.g tmpq[n_rot:, :, :] struct ggml_tensor * qpass = ggml_cont(ctx0, ggml_view_3d( @@ -4247,7 +4207,6 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpk) * n_rot )); ggml_set_name(qrot, format("qrot_%d", il).c_str()); - //ggml_set_name(krot, format("printme_krot_%d", il).c_str()); ggml_set_name(qpass, format("qpass_%d", il).c_str()); ggml_set_name(kpass, format("kpass_%d", il).c_str()); @@ -4272,7 +4231,6 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_concat(ctx0, qrotated, qpass), 2, 1, 0, 3)); struct ggml_tensor * tmp = ggml_permute(ctx0, ggml_concat(ctx0, krotated, kpass), 2, 1, 0, 3); - //ggml_format_name(tmp, "printme_tmp_%d", il); struct ggml_tensor * Kcur = ggml_cont(ctx0, tmp); ggml_set_name(Qcur, format("Qcur_%d", il).c_str()); // kcur appears healthy. @@ -4295,41 +4253,16 @@ static struct ggml_cgraph * llm_build_persimmon( // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - /* - struct ggml_tensor * Vcur = ggml_cont(ctx0, - ggml_transpose( - ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd, n_tokens) - )); - ggml_set_name(Vcur, "Vcur"); - struct ggml_tensor * k = ggml_view_1d( - ctx0, kv_self.k, n_tokens*n_embd, - (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + kv_head) - ); - ggml_set_name(k, "k"); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - */ } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); ggml_set_name(Q, "Q"); - //log_tensor(Q); - // For some reason this is all zeros and no balls... struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, n_embd_head, n_kv, n_head_kv, ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - //ggml_format_name(K, "printme_K_%d", il); - //log_tensor(K); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - //ggml_set_name(KQ, "KQ"); - //ggml_format_name(KQ, "printme_KQ_%d", il); struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); ggml_set_name(KQ_scaled, "KQ_scaled"); @@ -4337,7 +4270,6 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_set_name(KQ_masked, "KQ_masked"); struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - //ggml_set_name(KQ_soft_max, format("printme_KQ_soft_max_%d", il).c_str()); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, @@ -4357,7 +4289,6 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_set_name(cur, "KQV_merged_contiguous"); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - //ggml_format_name(cur, "printme_wo_%d", il); cur = ggml_add(ctx0, cur, model.layers[il].bo); ggml_set_name(cur, "result_wo"); } @@ -4376,7 +4307,6 @@ static struct ggml_cgraph * llm_build_persimmon( cur = ggml_add(ctx0, cur, model.layers[il].b3); cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur)); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - //ggml_format_name(cur, "printme_ffn_down_%d", il); struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, model.layers[il].b2); @@ -4389,7 +4319,6 @@ static struct ggml_cgraph * llm_build_persimmon( { cur = ggml_norm(ctx0, cur, norm_eps); cur = ggml_mul(ctx0, cur, model.output_norm); - //ggml_set_name(cur, "printme_final"); cur = ggml_add(ctx0, cur, model.output_norm_b); ggml_set_name(cur, "result_norm"); } @@ -7166,12 +7095,6 @@ struct llama_context * llama_new_context_with_model( LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } -<<<<<<< HEAD - const auto & hparams = ctx->model.hparams; - - //LLAMA_LOG_INFO("hg\n", __func__); -======= ->>>>>>> bc39553c901a91cfcb757863586250838c83eeab // resized during inference if (params.logits_all) { ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab); @@ -7198,25 +7121,12 @@ struct llama_context * llama_new_context_with_model( ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0)); #ifdef GGML_USE_METAL -<<<<<<< HEAD - if (false) { - if (params.n_gpu_layers > 0) { - ctx->ctx_metal = ggml_metal_init(1); - if (!ctx->ctx_metal) { - LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); - llama_free(ctx); - return NULL; - } - ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); - ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); -======= if (model->n_gpu_layers > 0) { ctx->ctx_metal = ggml_metal_init(1); if (!ctx->ctx_metal) { LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); llama_free(ctx); return NULL; ->>>>>>> bc39553c901a91cfcb757863586250838c83eeab } ggml_metal_log_set_callback(llama_log_callback_default, NULL); //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); From d904aff0403bb222b72880856e480e6684636c2f Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Thu, 28 Sep 2023 22:36:23 -0700 Subject: [PATCH 11/22] trivial cleanups --- ggml.c | 58 ++------------------------------------------ gguf-py/gguf/gguf.py | 20 +++++++-------- llama.cpp | 1 - 3 files changed, 12 insertions(+), 67 deletions(-) diff --git a/ggml.c b/ggml.c index 072078806cf17..3dda7547f6f30 100644 --- a/ggml.c +++ b/ggml.c @@ -4345,50 +4345,6 @@ void ggml_print_objects(const struct ggml_context * ctx) { GGML_PRINT("%s: --- end ---\n", __func__); } -static void ggml_print_tensor(const struct ggml_tensor * tensor) { - GGML_PRINT("Tensor (null): %s | rank %d | shape (", ggml_type_name(tensor->type), tensor->n_dims); - for (int i=0; in_dims; ++i) { - GGML_PRINT("%lld ", tensor->ne[i]); - } - GGML_PRINT(") | strides ("); - for (int i=0; in_dims; ++i) { - GGML_PRINT("%lld ", tensor->nb[i]); - } - GGML_PRINT(")\n"); -} - -static void ggml_print_tensor_values(const struct ggml_tensor * tensor, int starts[], int dim, int nelts) { - GGML_ASSERT(tensor->type == GGML_TYPE_F32); - GGML_PRINT("Printing values for tensor %s[", tensor->name); - for (int i=0; in_dims; ++i) { - GGML_ASSERT(starts[i] >= 0); - if (i == dim) { - if (starts[i] > 0) { - GGML_PRINT("%d:%d", starts[i], starts[i]+nelts); - } else { - GGML_PRINT(":%d", starts[i]+nelts); - } - } else { - GGML_PRINT("%d", starts[i]); - } - if (in_dims-1) { - GGML_PRINT(","); - } - } - GGML_PRINT("]\n"); - float *data_ptr = (float *) tensor->data; - int offset = 0; - for (int j = 0; j < tensor->n_dims; j++) { - offset += (starts[j] * tensor->nb[j]) / ggml_type_size(GGML_TYPE_F32); - } - data_ptr += offset; - for (int i = 0; i < nelts; i++) { - GGML_PRINT("%f ", *data_ptr); - data_ptr += tensor->nb[dim] / ggml_type_size(GGML_TYPE_F32); - } - GGML_PRINT("\n"); -} - int64_t ggml_nelements(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); @@ -6442,7 +6398,6 @@ struct ggml_tensor * ggml_mul_mat( const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); - //GGML_PRINT("ggml_mul_mat result shape : (%lld, %lld, %lld, %lld)\n", ne[0], ne[1], ne[2], ne[3]); result->op = GGML_OP_MUL_MAT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -11205,7 +11160,6 @@ static void ggml_compute_forward_norm_f32( } GGML_ASSERT(src0->nb[0] == sizeof(float)); - // If the name starts with "layer_inputs", and we are on thread 0, print the tensor const int ith = params->ith; const int nth = params->nth; @@ -12322,16 +12276,8 @@ static void ggml_compute_forward_view( const struct ggml_compute_params * params, const struct ggml_tensor * src0) { // NOP - if (strncmp(src0->name, "cache_k", 7) == 0 && params->ith == 0) { - /* - GGML_PRINT("\noutputs of cache_k for view%s\n", src0->name); - ggml_print_tensor(src0); - int starts[] = {4096 * }; - ggml_print_tensor_values(src0, starts, 0, 10); - */ - } - //UNUSED(params); - //UNUSED(src0); + UNUSED(params); + UNUSED(src0); } // ggml_compute_forward_permute diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 6bb139c9ddb2b..882b96bc61ba6 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -234,7 +234,7 @@ class TensorNameMap: "transformer.word_embeddings", # falcon "model.embed_tokens", # llama-hf "tok_embeddings", # llama-pth - "language_model.embedding.word_embeddings", # adept + "language_model.embedding.word_embeddings", # persimmon ), # Position embeddings @@ -247,7 +247,7 @@ class TensorNameMap: "embed_out", # gptneox "lm_head", # gpt2 mpt falcon llama-hf baichuan "output", # llama-pth - "word_embeddings_for_head", # adept + "word_embeddings_for_head", # persimmon ), # Output norm @@ -256,7 +256,7 @@ class TensorNameMap: "transformer.ln_f", # gpt2 falcon "model.norm", # llama-hf baichuan "norm", # llama-pth - "language_model.encoder.final_layernorm", # adept + "language_model.encoder.final_layernorm", # persimmon ), # Rope frequencies @@ -275,7 +275,7 @@ class TensorNameMap: "transformer.h.{bid}.ln_mlp", # falcon40b "model.layers.{bid}.input_layernorm", # llama-hf "layers.{bid}.attention_norm", # llama-pth - "language_model.encoder.layers.{bid}.input_layernorm", # adept + "language_model.encoder.layers.{bid}.input_layernorm", # persimmon ), # Attention norm 2 @@ -289,7 +289,7 @@ class TensorNameMap: "transformer.h.{bid}.attn.c_attn", # gpt2 "transformer.blocks.{bid}.attn.Wqkv", # mpt "transformer.h.{bid}.self_attention.query_key_value", # falcon - "language_model.encoder.layers.{bid}.self_attention.query_key_value", # adept + "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon ), # Attention query @@ -318,7 +318,7 @@ class TensorNameMap: "transformer.h.{bid}.self_attention.dense", # falcon "model.layers.{bid}.self_attn.o_proj", # llama-hf "layers.{bid}.attention.wo", # llama-pth - "language_model.encoder.layers.{bid}.self_attention.dense" # adept + "language_model.encoder.layers.{bid}.self_attention.dense" # persimmon ), # Rotary embeddings @@ -334,7 +334,7 @@ class TensorNameMap: "transformer.blocks.{bid}.norm_2", # mpt "model.layers.{bid}.post_attention_layernorm", # llama-hf "layers.{bid}.ffn_norm", # llama-pth - "language_model.encoder.layers.{bid}.post_attention_layernorm", # adept + "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon ), # Feed-forward up @@ -345,7 +345,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon "model.layers.{bid}.mlp.up_proj", # llama-hf "layers.{bid}.feed_forward.w3", # llama-pth - "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # adept + "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon ), # Feed-forward gate @@ -362,7 +362,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon "model.layers.{bid}.mlp.down_proj", # llama-hf "layers.{bid}.feed_forward.w2", # llama-pth - "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # adept + "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon ), MODEL_TENSOR.ATTN_Q_NORM: ( @@ -374,7 +374,7 @@ class TensorNameMap: ), MODEL_TENSOR.ROPE_FREQS: ( - "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # adept + "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon ) } diff --git a/llama.cpp b/llama.cpp index 20feae50d524a..cebb5b6ca7f1e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7088,7 +7088,6 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - LLAMA_LOG_INFO("Kv self cache: %7.2f MB\n", ggml_nbytes(ctx->kv_self.k) / 1024.0 / 1024.0); { const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); From ec0ce978ff1f43beb3cc9936b1bc7370cd8fcf5c Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Fri, 29 Sep 2023 14:17:39 -0700 Subject: [PATCH 12/22] Add offload funcs --- llama.cpp | 229 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 157 insertions(+), 72 deletions(-) diff --git a/llama.cpp b/llama.cpp index cebb5b6ca7f1e..ad9ce2ceee069 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4032,17 +4032,19 @@ static struct ggml_cgraph * llm_build_persimmon( const int64_t n_head = hparams.n_head; const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); + const size_t n_rot = n_embd_head / 2; const float freq_base = cparams.rope_freq_base; const float freq_scale = cparams.rope_freq_scale; - float norm_eps = hparams.f_norm_eps < 0 ? 1e-5f : hparams.f_norm_eps; + float norm_eps = 1e-5f;//: hparams.f_norm_eps; + LLAMA_LOG_INFO("norm_eps: %f\n", hparams.f_norm_eps); const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; - const size_t n_rot = n_embd_head / 2; - const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + + const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; auto & buf_compute = lctx.buf_compute; struct ggml_init_params params = { @@ -4066,6 +4068,11 @@ static struct ggml_cgraph * llm_build_persimmon( memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); } ggml_set_name(inp_tokens, "inp_tokens"); + LLAMA_LOG_INFO("Input tokens are: ["); + for (int i = 0; i < n_tokens; ++i) { + LLAMA_LOG_INFO("%d, ", batch.token[i]); + } + LLAMA_LOG_INFO("]\n"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); @@ -4074,6 +4081,9 @@ static struct ggml_cgraph * llm_build_persimmon( memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); } } + offload_func_t offload_func_nr = llama_nop; // nr = non-repeating + offload_func_t offload_func_kq = llama_nop; + offload_func_t offload_func_v = llama_nop; // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); ggml_allocr_alloc(lctx.alloc, KQ_scale); @@ -4082,8 +4092,10 @@ static struct ggml_cgraph * llm_build_persimmon( } ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + offload_func_kq(KQ_mask); ggml_set_name(KQ_mask, "KQ_mask"); ggml_allocr_alloc(lctx.alloc, KQ_mask); + if (!ggml_allocr_is_measure(lctx.alloc)) { float * data = (float *) KQ_mask->data; memset(data, 0, ggml_nbytes(KQ_mask)); @@ -4101,6 +4113,7 @@ static struct ggml_cgraph * llm_build_persimmon( } struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + offload_func_kq(KQ_pos); ggml_set_name(KQ_pos, "KQ_pos"); ggml_allocr_alloc(lctx.alloc, KQ_pos); if (!ggml_allocr_is_measure(lctx.alloc)) { @@ -4110,8 +4123,8 @@ static struct ggml_cgraph * llm_build_persimmon( } } if (do_rope_shift) { - LLAMA_LOG_INFO("do_rope_shift...?\n"); struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); + offload_func_kq(K_shift); ggml_set_name(K_shift, "K_shift"); ggml_allocr_alloc(lctx.alloc, K_shift); if (!ggml_allocr_is_measure(lctx.alloc)) { @@ -4122,154 +4135,195 @@ static struct ggml_cgraph * llm_build_persimmon( } for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = + // we rotate only the first n_rot dimensions. ggml_rope_custom_inplace(ctx0, ggml_view_3d(ctx0, kv_self.k, n_rot, n_head, n_ctx, ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)// + n_rot) + ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il) ), K_shift, n_rot, 2, 0, freq_base, freq_scale); + offload_func_kq(tmp); ggml_build_forward_expand(gf, tmp); } } for (int il=0; il < n_layer; ++il) { - struct ggml_tensor * residual = ggml_dup(ctx0, inpL); + struct ggml_tensor * residual = inpL; + offload_func_t offload_func = llama_nop; { cur = ggml_norm(ctx0, inpL, norm_eps); + offload_func(cur); cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); + offload_func(cur); cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); + offload_func(cur); ggml_format_name(cur, "input_layernorm_%d", il); } // self attention { cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - ggml_format_name(cur, "qkv_preadd_%d", il); + offload_func_kq(cur); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + offload_func_kq(cur); // split qkv GGML_ASSERT(n_head_kv == n_head); ggml_set_name(cur, format("qkv_%d", il).c_str()); struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens); - // get it to (d_h, n_head, L, 3) + offload_func_kq(tmpqkv); struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); + offload_func_kq(tmpqkv_perm); ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il); - struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d( + struct ggml_tensor * tmpq = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, ggml_element_size(tmpqkv_perm) * n_embd_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, 0 - )); + ); + offload_func_kq(tmpq); struct ggml_tensor * tmpk = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, ggml_element_size(tmpqkv_perm) * n_embd_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens ); - struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_3d( - ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - ggml_element_size(tmpqkv_perm) * n_embd_head, - ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 - )); + offload_func_kq(tmpk); + // Q/K Layernorm tmpq = ggml_norm(ctx0, tmpq, norm_eps); + offload_func_kq(tmpq); tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); + offload_func_kq(tmpq); tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); + offload_func_kq(tmpq); tmpk = ggml_norm(ctx0, tmpk, norm_eps); + offload_func_v(tmpk); tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); + offload_func_v(tmpk); tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); + offload_func_v(tmpk); - struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d( + // RoPE the first n_rot of q/k, pass the other half, and concat. + struct ggml_tensor * qrot = ggml_view_3d( ctx0, tmpq, n_rot, n_head, n_tokens, ggml_element_size(tmpq) * n_embd_head, ggml_element_size(tmpq) * n_embd_head * n_head, 0 - )); - struct ggml_tensor * krottmp = ggml_view_3d( + ); + offload_func_kq(qrot); + ggml_format_name(qrot, "qrot_%d", il); + struct ggml_tensor * krot = ggml_view_3d( ctx0, tmpk, n_rot, n_head, n_tokens, - /* nb1 = */ ggml_element_size(tmpk) * n_embd_head, - /* nb2 = */ ggml_element_size(tmpk) * n_embd_head * n_head, - /* offset = */ 0 + ggml_element_size(tmpk) * n_embd_head, + ggml_element_size(tmpk) * n_embd_head * n_head, + 0 ); - struct ggml_tensor * krot = ggml_cont(ctx0, krottmp); + offload_func_kq(krot); + ggml_format_name(krot, "krot_%d", il); + // get the second half of tmpq, e.g tmpq[n_rot:, :, :] - struct ggml_tensor * qpass = ggml_cont(ctx0, ggml_view_3d( + struct ggml_tensor * qpass = ggml_view_3d( ctx0, tmpq, n_rot, n_head, n_tokens, ggml_element_size(tmpq) * n_embd_head, ggml_element_size(tmpq) * n_embd_head * n_head, ggml_element_size(tmpq) * n_rot - )); - struct ggml_tensor * kpass = ggml_cont(ctx0, ggml_view_3d( + ); + offload_func_kq(qpass); + ggml_format_name(qpass, "qpass_%d", il); + struct ggml_tensor * kpass = ggml_view_3d( ctx0, tmpk, n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, ggml_element_size(tmpk) * n_rot - )); - ggml_set_name(qrot, format("qrot_%d", il).c_str()); - ggml_set_name(qpass, format("qpass_%d", il).c_str()); - ggml_set_name(kpass, format("kpass_%d", il).c_str()); + ); + offload_func_kq(kpass); + ggml_format_name(kpass, "kpass_%d", il); - struct ggml_tensor * qrotated = ggml_cont(ctx0, ggml_permute(ctx0, - ggml_rope_custom( + struct ggml_tensor * qrotated = ggml_rope_custom( ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale - ), - 2, 1, 0, 3 - )); - qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3)); - - struct ggml_tensor * krotated = ggml_cont(ctx0, ggml_permute(ctx0, - ggml_rope_custom( + ); + offload_func_kq(qrotated); + struct ggml_tensor * krotated = ggml_rope_custom( ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale - ), - 2, 1, 0, 3 - )); + ); + offload_func_kq(krotated); + // ggml currently only supports concatenation on dim=2 + // so we need to permute qrot, qpass, concat, then permute back. + qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3)); + offload_func_kq(qrotated); + krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3)); + offload_func_kq(krotated); + + qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3)); + offload_func_kq(qpass); kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3)); + offload_func_kq(kpass); - struct ggml_tensor * Qcur = ggml_cont(ctx0, - ggml_permute(ctx0, - ggml_concat(ctx0, qrotated, qpass), - 2, 1, 0, 3)); - struct ggml_tensor * tmp = ggml_permute(ctx0, ggml_concat(ctx0, krotated, kpass), 2, 1, 0, 3); - struct ggml_tensor * Kcur = ggml_cont(ctx0, tmp); - ggml_set_name(Qcur, format("Qcur_%d", il).c_str()); - // kcur appears healthy. - ggml_set_name(Kcur, format("Kcur_%d", il).c_str()); + struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass); + offload_func_kq(Qcur); + struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass); + offload_func_kq(Kcur); + + struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3)); + offload_func_kq(Q); + + Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3)); + offload_func_kq(Kcur); { + struct ggml_tensor * tmpv = ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, + ggml_element_size(tmpqkv_perm) * n_embd_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 + ); + offload_func_v(tmpv); + // store K, V in cache struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); + offload_func_v(Vcur); ggml_set_name(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d( ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head) ); + offload_func_kq(k); ggml_set_name(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); + offload_func_v(v); ggml_set_name(v, "v"); // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - ggml_set_name(Q, "Q"); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, n_embd_head, n_kv, n_head_kv, ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + offload_func_kq(K); + ggml_format_name(K, "K_%d", il); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + offload_func_kq(KQ); + ggml_set_name(KQ, "KQ"); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); + offload_func_kq(KQ_scaled); ggml_set_name(KQ_scaled, "KQ_scaled"); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); + offload_func_kq(KQ_masked); ggml_set_name(KQ_masked, "KQ_masked"); struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + offload_func_kq(KQ_soft_max); + ggml_set_name(KQ_soft_max, "KQ_soft_max"); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, @@ -4277,49 +4331,80 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + offload_func_v(V); ggml_set_name(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + offload_func_v(KQV); ggml_set_name(KQV, "KQV"); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + offload_func_v(KQV_merged); ggml_set_name(KQV_merged, "KQV_merged"); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); + offload_func_v(cur); ggml_set_name(cur, "KQV_merged_contiguous"); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + offload_func(cur); cur = ggml_add(ctx0, cur, model.layers[il].bo); + offload_func(cur); ggml_set_name(cur, "result_wo"); } - cur = ggml_add(ctx0, residual, cur); - struct ggml_tensor * residual2 = ggml_dup(ctx0, cur); - ggml_set_name(residual2, "residual2"); - // Norm - { - cur = ggml_norm(ctx0, cur, norm_eps); + + struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur); + offload_func(inpFF); + ggml_set_name(inpFF, "inpFF"); + { + // MLP + { + // Norm + cur = ggml_norm(ctx0, inpFF, norm_eps); + offload_func(cur); + cur = ggml_add(ctx0, + ggml_mul(ctx0, cur, model.layers[il].ffn_norm), + model.layers[il].ffn_norm_b + ); + ggml_set_name(cur, "ffn_norm"); + offload_func(cur); + } + cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); + offload_func(cur); + + cur = ggml_add(ctx0, cur, model.layers[il].b3); + offload_func(cur); + ggml_set_name(cur, "result_ffn_up"); + + cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur)); + ggml_set_name(cur, "result_ffn_act"); + offload_func(cur); + offload_func(cur->src[0]); + + cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); + offload_func(cur); cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.layers[il].ffn_norm), - model.layers[il].ffn_norm_b - ); + cur, + model.layers[il].b2); + offload_func(cur); + ggml_set_name(cur, "outFF"); } - cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - cur = ggml_add(ctx0, cur, model.layers[il].b3); - cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur)); - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - struct ggml_tensor * ffn_out = ggml_add(ctx0, - cur, - model.layers[il].b2); - ggml_format_name(ffn_out, "pre_residual2_%d", il); - cur = ggml_add(ctx0, ffn_out, residual2); - ggml_set_name(cur, "inpFF_+_attn_out"); + cur = ggml_add(ctx0, cur, inpFF); + offload_func(cur); + ggml_set_name(cur, "inpFF_+_outFF"); inpL = cur; } cur = inpL; { cur = ggml_norm(ctx0, cur, norm_eps); + offload_func_nr(cur); cur = ggml_mul(ctx0, cur, model.output_norm); + offload_func_nr(cur); + + ggml_set_name(cur, "printme_final"); cur = ggml_add(ctx0, cur, model.output_norm_b); + // offload_func_nr(cur); + ggml_set_name(cur, "result_norm"); } cur = ggml_mul_mat(ctx0, model.output, cur); From 3db04db2b839cd16b79ca76e213e4e9da22a25bb Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Fri, 29 Sep 2023 14:59:51 -0700 Subject: [PATCH 13/22] update conversion script to directly take adept artifacts rather than .saftensors file --- ...to-gguf.py => convert-persimmon-to-gguf.py | 80 +++++++++---------- 1 file changed, 39 insertions(+), 41 deletions(-) rename convert-persimmon-st-to-gguf.py => convert-persimmon-to-gguf.py (65%) diff --git a/convert-persimmon-st-to-gguf.py b/convert-persimmon-to-gguf.py similarity index 65% rename from convert-persimmon-st-to-gguf.py rename to convert-persimmon-to-gguf.py index f8fcbb4bdaae5..25c8a5963ba19 100644 --- a/convert-persimmon-st-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -1,28 +1,31 @@ -from convert import lazy_load_safetensors_file -import sys import torch -from safetensors import safe_open -from pathlib import Path +import os from pprint import pprint -from sentencepiece import SentencePieceProcessor +import sys import argparse +from pathlib import Path +from sentencepiece import SentencePieceProcessor +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) import gguf -import json -import struct - -def file_is_safetensors(path: Path) -> bool: - fp = open(path, 'rb') - first8 = fp.read(8) - fp.seek(0) - if first8[:2] == b'PK': - # A zip file, i.e. PyTorch format - return False - return struct.unpack(' None: - args = get_args() - assert file_is_safetensors(args.model), 'Error: model file is not a SafeTensors file' - dir_model = args.model.parent - with open(dir_model / 'config.json', 'r') as f: - hparams = json.load(f) arch = gguf.MODEL_ARCH.PERSIMMON gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch]) - block_count = hparams['num_layers'] - head_count = hparams['num_attention_heads'] + block_count = hparams.num_layers + head_count = hparams.num_attention_heads head_count_kv = head_count - ctx_length = hparams['seq_length'] - hidden_size = hparams['hidden_size'] + ctx_length = hparams.seq_length + hidden_size = hparams.hidden_size gguf_writer.add_name('persimmon-8b-chat') gguf_writer.add_context_length(ctx_length) gguf_writer.add_embedding_length(hidden_size) gguf_writer.add_block_count(block_count) - gguf_writer.add_feed_forward_length(hparams['ffn_hidden_size']) + gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size) gguf_writer.add_rope_dimension_count(hidden_size // head_count) gguf_writer.add_head_count(head_count) gguf_writer.add_head_count_kv(head_count_kv) - gguf_writer.add_rope_freq_base(hparams['rotary_emb_base']) - gguf_writer.add_layer_norm_eps(hparams['layernorm_epsilon']) - tokens, scores, toktypes = get_tokenizer_info(dir_model) + gguf_writer.add_rope_freq_base(hparams.rotary_emb_base) + gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon) + tokens, scores, toktypes = get_tokenizer_info(args.model_dir) gguf_writer.add_tokenizer_model('llama') gguf_writer.add_token_list(tokens) gguf_writer.add_token_scores(scores) @@ -98,10 +100,6 @@ def main() -> None: tensor_map = gguf.get_tensor_name_map(arch, block_count) print(tensor_map) - tensors = {} - with safe_open(args.model, framework="pt") as f: - for k in f.keys(): - tensors[k] = f.get_tensor(k) for name in tensors.keys(): data = tensors[name] if name.endswith(".self_attention.rotary_emb.inv_freq"): @@ -132,4 +130,4 @@ def main() -> None: if __name__ == '__main__': - main() + main() \ No newline at end of file From f28f52c6d0b6875da79720017b46e79a50d9cf77 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Fri, 29 Sep 2023 15:25:25 -0700 Subject: [PATCH 14/22] Fix norm eps bug --- llama.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index ad9ce2ceee069..f7bd2ed9d3342 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1939,6 +1939,14 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_PERSIMMON: + { + GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + switch (hparams.n_layer) { + case 36: model.type = e_model::MODEL_8B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } default: (void)0; } @@ -4037,8 +4045,7 @@ static struct ggml_cgraph * llm_build_persimmon( const float freq_base = cparams.rope_freq_base; const float freq_scale = cparams.rope_freq_scale; - float norm_eps = 1e-5f;//: hparams.f_norm_eps; - LLAMA_LOG_INFO("norm_eps: %f\n", hparams.f_norm_eps); + float norm_eps = hparams.f_norm_eps; const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; @@ -4068,11 +4075,6 @@ static struct ggml_cgraph * llm_build_persimmon( memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); } ggml_set_name(inp_tokens, "inp_tokens"); - LLAMA_LOG_INFO("Input tokens are: ["); - for (int i = 0; i < n_tokens; ++i) { - LLAMA_LOG_INFO("%d, ", batch.token[i]); - } - LLAMA_LOG_INFO("]\n"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); @@ -4401,7 +4403,6 @@ static struct ggml_cgraph * llm_build_persimmon( cur = ggml_mul(ctx0, cur, model.output_norm); offload_func_nr(cur); - ggml_set_name(cur, "printme_final"); cur = ggml_add(ctx0, cur, model.output_norm_b); // offload_func_nr(cur); From 2b565916dd322e8a984d05050df94b587920ca8a Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Sat, 30 Sep 2023 14:11:52 -0700 Subject: [PATCH 15/22] Support sqr and concat on metal, persimmon-8b-q4 runs correctly --- ggml-metal.m | 82 ++++++++++++++++++++++++++++++++++++++++-------- ggml-metal.metal | 63 +++++++++++++++++++++++++++++++++++++ llama.cpp | 6 +++- 3 files changed, 137 insertions(+), 14 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index b3c463f03ad3d..836e6e42cb07e 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -109,6 +109,8 @@ GGML_METAL_DECL_KERNEL(cpy_f32_f16); GGML_METAL_DECL_KERNEL(cpy_f32_f32); GGML_METAL_DECL_KERNEL(cpy_f16_f16); + GGML_METAL_DECL_KERNEL(concat); + GGML_METAL_DECL_KERNEL(sqr); #undef GGML_METAL_DECL_KERNEL }; @@ -300,6 +302,8 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){ GGML_METAL_ADD_KERNEL(cpy_f32_f16); GGML_METAL_ADD_KERNEL(cpy_f32_f32); GGML_METAL_ADD_KERNEL(cpy_f16_f16); + GGML_METAL_ADD_KERNEL(concat); + GGML_METAL_ADD_KERNEL(sqr); #undef GGML_METAL_ADD_KERNEL } @@ -375,6 +379,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(cpy_f32_f16); GGML_METAL_DEL_KERNEL(cpy_f32_f32); GGML_METAL_DEL_KERNEL(cpy_f16_f16); + GGML_METAL_DEL_KERNEL(concat); + GGML_METAL_DEL_KERNEL(sqr); #undef GGML_METAL_DEL_KERNEL @@ -744,18 +750,19 @@ void ggml_metal_graph_compute( id id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); - //if (src0) { - // GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, - // ggml_is_contiguous(src0), src0->name); - //} - //if (src1) { - // GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, - // ggml_is_contiguous(src1), src1->name); - //} - //if (dst) { - // GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, - // dst->name); - //} + if (src0) { + GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, + ggml_is_contiguous(src0), src0->name); + } + if (src1) { + GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, + ggml_is_contiguous(src1), src1->name); + } + if (dst) { + GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, + dst->name); + } +*/ switch (dst->op) { case GGML_OP_NONE: @@ -766,6 +773,45 @@ void ggml_metal_graph_compute( { // noop } break; + case GGML_OP_CONCAT: + { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + + int64_t nb = ne00; + [encoder setComputePipelineState:ctx->pipeline_concat]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:27]; + + const int nth = MIN(1024, ne0); + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; case GGML_OP_ADD: { GGML_ASSERT(ggml_is_contiguous(src0)); @@ -827,8 +873,8 @@ void ggml_metal_graph_compute( } break; case GGML_OP_MUL: { - GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(src0)); // utilize float4 GGML_ASSERT(ne00 % 4 == 0); @@ -903,6 +949,16 @@ void ggml_metal_graph_compute( GGML_ASSERT(false); } } break; + case GGML_OP_SQR: + { + [encoder setComputePipelineState:ctx->pipeline_sqr]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + //GGML_METAL_LOG_WARN("%s: node %3d, op = %8s dispatching \n", __func__, i, ggml_op_name(dst->op)); + const int64_t n = ggml_nelements(dst); + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; case GGML_OP_SOFT_MAX: { const int nth = MIN(32, ne00); diff --git a/ggml-metal.metal b/ggml-metal.metal index 5e1af6a092aed..b7038aa702799 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -132,6 +132,13 @@ kernel void kernel_relu( dst[tpig] = max(0.0f, src0[tpig]); } +kernel void kernel_sqr( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] * src0[tpig]; +} + constant float GELU_COEF_A = 0.044715f; constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; @@ -1091,6 +1098,62 @@ kernel void kernel_cpy_f32_f32( } } +kernel void kernel_concat( + device const char * src0, + device const char * src1, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const int64_t i03 = tgpig.z; + const int64_t i02 = tgpig.y; + const int64_t i01 = tgpig.x; + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + device const char * src0_ptr = src0 + i03 * nb03 + i02 * nb02 + i01 * nb01 + tpitg.x*nb00; + device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10; + device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0; + + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + if (i02 < ne02) { + ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0]; + src0_ptr += ntg.x*nb00; + } else { + ((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0]; + src1_ptr += ntg.x*nb10; + } + dst_ptr += ntg.x*nb0; + } +} + //============================================ k-quants ====================================================== #ifndef QK_K diff --git a/llama.cpp b/llama.cpp index b78abee76c677..db28652198dda 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4044,8 +4044,10 @@ static struct ggml_cgraph * llm_build_persimmon( const float freq_base = cparams.rope_freq_base; const float freq_scale = cparams.rope_freq_scale; + const float norm_eps = hparams.f_norm_eps; + + const int n_gpu_layers = model.n_gpu_layers; - float norm_eps = hparams.f_norm_eps; const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; @@ -4083,6 +4085,8 @@ static struct ggml_cgraph * llm_build_persimmon( memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); } } + const int i_gpu_start = n_layer - n_gpu_layers; + (void) i_gpu_start; offload_func_t offload_func_nr = llama_nop; // nr = non-repeating offload_func_t offload_func_kq = llama_nop; offload_func_t offload_func_v = llama_nop; From e6bf87f78575442161d51876d4d2e39fcfd26b6c Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Mon, 2 Oct 2023 10:21:16 -0700 Subject: [PATCH 16/22] Small changes from review --- ggml-metal.m | 29 ++++++++++++++--------------- llama.cpp | 12 +++--------- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 836e6e42cb07e..adcb3b0f07ba1 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -750,19 +750,18 @@ void ggml_metal_graph_compute( id id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); - if (src0) { - GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, - ggml_is_contiguous(src0), src0->name); - } - if (src1) { - GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, - ggml_is_contiguous(src1), src1->name); - } - if (dst) { - GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, - dst->name); - } -*/ + //if (src0) { + // GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, + // ggml_is_contiguous(src0), src0->name); + //} + //if (src1) { + // GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, + // ggml_is_contiguous(src1), src1->name); + //} + //if (dst) { + // GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, + // dst->name); + //} switch (dst->op) { case GGML_OP_NONE: @@ -775,8 +774,6 @@ void ggml_metal_graph_compute( } break; case GGML_OP_CONCAT: { - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); int64_t nb = ne00; [encoder setComputePipelineState:ctx->pipeline_concat]; @@ -951,6 +948,8 @@ void ggml_metal_graph_compute( } break; case GGML_OP_SQR: { + GGML_ASSERT(ggml_is_contiguous(src0)); + [encoder setComputePipelineState:ctx->pipeline_sqr]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; diff --git a/llama.cpp b/llama.cpp index db28652198dda..0f276e7f7cdd2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3037,11 +3037,9 @@ static struct ggml_cgraph * llm_build_baichaun( struct ggml_init_params params = { /*.mem_size =*/ buf_compute.size, /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ false, + /*.no_alloc =*/ true, }; - params.no_alloc = true; - struct ggml_context * ctx0 = ggml_init(params); ggml_cgraph * gf = ggml_new_graph(ctx0); @@ -3445,11 +3443,9 @@ static struct ggml_cgraph * llm_build_falcon( struct ggml_init_params params = { /*.mem_size =*/ buf_compute.size, /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ false, + /*.no_alloc =*/ true, }; - params.no_alloc = true; - struct ggml_context * ctx0 = ggml_init(params); ggml_cgraph * gf = ggml_new_graph(ctx0); @@ -3805,11 +3801,9 @@ static struct ggml_cgraph * llm_build_starcoder( struct ggml_init_params params = { /*.mem_size =*/ buf_compute.size, /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ false, + /*.no_alloc =*/ true, }; - params.no_alloc = true; - struct ggml_context * ctx0 = ggml_init(params); ggml_cgraph * gf = ggml_new_graph(ctx0); From cd4d3df8207c16bc3f44d0688ffce8fd89016957 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Mon, 2 Oct 2023 10:26:39 -0700 Subject: [PATCH 17/22] Formatting changes --- ggml-metal.m | 5 ++--- gguf-py/gguf/gguf.py | 4 ++-- llama.cpp | 11 +++++------ 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index adcb3b0f07ba1..9f754fe62ab71 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -870,8 +870,8 @@ void ggml_metal_graph_compute( } break; case GGML_OP_MUL: { - GGML_ASSERT(ggml_is_contiguous(src1)); GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); // utilize float4 GGML_ASSERT(ne00 % 4 == 0); @@ -952,9 +952,8 @@ void ggml_metal_graph_compute( [encoder setComputePipelineState:ctx->pipeline_sqr]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - //GGML_METAL_LOG_WARN("%s: node %3d, op = %8s dispatching \n", __func__, i, ggml_op_name(dst->op)); const int64_t n = ggml_nelements(dst); [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 882b96bc61ba6..56b309125680f 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -85,7 +85,7 @@ class MODEL_ARCH(IntEnum): GPTNEOX : int = auto() MPT : int = auto() STARCODER : int = auto() - PERSIMMON : int = auto() + PERSIMMON : int = auto() class MODEL_TENSOR(IntEnum): @@ -190,7 +190,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", }, MODEL_ARCH.PERSIMMON: { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.OUTPUT: "output", MODEL_TENSOR.OUTPUT_NORM: "output_norm", MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", diff --git a/llama.cpp b/llama.cpp index 0f276e7f7cdd2..c2d10d59733c1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2473,7 +2473,7 @@ static void llm_load_tensors( } break; case LLM_ARCH_PERSIMMON: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); @@ -2612,8 +2612,8 @@ static bool llama_model_load( } static struct ggml_cgraph * llm_build_llama( - llama_context & lctx, - const llama_batch & batch) { + llama_context & lctx, + const llama_batch & batch) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -4016,9 +4016,8 @@ static struct ggml_cgraph * llm_build_starcoder( static struct ggml_cgraph * llm_build_persimmon( - llama_context & lctx, - const llama_batch & batch -) { + llama_context & lctx, + const llama_batch & batch) { const auto & model = lctx.model; const auto & hparams = model.hparams; From 422b110841518e13362576db0b2fb6ef03edf910 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Mon, 2 Oct 2023 10:56:31 -0700 Subject: [PATCH 18/22] Minor changes to conversion script --- convert-persimmon-to-gguf.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py index 25c8a5963ba19..307610428b118 100644 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -21,7 +21,7 @@ def _flatten_dict(dct, tensors, prefix=None): raise ValueError(type(dct[key])) return None -def get_tokenizer_info(dir_model: Path): +def _get_sentencepiece_tokenizer_info(dir_model: Path): tokenizer_path = dir_model / 'adept_vocab.model' print('gguf: getting sentencepiece tokenizer from', tokenizer_path) tokenizer = SentencePieceProcessor(str(tokenizer_path)) @@ -38,14 +38,11 @@ def get_tokenizer_info(dir_model: Path): text = piece.encode("utf-8") score = tokenizer.get_score(i) - toktype = 1 # defualt to normal token type + toktype = 1 if tokenizer.is_unknown(i): toktype = 2 if tokenizer.is_control(i): toktype = 3 - - # toktype = 4 is user-defined = tokens from added_tokens.json - if tokenizer.is_unused(i): toktype = 5 if tokenizer.is_byte(i): @@ -90,7 +87,8 @@ def main(): gguf_writer.add_head_count_kv(head_count_kv) gguf_writer.add_rope_freq_base(hparams.rotary_emb_base) gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon) - tokens, scores, toktypes = get_tokenizer_info(args.model_dir) + + tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir) gguf_writer.add_tokenizer_model('llama') gguf_writer.add_token_list(tokens) gguf_writer.add_token_scores(scores) @@ -113,7 +111,6 @@ def main(): sys.exit() n_dims = len(data.shape) print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - gguf_writer.add_tensor(new_name, data) print("gguf: write header") gguf_writer.write_header_to_file() From 7a279fe5a86fa95cda58d6e0d897024da74670f3 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Mon, 2 Oct 2023 14:25:41 -0700 Subject: [PATCH 19/22] Remove old script --- convert-persimmon-st-to-gguf.py | 135 -------------------------------- 1 file changed, 135 deletions(-) delete mode 100644 convert-persimmon-st-to-gguf.py diff --git a/convert-persimmon-st-to-gguf.py b/convert-persimmon-st-to-gguf.py deleted file mode 100644 index f8fcbb4bdaae5..0000000000000 --- a/convert-persimmon-st-to-gguf.py +++ /dev/null @@ -1,135 +0,0 @@ -from convert import lazy_load_safetensors_file -import sys -import torch -from safetensors import safe_open -from pathlib import Path -from pprint import pprint -from sentencepiece import SentencePieceProcessor -import argparse -import gguf -import json -import struct - -def file_is_safetensors(path: Path) -> bool: - fp = open(path, 'rb') - first8 = fp.read(8) - fp.seek(0) - if first8[:2] == b'PK': - # A zip file, i.e. PyTorch format - return False - return struct.unpack(' None: - args = get_args() - assert file_is_safetensors(args.model), 'Error: model file is not a SafeTensors file' - dir_model = args.model.parent - with open(dir_model / 'config.json', 'r') as f: - hparams = json.load(f) - arch = gguf.MODEL_ARCH.PERSIMMON - gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch]) - - block_count = hparams['num_layers'] - head_count = hparams['num_attention_heads'] - head_count_kv = head_count - ctx_length = hparams['seq_length'] - hidden_size = hparams['hidden_size'] - - gguf_writer.add_name('persimmon-8b-chat') - gguf_writer.add_context_length(ctx_length) - gguf_writer.add_embedding_length(hidden_size) - gguf_writer.add_block_count(block_count) - gguf_writer.add_feed_forward_length(hparams['ffn_hidden_size']) - gguf_writer.add_rope_dimension_count(hidden_size // head_count) - gguf_writer.add_head_count(head_count) - gguf_writer.add_head_count_kv(head_count_kv) - gguf_writer.add_rope_freq_base(hparams['rotary_emb_base']) - gguf_writer.add_layer_norm_eps(hparams['layernorm_epsilon']) - tokens, scores, toktypes = get_tokenizer_info(dir_model) - gguf_writer.add_tokenizer_model('llama') - gguf_writer.add_token_list(tokens) - gguf_writer.add_token_scores(scores) - gguf_writer.add_token_types(toktypes) - gguf_writer.add_bos_token_id(71013) - gguf_writer.add_eos_token_id(71013) - - tensor_map = gguf.get_tensor_name_map(arch, block_count) - print(tensor_map) - tensors = {} - with safe_open(args.model, framework="pt") as f: - for k in f.keys(): - tensors[k] = f.get_tensor(k) - for name in tensors.keys(): - data = tensors[name] - if name.endswith(".self_attention.rotary_emb.inv_freq"): - continue - old_dtype = data.dtype - # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?) - data = data.to(torch.float32).squeeze().numpy() - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - n_dims = len(data.shape) - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - print("gguf: write header") - gguf_writer.write_header_to_file() - print("gguf: write metadata") - gguf_writer.write_kv_data_to_file() - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - - gguf_writer.close() - - print(f"gguf: model successfully exported to '{args.outfile}'") - print("") - - - -if __name__ == '__main__': - main() From c90ed9f16b3c6551d3080a617a143d38909ae30d Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Tue, 3 Oct 2023 13:18:23 -0700 Subject: [PATCH 20/22] Fix editorconfig formatting --- convert-persimmon-to-gguf.py | 6 +++--- ggml-metal.metal | 2 +- llama.cpp | 20 ++++++++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py index 307610428b118..e022ffe46189e 100644 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -24,7 +24,7 @@ def _flatten_dict(dct, tensors, prefix=None): def _get_sentencepiece_tokenizer_info(dir_model: Path): tokenizer_path = dir_model / 'adept_vocab.model' print('gguf: getting sentencepiece tokenizer from', tokenizer_path) - tokenizer = SentencePieceProcessor(str(tokenizer_path)) + tokenizer = SentencePieceProcessor(str(tokenizer_path)) print('gguf: adding tokens') tokens: list[bytes] = [] scores: list[float] = [] @@ -70,7 +70,7 @@ def main(): arch = gguf.MODEL_ARCH.PERSIMMON gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch]) - + block_count = hparams.num_layers head_count = hparams.num_attention_heads head_count_kv = head_count @@ -127,4 +127,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/ggml-metal.metal b/ggml-metal.metal index b7038aa702799..2ed1964fee8bf 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -1129,7 +1129,7 @@ kernel void kernel_concat( uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { - + const int64_t i03 = tgpig.z; const int64_t i02 = tgpig.y; const int64_t i01 = tgpig.x; diff --git a/llama.cpp b/llama.cpp index 7e3c663cc89b7..150ba152b5c0e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2476,7 +2476,7 @@ static void llm_load_tensors( } } break; case LLM_ARCH_PERSIMMON: - { + { model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); @@ -4162,8 +4162,8 @@ static struct ggml_cgraph * llm_build_persimmon( cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); offload_func(cur); ggml_format_name(cur, "input_layernorm_%d", il); - } - // self attention + } + // self attention { cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); offload_func_kq(cur); @@ -4206,7 +4206,7 @@ static struct ggml_cgraph * llm_build_persimmon( offload_func_v(tmpk); tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); offload_func_v(tmpk); - + // RoPE the first n_rot of q/k, pass the other half, and concat. struct ggml_tensor * qrot = ggml_view_3d( ctx0, tmpq, n_rot, n_head, n_tokens, @@ -4227,7 +4227,7 @@ static struct ggml_cgraph * llm_build_persimmon( // get the second half of tmpq, e.g tmpq[n_rot:, :, :] struct ggml_tensor * qpass = ggml_view_3d( - ctx0, tmpq, n_rot, n_head, n_tokens, + ctx0, tmpq, n_rot, n_head, n_tokens, ggml_element_size(tmpq) * n_embd_head, ggml_element_size(tmpq) * n_embd_head * n_head, ggml_element_size(tmpq) * n_rot @@ -4328,9 +4328,9 @@ static struct ggml_cgraph * llm_build_persimmon( offload_func_kq(KQ_soft_max); ggml_set_name(KQ_soft_max, "KQ_soft_max"); - struct ggml_tensor * V = + struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, + n_kv, n_embd_head, n_head_kv, ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); @@ -4361,11 +4361,11 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_set_name(inpFF, "inpFF"); { // MLP - { + { // Norm cur = ggml_norm(ctx0, inpFF, norm_eps); offload_func(cur); - cur = ggml_add(ctx0, + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b ); @@ -4386,7 +4386,7 @@ static struct ggml_cgraph * llm_build_persimmon( cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); offload_func(cur); - cur = ggml_add(ctx0, + cur = ggml_add(ctx0, cur, model.layers[il].b2); offload_func(cur); From 1d518d65d34213f395c45cff2f1ef02e5e6a2828 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Thu, 5 Oct 2023 12:24:06 -0700 Subject: [PATCH 21/22] Fix build --- llama.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llama.cpp b/llama.cpp index 4ad80f563ade7..8cf9a1b5fd2d1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4832,6 +4832,7 @@ static struct ggml_cgraph * llama_build_graph( case LLM_ARCH_PERSIMMON: { result = llm_build_persimmon(lctx, batch); + } case LLM_ARCH_REFACT: { result = llm_build_refact(lctx, batch); From 485a471e9389db1e2b5b5b0f720bd2eec432ebf2 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Fri, 6 Oct 2023 12:39:27 -0700 Subject: [PATCH 22/22] add overlooked offload code ggml-ci --- llama.cpp | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index 9f6ea69939156..3e923629526d1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2587,9 +2587,38 @@ static void llm_load_tensors( case LLM_ARCH_PERSIMMON: { model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + + { + ggml_backend backend_norm; + ggml_backend backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 + + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + vram_weights += ggml_nbytes(model.output_norm_b); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } const uint32_t n_ff = hparams.n_ff; const int i_gpu_start = n_layer - n_gpu_layers; @@ -3549,11 +3578,9 @@ static struct ggml_cgraph * llm_build_refact( struct ggml_init_params params = { /*.mem_size =*/ buf_compute.size, /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ false, + /*.no_alloc =*/ true, }; - params.no_alloc = true; - struct ggml_context * ctx0 = ggml_init(params); ggml_cgraph * gf = ggml_new_graph(ctx0);