diff --git a/migrate-ggml-2023-03-30-pr613.py b/migrate-ggml-2023-03-30-pr613.py new file mode 100644 index 0000000000000..b6ef2476e052d --- /dev/null +++ b/migrate-ggml-2023-03-30-pr613.py @@ -0,0 +1,311 @@ +# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic +# +# We caused a breaking change to the file format on 2023-03-30 in: +# https://github.com/ggerganov/llama.cpp/pull/613 +# +# (1) If you still have the Meta LLaMA .pth files, then close this +# file now; you can just run `convert-pth-to-ggml.py` again to +# migrate to the new format. The tool is easier to use too. It +# isn't necessary anymore to manage split output files because +# the new format always combines things into a single file. +# +# (2) If you deleted the Meta LLaMA .pth files due to save on disk +# space, then this tool is intended to help you. Please check +# out the instructions below. +# +# USAGE +# +# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT +# +# PREREQUISITES +# +# pip install numpy +# cd llama.cpp +# make -j4 +# +# EXAMPLE (7B MODEL) +# +# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights +# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin +# +# # check that it works +# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?' +# +# # you can delete the old files +# rm -f models/7B/ggml-model-f16.bin +# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin +# +# EXAMPLE (13B MODEL) +# +# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights +# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin +# +# # check that it works +# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?' +# +# # you can delete the old files +# rm -f models/13B/ggml-model-f16.bin* +# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin +# + +import argparse +import os +import sys +import json +import struct +import numpy as np + +QK = 32 + +GGML_TYPE_Q4_0 = 0 +GGML_TYPE_Q4_1 = 1 +GGML_TYPE_I8 = 2 +GGML_TYPE_I16 = 3 +GGML_TYPE_I32 = 4 +GGML_TYPE_F16 = 5 +GGML_TYPE_F32 = 6 + +WTYPE_NAMES = { + 0: "F32", + 1: "F16", + 2: "Q4_0", + 3: "Q4_1", +} + +WTYPES = { + 0: GGML_TYPE_F32, + 1: GGML_TYPE_F16, + 2: GGML_TYPE_Q4_0, + 3: GGML_TYPE_Q4_1, +} + +GGML_BLCK_SIZE = { + GGML_TYPE_Q4_0: QK, + GGML_TYPE_Q4_1: QK, + GGML_TYPE_I8: 1, + GGML_TYPE_I16: 1, + GGML_TYPE_I32: 1, + GGML_TYPE_F16: 1, + GGML_TYPE_F32: 1, +} + +GGML_TYPE_SIZE = { + GGML_TYPE_Q4_0: 4 + QK//2, + GGML_TYPE_Q4_1: 4*2 + QK//2, + GGML_TYPE_I8: 1, + GGML_TYPE_I16: 2, + GGML_TYPE_I32: 4, + GGML_TYPE_F16: 2, + GGML_TYPE_F32: 4, +} + +HPARAMS = [ + 'magic', # int32 + 'version', # int32 + 'n_vocab', # int32 + 'n_embd', # int32 + 'n_mult', # int32 + 'n_head', # int32 + 'n_layer', # int32 + 'n_rot', # int32 + 'f16', # int32 +] + +def read_hparams(fin): + struct_fmt = "i" * len(HPARAMS) + struct_size = struct.calcsize(struct_fmt) + buf = fin.read(struct_size) + ints = struct.unpack(struct_fmt, buf) + hparams = dict(zip(HPARAMS, ints)) + return hparams + +def write_hparams(fout, hparams): + struct_fmt = "i" * len(HPARAMS) + struct_size = struct.calcsize(struct_fmt) + ints = [hparams[h] for h in HPARAMS] + fout.write(struct.pack(struct_fmt, *ints)) + +def read_tokens(fin, hparams): + tokens = [] + for i in range(hparams['n_vocab']): + len_b = fin.read(4) + (length,) = struct.unpack("i", len_b) + word = fin.read(length) + score_b = fin.read(4) + (score,) = struct.unpack("f", score_b) + tokens.append((word, score)) + return tokens + +def write_tokens(fout, tokens): + for word, score in tokens: + fout.write(struct.pack("i", len(word))) + fout.write(word) + fout.write(struct.pack("f", score)) + +def ggml_nelements(shape): + r = 1 + for i in shape: + r *= i + return r + +def ggml_nbytes(shape, ftype): + x = ggml_nelements(shape) + t = WTYPES[ftype] + x *= GGML_TYPE_SIZE[t] + x //= GGML_BLCK_SIZE[t] + return x + +def copy_tensors(fin, fout, part_id, n_parts): + while True: + + b = fin.read(4) + if not b: break + (n_dims,) = struct.unpack("i", b) + b = fin.read(4) + (length,) = struct.unpack("i", b) + b = fin.read(4) + (ftype,) = struct.unpack("i", b) + + assert n_dims in (1, 2) + + partshape = list(range(n_dims)) + for i in range(n_dims): + b = fin.read(4) + partshape[i] = struct.unpack("i", b)[0] + partshape = list(reversed(partshape)) + + name = fin.read(length) + data = fin.read(ggml_nbytes(partshape, ftype)) + + blck_size = GGML_BLCK_SIZE[WTYPES[ftype]] + type_size = GGML_TYPE_SIZE[WTYPES[ftype]] + + print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}") + + # determine dimension along which multipart tensor is sharded + # + # split_dim 0 regex: + # - output.* + # - layers.*.attention.wq.weight + # - layers.*.attention.wk.weight + # - layers.*.attention.wv.weight + # - layers.*.feed_forward.w1.weight + # - layers.*.feed_forward.w3.weight + # + # split_dim 1 regex: + # - tok_embeddings.* + # - layers.*.attention.wo.weight + # - layers.*.feed_forward.w2.weight + # + if n_dims > 1: + split_dim = 1 + if b"tok_embeddings" in name: + split_dim = 1 + elif b"layers" in name: + if b"attention.wo.weight" in name: + split_dim = 1 + elif b"feed_forward.w2.weight" in name: + split_dim = 1 + else: + split_dim = 0 + elif b"output" in name: + split_dim = 0 + + # output tensor header + fullshape = list(partshape) + if n_dims > 1: + fullshape[split_dim] *= n_parts + fout.write(struct.pack("iii", n_dims, len(name), ftype)) + for dim in reversed(fullshape): + fout.write(struct.pack("i", dim)) + fout.write(name) + + # ensure tensor data is aligned + tensor_data_offset = fout.tell() + while tensor_data_offset % QK != 0: + fout.write(struct.pack("B", 0)) + tensor_data_offset += 1 + + # output unified mappable tensor data + if n_dims == 1 or n_parts == 1: + # copy tensor which we thankfully received in one piece + if part_id == 0: + fout.write(data) + elif split_dim == 0: + # reassemble multifile tensor containing some of the rows + rows_per_chunk = partshape[0] + current_row = part_id * rows_per_chunk + bytes_per_row = fullshape[1] // blck_size * type_size + offset = current_row * bytes_per_row + fout.seek(tensor_data_offset + offset) + fout.write(data) + elif split_dim == 1: + # reassemble multifile tensor containing some of the cols + cols_per_chunk = partshape[1] + current_col = part_id * cols_per_chunk + bpr = partshape[1] // blck_size * type_size + bytes_per_row = fullshape[1] // blck_size * type_size + offset_current_col = current_col // blck_size * type_size + for row in range(partshape[0]): + offset_row = row * bytes_per_row + offset = offset_row + offset_current_col + fout.seek(tensor_data_offset + offset) + fout.write(data[row * bpr:row * bpr + bpr]) + + # advance file position to next tensor + fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype)) + +def parse_args(): + parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format') + parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)') + parser.add_argument('fout_path', help='your new ggjt file name') + return parser.parse_args() + +def main(): + args = parse_args() + assert args.fin_path + assert args.fout_path + assert args.fin_path != args.fout_path + + with open(args.fin_path, "rb") as fin: + hparams = read_hparams(fin) + tokens = read_tokens(fin, hparams) + + if hparams['magic'] == 0x67676a74: # ggjt + print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n") + sys.exit(1) + + if hparams['magic'] != 0x67676d66: # ggmf + print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n") + sys.exit(1) + + hparams['magic'] = 0x67676a74 # ggjt + + # count number of multipart files by convention + n_parts = 1 + while True: + if os.path.exists(f"{args.fin_path}.{n_parts}"): + n_parts += 1 + else: + break + + # we output a single file for ggml + with open(args.fout_path, "wb") as fout: + write_hparams(fout, hparams) + write_tokens(fout, tokens) + offset_of_tensors = fout.tell() + # the tensors we load could be split across multiple files + for part_id in range(n_parts): + fout.seek(offset_of_tensors) + print(f"Processing part {part_id+1} of {n_parts}\n") + fin_path = args.fin_path + if part_id > 0: + fin_path += f".{part_id}" + with open(fin_path, "rb") as fin: + read_tokens(fin, read_hparams(fin)) + copy_tensors(fin, fout, part_id, n_parts) + + print(f"Done. Output file: {args.fout_path}\n") + +if __name__ == "__main__": + main() diff --git a/play.py b/play.py new file mode 100644 index 0000000000000..5c0107a9918dd --- /dev/null +++ b/play.py @@ -0,0 +1,24 @@ +''' +Personal playground for playing with an instance of GPT4ALL.cpp +''' +from pyllamacpp.model import Model + +def new_text_callback(text: str): + print(text, end="", flush=True) + +def ask(message, n=70): + outline = model.generate(message, + n_predict=n, + new_text_callback=new_text_callback, + n_threads=8) + return outline + +model = Model(ggml_model='./ggjt-model.bin', n_ctx=1024) +prompt="Write a story about a girl and her love of sewing." + +print("===================") +# print(outline) +# outline=outline.replace(prompt,"") +q = "{}".format( prompt)#, " ###Instruction: What happens next?") +ask(q,n=200) +