Skip to content
This repository has been archived by the owner on Sep 17, 2024. It is now read-only.

added a demonstration script for use with pyllama! #7

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
311 changes: 311 additions & 0 deletions migrate-ggml-2023-03-30-pr613.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,311 @@
# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
#
# We caused a breaking change to the file format on 2023-03-30 in:
# https://github.com/ggerganov/llama.cpp/pull/613
#
# (1) If you still have the Meta LLaMA .pth files, then close this
# file now; you can just run `convert-pth-to-ggml.py` again to
# migrate to the new format. The tool is easier to use too. It
# isn't necessary anymore to manage split output files because
# the new format always combines things into a single file.
#
# (2) If you deleted the Meta LLaMA .pth files due to save on disk
# space, then this tool is intended to help you. Please check
# out the instructions below.
#
# USAGE
#
# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
#
# PREREQUISITES
#
# pip install numpy
# cd llama.cpp
# make -j4
#
# EXAMPLE (7B MODEL)
#
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
#
# # check that it works
# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
#
# # you can delete the old files
# rm -f models/7B/ggml-model-f16.bin
# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
#
# EXAMPLE (13B MODEL)
#
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
#
# # check that it works
# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
#
# # you can delete the old files
# rm -f models/13B/ggml-model-f16.bin*
# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
#

import argparse
import os
import sys
import json
import struct
import numpy as np

QK = 32

GGML_TYPE_Q4_0 = 0
GGML_TYPE_Q4_1 = 1
GGML_TYPE_I8 = 2
GGML_TYPE_I16 = 3
GGML_TYPE_I32 = 4
GGML_TYPE_F16 = 5
GGML_TYPE_F32 = 6

WTYPE_NAMES = {
0: "F32",
1: "F16",
2: "Q4_0",
3: "Q4_1",
}

WTYPES = {
0: GGML_TYPE_F32,
1: GGML_TYPE_F16,
2: GGML_TYPE_Q4_0,
3: GGML_TYPE_Q4_1,
}

GGML_BLCK_SIZE = {
GGML_TYPE_Q4_0: QK,
GGML_TYPE_Q4_1: QK,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 1,
GGML_TYPE_I32: 1,
GGML_TYPE_F16: 1,
GGML_TYPE_F32: 1,
}

GGML_TYPE_SIZE = {
GGML_TYPE_Q4_0: 4 + QK//2,
GGML_TYPE_Q4_1: 4*2 + QK//2,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 2,
GGML_TYPE_I32: 4,
GGML_TYPE_F16: 2,
GGML_TYPE_F32: 4,
}

HPARAMS = [
'magic', # int32
'version', # int32
'n_vocab', # int32
'n_embd', # int32
'n_mult', # int32
'n_head', # int32
'n_layer', # int32
'n_rot', # int32
'f16', # int32
]

def read_hparams(fin):
struct_fmt = "i" * len(HPARAMS)
struct_size = struct.calcsize(struct_fmt)
buf = fin.read(struct_size)
ints = struct.unpack(struct_fmt, buf)
hparams = dict(zip(HPARAMS, ints))
return hparams

def write_hparams(fout, hparams):
struct_fmt = "i" * len(HPARAMS)
struct_size = struct.calcsize(struct_fmt)
ints = [hparams[h] for h in HPARAMS]
fout.write(struct.pack(struct_fmt, *ints))

def read_tokens(fin, hparams):
tokens = []
for i in range(hparams['n_vocab']):
len_b = fin.read(4)
(length,) = struct.unpack("i", len_b)
word = fin.read(length)
score_b = fin.read(4)
(score,) = struct.unpack("f", score_b)
tokens.append((word, score))
return tokens

def write_tokens(fout, tokens):
for word, score in tokens:
fout.write(struct.pack("i", len(word)))
fout.write(word)
fout.write(struct.pack("f", score))

def ggml_nelements(shape):
r = 1
for i in shape:
r *= i
return r

def ggml_nbytes(shape, ftype):
x = ggml_nelements(shape)
t = WTYPES[ftype]
x *= GGML_TYPE_SIZE[t]
x //= GGML_BLCK_SIZE[t]
return x

def copy_tensors(fin, fout, part_id, n_parts):
while True:

b = fin.read(4)
if not b: break
(n_dims,) = struct.unpack("i", b)
b = fin.read(4)
(length,) = struct.unpack("i", b)
b = fin.read(4)
(ftype,) = struct.unpack("i", b)

assert n_dims in (1, 2)

partshape = list(range(n_dims))
for i in range(n_dims):
b = fin.read(4)
partshape[i] = struct.unpack("i", b)[0]
partshape = list(reversed(partshape))

name = fin.read(length)
data = fin.read(ggml_nbytes(partshape, ftype))

blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
type_size = GGML_TYPE_SIZE[WTYPES[ftype]]

print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")

# determine dimension along which multipart tensor is sharded
#
# split_dim 0 regex:
# - output.*
# - layers.*.attention.wq.weight
# - layers.*.attention.wk.weight
# - layers.*.attention.wv.weight
# - layers.*.feed_forward.w1.weight
# - layers.*.feed_forward.w3.weight
#
# split_dim 1 regex:
# - tok_embeddings.*
# - layers.*.attention.wo.weight
# - layers.*.feed_forward.w2.weight
#
if n_dims > 1:
split_dim = 1
if b"tok_embeddings" in name:
split_dim = 1
elif b"layers" in name:
if b"attention.wo.weight" in name:
split_dim = 1
elif b"feed_forward.w2.weight" in name:
split_dim = 1
else:
split_dim = 0
elif b"output" in name:
split_dim = 0

# output tensor header
fullshape = list(partshape)
if n_dims > 1:
fullshape[split_dim] *= n_parts
fout.write(struct.pack("iii", n_dims, len(name), ftype))
for dim in reversed(fullshape):
fout.write(struct.pack("i", dim))
fout.write(name)

# ensure tensor data is aligned
tensor_data_offset = fout.tell()
while tensor_data_offset % QK != 0:
fout.write(struct.pack("B", 0))
tensor_data_offset += 1

# output unified mappable tensor data
if n_dims == 1 or n_parts == 1:
# copy tensor which we thankfully received in one piece
if part_id == 0:
fout.write(data)
elif split_dim == 0:
# reassemble multifile tensor containing some of the rows
rows_per_chunk = partshape[0]
current_row = part_id * rows_per_chunk
bytes_per_row = fullshape[1] // blck_size * type_size
offset = current_row * bytes_per_row
fout.seek(tensor_data_offset + offset)
fout.write(data)
elif split_dim == 1:
# reassemble multifile tensor containing some of the cols
cols_per_chunk = partshape[1]
current_col = part_id * cols_per_chunk
bpr = partshape[1] // blck_size * type_size
bytes_per_row = fullshape[1] // blck_size * type_size
offset_current_col = current_col // blck_size * type_size
for row in range(partshape[0]):
offset_row = row * bytes_per_row
offset = offset_row + offset_current_col
fout.seek(tensor_data_offset + offset)
fout.write(data[row * bpr:row * bpr + bpr])

# advance file position to next tensor
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))

def parse_args():
parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
parser.add_argument('fout_path', help='your new ggjt file name')
return parser.parse_args()

def main():
args = parse_args()
assert args.fin_path
assert args.fout_path
assert args.fin_path != args.fout_path

with open(args.fin_path, "rb") as fin:
hparams = read_hparams(fin)
tokens = read_tokens(fin, hparams)

if hparams['magic'] == 0x67676a74: # ggjt
print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
sys.exit(1)

if hparams['magic'] != 0x67676d66: # ggmf
print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
sys.exit(1)

hparams['magic'] = 0x67676a74 # ggjt

# count number of multipart files by convention
n_parts = 1
while True:
if os.path.exists(f"{args.fin_path}.{n_parts}"):
n_parts += 1
else:
break

# we output a single file for ggml
with open(args.fout_path, "wb") as fout:
write_hparams(fout, hparams)
write_tokens(fout, tokens)
offset_of_tensors = fout.tell()
# the tensors we load could be split across multiple files
for part_id in range(n_parts):
fout.seek(offset_of_tensors)
print(f"Processing part {part_id+1} of {n_parts}\n")
fin_path = args.fin_path
if part_id > 0:
fin_path += f".{part_id}"
with open(fin_path, "rb") as fin:
read_tokens(fin, read_hparams(fin))
copy_tensors(fin, fout, part_id, n_parts)

print(f"Done. Output file: {args.fout_path}\n")

if __name__ == "__main__":
main()
24 changes: 24 additions & 0 deletions play.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
'''
Personal playground for playing with an instance of GPT4ALL.cpp
'''
from pyllamacpp.model import Model

def new_text_callback(text: str):
print(text, end="", flush=True)

def ask(message, n=70):
outline = model.generate(message,
n_predict=n,
new_text_callback=new_text_callback,
n_threads=8)
return outline

model = Model(ggml_model='./ggjt-model.bin', n_ctx=1024)
prompt="Write a story about a girl and her love of sewing."

print("===================")
# print(outline)
# outline=outline.replace(prompt,"")
q = "{}".format( prompt)#, " ###Instruction: What happens next?")
ask(q,n=200)