Skip to content

Commit

Permalink
some users report that this repo is now being flagged as malicious?
Browse files Browse the repository at this point in the history
no idea why, but I am removing all prebuilt binaries except libopenblas. windows users can still obtain it from /releases and osx and linux users can rebuild from source code.
  • Loading branch information
LostRuins committed Apr 6, 2023
2 parents b56f872 + d2beca9 commit 4f5faf9
Show file tree
Hide file tree
Showing 6 changed files with 180 additions and 1 deletion.
Binary file removed koboldcpp.dll
Binary file not shown.
7 changes: 7 additions & 0 deletions koboldcpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,12 @@ def main(args):
elif not args.noblas:
print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas.dll will be required.")
use_blas = True

if args.psutil_set_threads:
import psutil
args.threads = psutil.cpu_count(logical=False)
print("Overriding thread count, using " + str(args.threads) + " threads instead.")

init_library() # Note: if blas does not exist and is enabled, program will crash.
ggml_selected_file = args.model_file
embedded_kailite = None
Expand Down Expand Up @@ -363,6 +369,7 @@ def main(args):
physical_core_limit = int(os.cpu_count()/2)
default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
parser.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
args = parser.parse_args()
Expand Down
Binary file removed koboldcpp_blas.dll
Binary file not shown.
173 changes: 173 additions & 0 deletions otherarch/convert_hf_gptj.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
# Convert GPT-J-6B h5 transformer model to ggml format
#
# Load the model using GPTJForCausalLM.
# Iterate over all variables and write them to a binary file.
#
# For each variable, write the following:
# - Number of dimensions (int)
# - Name length (int)
# - Dimensions (int[n_dims])
# - Name (char[name_length])
# - Data (float[n_dims])
#
# By default, the bigger matrices are converted to 16-bit floats.
# This can be disabled by adding the "use-f32" CLI argument.
#
# At the start of the ggml file we write the model parameters
# and vocabulary.
#

import sys
import struct
import json
import torch
import numpy as np

from transformers import GPTJForCausalLM

# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))

if len(sys.argv) < 3:
print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
print(" ftype == 0 -> float32")
print(" ftype == 1 -> float16")
sys.exit(1)

# output in the same directory as the model
dir_model = sys.argv[1]
fname_out = sys.argv[1] + "/ggml-model.bin"

with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
encoder = json.load(f)

with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
encoder_added = json.load(f)

with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)

# possible data types
# ftype == 0 -> float32
# ftype == 1 -> float16
#
# map from ftype to string
ftype_str = ["f32", "f16"]

ftype = 1
if len(sys.argv) > 2:
ftype = int(sys.argv[2])
if ftype < 0 or ftype > 1:
print("Invalid ftype: " + str(ftype))
sys.exit(1)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"


model = GPTJForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
#print (model)

list_vars = model.state_dict()
#print (list_vars)

fout = open(fname_out, "wb")

fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["vocab_size"]))
fout.write(struct.pack("i", hparams["n_positions"]))
fout.write(struct.pack("i", hparams["n_embd"]))
fout.write(struct.pack("i", hparams["n_head"]))
fout.write(struct.pack("i", hparams["n_layer"]))
fout.write(struct.pack("i", hparams["rotary_dim"]))
fout.write(struct.pack("i", ftype))

byte_encoder = bytes_to_unicode()
byte_decoder = {v:k for k, v in byte_encoder.items()}

fout.write(struct.pack("i", len(encoder) + len(encoder_added)))

for key in encoder:
text = bytearray([byte_decoder[c] for c in key])
fout.write(struct.pack("i", len(text)))
fout.write(text)

for key in encoder_added:
text = bytearray([byte_decoder[c] for c in key])
fout.write(struct.pack("i", len(text)))
fout.write(text)

for name in list_vars.keys():
data = list_vars[name].squeeze().numpy()
print("Processing variable: " + name + " with shape: ", data.shape)

# we don't need these
if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
print(" Skipping variable: " + name)
continue

n_dims = len(data.shape);

# ftype == 0 -> float32, ftype == 1 -> float16
ftype_cur = 0;
if ftype != 0:
if name[-7:] == ".weight" and n_dims == 2:
print(" Converting to float16")
data = data.astype(np.float16)
ftype_cur = 1
else:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0
else:
if data.dtype != np.float32:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0

# for efficiency - transpose these matrices:
# (note - with latest ggml this is no longer more efficient, so disabling it)
# "transformer.h.*.mlp.fc_in.weight"
# "transformer.h.*.attn.out_proj.weight"
# "transformer.h.*.attn.q_proj.weight"
# "transformer.h.*.attn.k_proj.weight"
# "transformer.h.*.attn.v_proj.weight"
#if name.endswith(".mlp.fc_in.weight") or \
# name.endswith(".attn.out_proj.weight") or \
# name.endswith(".attn.q_proj.weight") or \
# name.endswith(".attn.k_proj.weight") or \
# name.endswith(".attn.v_proj.weight"):
# print(" Transposing")
# data = data.transpose()

# header
str = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
for i in range(n_dims):
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
fout.write(str);

# data
data.tofile(fout)

fout.close()

print("Done. Output file: " + fname_out)
print("")
1 change: 0 additions & 1 deletion otherarch/gptj_v2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g

ctx_size += (5 + 10*n_layer)*256; // object overhead

ctx_size = ctx_size * 3 / 2;
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}

Expand Down
Binary file removed quantize.exe
Binary file not shown.

0 comments on commit 4f5faf9

Please sign in to comment.