Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update gpt2 preprocess and add deepseek coder preprocess #4070

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -96,5 +96,6 @@ tests/test-quantize-perf
tests/test-sampling
tests/test-tokenizer-0-llama
tests/test-tokenizer-0-falcon
tests/test-tokenizer-0-deepseek_coder
tests/test-tokenizer-1-llama
tests/test-tokenizer-1-bpe
8 changes: 7 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ BUILD_TARGETS = \
TEST_TARGETS = \
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek_coder \
tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe

# Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
Expand Down Expand Up @@ -69,6 +70,8 @@ test: $(TEST_TARGETS)
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek_coder" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
Expand Down Expand Up @@ -712,6 +715,9 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

tests/test-tokenizer-0-deepseek_coder: tests/test-tokenizer-0-deepseek_coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

Expand Down
91 changes: 90 additions & 1 deletion convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,30 @@ def from_model_architecture(model_architecture):
if model_architecture == "PersimmonForCausalLM":
return PersimmonModel
return Model

@staticmethod
def from_model_name(model_name: str):
DOGEwbx marked this conversation as resolved.
Show resolved Hide resolved
if model_name == "StableLMEpoch":
return StableLMModel
DOGEwbx marked this conversation as resolved.
Show resolved Hide resolved
if model_name == "GPTNeoX":
return GPTNeoXModel
if model_name == "Bloom":
return BloomModel
if model_name == "MPT":
return MPTModel
if model_name in ("Baichuan", "BaiChuan"):
return BaichuanModel
if model_name in ("Falcon", "RW"):
return FalconModel
if model_name == "GPTBigCode":
return StarCoderModel
if model_name == "GPTRefact":
return RefactModel
if model_name == "Persimmon":
return PersimmonModel
if model_name == "DeepseekCoder":
return DeepseekCoderModel
return Model

def _is_model_safetensors(self) -> bool:
return Model.count_model_parts(self.dir_model, ".safetensors") > 0
Expand Down Expand Up @@ -201,6 +225,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
return gguf.MODEL_ARCH.REFACT
if arch == "PersimmonForCausalLM":
return gguf.MODEL_ARCH.PERSIMMON
if arch == "LlamaForCausalLM":
return gguf.MODEL_ARCH.LLAMA

raise NotImplementedError(f'Architecture "{arch}" not supported!')

Expand Down Expand Up @@ -823,6 +849,68 @@ def write_tensors(self):
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)

class DeepseekCoderModel(Model):
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
head_count = self.hparams["num_attention_heads"]
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
ctx_length = self.hparams["max_position_embeddings"]

self.gguf_writer.add_name("deepseek_coder")
self.gguf_writer.add_context_length(ctx_length)
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
self.gguf_writer.add_head_count(head_count)
self.gguf_writer.add_head_count_kv(head_count_kv)
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])

if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])

def set_vocab(self):
DOGEwbx marked this conversation as resolved.
Show resolved Hide resolved
dir_model = self.dir_model
hparams = self.hparams
tokens: list[bytearray] = []
toktypes: list[int] = []

from transformers import AutoTokenizer # type: ignore[attr-defined]
tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size

reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab()
special_tokens = tokenizer.all_special_tokens
for i in range(vocab_size):
if i not in reverse_vocab:
pad_token = f"[PAD{i}]".encode('utf-8')
tokens.append(bytearray(pad_token))
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
if reverse_vocab[i] in special_tokens:
DOGEwbx marked this conversation as resolved.
Show resolved Hide resolved
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)

self.gguf_writer.add_tokenizer_model("deepseek_coder")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
special_vocab.add_to_gguf(self.gguf_writer)




DOGEwbx marked this conversation as resolved.
Show resolved Hide resolved

###### CONVERSION LOGIC ######

Expand All @@ -845,6 +933,7 @@ def parse_args() -> argparse.Namespace:
"model", type=Path,
help="directory containing model file",
)
parser.add_argument("--model-name", type=str, default=None, help="name of the model")

return parser.parse_args()

Expand All @@ -871,7 +960,7 @@ def parse_args() -> argparse.Namespace:

hparams = Model.load_hparams(dir_model)

model_class = Model.from_model_architecture(hparams["architectures"][0])
model_class = Model.from_model_name(args.model_name) if args.model_name else Model.from_model_architecture(hparams["architectures"][0])
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)

print("Set model parameters")
Expand Down
Loading