Skip to content

Commit

Permalink
Revert "Temporary revert to old vocab conversion for falcon"
Browse files Browse the repository at this point in the history
This reverts commit 63dd07a.
  • Loading branch information
Galunid committed Oct 31, 2023
1 parent 80de47c commit fcae724
Showing 1 changed file with 0 additions and 24 deletions.
24 changes: 0 additions & 24 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,30 +568,6 @@ def set_gguf_parameters(self):
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_file_type(self.ftype)

def set_vocab(self):
tokens = []
scores = []
toktypes = []

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size

reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}

for i in range(vocab_size):
tokens.append(reverse_vocab[i])
scores.append(0.0) # dummy
toktypes.append(gguf.TokenType.NORMAL)

self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, load_merges = True, n_vocab = len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)

def write_tensors(self):
block_count = self.hparams.get("num_hidden_layers")
if block_count is None:
Expand Down

0 comments on commit fcae724

Please sign in to comment.