Skip to content
Minhao Chou edited this page Apr 5, 2022 · 3 revisions

Guide of transformer tokenizers

BERT tokenizer

intialize tokenizer

tokenizers::BertTokenizer::Options options{};
options.vocab_file = /bert/vocab/file/path;
std::unique_ptr<BertTokenizer> tokenizer = tokenizers::BertTokenizer::CreateTokenizer(options);

encode single sentence

std::vector<std::string> texts = {"bert tokenizer", "gpt tokenizer"};
std::vector<EncodeOutput> batch_outputs = tokenizer->BatchEncode(&texts, nullptr, /*max_length=*/512);

encode sentence pair

std::vector<std::string> a_texts = {"bert tokenizer", "gpt tokenizer"};
std::vector<std::string> b_texts = {"transformer encoder", "transformer decoder"};
std::vector<EncodeOutput> batch_outputs = tokenizer->BatchEncode(&a_texts, &b_texts, /*max_length=*/512);