Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix train_new_from_iterator in the case of byte-level tokenizers #17549

Merged
merged 15 commits into from
Jun 8, 2022
3 changes: 3 additions & 0 deletions src/transformers/tokenization_utils_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple, Union

import tokenizers.pre_tokenizers as pre_tokenizers_fast
from tokenizers import Encoding as EncodingFast
from tokenizers import Tokenizer as TokenizerFast
from tokenizers.decoders import Decoder as DecoderFast
Expand Down Expand Up @@ -699,6 +700,8 @@ def train_new_from_iterator(
kwargs["end_of_word_suffix"] = tokenizer_json["model"]["end_of_word_suffix"]
if tokenizer_json["model"]["type"] == "Unigram" and unk_token is not None:
kwargs["unk_token"] = unk_token
if tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel":
kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At some point everything in this should be ported directly within tokenizers.

Information flow from the Tokenizer to the trainer is a long standing issue (some options are recoverable, some are not, but it's inconsistent currently)


trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]]
trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs)
Expand Down
1 change: 1 addition & 0 deletions tests/models/bart/test_modeling_bart.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ def get_config(self):
def get_pipeline_config(self):
config = self.get_config()
config.max_position_embeddings = 100
config.vocab_size = 300
return config

def prepare_config_and_inputs_for_common(self):
Expand Down
1 change: 1 addition & 0 deletions tests/models/blenderbot/test_modeling_blenderbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def get_config(self):
def get_pipeline_config(self):
config = self.get_config()
config.max_position_embeddings = 100
config.vocab_size = 300
return config

def prepare_config_and_inputs_for_common(self):
Expand Down
5 changes: 5 additions & 0 deletions tests/models/deberta/test_modeling_deberta.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ def get_config(self):
pos_att_type=self.pos_att_type,
)

def get_pipeline_config(self):
config = self.get_config()
config.vocab_size = 300
return config

def check_loss_output(self, result):
self.parent.assertListEqual(list(result.loss.size()), [])

Expand Down
5 changes: 5 additions & 0 deletions tests/models/gpt2/test_modeling_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,11 @@ def get_config(
reorder_and_upcast_attn=reorder_and_upcast_attn,
)

def get_pipeline_config(self):
config = self.get_config()
config.vocab_size = 300
return config

def prepare_config_and_inputs_for_decoder(self):
(
config,
Expand Down
5 changes: 5 additions & 0 deletions tests/models/gpt_neo/test_modeling_gpt_neo.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,11 @@ def get_config(self):
attention_types=self.attention_types,
)

def get_pipeline_config(self):
config = self.get_config()
config.vocab_size = 300
return config

def prepare_config_and_inputs_for_decoder(self):
(
config,
Expand Down
5 changes: 5 additions & 0 deletions tests/models/gptj/test_modeling_gptj.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,11 @@ def get_config(self):
rotary_dim=self.rotary_dim,
)

def get_pipeline_config(self):
config = self.get_config()
config.vocab_size = 300
return config

def prepare_config_and_inputs_for_decoder(self):
(
config,
Expand Down
5 changes: 5 additions & 0 deletions tests/models/ibert/test_modeling_ibert.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,11 @@ def get_config(self):
quant_mode=True,
)

def get_pipeline_config(self):
config = self.get_config()
config.vocab_size = 300
return config

def create_and_check_model(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
Expand Down
1 change: 1 addition & 0 deletions tests/models/led/test_modeling_led.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def get_config(self):
def get_pipeline_config(self):
config = self.get_config()
config.max_position_embeddings = 100
config.vocab_size = 300
return config

def prepare_config_and_inputs_for_common(self):
Expand Down
5 changes: 5 additions & 0 deletions tests/models/longformer/test_modeling_longformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,11 @@ def get_config(self):
attention_window=self.attention_window,
)

def get_pipeline_config(self):
config = self.get_config()
config.vocab_size = 300
return config

def create_and_check_attention_mask_determinism(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
Expand Down
5 changes: 5 additions & 0 deletions tests/models/roberta/test_modeling_roberta.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,11 @@ def get_config(self):
initializer_range=self.initializer_range,
)

def get_pipeline_config(self):
config = self.get_config()
config.vocab_size = 300
return config

def prepare_config_and_inputs_for_decoder(self):
(
config,
Expand Down
5 changes: 5 additions & 0 deletions tests/models/yoso/test_modeling_yoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ def get_config(self):
initializer_range=self.initializer_range,
)

def get_pipeline_config(self):
config = self.get_config()
config.vocab_size = 300
return config

def prepare_config_and_inputs_for_decoder(self):
(
config,
Expand Down
10 changes: 10 additions & 0 deletions tests/tokenization/test_tokenization_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def setUp(self):
self.test_rust_tokenizer = True

model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"]
self.bytelevel_bpe_model_name = "SaulLu/dummy-tokenizer-bytelevel-bpe"

# Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment)
self.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths]
Expand Down Expand Up @@ -99,6 +100,15 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
shutil.rmtree(self.tmpdirname)
self.tmpdirname = tmpdirname_orig

def test_training_new_tokenizer_with_bytelevel(self):
tokenizer = self.rust_tokenizer_class.from_pretrained(self.bytelevel_bpe_model_name)

toy_text_iterator = ("a" for _ in range(1000))
new_tokenizer = tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50)

encoding_ids = new_tokenizer.encode("a🤗")
self.assertEqual(encoding_ids, [64, 172, 253, 97, 245])


@require_tokenizers
class TokenizerVersioningTest(unittest.TestCase):
Expand Down