From 81f35baf5156c7dfdb81597d1984619028246ab3 Mon Sep 17 00:00:00 2001 From: wj-Mcat <1435130236@qq.com> Date: Sat, 6 Aug 2022 09:34:07 +0000 Subject: [PATCH 01/15] add roformer unittest --- paddlenlp/transformers/roformer/modeling.py | 12 + paddlenlp/transformers/roformer/tokenizer.py | 24 +- tests/transformers/roformer/__init__.py | 0 tests/transformers/roformer/test_modeling.py | 290 ++++++++++++++++++ tests/transformers/roformer/test_tokenizer.py | 268 ++++++++++++++++ 5 files changed, 592 insertions(+), 2 deletions(-) create mode 100644 tests/transformers/roformer/__init__.py create mode 100644 tests/transformers/roformer/test_modeling.py create mode 100644 tests/transformers/roformer/test_tokenizer.py diff --git a/paddlenlp/transformers/roformer/modeling.py b/paddlenlp/transformers/roformer/modeling.py index 7560d71b117d..36cdcc069f5d 100644 --- a/paddlenlp/transformers/roformer/modeling.py +++ b/paddlenlp/transformers/roformer/modeling.py @@ -451,6 +451,12 @@ def init_weights(self, layer): elif isinstance(layer, nn.LayerNorm): layer._epsilon = 1e-12 + def get_input_embeddings(self) -> nn.Embedding: + return self.roformer.get_input_embeddings() + + def set_input_embeddings(self, embedding: nn.Embedding): + self.roformer.set_input_embeddings(embedding) + @register_base_model class RoFormerModel(RoFormerPretrainedModel): @@ -671,6 +677,12 @@ def forward( else: return sequence_output, pooled_output + def get_input_embeddings(self) -> nn.Embedding: + return self.embeddings.word_embeddings + + def set_input_embeddings(self, embedding: nn.Embedding): + self.embeddings.word_embeddings = embedding + class RoFormerForQuestionAnswering(RoFormerPretrainedModel): """ diff --git a/paddlenlp/transformers/roformer/tokenizer.py b/paddlenlp/transformers/roformer/tokenizer.py index 11b95960c953..54c95111e1c7 100644 --- a/paddlenlp/transformers/roformer/tokenizer.py +++ b/paddlenlp/transformers/roformer/tokenizer.py @@ -35,10 +35,18 @@ class JiebaBasicTokenizer(BasicTokenizer): Defaults to `True`. """ - def __init__(self, vocab, do_lower_case=True): + def __init__(self, + vocab, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None): """Constructs a JiebaBasicTokenizer.""" + super().__init__(never_split=never_split, + do_lower_case=do_lower_case, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents) self.vocab = vocab - self.do_lower_case = do_lower_case def _tokenize_chinese_chars(self, text): output = [] @@ -140,6 +148,18 @@ class RoFormerTokenizer(PretrainedTokenizer): "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-english-small-generator/vocab.txt", } } + max_model_input_sizes = { + "roformer-chinese-small": 512, + "roformer-chinese-base": 1536, + "roformer-chinese-char-small": 512, + "roformer-chinese-char-base": 512, + "roformer-chinese-sim-char-ft-small": 512, + "roformer-chinese-sim-char-ft-base": 512, + "roformer-chinese-sim-char-small": 512, + "roformer-chinese-sim-char-base": 512, + "roformer-english-small-discriminator": 128, + "roformer-english-small-generator": 128, + } pretrained_init_configuration = { "roformer-chinese-small": { "do_lower_case": True, diff --git a/tests/transformers/roformer/__init__.py b/tests/transformers/roformer/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/transformers/roformer/test_modeling.py b/tests/transformers/roformer/test_modeling.py new file mode 100644 index 000000000000..2ea456096f12 --- /dev/null +++ b/tests/transformers/roformer/test_modeling.py @@ -0,0 +1,290 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from typing import Optional, Tuple +from dataclasses import dataclass, fields, Field +from dataclasses_json import dataclass_json + +import paddle + +from paddlenlp.transformers import ( + RoFormerModel, RoFormerPretrainedModel, RoFormerForPretraining, + RoFormerForSequenceClassification, RoFormerForTokenClassification, + RoFormerForQuestionAnswering, RoFormerForMultipleChoice, + RoFormerForMaskedLM) + +from tests.transformers.test_modeling_common import ids_tensor, floats_tensor, random_attention_mask, ModelTesterMixin +from tests.testing_utils import slow + + +@dataclass +class RoFormerModelTestModelConfig: + """RoFormerModel model config which keep consist with pretrained_init_configuration sub fields + """ + vocab_size: int = 50000 + embedding_size: int = 384 + hidden_size: int = 384 + num_hidden_layers: int = 6 + num_attention_heads: int = 6 + intermediate_size: int = 1536 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + pad_token_id: int = 0 + rotary_value: bool = False + + @property + def model_kwargs(self) -> dict: + """get the model kwargs configuration to init the model""" + model_config_fields: Tuple[Field, + ...] = fields(RoFormerModelTestModelConfig) + return { + field.name: getattr(self, field.name) + for field in model_config_fields + } + + +@dataclass +class RoFormerModelTestConfig(RoFormerModelTestModelConfig): + """train config under unittest code""" + batch_size: int = 2 + seq_length: int = 7 + is_training: bool = False + use_input_mask: bool = False + use_token_type_ids: bool = True + + # used for sequence classification + num_classes: int = 3 + num_choices: int = 3 + + +class RoFormerModelTester: + + def __init__( + self, + parent, + config: Optional[RoFormerModelTestConfig] = None, + ): + self.parent = parent + self.config: RoFormerModelTestConfig = config or RoFormerModelTestConfig( + ) + + self.is_training = self.config.is_training + self.num_classes = self.config.num_classes + self.num_choices = self.config.num_choices + + def prepare_config_and_inputs(self): + config = self.config + input_ids = ids_tensor([config.batch_size, config.seq_length], + config.vocab_size) + + input_mask = None + if self.config.use_input_mask: + input_mask = random_attention_mask( + [config.batch_size, config.seq_length]) + + token_type_ids = None + if self.config.use_token_type_ids: + token_type_ids = ids_tensor([config.batch_size, config.seq_length], + config.type_vocab_size) + + config = self.get_config() + return config, input_ids, token_type_ids, input_mask + + def get_config(self) -> dict: + return self.config.model_kwargs + + def create_and_check_model( + self, + config, + input_ids, + token_type_ids, + input_mask, + ): + model = RoFormerModel(**config) + model.eval() + result = model(input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result[0].shape, [ + self.config.batch_size, self.config.seq_length, + self.config.hidden_size + ]) + self.parent.assertEqual( + result[1].shape, [self.config.batch_size, self.config.hidden_size]) + + def create_and_check_for_multiple_choice( + self, + config, + input_ids, + token_type_ids, + input_mask, + ): + model = RoFormerForMultipleChoice(RoFormerModel(**config), + num_choices=self.config.num_choices) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand( + [-1, self.config.num_choices, -1]) + + if token_type_ids is not None: + token_type_ids = token_type_ids.unsqueeze(1).expand( + [-1, self.config.num_choices, -1]) + + if input_mask is not None: + input_mask = input_mask.unsqueeze(1).expand( + [-1, self.config.num_choices, -1]) + + result = model( + multiple_choice_inputs_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + ) + self.parent.assertEqual( + result.shape, [self.config.batch_size, self.config.num_choices]) + + def create_and_check_for_masked_lm( + self, + config, + input_ids, + token_type_ids, + input_mask, + ): + model = RoFormerForMaskedLM(RoFormerModel(**config)) + model.eval() + result = model(input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids) + self.parent.assertEqual( + result.shape, + [self.config.batch_size, self.config.seq_length, self.vocab_size]) + + def create_and_check_for_sequence_classification( + self, + config, + input_ids, + token_type_ids, + input_mask, + ): + model = RoFormerForSequenceClassification( + RoFormerModel(**config), num_classes=self.config.num_classes) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + ) + self.parent.assertEqual( + result.shape, [self.config.batch_size, self.config.num_classes]) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + ) = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "attention_mask": input_mask + } + return config, inputs_dict + + +class RoFormerModelTest(ModelTesterMixin, unittest.TestCase): + base_model_class = RoFormerModel + + all_model_classes = ( + RoFormerModel, + RoFormerForMultipleChoice, + RoFormerForPretraining, + RoFormerForSequenceClassification, + ) + + def setUp(self): + self.model_tester = RoFormerModelTester(self) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice( + *config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification( + *config_and_inputs) + + # @slow + def test_model_from_pretrained(self): + for model_name in list( + RoFormerPretrainedModel.pretrained_init_configuration)[:1]: + model = RoFormerModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class RoFormerModelIntegrationTest(unittest.TestCase): + + # @slow + def test_inference_no_attention(self): + model = RoFormerModel.from_pretrained("roformer-chinese-small") + model.eval() + input_ids = paddle.to_tensor( + [[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) + with paddle.no_grad(): + output = model(input_ids)[0] + expected_shape = [1, 11, 384] + self.assertEqual(output.shape, expected_shape) + + expected_slice = paddle.to_tensor( + [[[0.17788891, -2.17795515, 0.28824317], + [-1.70342600, -2.84062195, -0.53377795], + [-0.16374627, -0.67967212, -0.37192002]]]) + + self.assertTrue( + paddle.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4)) + + # @slow + def test_inference_with_attention(self): + model = RoFormerModel.from_pretrained("roformer-chinese-small") + model.eval() + input_ids = paddle.to_tensor( + [[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) + attention_mask = paddle.to_tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) + with paddle.no_grad(): + output = model(input_ids, attention_mask=attention_mask)[0] + expected_shape = [1, 11, 384] + self.assertEqual(output.shape, expected_shape) + + expected_slice = paddle.to_tensor( + [[[0.17788891, -2.17795515, 0.28824317], + [-1.70342600, -2.84062195, -0.53377795], + [-0.16374627, -0.67967212, -0.37192002]]]) + self.assertTrue( + paddle.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/transformers/roformer/test_tokenizer.py b/tests/transformers/roformer/test_tokenizer.py new file mode 100644 index 000000000000..e67bc5763d57 --- /dev/null +++ b/tests/transformers/roformer/test_tokenizer.py @@ -0,0 +1,268 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +from paddlenlp.data.vocab import Vocab +from paddlenlp.transformers.roformer.tokenizer import (JiebaBasicTokenizer, + RoFormerTokenizer, + WordpieceTokenizer) + +from tests.testing_utils import slow +from tests.transformers.test_tokenizer_common import TokenizerTesterMixin, filter_non_english + + +class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + + tokenizer_class = RoFormerTokenizer + space_between_special_tokens = True + from_pretrained_filter = filter_non_english + test_seq2seq = True + + def setUp(self): + self.from_pretrained_kwargs = {"do_lower_case": False} + + super().setUp() + vocab_tokens = [ + "[UNK]", + "[CLS]", + "[SEP]", + "[PAD]", + "[MASK]", + "want", + "##want", + "##ed", + "wa", + "un", + "runn", + "##ing", + ",", + "low", + "lowest", + ] + + self.vocab_file = os.path.join( + self.tmpdirname, + RoFormerTokenizer.resource_files_names["vocab_file"]) + with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + + self.vocab = Vocab.from_dict( + {token: index + for index, token in enumerate(vocab_tokens)}, + unk_token='[UNK]', + pad_token='[PAD]', + bos_token='[CLS]', + eos_token='[SEP]', + ) + + def get_input_output_texts(self, tokenizer): + input_text = "UNwant\u00E9d,running" + output_text = "unwanted, running" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = self.tokenizer_class(self.vocab_file) + + tokens = tokenizer.tokenize("UNwant\u00E9d,running") + self.assertListEqual(tokens, + ["un", "##want", "##ed", ",", "runn", "##ing"]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), + [9, 6, 7, 12, 10, 11]) + + def test_chinese(self): + tokenizer = JiebaBasicTokenizer(self.vocab) + + self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), + ["ah", "\u535A", "\u63A8", "zz"]) + + def test_basic_tokenizer_lower(self): + tokenizer = JiebaBasicTokenizer(self.vocab, do_lower_case=True) + + self.assertListEqual(tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), + ["hello", "!", "how", "are", "you", "?"]) + self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"]) + + def test_basic_tokenizer_lower_strip_accents_false(self): + tokenizer = JiebaBasicTokenizer(self.vocab, + do_lower_case=True, + strip_accents=False) + + self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), + ["hällo", "!", "how", "are", "you", "?"]) + self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"]) + + def test_basic_tokenizer_lower_strip_accents_true(self): + tokenizer = JiebaBasicTokenizer(self.vocab, do_lower_case=True) + + self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), + ["hallo", "!", "how", "are", "you", "?"]) + self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"]) + + def test_basic_tokenizer_lower_strip_accents_default(self): + tokenizer = JiebaBasicTokenizer(self.vocab, do_lower_case=True) + + self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), + ["hallo", "!", "how", "are", "you", "?"]) + self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"]) + + def test_basic_tokenizer_no_lower(self): + tokenizer = JiebaBasicTokenizer(self.vocab, do_lower_case=False) + + self.assertListEqual(tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), + ["HeLLo", "!", "how", "Are", "yoU", "?"]) + + def test_basic_tokenizer_no_lower_strip_accents_false(self): + tokenizer = JiebaBasicTokenizer(self.vocab, + do_lower_case=False, + strip_accents=False) + + self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), + ["HäLLo", "!", "how", "Are", "yoU", "?"]) + + def test_basic_tokenizer_no_lower_strip_accents_true(self): + tokenizer = JiebaBasicTokenizer(self.vocab, + do_lower_case=False, + strip_accents=True) + + self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), + ["HaLLo", "!", "how", "Are", "yoU", "?"]) + + def test_basic_tokenizer_respects_never_split_tokens(self): + tokenizer = JiebaBasicTokenizer(self.vocab, + do_lower_case=False, + never_split=["[UNK]"]) + + self.assertListEqual( + tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), + ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]) + + def test_wordpiece_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", + "runn", "##ing" + ] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]") + + self.assertListEqual(tokenizer.tokenize(""), []) + + self.assertListEqual(tokenizer.tokenize("unwanted running"), + ["un", "##want", "##ed", "runn", "##ing"]) + + self.assertListEqual(tokenizer.tokenize("unwantedX running"), + ["[UNK]", "runn", "##ing"]) + + def test_clean_text(self): + tokenizer = self.get_tokenizer() + + # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340 + self.assertListEqual( + [tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], + [["[UNK]"], [], ["[UNK]"]]) + + # @slow + def test_sequence_builders(self): + tokenizer = self.tokenizer_class.from_pretrained( + "roformer-chinese-small") + + text = tokenizer.encode("sequence builders", + return_token_type_ids=None, + add_special_tokens=False)["input_ids"] + text_2 = tokenizer.encode("multi-sequence build", + return_token_type_ids=None, + add_special_tokens=False)["input_ids"] + + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + assert encoded_sentence == [101] + text + [102] + assert encoded_pair == [101] + text + [102] + text_2 + [102] + + def test_offsets_with_special_characters(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest( + f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer = self.tokenizer_class.from_pretrained( + pretrained_name, **kwargs) + + # sentence = f"testing with {tokenizer.mask_token} simple sentence" + sentence = f"a simple {tokenizer.mask_token} allennlp sentence." + tokens = tokenizer.encode( + sentence, + return_attention_mask=False, + return_token_type_ids=False, + return_offsets_mapping=True, + add_special_tokens=True, + ) + expected_results = [ + ((0, 0), tokenizer.cls_token), + ((0, 1), "a"), + ((2, 8), "simple"), + ((9, 15), tokenizer.mask_token), + ((16, 21), "allen"), + ((21, 23), "##nl"), + ((23, 24), "##p"), + ((25, 33), "sentence"), + ((33, 34), "."), + ((0, 0), tokenizer.sep_token), + ] + + self.assertEqual([e[1] for e in expected_results], + tokenizer.convert_ids_to_tokens( + tokens["input_ids"])) + self.assertEqual([e[0] for e in expected_results], + tokens["offset_mapping"]) + + def test_change_tokenize_chinese_chars(self): + list_of_commun_chinese_char = ["的", "人", "有"] + text_with_chinese_char = "".join(list_of_commun_chinese_char) + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest( + f"{tokenizer.__class__.__name__} ({pretrained_name})"): + + kwargs["tokenize_chinese_chars"] = True + tokenizer = self.tokenizer_class.from_pretrained( + pretrained_name, **kwargs) + + ids_without_spe_char_p = tokenizer.encode( + text_with_chinese_char, + return_token_type_ids=None, + add_special_tokens=False)["input_ids"] + + tokens_without_spe_char_p = tokenizer.convert_ids_to_tokens( + ids_without_spe_char_p) + + # it is expected that each Chinese character is not preceded by "##" + self.assertListEqual(tokens_without_spe_char_p, + list_of_commun_chinese_char) + ''' + kwargs["tokenize_chinese_chars"] = False + tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + ids_without_spe_char_p = tokenizer.encode(text_with_chinese_char, return_token_type_ids=None,add_special_tokens=False)["input_ids"] + + tokens_without_spe_char_p = tokenizer.convert_ids_to_tokens(ids_without_spe_char_p) + + # it is expected that only the first Chinese character is not preceded by "##". + expected_tokens = [ + f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_commun_chinese_char) + ] + self.assertListEqual(tokens_without_spe_char_p, expected_tokens) + ''' From 543a349e555826100498939c7789c0cc8a26ded3 Mon Sep 17 00:00:00 2001 From: wj-Mcat <1435130236@qq.com> Date: Sat, 6 Aug 2022 09:34:07 +0000 Subject: [PATCH 02/15] add roformer unittest --- paddlenlp/transformers/roformer/modeling.py | 12 + paddlenlp/transformers/roformer/tokenizer.py | 24 +- tests/transformers/roformer/__init__.py | 0 tests/transformers/roformer/test_modeling.py | 290 ++++++++++++++++++ tests/transformers/roformer/test_tokenizer.py | 268 ++++++++++++++++ 5 files changed, 592 insertions(+), 2 deletions(-) create mode 100644 tests/transformers/roformer/__init__.py create mode 100644 tests/transformers/roformer/test_modeling.py create mode 100644 tests/transformers/roformer/test_tokenizer.py diff --git a/paddlenlp/transformers/roformer/modeling.py b/paddlenlp/transformers/roformer/modeling.py index 7560d71b117d..36cdcc069f5d 100644 --- a/paddlenlp/transformers/roformer/modeling.py +++ b/paddlenlp/transformers/roformer/modeling.py @@ -451,6 +451,12 @@ def init_weights(self, layer): elif isinstance(layer, nn.LayerNorm): layer._epsilon = 1e-12 + def get_input_embeddings(self) -> nn.Embedding: + return self.roformer.get_input_embeddings() + + def set_input_embeddings(self, embedding: nn.Embedding): + self.roformer.set_input_embeddings(embedding) + @register_base_model class RoFormerModel(RoFormerPretrainedModel): @@ -671,6 +677,12 @@ def forward( else: return sequence_output, pooled_output + def get_input_embeddings(self) -> nn.Embedding: + return self.embeddings.word_embeddings + + def set_input_embeddings(self, embedding: nn.Embedding): + self.embeddings.word_embeddings = embedding + class RoFormerForQuestionAnswering(RoFormerPretrainedModel): """ diff --git a/paddlenlp/transformers/roformer/tokenizer.py b/paddlenlp/transformers/roformer/tokenizer.py index 11b95960c953..54c95111e1c7 100644 --- a/paddlenlp/transformers/roformer/tokenizer.py +++ b/paddlenlp/transformers/roformer/tokenizer.py @@ -35,10 +35,18 @@ class JiebaBasicTokenizer(BasicTokenizer): Defaults to `True`. """ - def __init__(self, vocab, do_lower_case=True): + def __init__(self, + vocab, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None): """Constructs a JiebaBasicTokenizer.""" + super().__init__(never_split=never_split, + do_lower_case=do_lower_case, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents) self.vocab = vocab - self.do_lower_case = do_lower_case def _tokenize_chinese_chars(self, text): output = [] @@ -140,6 +148,18 @@ class RoFormerTokenizer(PretrainedTokenizer): "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-english-small-generator/vocab.txt", } } + max_model_input_sizes = { + "roformer-chinese-small": 512, + "roformer-chinese-base": 1536, + "roformer-chinese-char-small": 512, + "roformer-chinese-char-base": 512, + "roformer-chinese-sim-char-ft-small": 512, + "roformer-chinese-sim-char-ft-base": 512, + "roformer-chinese-sim-char-small": 512, + "roformer-chinese-sim-char-base": 512, + "roformer-english-small-discriminator": 128, + "roformer-english-small-generator": 128, + } pretrained_init_configuration = { "roformer-chinese-small": { "do_lower_case": True, diff --git a/tests/transformers/roformer/__init__.py b/tests/transformers/roformer/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/transformers/roformer/test_modeling.py b/tests/transformers/roformer/test_modeling.py new file mode 100644 index 000000000000..9fae632f848f --- /dev/null +++ b/tests/transformers/roformer/test_modeling.py @@ -0,0 +1,290 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from typing import Optional, Tuple +from dataclasses import dataclass, fields, Field +from dataclasses_json import dataclass_json + +import paddle + +from paddlenlp.transformers import ( + RoFormerModel, RoFormerPretrainedModel, RoFormerForPretraining, + RoFormerForSequenceClassification, RoFormerForTokenClassification, + RoFormerForQuestionAnswering, RoFormerForMultipleChoice, + RoFormerForMaskedLM) + +from tests.transformers.test_modeling_common import ids_tensor, floats_tensor, random_attention_mask, ModelTesterMixin +from tests.testing_utils import slow + + +@dataclass +class RoFormerModelTestModelConfig: + """RoFormerModel model config which keep consist with pretrained_init_configuration sub fields + """ + vocab_size: int = 50000 + embedding_size: int = 384 + hidden_size: int = 384 + num_hidden_layers: int = 6 + num_attention_heads: int = 6 + intermediate_size: int = 1536 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + pad_token_id: int = 0 + rotary_value: bool = False + + @property + def model_kwargs(self) -> dict: + """get the model kwargs configuration to init the model""" + model_config_fields: Tuple[Field, + ...] = fields(RoFormerModelTestModelConfig) + return { + field.name: getattr(self, field.name) + for field in model_config_fields + } + + +@dataclass +class RoFormerModelTestConfig(RoFormerModelTestModelConfig): + """train config under unittest code""" + batch_size: int = 2 + seq_length: int = 7 + is_training: bool = False + use_input_mask: bool = False + use_token_type_ids: bool = True + + # used for sequence classification + num_classes: int = 3 + num_choices: int = 3 + + +class RoFormerModelTester: + + def __init__( + self, + parent, + config: Optional[RoFormerModelTestConfig] = None, + ): + self.parent = parent + self.config: RoFormerModelTestConfig = config or RoFormerModelTestConfig( + ) + + self.is_training = self.config.is_training + self.num_classes = self.config.num_classes + self.num_choices = self.config.num_choices + + def prepare_config_and_inputs(self): + config = self.config + input_ids = ids_tensor([config.batch_size, config.seq_length], + config.vocab_size) + + input_mask = None + if self.config.use_input_mask: + input_mask = random_attention_mask( + [config.batch_size, config.seq_length]) + + token_type_ids = None + if self.config.use_token_type_ids: + token_type_ids = ids_tensor([config.batch_size, config.seq_length], + config.type_vocab_size) + + config = self.get_config() + return config, input_ids, token_type_ids, input_mask + + def get_config(self) -> dict: + return self.config.model_kwargs + + def create_and_check_model( + self, + config, + input_ids, + token_type_ids, + input_mask, + ): + model = RoFormerModel(**config) + model.eval() + result = model(input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result[0].shape, [ + self.config.batch_size, self.config.seq_length, + self.config.hidden_size + ]) + self.parent.assertEqual( + result[1].shape, [self.config.batch_size, self.config.hidden_size]) + + def create_and_check_for_multiple_choice( + self, + config, + input_ids, + token_type_ids, + input_mask, + ): + model = RoFormerForMultipleChoice(RoFormerModel(**config), + num_choices=self.config.num_choices) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand( + [-1, self.config.num_choices, -1]) + + if token_type_ids is not None: + token_type_ids = token_type_ids.unsqueeze(1).expand( + [-1, self.config.num_choices, -1]) + + if input_mask is not None: + input_mask = input_mask.unsqueeze(1).expand( + [-1, self.config.num_choices, -1]) + + result = model( + multiple_choice_inputs_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + ) + self.parent.assertEqual( + result.shape, [self.config.batch_size, self.config.num_choices]) + + def create_and_check_for_masked_lm( + self, + config, + input_ids, + token_type_ids, + input_mask, + ): + model = RoFormerForMaskedLM(RoFormerModel(**config)) + model.eval() + result = model(input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids) + self.parent.assertEqual( + result.shape, + [self.config.batch_size, self.config.seq_length, self.vocab_size]) + + def create_and_check_for_sequence_classification( + self, + config, + input_ids, + token_type_ids, + input_mask, + ): + model = RoFormerForSequenceClassification( + RoFormerModel(**config), num_classes=self.config.num_classes) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + ) + self.parent.assertEqual( + result.shape, [self.config.batch_size, self.config.num_classes]) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + ) = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "attention_mask": input_mask + } + return config, inputs_dict + + +class RoFormerModelTest(ModelTesterMixin, unittest.TestCase): + base_model_class = RoFormerModel + + all_model_classes = ( + RoFormerModel, + RoFormerForMultipleChoice, + RoFormerForPretraining, + RoFormerForSequenceClassification, + ) + + def setUp(self): + self.model_tester = RoFormerModelTester(self) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice( + *config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification( + *config_and_inputs) + + # @slow + def test_model_from_pretrained(self): + for model_name in list( + RoFormerPretrainedModel.pretrained_init_configuration)[:1]: + model = RoFormerModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class RoFormerModelIntegrationTest(unittest.TestCase): + + @slow + def test_inference_no_attention(self): + model = RoFormerModel.from_pretrained("roformer-chinese-small") + model.eval() + input_ids = paddle.to_tensor( + [[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) + with paddle.no_grad(): + output = model(input_ids)[0] + expected_shape = [1, 11, 384] + self.assertEqual(output.shape, expected_shape) + + expected_slice = paddle.to_tensor( + [[[0.17788891, -2.17795515, 0.28824317], + [-1.70342600, -2.84062195, -0.53377795], + [-0.16374627, -0.67967212, -0.37192002]]]) + + self.assertTrue( + paddle.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4)) + + @slow + def test_inference_with_attention(self): + model = RoFormerModel.from_pretrained("roformer-chinese-small") + model.eval() + input_ids = paddle.to_tensor( + [[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) + attention_mask = paddle.to_tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) + with paddle.no_grad(): + output = model(input_ids, attention_mask=attention_mask)[0] + expected_shape = [1, 11, 384] + self.assertEqual(output.shape, expected_shape) + + expected_slice = paddle.to_tensor( + [[[0.17788891, -2.17795515, 0.28824317], + [-1.70342600, -2.84062195, -0.53377795], + [-0.16374627, -0.67967212, -0.37192002]]]) + self.assertTrue( + paddle.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/transformers/roformer/test_tokenizer.py b/tests/transformers/roformer/test_tokenizer.py new file mode 100644 index 000000000000..e67bc5763d57 --- /dev/null +++ b/tests/transformers/roformer/test_tokenizer.py @@ -0,0 +1,268 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +from paddlenlp.data.vocab import Vocab +from paddlenlp.transformers.roformer.tokenizer import (JiebaBasicTokenizer, + RoFormerTokenizer, + WordpieceTokenizer) + +from tests.testing_utils import slow +from tests.transformers.test_tokenizer_common import TokenizerTesterMixin, filter_non_english + + +class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + + tokenizer_class = RoFormerTokenizer + space_between_special_tokens = True + from_pretrained_filter = filter_non_english + test_seq2seq = True + + def setUp(self): + self.from_pretrained_kwargs = {"do_lower_case": False} + + super().setUp() + vocab_tokens = [ + "[UNK]", + "[CLS]", + "[SEP]", + "[PAD]", + "[MASK]", + "want", + "##want", + "##ed", + "wa", + "un", + "runn", + "##ing", + ",", + "low", + "lowest", + ] + + self.vocab_file = os.path.join( + self.tmpdirname, + RoFormerTokenizer.resource_files_names["vocab_file"]) + with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + + self.vocab = Vocab.from_dict( + {token: index + for index, token in enumerate(vocab_tokens)}, + unk_token='[UNK]', + pad_token='[PAD]', + bos_token='[CLS]', + eos_token='[SEP]', + ) + + def get_input_output_texts(self, tokenizer): + input_text = "UNwant\u00E9d,running" + output_text = "unwanted, running" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = self.tokenizer_class(self.vocab_file) + + tokens = tokenizer.tokenize("UNwant\u00E9d,running") + self.assertListEqual(tokens, + ["un", "##want", "##ed", ",", "runn", "##ing"]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), + [9, 6, 7, 12, 10, 11]) + + def test_chinese(self): + tokenizer = JiebaBasicTokenizer(self.vocab) + + self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), + ["ah", "\u535A", "\u63A8", "zz"]) + + def test_basic_tokenizer_lower(self): + tokenizer = JiebaBasicTokenizer(self.vocab, do_lower_case=True) + + self.assertListEqual(tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), + ["hello", "!", "how", "are", "you", "?"]) + self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"]) + + def test_basic_tokenizer_lower_strip_accents_false(self): + tokenizer = JiebaBasicTokenizer(self.vocab, + do_lower_case=True, + strip_accents=False) + + self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), + ["hällo", "!", "how", "are", "you", "?"]) + self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"]) + + def test_basic_tokenizer_lower_strip_accents_true(self): + tokenizer = JiebaBasicTokenizer(self.vocab, do_lower_case=True) + + self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), + ["hallo", "!", "how", "are", "you", "?"]) + self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"]) + + def test_basic_tokenizer_lower_strip_accents_default(self): + tokenizer = JiebaBasicTokenizer(self.vocab, do_lower_case=True) + + self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), + ["hallo", "!", "how", "are", "you", "?"]) + self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"]) + + def test_basic_tokenizer_no_lower(self): + tokenizer = JiebaBasicTokenizer(self.vocab, do_lower_case=False) + + self.assertListEqual(tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), + ["HeLLo", "!", "how", "Are", "yoU", "?"]) + + def test_basic_tokenizer_no_lower_strip_accents_false(self): + tokenizer = JiebaBasicTokenizer(self.vocab, + do_lower_case=False, + strip_accents=False) + + self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), + ["HäLLo", "!", "how", "Are", "yoU", "?"]) + + def test_basic_tokenizer_no_lower_strip_accents_true(self): + tokenizer = JiebaBasicTokenizer(self.vocab, + do_lower_case=False, + strip_accents=True) + + self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), + ["HaLLo", "!", "how", "Are", "yoU", "?"]) + + def test_basic_tokenizer_respects_never_split_tokens(self): + tokenizer = JiebaBasicTokenizer(self.vocab, + do_lower_case=False, + never_split=["[UNK]"]) + + self.assertListEqual( + tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), + ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]) + + def test_wordpiece_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", + "runn", "##ing" + ] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]") + + self.assertListEqual(tokenizer.tokenize(""), []) + + self.assertListEqual(tokenizer.tokenize("unwanted running"), + ["un", "##want", "##ed", "runn", "##ing"]) + + self.assertListEqual(tokenizer.tokenize("unwantedX running"), + ["[UNK]", "runn", "##ing"]) + + def test_clean_text(self): + tokenizer = self.get_tokenizer() + + # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340 + self.assertListEqual( + [tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], + [["[UNK]"], [], ["[UNK]"]]) + + # @slow + def test_sequence_builders(self): + tokenizer = self.tokenizer_class.from_pretrained( + "roformer-chinese-small") + + text = tokenizer.encode("sequence builders", + return_token_type_ids=None, + add_special_tokens=False)["input_ids"] + text_2 = tokenizer.encode("multi-sequence build", + return_token_type_ids=None, + add_special_tokens=False)["input_ids"] + + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + assert encoded_sentence == [101] + text + [102] + assert encoded_pair == [101] + text + [102] + text_2 + [102] + + def test_offsets_with_special_characters(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest( + f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer = self.tokenizer_class.from_pretrained( + pretrained_name, **kwargs) + + # sentence = f"testing with {tokenizer.mask_token} simple sentence" + sentence = f"a simple {tokenizer.mask_token} allennlp sentence." + tokens = tokenizer.encode( + sentence, + return_attention_mask=False, + return_token_type_ids=False, + return_offsets_mapping=True, + add_special_tokens=True, + ) + expected_results = [ + ((0, 0), tokenizer.cls_token), + ((0, 1), "a"), + ((2, 8), "simple"), + ((9, 15), tokenizer.mask_token), + ((16, 21), "allen"), + ((21, 23), "##nl"), + ((23, 24), "##p"), + ((25, 33), "sentence"), + ((33, 34), "."), + ((0, 0), tokenizer.sep_token), + ] + + self.assertEqual([e[1] for e in expected_results], + tokenizer.convert_ids_to_tokens( + tokens["input_ids"])) + self.assertEqual([e[0] for e in expected_results], + tokens["offset_mapping"]) + + def test_change_tokenize_chinese_chars(self): + list_of_commun_chinese_char = ["的", "人", "有"] + text_with_chinese_char = "".join(list_of_commun_chinese_char) + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest( + f"{tokenizer.__class__.__name__} ({pretrained_name})"): + + kwargs["tokenize_chinese_chars"] = True + tokenizer = self.tokenizer_class.from_pretrained( + pretrained_name, **kwargs) + + ids_without_spe_char_p = tokenizer.encode( + text_with_chinese_char, + return_token_type_ids=None, + add_special_tokens=False)["input_ids"] + + tokens_without_spe_char_p = tokenizer.convert_ids_to_tokens( + ids_without_spe_char_p) + + # it is expected that each Chinese character is not preceded by "##" + self.assertListEqual(tokens_without_spe_char_p, + list_of_commun_chinese_char) + ''' + kwargs["tokenize_chinese_chars"] = False + tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + ids_without_spe_char_p = tokenizer.encode(text_with_chinese_char, return_token_type_ids=None,add_special_tokens=False)["input_ids"] + + tokens_without_spe_char_p = tokenizer.convert_ids_to_tokens(ids_without_spe_char_p) + + # it is expected that only the first Chinese character is not preceded by "##". + expected_tokens = [ + f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_commun_chinese_char) + ] + self.assertListEqual(tokens_without_spe_char_p, expected_tokens) + ''' From aa52e3c756fa450ac9c6027573d738655773b5b3 Mon Sep 17 00:00:00 2001 From: wj-Mcat <1435130236@qq.com> Date: Tue, 9 Aug 2022 01:38:06 +0000 Subject: [PATCH 03/15] update test_modeling --- tests/transformers/roformer/test_modeling.py | 53 ++++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/tests/transformers/roformer/test_modeling.py b/tests/transformers/roformer/test_modeling.py index 9fae632f848f..432088bfb97b 100644 --- a/tests/transformers/roformer/test_modeling.py +++ b/tests/transformers/roformer/test_modeling.py @@ -160,6 +160,38 @@ def create_and_check_for_multiple_choice( self.parent.assertEqual( result.shape, [self.config.batch_size, self.config.num_choices]) + def create_and_check_for_question_answering(self, config, input_ids, + token_type_ids, input_mask): + model = RoFormerForQuestionAnswering(RoFormerModel(**config)) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + ) + self.parent.assertEqual( + result[0].shape, [self.config.batch_size, self.config.seq_length]) + self.parent.assertEqual( + result[1].shape, [self.config.batch_size, self.config.seq_length]) + + def create_and_check_for_token_classification( + self, + config, + input_ids, + token_type_ids, + input_mask, + ): + model = RoFormerForTokenClassification(RoFormerModel(**config), + num_classes=self.num_classes) + model.eval() + result = model(input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids) + self.parent.assertEqual(result.shape, [ + self.config.batch_size, self.config.seq_length, + self.config.num_classes + ]) + def create_and_check_for_masked_lm( self, config, @@ -172,9 +204,10 @@ def create_and_check_for_masked_lm( result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) - self.parent.assertEqual( - result.shape, - [self.config.batch_size, self.config.seq_length, self.vocab_size]) + self.parent.assertEqual(result.shape, [ + self.config.batch_size, self.config.seq_length, + self.config.vocab_size + ]) def create_and_check_for_sequence_classification( self, @@ -227,16 +260,30 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + def test_for_multiple_choice(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_multiple_choice( *config_and_inputs) + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering( + *config_and_inputs) + def test_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_sequence_classification( *config_and_inputs) + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification( + *config_and_inputs) + # @slow def test_model_from_pretrained(self): for model_name in list( From bb8e87437335ef8ac56d85ecbd6af6ced90416dc Mon Sep 17 00:00:00 2001 From: wj-Mcat <1435130236@qq.com> Date: Thu, 11 Aug 2022 09:16:02 +0000 Subject: [PATCH 04/15] use relative import --- tests/transformers/roformer/test_modeling.py | 4 ++-- tests/transformers/roformer/test_tokenizer.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/transformers/roformer/test_modeling.py b/tests/transformers/roformer/test_modeling.py index 432088bfb97b..13ed9312a384 100644 --- a/tests/transformers/roformer/test_modeling.py +++ b/tests/transformers/roformer/test_modeling.py @@ -26,8 +26,8 @@ RoFormerForQuestionAnswering, RoFormerForMultipleChoice, RoFormerForMaskedLM) -from tests.transformers.test_modeling_common import ids_tensor, floats_tensor, random_attention_mask, ModelTesterMixin -from tests.testing_utils import slow +from ..test_modeling_common import ids_tensor, floats_tensor, random_attention_mask, ModelTesterMixin +from ...testing_utils import slow @dataclass diff --git a/tests/transformers/roformer/test_tokenizer.py b/tests/transformers/roformer/test_tokenizer.py index e67bc5763d57..2612d35f2a67 100644 --- a/tests/transformers/roformer/test_tokenizer.py +++ b/tests/transformers/roformer/test_tokenizer.py @@ -21,8 +21,8 @@ RoFormerTokenizer, WordpieceTokenizer) -from tests.testing_utils import slow -from tests.transformers.test_tokenizer_common import TokenizerTesterMixin, filter_non_english +from ...testing_utils import slow +from ..test_tokenizer_common import TokenizerTesterMixin, filter_non_english class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): From 282cf42692cff8ec6cb41c3c1ded765da560205e Mon Sep 17 00:00:00 2001 From: wj-Mcat <1435130236@qq.com> Date: Thu, 11 Aug 2022 12:28:37 +0000 Subject: [PATCH 05/15] reduce model config to accelerate testing --- tests/transformers/roformer/test_modeling.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/transformers/roformer/test_modeling.py b/tests/transformers/roformer/test_modeling.py index 13ed9312a384..44e7ffb5b52d 100644 --- a/tests/transformers/roformer/test_modeling.py +++ b/tests/transformers/roformer/test_modeling.py @@ -16,7 +16,6 @@ import unittest from typing import Optional, Tuple from dataclasses import dataclass, fields, Field -from dataclasses_json import dataclass_json import paddle @@ -34,16 +33,16 @@ class RoFormerModelTestModelConfig: """RoFormerModel model config which keep consist with pretrained_init_configuration sub fields """ - vocab_size: int = 50000 - embedding_size: int = 384 - hidden_size: int = 384 + vocab_size: int = 200 + embedding_size: int = 50 + hidden_size: int = 36 num_hidden_layers: int = 6 num_attention_heads: int = 6 - intermediate_size: int = 1536 + intermediate_size: int = 16 hidden_act: str = "gelu" hidden_dropout_prob: float = 0.1 attention_probs_dropout_prob: float = 0.1 - max_position_embeddings: int = 512 + max_position_embeddings: int = 20 type_vocab_size: int = 2 initializer_range: float = 0.02 pad_token_id: int = 0 From 1be830a9eef07352249493d8786179f9188f2e2b Mon Sep 17 00:00:00 2001 From: wj-Mcat <1435130236@qq.com> Date: Thu, 11 Aug 2022 12:40:22 +0000 Subject: [PATCH 06/15] remove input_embedding from pretrained model --- paddlenlp/transformers/roformer/modeling.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/paddlenlp/transformers/roformer/modeling.py b/paddlenlp/transformers/roformer/modeling.py index 36cdcc069f5d..f5788847a325 100644 --- a/paddlenlp/transformers/roformer/modeling.py +++ b/paddlenlp/transformers/roformer/modeling.py @@ -451,12 +451,6 @@ def init_weights(self, layer): elif isinstance(layer, nn.LayerNorm): layer._epsilon = 1e-12 - def get_input_embeddings(self) -> nn.Embedding: - return self.roformer.get_input_embeddings() - - def set_input_embeddings(self, embedding: nn.Embedding): - self.roformer.set_input_embeddings(embedding) - @register_base_model class RoFormerModel(RoFormerPretrainedModel): From 2cc4243c0d5e3b07736301f2444bbda09c86932f Mon Sep 17 00:00:00 2001 From: wj-Mcat <1435130236@qq.com> Date: Fri, 12 Aug 2022 11:27:08 +0000 Subject: [PATCH 07/15] revert slow tag --- tests/transformers/roformer/test_modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/transformers/roformer/test_modeling.py b/tests/transformers/roformer/test_modeling.py index 44e7ffb5b52d..23a46ddc3c93 100644 --- a/tests/transformers/roformer/test_modeling.py +++ b/tests/transformers/roformer/test_modeling.py @@ -283,7 +283,7 @@ def test_for_token_classification(self): self.model_tester.create_and_check_for_token_classification( *config_and_inputs) - # @slow + @slow def test_model_from_pretrained(self): for model_name in list( RoFormerPretrainedModel.pretrained_init_configuration)[:1]: From c458b9ae0e727e7ca0d32e6a18e01b584ae995ca Mon Sep 17 00:00:00 2001 From: wj-Mcat <1435130236@qq.com> Date: Tue, 16 Aug 2022 06:34:34 +0000 Subject: [PATCH 08/15] update local branch --- paddlenlp/transformers/roformer/tokenizer.py | 8 ++ tests/transformers/roformer/test_tokenizer.py | 82 +------------------ 2 files changed, 9 insertions(+), 81 deletions(-) diff --git a/paddlenlp/transformers/roformer/tokenizer.py b/paddlenlp/transformers/roformer/tokenizer.py index 54c95111e1c7..4482ece02238 100644 --- a/paddlenlp/transformers/roformer/tokenizer.py +++ b/paddlenlp/transformers/roformer/tokenizer.py @@ -417,3 +417,11 @@ def get_special_tokens_mask(self, return [1] + ([0] * len(token_ids_0)) + [1] + ( [0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] + + def get_vocab(self): + vocab = { + self.convert_ids_to_tokens(i): i + for i in range(self.vocab_size) + } + vocab.update(self.added_tokens_encoder) + return vocab diff --git a/tests/transformers/roformer/test_tokenizer.py b/tests/transformers/roformer/test_tokenizer.py index 2612d35f2a67..2134a3e3962f 100644 --- a/tests/transformers/roformer/test_tokenizer.py +++ b/tests/transformers/roformer/test_tokenizer.py @@ -89,86 +89,6 @@ def test_chinese(self): self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"]) - def test_basic_tokenizer_lower(self): - tokenizer = JiebaBasicTokenizer(self.vocab, do_lower_case=True) - - self.assertListEqual(tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), - ["hello", "!", "how", "are", "you", "?"]) - self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"]) - - def test_basic_tokenizer_lower_strip_accents_false(self): - tokenizer = JiebaBasicTokenizer(self.vocab, - do_lower_case=True, - strip_accents=False) - - self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), - ["hällo", "!", "how", "are", "you", "?"]) - self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"]) - - def test_basic_tokenizer_lower_strip_accents_true(self): - tokenizer = JiebaBasicTokenizer(self.vocab, do_lower_case=True) - - self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), - ["hallo", "!", "how", "are", "you", "?"]) - self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"]) - - def test_basic_tokenizer_lower_strip_accents_default(self): - tokenizer = JiebaBasicTokenizer(self.vocab, do_lower_case=True) - - self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), - ["hallo", "!", "how", "are", "you", "?"]) - self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"]) - - def test_basic_tokenizer_no_lower(self): - tokenizer = JiebaBasicTokenizer(self.vocab, do_lower_case=False) - - self.assertListEqual(tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), - ["HeLLo", "!", "how", "Are", "yoU", "?"]) - - def test_basic_tokenizer_no_lower_strip_accents_false(self): - tokenizer = JiebaBasicTokenizer(self.vocab, - do_lower_case=False, - strip_accents=False) - - self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), - ["HäLLo", "!", "how", "Are", "yoU", "?"]) - - def test_basic_tokenizer_no_lower_strip_accents_true(self): - tokenizer = JiebaBasicTokenizer(self.vocab, - do_lower_case=False, - strip_accents=True) - - self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), - ["HaLLo", "!", "how", "Are", "yoU", "?"]) - - def test_basic_tokenizer_respects_never_split_tokens(self): - tokenizer = JiebaBasicTokenizer(self.vocab, - do_lower_case=False, - never_split=["[UNK]"]) - - self.assertListEqual( - tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), - ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]) - - def test_wordpiece_tokenizer(self): - vocab_tokens = [ - "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", - "runn", "##ing" - ] - - vocab = {} - for (i, token) in enumerate(vocab_tokens): - vocab[token] = i - tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]") - - self.assertListEqual(tokenizer.tokenize(""), []) - - self.assertListEqual(tokenizer.tokenize("unwanted running"), - ["un", "##want", "##ed", "runn", "##ing"]) - - self.assertListEqual(tokenizer.tokenize("unwantedX running"), - ["[UNK]", "runn", "##ing"]) - def test_clean_text(self): tokenizer = self.get_tokenizer() @@ -177,7 +97,7 @@ def test_clean_text(self): [tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]) - # @slow + @slow def test_sequence_builders(self): tokenizer = self.tokenizer_class.from_pretrained( "roformer-chinese-small") From edb99ea00c7fd87ac2f2a6097934fd712da41326 Mon Sep 17 00:00:00 2001 From: wj-Mcat <1435130236@qq.com> Date: Wed, 17 Aug 2022 06:37:30 +0000 Subject: [PATCH 09/15] update get_vocab method --- paddlenlp/transformers/roformer/tokenizer.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/paddlenlp/transformers/roformer/tokenizer.py b/paddlenlp/transformers/roformer/tokenizer.py index 4482ece02238..7073ba7f44c3 100644 --- a/paddlenlp/transformers/roformer/tokenizer.py +++ b/paddlenlp/transformers/roformer/tokenizer.py @@ -419,9 +419,4 @@ def get_special_tokens_mask(self, return [1] + ([0] * len(token_ids_0)) + [1] def get_vocab(self): - vocab = { - self.convert_ids_to_tokens(i): i - for i in range(self.vocab_size) - } - vocab.update(self.added_tokens_encoder) - return vocab + return dict(self.vocab.token_to_idx, **self.added_tokens_encoder) From 27d4e2f2eeadc1110c505a9da5cde63f2e8a7750 Mon Sep 17 00:00:00 2001 From: wj-Mcat <1435130236@qq.com> Date: Wed, 17 Aug 2022 06:37:30 +0000 Subject: [PATCH 10/15] update get_vocab method --- paddlenlp/transformers/roformer/tokenizer.py | 7 +------ tests/transformers/roformer/test_tokenizer.py | 1 - 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/paddlenlp/transformers/roformer/tokenizer.py b/paddlenlp/transformers/roformer/tokenizer.py index 4482ece02238..7073ba7f44c3 100644 --- a/paddlenlp/transformers/roformer/tokenizer.py +++ b/paddlenlp/transformers/roformer/tokenizer.py @@ -419,9 +419,4 @@ def get_special_tokens_mask(self, return [1] + ([0] * len(token_ids_0)) + [1] def get_vocab(self): - vocab = { - self.convert_ids_to_tokens(i): i - for i in range(self.vocab_size) - } - vocab.update(self.added_tokens_encoder) - return vocab + return dict(self.vocab.token_to_idx, **self.added_tokens_encoder) diff --git a/tests/transformers/roformer/test_tokenizer.py b/tests/transformers/roformer/test_tokenizer.py index 2134a3e3962f..ce37ebf8a391 100644 --- a/tests/transformers/roformer/test_tokenizer.py +++ b/tests/transformers/roformer/test_tokenizer.py @@ -30,7 +30,6 @@ class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = RoFormerTokenizer space_between_special_tokens = True from_pretrained_filter = filter_non_english - test_seq2seq = True def setUp(self): self.from_pretrained_kwargs = {"do_lower_case": False} From 9c85b1dae5ffc1c88c86b4ce17631b56f5aced6c Mon Sep 17 00:00:00 2001 From: wj-Mcat <1435130236@qq.com> Date: Thu, 18 Aug 2022 01:17:35 +0000 Subject: [PATCH 11/15] update test_chinese method --- tests/transformers/roformer/test_tokenizer.py | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/tests/transformers/roformer/test_tokenizer.py b/tests/transformers/roformer/test_tokenizer.py index ce37ebf8a391..18830704254f 100644 --- a/tests/transformers/roformer/test_tokenizer.py +++ b/tests/transformers/roformer/test_tokenizer.py @@ -21,8 +21,11 @@ RoFormerTokenizer, WordpieceTokenizer) -from ...testing_utils import slow -from ..test_tokenizer_common import TokenizerTesterMixin, filter_non_english +# from ...testing_utils import slow +# from ..test_tokenizer_common import TokenizerTesterMixin, filter_non_english + +from tests.testing_utils import slow, get_tests_dir +from tests.transformers.test_tokenizer_common import TokenizerTesterMixin, filter_non_english class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @@ -59,15 +62,6 @@ def setUp(self): with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) - self.vocab = Vocab.from_dict( - {token: index - for index, token in enumerate(vocab_tokens)}, - unk_token='[UNK]', - pad_token='[PAD]', - bos_token='[CLS]', - eos_token='[SEP]', - ) - def get_input_output_texts(self, tokenizer): input_text = "UNwant\u00E9d,running" output_text = "unwanted, running" @@ -83,10 +77,14 @@ def test_full_tokenizer(self): [9, 6, 7, 12, 10, 11]) def test_chinese(self): - tokenizer = JiebaBasicTokenizer(self.vocab) - - self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), - ["ah", "\u535A", "\u63A8", "zz"]) + tokenizer = RoFormerTokenizer.from_pretrained(list( + RoFormerTokenizer.pretrained_init_configuration.keys())[0], + use_jieba=True) + # test jieba tokenizer in rofromer + jieba_tokenizer = tokenizer.basic_tokenizer + + self.assertListEqual(jieba_tokenizer.tokenize("ah\u535A\u63A8zz"), + ["ah", "博", "推", "zz"]) def test_clean_text(self): tokenizer = self.get_tokenizer() From d579c70a539867c4daf324e343d505e3466ce373 Mon Sep 17 00:00:00 2001 From: wj-Mcat <1435130236@qq.com> Date: Thu, 18 Aug 2022 01:19:25 +0000 Subject: [PATCH 12/15] change absolute import --- tests/transformers/roformer/test_tokenizer.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/transformers/roformer/test_tokenizer.py b/tests/transformers/roformer/test_tokenizer.py index 18830704254f..847d6c576258 100644 --- a/tests/transformers/roformer/test_tokenizer.py +++ b/tests/transformers/roformer/test_tokenizer.py @@ -21,11 +21,8 @@ RoFormerTokenizer, WordpieceTokenizer) -# from ...testing_utils import slow -# from ..test_tokenizer_common import TokenizerTesterMixin, filter_non_english - -from tests.testing_utils import slow, get_tests_dir -from tests.transformers.test_tokenizer_common import TokenizerTesterMixin, filter_non_english +from ...testing_utils import slow +from ..test_tokenizer_common import TokenizerTesterMixin, filter_non_english class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): From 54d5a3aac5e299c846de0395a5bd70e46639400b Mon Sep 17 00:00:00 2001 From: wj-Mcat <1435130236@qq.com> Date: Mon, 22 Aug 2022 03:37:28 +0000 Subject: [PATCH 13/15] update unittest --- tests/transformers/roformer/test_tokenizer.py | 37 ------------------- 1 file changed, 37 deletions(-) diff --git a/tests/transformers/roformer/test_tokenizer.py b/tests/transformers/roformer/test_tokenizer.py index 847d6c576258..b937995021f4 100644 --- a/tests/transformers/roformer/test_tokenizer.py +++ b/tests/transformers/roformer/test_tokenizer.py @@ -143,40 +143,3 @@ def test_offsets_with_special_characters(self): tokens["input_ids"])) self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"]) - - def test_change_tokenize_chinese_chars(self): - list_of_commun_chinese_char = ["的", "人", "有"] - text_with_chinese_char = "".join(list_of_commun_chinese_char) - for tokenizer, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest( - f"{tokenizer.__class__.__name__} ({pretrained_name})"): - - kwargs["tokenize_chinese_chars"] = True - tokenizer = self.tokenizer_class.from_pretrained( - pretrained_name, **kwargs) - - ids_without_spe_char_p = tokenizer.encode( - text_with_chinese_char, - return_token_type_ids=None, - add_special_tokens=False)["input_ids"] - - tokens_without_spe_char_p = tokenizer.convert_ids_to_tokens( - ids_without_spe_char_p) - - # it is expected that each Chinese character is not preceded by "##" - self.assertListEqual(tokens_without_spe_char_p, - list_of_commun_chinese_char) - ''' - kwargs["tokenize_chinese_chars"] = False - tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - - ids_without_spe_char_p = tokenizer.encode(text_with_chinese_char, return_token_type_ids=None,add_special_tokens=False)["input_ids"] - - tokens_without_spe_char_p = tokenizer.convert_ids_to_tokens(ids_without_spe_char_p) - - # it is expected that only the first Chinese character is not preceded by "##". - expected_tokens = [ - f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_commun_chinese_char) - ] - self.assertListEqual(tokens_without_spe_char_p, expected_tokens) - ''' From 3e42ca401d5d059fc40ecc15dce7adc3ebd39a77 Mon Sep 17 00:00:00 2001 From: wj-Mcat <1435130236@qq.com> Date: Mon, 22 Aug 2022 03:43:19 +0000 Subject: [PATCH 14/15] update chinese test case --- tests/transformers/roformer/test_tokenizer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/transformers/roformer/test_tokenizer.py b/tests/transformers/roformer/test_tokenizer.py index b937995021f4..bb27072b63a6 100644 --- a/tests/transformers/roformer/test_tokenizer.py +++ b/tests/transformers/roformer/test_tokenizer.py @@ -78,10 +78,12 @@ def test_chinese(self): RoFormerTokenizer.pretrained_init_configuration.keys())[0], use_jieba=True) # test jieba tokenizer in rofromer - jieba_tokenizer = tokenizer.basic_tokenizer - self.assertListEqual(jieba_tokenizer.tokenize("ah\u535A\u63A8zz"), - ["ah", "博", "推", "zz"]) + tokens = tokenizer.tokenize("ah\u535A\u63A8zz") + self.assertListEqual(tokens, ["ah", "博", "推", 'z', '##z']) + + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), + [5829, 713, 2093, 167, 48585]) def test_clean_text(self): tokenizer = self.get_tokenizer() From 1b4ca3a365dd9f73b3d98ead24e63921827c6085 Mon Sep 17 00:00:00 2001 From: wj-Mcat <1435130236@qq.com> Date: Tue, 30 Aug 2022 05:11:28 +0000 Subject: [PATCH 15/15] add roformer more output testing --- tests/transformers/roformer/test_modeling.py | 205 ++++++++++++------- 1 file changed, 127 insertions(+), 78 deletions(-) diff --git a/tests/transformers/roformer/test_modeling.py b/tests/transformers/roformer/test_modeling.py index 23a46ddc3c93..bc9c7e3945e5 100644 --- a/tests/transformers/roformer/test_modeling.py +++ b/tests/transformers/roformer/test_modeling.py @@ -16,14 +16,17 @@ import unittest from typing import Optional, Tuple from dataclasses import dataclass, fields, Field +from parameterized import parameterized_class import paddle +from paddle import Tensor -from paddlenlp.transformers import ( - RoFormerModel, RoFormerPretrainedModel, RoFormerForPretraining, - RoFormerForSequenceClassification, RoFormerForTokenClassification, - RoFormerForQuestionAnswering, RoFormerForMultipleChoice, - RoFormerForMaskedLM) +from paddlenlp.transformers import (RoFormerModel, RoFormerPretrainedModel, + RoFormerForSequenceClassification, + RoFormerForTokenClassification, + RoFormerForQuestionAnswering, + RoFormerForMultipleChoice, + RoFormerForMaskedLM) from ..test_modeling_common import ids_tensor, floats_tensor, random_attention_mask, ModelTesterMixin from ...testing_utils import slow @@ -67,6 +70,7 @@ class RoFormerModelTestConfig(RoFormerModelTestModelConfig): is_training: bool = False use_input_mask: bool = False use_token_type_ids: bool = True + type_sequence_label_size = 3 # used for sequence classification num_classes: int = 3 @@ -102,27 +106,43 @@ def prepare_config_and_inputs(self): if self.config.use_token_type_ids: token_type_ids = ids_tensor([config.batch_size, config.seq_length], config.type_vocab_size) + sequence_labels = None + token_labels = None + choice_labels = None + + if self.parent.use_labels: + sequence_labels = ids_tensor([self.batch_size], + self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], + self.num_classes) + choice_labels = ids_tensor([self.batch_size], self.num_choices) config = self.get_config() - return config, input_ids, token_type_ids, input_mask + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels def get_config(self) -> dict: return self.config.model_kwargs - def create_and_check_model( - self, - config, - input_ids, - token_type_ids, - input_mask, - ): + def __getattr__(self, key: str): + if not hasattr(self.config, key): + raise AttributeError(f'attribute <{key}> not exist') + return getattr(self.config, key) + + def create_and_check_model(self, config, input_ids: Tensor, + token_type_ids: Tensor, input_mask: Tensor, + sequence_labels: Tensor, token_labels: Tensor, + choice_labels: Tensor): model = RoFormerModel(**config) model.eval() result = model(input_ids, attention_mask=input_mask, - token_type_ids=token_type_ids) - result = model(input_ids, token_type_ids=token_type_ids) - result = model(input_ids) + token_type_ids=token_type_ids, + return_dict=self.parent.return_dict) + result = model(input_ids, + token_type_ids=token_type_ids, + return_dict=self.parent.return_dict) + result = model(input_ids, return_dict=self.parent.return_dict) + self.parent.assertEqual(result[0].shape, [ self.config.batch_size, self.config.seq_length, self.config.hidden_size @@ -130,13 +150,12 @@ def create_and_check_model( self.parent.assertEqual( result[1].shape, [self.config.batch_size, self.config.hidden_size]) - def create_and_check_for_multiple_choice( - self, - config, - input_ids, - token_type_ids, - input_mask, - ): + def create_and_check_for_multiple_choice(self, config, input_ids: Tensor, + token_type_ids: Tensor, + input_mask: Tensor, + sequence_labels: Tensor, + token_labels: Tensor, + choice_labels: Tensor): model = RoFormerForMultipleChoice(RoFormerModel(**config), num_choices=self.config.num_choices) model.eval() @@ -151,89 +170,113 @@ def create_and_check_for_multiple_choice( input_mask = input_mask.unsqueeze(1).expand( [-1, self.config.num_choices, -1]) - result = model( - multiple_choice_inputs_ids, - attention_mask=input_mask, - token_type_ids=token_type_ids, - ) - self.parent.assertEqual( - result.shape, [self.config.batch_size, self.config.num_choices]) + result = model(multiple_choice_inputs_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + labels=choice_labels, + return_dict=self.parent.return_dict) + + if paddle.is_tensor(result): + result = [result] + elif choice_labels is not None: + result = result[1:] - def create_and_check_for_question_answering(self, config, input_ids, - token_type_ids, input_mask): + self.parent.assertEqual( + result[0].shape, [self.config.batch_size, self.config.num_choices]) + + def create_and_check_for_question_answering(self, config, input_ids: Tensor, + token_type_ids: Tensor, + input_mask: Tensor, + sequence_labels: Tensor, + token_labels: Tensor, + choice_labels: Tensor): model = RoFormerForQuestionAnswering(RoFormerModel(**config)) model.eval() - result = model( - input_ids, - attention_mask=input_mask, - token_type_ids=token_type_ids, - ) + result = model(input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + return_dict=self.parent.return_dict) + + if paddle.is_tensor(result): + result = [result] + elif choice_labels is not None: + result = result[1:] + self.parent.assertEqual( result[0].shape, [self.config.batch_size, self.config.seq_length]) self.parent.assertEqual( result[1].shape, [self.config.batch_size, self.config.seq_length]) def create_and_check_for_token_classification( - self, - config, - input_ids, - token_type_ids, - input_mask, - ): + self, config, input_ids: Tensor, token_type_ids: Tensor, + input_mask: Tensor, sequence_labels: Tensor, token_labels: Tensor, + choice_labels: Tensor): model = RoFormerForTokenClassification(RoFormerModel(**config), num_classes=self.num_classes) model.eval() result = model(input_ids, attention_mask=input_mask, - token_type_ids=token_type_ids) - self.parent.assertEqual(result.shape, [ + token_type_ids=token_type_ids, + labels=token_labels, + return_dict=self.parent.return_dict) + if paddle.is_tensor(result): + result = [result] + elif choice_labels is not None: + result = result[1:] + + self.parent.assertEqual(result[0].shape, [ self.config.batch_size, self.config.seq_length, self.config.num_classes ]) - def create_and_check_for_masked_lm( - self, - config, - input_ids, - token_type_ids, - input_mask, - ): + def create_and_check_for_masked_lm(self, config, input_ids: Tensor, + token_type_ids: Tensor, + input_mask: Tensor, + sequence_labels: Tensor, + token_labels: Tensor, + choice_labels: Tensor): model = RoFormerForMaskedLM(RoFormerModel(**config)) model.eval() result = model(input_ids, attention_mask=input_mask, - token_type_ids=token_type_ids) - self.parent.assertEqual(result.shape, [ + token_type_ids=token_type_ids, + labels=token_labels, + return_dict=self.parent.return_dict) + if paddle.is_tensor(result): + result = [result] + elif choice_labels is not None: + result = result[1:] + + self.parent.assertEqual(result[0].shape, [ self.config.batch_size, self.config.seq_length, self.config.vocab_size ]) def create_and_check_for_sequence_classification( - self, - config, - input_ids, - token_type_ids, - input_mask, - ): + self, config, input_ids: Tensor, token_type_ids: Tensor, + input_mask: Tensor, sequence_labels: Tensor, token_labels: Tensor, + choice_labels: Tensor): model = RoFormerForSequenceClassification( RoFormerModel(**config), num_classes=self.config.num_classes) model.eval() - result = model( - input_ids, - attention_mask=input_mask, - token_type_ids=token_type_ids, - ) + result = model(input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + labels=sequence_labels, + return_dict=self.parent.return_dict) + if paddle.is_tensor(result): + result = [result] + elif choice_labels is not None: + result = result[1:] self.parent.assertEqual( - result.shape, [self.config.batch_size, self.config.num_classes]) + result[0].shape, [self.config.batch_size, self.config.num_classes]) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - ( - config, - input_ids, - token_type_ids, - input_mask, - ) = config_and_inputs + (config, input_ids, token_type_ids, input_mask, _, _, + _) = config_and_inputs inputs_dict = { "input_ids": input_ids, "token_type_ids": token_type_ids, @@ -242,15 +285,21 @@ def prepare_config_and_inputs_for_common(self): return config, inputs_dict +@parameterized_class(("return_dict", "use_labels"), [ + [False, False], + [False, True], + [True, False], + [True, True], +]) class RoFormerModelTest(ModelTesterMixin, unittest.TestCase): base_model_class = RoFormerModel + use_labels = False + return_dict = False - all_model_classes = ( - RoFormerModel, - RoFormerForMultipleChoice, - RoFormerForPretraining, - RoFormerForSequenceClassification, - ) + all_model_classes = (RoFormerModel, RoFormerForSequenceClassification, + RoFormerForTokenClassification, + RoFormerForQuestionAnswering, + RoFormerForMultipleChoice, RoFormerForMaskedLM) def setUp(self): self.model_tester = RoFormerModelTester(self)