From d3869f72482fcab96aa36c7ad1a4bce5606e4cc8 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Wed, 14 Oct 2020 09:55:35 +0200 Subject: [PATCH 01/31] Removed LanguageModelTokenizer. Logic moved into HFTransformersNLP, LMTokenizer tests adjusted and moved into HFTransformersNLP tests --- .../dense_featurizer/lm_featurizer.py | 4 +- rasa/nlu/tokenizers/lm_tokenizer.py | 45 +- .../nlu/utils/hugging_face/hf_transformers.py | 16 +- tests/nlu/tokenizers/test_lm_tokenizer.py | 424 ------------------ tests/nlu/utils/test_hf_transformers.py | 419 ++++++++++++++++- 5 files changed, 445 insertions(+), 463 deletions(-) delete mode 100644 tests/nlu/tokenizers/test_lm_tokenizer.py diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index d0bea59d1c78..e28346560555 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -5,7 +5,7 @@ from rasa.nlu.featurizers.featurizer import DenseFeaturizer from rasa.shared.nlu.training_data.features import Features from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP -from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer +from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.shared.nlu.training_data.message import Message from rasa.nlu.constants import ( @@ -27,7 +27,7 @@ class LanguageModelFeaturizer(DenseFeaturizer): @classmethod def required_components(cls) -> List[Type[Component]]: - return [HFTransformersNLP, LanguageModelTokenizer] + return [HFTransformersNLP, Tokenizer] def train( self, diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py index 5e3bd61f41bb..778d664b1c06 100644 --- a/rasa/nlu/tokenizers/lm_tokenizer.py +++ b/rasa/nlu/tokenizers/lm_tokenizer.py @@ -1,35 +1,24 @@ -from typing import Text, List, Any, Dict, Type +from typing import Dict, Text, Any -from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer -from rasa.nlu.components import Component -from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP -from rasa.shared.nlu.training_data.message import Message +import rasa.shared.utils.io +from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer +from rasa.shared.constants import DOCS_URL_MIGRATION_GUIDE -from rasa.nlu.constants import LANGUAGE_MODEL_DOCS, TOKENS +class LanguageModelTokenizer(WhitespaceTokenizer): + """ + This tokenizer is deprecated and will be removed in the future. -class LanguageModelTokenizer(Tokenizer): - """Tokenizer using transformer based language models. - - Uses the output of HFTransformersNLP component to set the tokens + The HFTransformersNLP component now sets the tokens for dense featurizable attributes of each message object. """ - @classmethod - def required_components(cls) -> List[Type[Component]]: - return [HFTransformersNLP] - - defaults = { - # Flag to check whether to split intents - "intent_tokenization_flag": False, - # Symbol on which intent should be split - "intent_split_symbol": "_", - } - - def get_doc(self, message: Message, attribute: Text) -> Dict[Text, Any]: - return message.get(LANGUAGE_MODEL_DOCS[attribute]) - - def tokenize(self, message: Message, attribute: Text) -> List[Token]: - doc = self.get_doc(message, attribute) - - return doc[TOKENS] + def __init__(self, component_config: Dict[Text, Any] = None) -> None: + super().__init__(component_config) + rasa.shared.utils.io.raise_warning( + f"'{self.__class__.__name__}' is deprecated and " + f"will be removed in the future. " + f"It is recommended to use the '{WhitespaceTokenizer.__name__}' instead.", + category=DeprecationWarning, + docs=DOCS_URL_MIGRATION_GUIDE, + ) \ No newline at end of file diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py index 8b818f3b8030..179022da57c8 100644 --- a/rasa/nlu/utils/hugging_face/hf_transformers.py +++ b/rasa/nlu/utils/hugging_face/hf_transformers.py @@ -1,14 +1,13 @@ import logging -from typing import Any, Dict, List, Text, Tuple, Optional +from typing import Any, Dict, List, Text, Tuple, Optional, Type from rasa.core.utils import get_dict_hash from rasa.nlu.model import Metadata -from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer from rasa.nlu.components import Component from rasa.nlu.config import RasaNLUModelConfig from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.shared.nlu.training_data.message import Message -from rasa.nlu.tokenizers.tokenizer import Token +from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer import rasa.utils.train_utils as train_utils import numpy as np @@ -17,6 +16,7 @@ DENSE_FEATURIZABLE_ATTRIBUTES, TOKEN_IDS, TOKENS, + TOKENS_NAMES, SENTENCE_FEATURES, SEQUENCE_FEATURES, NUMBER_OF_SUB_TOKENS, @@ -55,6 +55,10 @@ class HFTransformersNLP(Component): "cache_dir": None, } + @classmethod + def required_components(cls) -> List[Type[Component]]: + return [Tokenizer] + def __init__( self, component_config: Optional[Dict[Text, Any]] = None, @@ -64,7 +68,6 @@ def __init__( self._load_model_metadata() self._load_model_instance(skip_model_load) - self.whitespace_tokenizer = WhitespaceTokenizer() def _load_model_metadata(self) -> None: @@ -241,7 +244,7 @@ def _tokenize_example( Many language models add a special char in front of (some) words and split words into sub-words. To ensure the entity start and end values matches the - token values, tokenize the text first using the whitespace tokenizer. If + token values, use the tokens produced by the Tokenizer component. If individual tokens are split up into multiple tokens, we add this information to the respected token. @@ -255,8 +258,7 @@ def _tokenize_example( message. """ - tokens_in = self.whitespace_tokenizer.tokenize(message, attribute) - + tokens_in = message.get(TOKENS_NAMES[attribute]) tokens_out = [] token_ids_out = [] diff --git a/tests/nlu/tokenizers/test_lm_tokenizer.py b/tests/nlu/tokenizers/test_lm_tokenizer.py deleted file mode 100644 index ad284bfd09f0..000000000000 --- a/tests/nlu/tokenizers/test_lm_tokenizer.py +++ /dev/null @@ -1,424 +0,0 @@ -import pytest - -from rasa.shared.nlu.training_data.training_data import TrainingData -from rasa.shared.nlu.training_data.message import Message -from rasa.nlu.constants import ( - TOKENS_NAMES, - LANGUAGE_MODEL_DOCS, - TOKEN_IDS, - NUMBER_OF_SUB_TOKENS, -) -from rasa.shared.nlu.constants import TEXT, INTENT -from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer -from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP - - -# TODO: need to fix this failing test -@pytest.mark.skip(reason="Results in random crashing of github action workers") -@pytest.mark.parametrize( - "model_name, model_weights, texts, expected_tokens, expected_indices, expected_num_token_ids", - [ - ( - "bert", - None, - [ - "Good evening.", - "you're", - "r. n. b.", - "rock & roll", - "here is the sentence I want embeddings for.", - ], - [ - ["good", "evening"], - ["you", "re"], - ["r", "n", "b"], - ["rock", "&", "roll"], - [ - "here", - "is", - "the", - "sentence", - "i", - "want", - "em", - "bed", - "ding", - "s", - "for", - ], - ], - [ - [(0, 4), (5, 12)], - [(0, 3), (4, 6)], - [(0, 1), (3, 4), (6, 7)], - [(0, 4), (5, 6), (7, 11)], - [ - (0, 4), - (5, 7), - (8, 11), - (12, 20), - (21, 22), - (23, 27), - (28, 30), - (30, 33), - (33, 37), - (37, 38), - (39, 42), - ], - ], - [4, 4, 5, 5, 13], - ), - ( - "bert", - "bert-base-chinese", - [ - "晚上好", # normal & easy case - "没问题!", # `!` is a Chinese punctuation - "去东畈村", # `畈` is a OOV token for bert-base-chinese - "好的😃", # include a emoji which is common in Chinese text-based chat - ], - [ - ["晚", "上", "好"], - ["没", "问", "题", "!"], - ["去", "东", "畈", "村"], - ["好", "的", "😃"], - ], - [ - [(0, 1), (1, 2), (2, 3)], - [(0, 1), (1, 2), (2, 3), (3, 4)], - [(0, 1), (1, 2), (2, 3), (3, 4)], - [(0, 1), (1, 2), (2, 3)], - ], - [3, 4, 4, 3], - ), - ( - "gpt", - None, - [ - "Good evening.", - "hello", - "you're", - "r. n. b.", - "rock & roll", - "here is the sentence I want embeddings for.", - ], - [ - ["good", "evening"], - ["hello"], - ["you", "re"], - ["r", "n", "b"], - ["rock", "&", "roll"], - ["here", "is", "the", "sentence", "i", "want", "embe", "ddings", "for"], - ], - [ - [(0, 4), (5, 12)], - [(0, 5)], - [(0, 3), (4, 6)], - [(0, 1), (3, 4), (6, 7)], - [(0, 4), (5, 6), (7, 11)], - [ - (0, 4), - (5, 7), - (8, 11), - (12, 20), - (21, 22), - (23, 27), - (28, 32), - (32, 38), - (39, 42), - ], - ], - [2, 1, 2, 3, 3, 9], - ), - ( - "gpt2", - None, - [ - "Good evening.", - "hello", - "you're", - "r. n. b.", - "rock & roll", - "here is the sentence I want embeddings for.", - ], - [ - ["Good", "even", "ing"], - ["hello"], - ["you", "re"], - ["r", "n", "b"], - ["rock", "&", "roll"], - [ - "here", - "is", - "the", - "sent", - "ence", - "I", - "want", - "embed", - "d", - "ings", - "for", - ], - ], - [ - [(0, 4), (5, 9), (9, 12)], - [(0, 5)], - [(0, 3), (4, 6)], - [(0, 1), (3, 4), (6, 7)], - [(0, 4), (5, 6), (7, 11)], - [ - (0, 4), - (5, 7), - (8, 11), - (12, 16), - (16, 20), - (21, 22), - (23, 27), - (28, 33), - (33, 34), - (34, 38), - (39, 42), - ], - ], - [3, 1, 2, 3, 3, 11], - ), - ( - "xlnet", - None, - [ - "Good evening.", - "hello", - "you're", - "r. n. b.", - "rock & roll", - "here is the sentence I want embeddings for.", - ], - [ - ["Good", "evening"], - ["hello"], - ["you", "re"], - ["r", "n", "b"], - ["rock", "&", "roll"], - [ - "here", - "is", - "the", - "sentence", - "I", - "want", - "embed", - "ding", - "s", - "for", - ], - ], - [ - [(0, 4), (5, 12)], - [(0, 5)], - [(0, 3), (4, 6)], - [(0, 1), (3, 4), (6, 7)], - [(0, 4), (5, 6), (7, 11)], - [ - (0, 4), - (5, 7), - (8, 11), - (12, 20), - (21, 22), - (23, 27), - (28, 33), - (33, 37), - (37, 38), - (39, 42), - ], - ], - [4, 3, 4, 5, 5, 12], - ), - ( - "distilbert", - None, - [ - "Good evening.", - "you're", - "r. n. b.", - "rock & roll", - "here is the sentence I want embeddings for.", - ], - [ - ["good", "evening"], - ["you", "re"], - ["r", "n", "b"], - ["rock", "&", "roll"], - [ - "here", - "is", - "the", - "sentence", - "i", - "want", - "em", - "bed", - "ding", - "s", - "for", - ], - ], - [ - [(0, 4), (5, 12)], - [(0, 3), (4, 6)], - [(0, 1), (3, 4), (6, 7)], - [(0, 4), (5, 6), (7, 11)], - [ - (0, 4), - (5, 7), - (8, 11), - (12, 20), - (21, 22), - (23, 27), - (28, 30), - (30, 33), - (33, 37), - (37, 38), - (39, 42), - ], - ], - [4, 4, 5, 5, 13], - ), - ( - "roberta", - None, - [ - "Good evening.", - "hello", - "you're", - "r. n. b.", - "rock & roll", - "here is the sentence I want embeddings for.", - ], - [ - ["Good", "even", "ing"], - ["hello"], - ["you", "re"], - ["r", "n", "b"], - ["rock", "&", "roll"], - [ - "here", - "is", - "the", - "sent", - "ence", - "I", - "want", - "embed", - "d", - "ings", - "for", - ], - ], - [ - [(0, 4), (5, 9), (9, 12)], - [(0, 5)], - [(0, 3), (4, 6)], - [(0, 1), (3, 4), (6, 7)], - [(0, 4), (5, 6), (7, 11)], - [ - (0, 4), - (5, 7), - (8, 11), - (12, 16), - (16, 20), - (21, 22), - (23, 27), - (28, 33), - (33, 34), - (34, 38), - (39, 42), - ], - ], - [5, 3, 4, 5, 5, 13], - ), - ], -) -@pytest.mark.skip_on_windows -def test_lm_tokenizer_edge_cases( - model_name, - model_weights, - texts, - expected_tokens, - expected_indices, - expected_num_token_ids, -): - - if model_weights is None: - model_weights_config = {} - else: - model_weights_config = {"model_weights": model_weights} - transformers_config = {**{"model_name": model_name}, **model_weights_config} - - transformers_nlp = HFTransformersNLP(transformers_config) - lm_tokenizer = LanguageModelTokenizer() - - for text, gt_tokens, gt_indices, gt_num_indices in zip( - texts, expected_tokens, expected_indices, expected_num_token_ids - ): - - message = Message.build(text=text) - transformers_nlp.process(message) - tokens = lm_tokenizer.tokenize(message, TEXT) - token_ids = message.get(LANGUAGE_MODEL_DOCS[TEXT])[TOKEN_IDS] - - assert [t.text for t in tokens] == gt_tokens - assert [t.start for t in tokens] == [i[0] for i in gt_indices] - assert [t.end for t in tokens] == [i[1] for i in gt_indices] - assert len(token_ids) == gt_num_indices - - -@pytest.mark.parametrize( - "text, expected_tokens", - [ - ("Forecast_for_LUNCH", ["Forecast_for_LUNCH"]), - ("Forecast for LUNCH", ["Forecast for LUNCH"]), - ("Forecast+for+LUNCH", ["Forecast", "for", "LUNCH"]), - ], -) -@pytest.mark.skip_on_windows -def test_lm_tokenizer_custom_intent_symbol(text, expected_tokens): - component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"} - - transformers_config = {"model_name": "bert"} # Test for one should be enough - - transformers_nlp = HFTransformersNLP(transformers_config) - lm_tokenizer = LanguageModelTokenizer(component_config) - - message = Message.build(text=text) - message.set(INTENT, text) - - td = TrainingData([message]) - - transformers_nlp.train(td) - lm_tokenizer.train(td) - - assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens - - -@pytest.mark.parametrize( - "text, expected_number_of_sub_tokens", - [("sentence embeddings", [1, 4]), ("this is a test", [1, 1, 1, 1])], -) -@pytest.mark.skip_on_windows -def test_lm_tokenizer_number_of_sub_tokens(text, expected_number_of_sub_tokens): - transformers_config = {"model_name": "bert"} # Test for one should be enough - - transformers_nlp = HFTransformersNLP(transformers_config) - lm_tokenizer = LanguageModelTokenizer() - - message = Message.build(text=text) - - td = TrainingData([message]) - - transformers_nlp.train(td) - lm_tokenizer.train(td) - - assert [ - t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT]) - ] == expected_number_of_sub_tokens diff --git a/tests/nlu/utils/test_hf_transformers.py b/tests/nlu/utils/test_hf_transformers.py index a6918d928314..c8ceadf14c2f 100644 --- a/tests/nlu/utils/test_hf_transformers.py +++ b/tests/nlu/utils/test_hf_transformers.py @@ -5,6 +5,16 @@ from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP from rasa.shared.nlu.training_data.message import Message +from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer +from rasa.shared.nlu.training_data.training_data import TrainingData +from rasa.nlu.constants import ( + TOKENS_NAMES, + NUMBER_OF_SUB_TOKENS, LANGUAGE_MODEL_DOCS, TOKEN_IDS +) +from rasa.shared.nlu.constants import ( + TEXT, + INTENT +) @pytest.mark.parametrize( @@ -14,7 +24,6 @@ def test_sequence_length_overflow_train( input_sequence_length: int, model_name: Text, should_overflow: bool ): - component = HFTransformersNLP({"model_name": model_name}, skip_model_load=True) message = Message.build(text=" ".join(["hi"] * input_sequence_length)) if should_overflow: @@ -42,7 +51,6 @@ def test_long_sequences_extra_padding( model_name: Text, padding_needed: bool, ): - component = HFTransformersNLP({"model_name": model_name}, skip_model_load=True) modified_sequence_embeddings = component._add_extra_padding( sequence_embeddings, actual_sequence_lengths @@ -123,3 +131,410 @@ def test_attention_mask( assert np.all(mask_ones == 1) assert np.all(mask_zeros == 0) + +# TODO: need to fix this failing test +@pytest.mark.skip(reason="Results in random crashing of github action workers") +@pytest.mark.parametrize( + "model_name, model_weights, texts, expected_tokens, expected_indices, expected_num_token_ids", + [ + ( + "bert", + None, + [ + "Good evening.", + "you're", + "r. n. b.", + "rock & roll", + "here is the sentence I want embeddings for.", + ], + [ + ["good", "evening"], + ["you", "re"], + ["r", "n", "b"], + ["rock", "&", "roll"], + [ + "here", + "is", + "the", + "sentence", + "i", + "want", + "em", + "bed", + "ding", + "s", + "for", + ], + ], + [ + [(0, 4), (5, 12)], + [(0, 3), (4, 6)], + [(0, 1), (3, 4), (6, 7)], + [(0, 4), (5, 6), (7, 11)], + [ + (0, 4), + (5, 7), + (8, 11), + (12, 20), + (21, 22), + (23, 27), + (28, 30), + (30, 33), + (33, 37), + (37, 38), + (39, 42), + ], + ], + [4, 4, 5, 5, 13], + ), + ( + "bert", + "bert-base-chinese", + [ + "晚上好", # normal & easy case + "没问题!", # `!` is a Chinese punctuation + "去东畈村", # `畈` is a OOV token for bert-base-chinese + "好的😃", # include a emoji which is common in Chinese text-based chat + ], + [ + ["晚", "上", "好"], + ["没", "问", "题", "!"], + ["去", "东", "畈", "村"], + ["好", "的", "😃"], + ], + [ + [(0, 1), (1, 2), (2, 3)], + [(0, 1), (1, 2), (2, 3), (3, 4)], + [(0, 1), (1, 2), (2, 3), (3, 4)], + [(0, 1), (1, 2), (2, 3)], + ], + [3, 4, 4, 3], + ), + ( + "gpt", + None, + [ + "Good evening.", + "hello", + "you're", + "r. n. b.", + "rock & roll", + "here is the sentence I want embeddings for.", + ], + [ + ["good", "evening"], + ["hello"], + ["you", "re"], + ["r", "n", "b"], + ["rock", "&", "roll"], + ["here", "is", "the", "sentence", "i", "want", "embe", "ddings", "for"], + ], + [ + [(0, 4), (5, 12)], + [(0, 5)], + [(0, 3), (4, 6)], + [(0, 1), (3, 4), (6, 7)], + [(0, 4), (5, 6), (7, 11)], + [ + (0, 4), + (5, 7), + (8, 11), + (12, 20), + (21, 22), + (23, 27), + (28, 32), + (32, 38), + (39, 42), + ], + ], + [2, 1, 2, 3, 3, 9], + ), + ( + "gpt2", + None, + [ + "Good evening.", + "hello", + "you're", + "r. n. b.", + "rock & roll", + "here is the sentence I want embeddings for.", + ], + [ + ["Good", "even", "ing"], + ["hello"], + ["you", "re"], + ["r", "n", "b"], + ["rock", "&", "roll"], + [ + "here", + "is", + "the", + "sent", + "ence", + "I", + "want", + "embed", + "d", + "ings", + "for", + ], + ], + [ + [(0, 4), (5, 9), (9, 12)], + [(0, 5)], + [(0, 3), (4, 6)], + [(0, 1), (3, 4), (6, 7)], + [(0, 4), (5, 6), (7, 11)], + [ + (0, 4), + (5, 7), + (8, 11), + (12, 16), + (16, 20), + (21, 22), + (23, 27), + (28, 33), + (33, 34), + (34, 38), + (39, 42), + ], + ], + [3, 1, 2, 3, 3, 11], + ), + ( + "xlnet", + None, + [ + "Good evening.", + "hello", + "you're", + "r. n. b.", + "rock & roll", + "here is the sentence I want embeddings for.", + ], + [ + ["Good", "evening"], + ["hello"], + ["you", "re"], + ["r", "n", "b"], + ["rock", "&", "roll"], + [ + "here", + "is", + "the", + "sentence", + "I", + "want", + "embed", + "ding", + "s", + "for", + ], + ], + [ + [(0, 4), (5, 12)], + [(0, 5)], + [(0, 3), (4, 6)], + [(0, 1), (3, 4), (6, 7)], + [(0, 4), (5, 6), (7, 11)], + [ + (0, 4), + (5, 7), + (8, 11), + (12, 20), + (21, 22), + (23, 27), + (28, 33), + (33, 37), + (37, 38), + (39, 42), + ], + ], + [4, 3, 4, 5, 5, 12], + ), + ( + "distilbert", + None, + [ + "Good evening.", + "you're", + "r. n. b.", + "rock & roll", + "here is the sentence I want embeddings for.", + ], + [ + ["good", "evening"], + ["you", "re"], + ["r", "n", "b"], + ["rock", "&", "roll"], + [ + "here", + "is", + "the", + "sentence", + "i", + "want", + "em", + "bed", + "ding", + "s", + "for", + ], + ], + [ + [(0, 4), (5, 12)], + [(0, 3), (4, 6)], + [(0, 1), (3, 4), (6, 7)], + [(0, 4), (5, 6), (7, 11)], + [ + (0, 4), + (5, 7), + (8, 11), + (12, 20), + (21, 22), + (23, 27), + (28, 30), + (30, 33), + (33, 37), + (37, 38), + (39, 42), + ], + ], + [4, 4, 5, 5, 13], + ), + ( + "roberta", + None, + [ + "Good evening.", + "hello", + "you're", + "r. n. b.", + "rock & roll", + "here is the sentence I want embeddings for.", + ], + [ + ["Good", "even", "ing"], + ["hello"], + ["you", "re"], + ["r", "n", "b"], + ["rock", "&", "roll"], + [ + "here", + "is", + "the", + "sent", + "ence", + "I", + "want", + "embed", + "d", + "ings", + "for", + ], + ], + [ + [(0, 4), (5, 9), (9, 12)], + [(0, 5)], + [(0, 3), (4, 6)], + [(0, 1), (3, 4), (6, 7)], + [(0, 4), (5, 6), (7, 11)], + [ + (0, 4), + (5, 7), + (8, 11), + (12, 16), + (16, 20), + (21, 22), + (23, 27), + (28, 33), + (33, 34), + (34, 38), + (39, 42), + ], + ], + [5, 3, 4, 5, 5, 13], + ), + ], +) +@pytest.mark.skip_on_windows +def test_hf_transformers_edge_cases( + model_name, + model_weights, + texts, + expected_tokens, + expected_indices, + expected_num_token_ids, +): + + if model_weights is None: + model_weights_config = {} + else: + model_weights_config = {"model_weights": model_weights} + transformers_config = {**{"model_name": model_name}, **model_weights_config} + + transformers_nlp = HFTransformersNLP(transformers_config) + whitespace_tokenizer = WhitespaceTokenizer() + + for text, gt_tokens, gt_indices, gt_num_indices in zip( + texts, expected_tokens, expected_indices, expected_num_token_ids + ): + + message = Message.build(text=text) + tokens = whitespace_tokenizer.tokenize(message, TEXT) + message.set(TOKENS_NAMES[TEXT], tokens) + transformers_nlp.process(message) + token_ids = message.get(LANGUAGE_MODEL_DOCS[TEXT])[TOKEN_IDS] + + assert [t.text for t in tokens] == gt_tokens + assert [t.start for t in tokens] == [i[0] for i in gt_indices] + assert [t.end for t in tokens] == [i[1] for i in gt_indices] + assert len(token_ids) == gt_num_indices + +@pytest.mark.parametrize( + "text, expected_tokens", + [ + ("Forecast_for_LUNCH", ["Forecast_for_LUNCH"]), + ("Forecast for LUNCH", ["Forecast for LUNCH"]), + ("Forecast+for+LUNCH", ["Forecast", "for", "LUNCH"]), + ], +) +@pytest.mark.skip_on_windows +def test_hf_transformers_custom_intent_symbol(text, expected_tokens): + + transformers_config = {"model_name": "bert"} # Test for one should be enough + + transformers_nlp = HFTransformersNLP(transformers_config) + whitespace_tokenizer = WhitespaceTokenizer() + + message = Message.build(text=text) + message.set(INTENT, text) + + td = TrainingData([message]) + + whitespace_tokenizer.train(td) + transformers_nlp.train(td) + + assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens + +@pytest.mark.parametrize( + "text, expected_number_of_sub_tokens", + [("sentence embeddings", [1, 4]), ("this is a test", [1, 1, 1, 1])], +) +@pytest.mark.skip_on_windows +def test_hf_transformers_number_of_sub_tokens(text, expected_number_of_sub_tokens): + transformers_config = {"model_name": "bert"} # Test for one should be enough + + transformers_nlp = HFTransformersNLP(transformers_config) + whitespace_tokenizer = WhitespaceTokenizer() + + message = Message.build(text=text) + + td = TrainingData([message]) + whitespace_tokenizer.train(td) + transformers_nlp.train(td) + + assert [ + t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT]) + ] == expected_number_of_sub_tokens From e4cc85b800d963f797a0116925639cacb8091274 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Wed, 14 Oct 2020 14:02:26 +0200 Subject: [PATCH 02/31] Updated Components doc to reflect deprecation of LanguageModelTokenizer --- docs/docs/components.mdx | 50 ++++++++-------------------------------- 1 file changed, 9 insertions(+), 41 deletions(-) diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx index d94ad71cf26a..902db5ff830b 100644 --- a/docs/docs/components.mdx +++ b/docs/docs/components.mdx @@ -162,7 +162,7 @@ word vectors in your pipeline. Initializes specified pre-trained language model from HuggingFace's [Transformers library](https://huggingface.co/transformers/). The component applies language model specific tokenization and featurization to compute sequence and sentence level representations for each example in the training data. - Include [LanguageModelTokenizer](./components.mdx#languagemodeltokenizer) and [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) to utilize the output of this + Include a [tokenizer](./components.mdx#tokenizers) and [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) to utilize the output of this component for downstream NLU models. :::note @@ -458,46 +458,6 @@ word vectors in your pipeline. ``` - ### LanguageModelTokenizer - - - * **Short** - - Tokenizer from pre-trained language models - - - - * **Outputs** - - `tokens` for user messages, responses (if present), and intents (if specified) - - - - * **Requires** - - [HFTransformersNLP](./components.mdx#hftransformersnlp) - - - - * **Description** - - Creates tokens using the pre-trained language model specified in upstream [HFTransformersNLP](./components.mdx#hftransformersnlp) component. - Must be used whenever the [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) is used. - - - - * **Configuration** - - ```yaml-rasa - pipeline: - - name: "LanguageModelTokenizer" - # Flag to check whether to split intents - "intent_tokenization_flag": False - # Symbol on which intent should be split - "intent_split_symbol": "_" - ``` - - ## Featurizers Text featurizers are divided into two different categories: sparse featurizers and dense featurizers. @@ -2740,3 +2700,11 @@ the matrix contains a feature vector for every token in the sequence. The sentence features are represented by a matrix of size `(1 x feature-dimension)`. ::: + +## Deprecated Components + +:::caution Deprecated +The `LanguageModelTokenizer` is deprecated. The [HFTransformersNLP](./components.mdx#hftransformersnlp) now implements +its behaviour. Any [tokenizer](./components.mdx#tokenizers) can be used in its place; this must be placed before the +[HFTransformersNLP](./components.mdx#hftransformersnlp) Component. +::: \ No newline at end of file From 955312d92b4660152f9279c17475cfa70931916c Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Wed, 14 Oct 2020 14:18:04 +0200 Subject: [PATCH 03/31] Updated Components doc to reflect decoupling of LMFeaturizer and LMTokenizer --- docs/docs/components.mdx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx index 902db5ff830b..17d22712b02b 100644 --- a/docs/docs/components.mdx +++ b/docs/docs/components.mdx @@ -154,7 +154,7 @@ word vectors in your pipeline. * **Requires** - Nothing + `tokens` @@ -652,7 +652,7 @@ Note: The `feature-dimension` for sequence and sentence features does not have t * **Requires** - [HFTransformersNLP](./components.mdx#hftransformersnlp) and [LanguageModelTokenizer](./components.mdx#languagemodeltokenizer) + `tokens` and [HFTransformersNLP](./components.mdx#hftransformersnlp). @@ -678,8 +678,8 @@ Note: The `feature-dimension` for sequence and sentence features does not have t * **Configuration** - Include [HFTransformersNLP](./components.mdx#hftransformersnlp) and [LanguageModelTokenizer](./components.mdx#languagemodeltokenizer) components before this component. Use - [LanguageModelTokenizer](./components.mdx#languagemodeltokenizer) to ensure tokens are correctly set for all components throughout the pipeline. + Include [HFTransformersNLP](./components.mdx#hftransformersnlp) and a [Tokenizer](./components.mdx#tokenizers) components before this component. Use + [HFTransformersNLP](./components.mdx#hftransformersnlp) to ensure tokens are correctly set for all components throughout the pipeline. ```yaml-rasa pipeline: From 2466515ebaa4921ad264efbcba80adca8a82d8fd Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Thu, 15 Oct 2020 17:19:20 +0200 Subject: [PATCH 04/31] Reformatted lm_tokenizer and hf_transformers --- rasa/nlu/tokenizers/lm_tokenizer.py | 2 +- tests/nlu/utils/test_hf_transformers.py | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py index 778d664b1c06..66cb41f4ee3c 100644 --- a/rasa/nlu/tokenizers/lm_tokenizer.py +++ b/rasa/nlu/tokenizers/lm_tokenizer.py @@ -21,4 +21,4 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None: f"It is recommended to use the '{WhitespaceTokenizer.__name__}' instead.", category=DeprecationWarning, docs=DOCS_URL_MIGRATION_GUIDE, - ) \ No newline at end of file + ) diff --git a/tests/nlu/utils/test_hf_transformers.py b/tests/nlu/utils/test_hf_transformers.py index c8ceadf14c2f..22957e52d3e7 100644 --- a/tests/nlu/utils/test_hf_transformers.py +++ b/tests/nlu/utils/test_hf_transformers.py @@ -9,12 +9,11 @@ from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.nlu.constants import ( TOKENS_NAMES, - NUMBER_OF_SUB_TOKENS, LANGUAGE_MODEL_DOCS, TOKEN_IDS -) -from rasa.shared.nlu.constants import ( - TEXT, - INTENT + NUMBER_OF_SUB_TOKENS, + LANGUAGE_MODEL_DOCS, + TOKEN_IDS, ) +from rasa.shared.nlu.constants import TEXT, INTENT @pytest.mark.parametrize( @@ -132,6 +131,7 @@ def test_attention_mask( assert np.all(mask_ones == 1) assert np.all(mask_zeros == 0) + # TODO: need to fix this failing test @pytest.mark.skip(reason="Results in random crashing of github action workers") @pytest.mark.parametrize( @@ -492,6 +492,7 @@ def test_hf_transformers_edge_cases( assert [t.end for t in tokens] == [i[1] for i in gt_indices] assert len(token_ids) == gt_num_indices + @pytest.mark.parametrize( "text, expected_tokens", [ @@ -518,6 +519,7 @@ def test_hf_transformers_custom_intent_symbol(text, expected_tokens): assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens + @pytest.mark.parametrize( "text, expected_number_of_sub_tokens", [("sentence embeddings", [1, 4]), ("this is a test", [1, 1, 1, 1])], @@ -536,5 +538,5 @@ def test_hf_transformers_number_of_sub_tokens(text, expected_number_of_sub_token transformers_nlp.train(td) assert [ - t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT]) - ] == expected_number_of_sub_tokens + t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT]) + ] == expected_number_of_sub_tokens From a7fcffb524edc5d44ecf1e6b56cc517f0073b23f Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Thu, 15 Oct 2020 17:22:55 +0200 Subject: [PATCH 05/31] Removed ConveRTTokenizer and moved its logic into ConveRTFeaturizer. Also moved tests from test_convert_tokenizer into test_convert_featurizer and adjusted accordingly --- .../dense_featurizer/convert_featurizer.py | 70 +++++++++++- rasa/nlu/tokenizers/convert_tokenizer.py | 100 +++--------------- .../featurizers/test_convert_featurizer.py | 90 +++++++++++++--- .../nlu/tokenizers/test_convert_tokenizer.py | 79 -------------- 4 files changed, 156 insertions(+), 183 deletions(-) delete mode 100644 tests/nlu/tokenizers/test_convert_tokenizer.py diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index 9d65e3ef3460..47a0091326eb 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -4,9 +4,11 @@ from tqdm import tqdm import rasa.shared.utils.io -from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer +from rasa.core.utils import get_dict_hash +from rasa.utils import common +from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer +from rasa.nlu.model import Metadata from rasa.shared.constants import DOCS_URL_COMPONENTS -from rasa.nlu.tokenizers.tokenizer import Token from rasa.nlu.components import Component from rasa.nlu.featurizers.featurizer import DenseFeaturizer from rasa.shared.nlu.training_data.features import Features @@ -17,6 +19,7 @@ DENSE_FEATURIZABLE_ATTRIBUTES, FEATURIZER_CLASS_ALIAS, TOKENS_NAMES, + NUMBER_OF_SUB_TOKENS, ) from rasa.shared.nlu.constants import TEXT, FEATURE_TYPE_SENTENCE, FEATURE_TYPE_SEQUENCE import numpy as np @@ -26,6 +29,10 @@ logger = logging.getLogger(__name__) +TF_HUB_MODULE_URL = ( + "https://github.com/connorbrinton/polyai-models/releases/download/v1.0/model.tar.gz" +) + class ConveRTFeaturizer(DenseFeaturizer): """Featurizer using ConveRT model. @@ -37,7 +44,7 @@ class ConveRTFeaturizer(DenseFeaturizer): @classmethod def required_components(cls) -> List[Type[Component]]: - return [ConveRTTokenizer] + return [Tokenizer] @classmethod def required_packages(cls) -> List[Text]: @@ -46,6 +53,11 @@ def required_packages(cls) -> List[Text]: def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: super(ConveRTFeaturizer, self).__init__(component_config) + self.model_url = self.component_config.get("model_url", TF_HUB_MODULE_URL) + + self.module = train_utils.load_tf_hub_model(self.model_url) + + self.tokenize_signature = self.module.signatures["tokenize"] @staticmethod def __get_signature(signature: Text, module: Any) -> NoReturn: @@ -92,7 +104,7 @@ def _compute_sequence_encodings( self, batch_examples: List[Message], module: Any, attribute: Text = TEXT ) -> Tuple[np.ndarray, List[int]]: list_of_tokens = [ - example.get(TOKENS_NAMES[attribute]) for example in batch_examples + self.tokenize(example, attribute) for example in batch_examples ] number_of_tokens_in_sentence = [ @@ -249,3 +261,53 @@ def _set_features( self.component_config[FEATURIZER_CLASS_ALIAS], ) example.add_features(_sentence_features) + + @classmethod + def cache_key( + cls, component_meta: Dict[Text, Any], model_metadata: Metadata + ) -> Optional[Text]: + _config = common.update_existing_keys(cls.defaults, component_meta) + return f"{cls.name}-{get_dict_hash(_config)}" + + def provide_context(self) -> Dict[Text, Any]: + return {"tf_hub_module": self.module} + + def _tokenize(self, sentence: Text) -> Any: + + return self.tokenize_signature(tf.convert_to_tensor([sentence]))[ + "default" + ].numpy() + + def tokenize(self, message: Message, attribute: Text) -> List[Token]: + """Tokenize the text using the ConveRT model. + ConveRT adds a special char in front of (some) words and splits words into + sub-words. To ensure the entity start and end values matches the token values, + tokenize the text first using the whitespace tokenizer. If individual tokens + are split up into multiple tokens, add this information to the + respected tokens. + """ + + tokens_in = message.get(TOKENS_NAMES[attribute]) + + tokens_out = [] + + for token in tokens_in: + # use ConveRT model to tokenize the text + split_token_strings = self._tokenize(token.text)[0] + + # clean tokens (remove special chars and empty tokens) + split_token_strings = self._clean_tokens(split_token_strings) + + token.set(NUMBER_OF_SUB_TOKENS, len(split_token_strings)) + + tokens_out.append(token) + + message.set(TOKENS_NAMES[attribute], tokens_out) + return tokens_out + + @staticmethod + def _clean_tokens(tokens: List[bytes]) -> List[Text]: + """Encode tokens and remove special char added by ConveRT.""" + + tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens] + return [string for string in tokens if string] diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py index f439b0cf0630..627b7d714a44 100644 --- a/rasa/nlu/tokenizers/convert_tokenizer.py +++ b/rasa/nlu/tokenizers/convert_tokenizer.py @@ -1,96 +1,24 @@ -from typing import Any, Dict, List, Optional, Text +from typing import Dict, Text, Any -from rasa.core.utils import get_dict_hash -from rasa.nlu.constants import NUMBER_OF_SUB_TOKENS -from rasa.nlu.model import Metadata -from rasa.nlu.tokenizers.tokenizer import Token +import rasa.shared.utils.io from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer -from rasa.shared.nlu.training_data.message import Message -from rasa.utils import common -import rasa.utils.train_utils as train_utils -import tensorflow as tf - - -TF_HUB_MODULE_URL = ( - "https://github.com/PolyAI-LDN/polyai-models/releases/download/v1.0/model.tar.gz" -) +from rasa.shared.constants import DOCS_URL_MIGRATION_GUIDE class ConveRTTokenizer(WhitespaceTokenizer): - """Tokenizer using ConveRT model. - Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert) - model from TFHub and computes sub-word tokens for dense - featurizable attributes of each message object. """ + This tokenizer is deprecated and will be removed in the future. - defaults = { - # Flag to check whether to split intents - "intent_tokenization_flag": False, - # Symbol on which intent should be split - "intent_split_symbol": "_", - # Regular expression to detect tokens - "token_pattern": None, - # Remote URL of hosted model - "model_url": TF_HUB_MODULE_URL, - } + The ConveRTFeaturizer component now sets the sub-token information + for dense featurizable attributes of each message object. + """ def __init__(self, component_config: Dict[Text, Any] = None) -> None: - """Construct a new tokenizer using the WhitespaceTokenizer framework.""" - super().__init__(component_config) - - self.model_url = self.component_config.get("model_url", TF_HUB_MODULE_URL) - - self.module = train_utils.load_tf_hub_model(self.model_url) - - self.tokenize_signature = self.module.signatures["tokenize"] - - @classmethod - def cache_key( - cls, component_meta: Dict[Text, Any], model_metadata: Metadata - ) -> Optional[Text]: - _config = common.update_existing_keys(cls.defaults, component_meta) - return f"{cls.name}-{get_dict_hash(_config)}" - - def provide_context(self) -> Dict[Text, Any]: - return {"tf_hub_module": self.module} - - def _tokenize(self, sentence: Text) -> Any: - - return self.tokenize_signature(tf.convert_to_tensor([sentence]))[ - "default" - ].numpy() - - def tokenize(self, message: Message, attribute: Text) -> List[Token]: - """Tokenize the text using the ConveRT model. - ConveRT adds a special char in front of (some) words and splits words into - sub-words. To ensure the entity start and end values matches the token values, - tokenize the text first using the whitespace tokenizer. If individual tokens - are split up into multiple tokens, add this information to the - respected tokens. - """ - - # perform whitespace tokenization - tokens_in = super().tokenize(message, attribute) - - tokens_out = [] - - for token in tokens_in: - # use ConveRT model to tokenize the text - split_token_strings = self._tokenize(token.text)[0] - - # clean tokens (remove special chars and empty tokens) - split_token_strings = self._clean_tokens(split_token_strings) - - token.set(NUMBER_OF_SUB_TOKENS, len(split_token_strings)) - - tokens_out.append(token) - - return tokens_out - - @staticmethod - def _clean_tokens(tokens: List[bytes]) -> List[Text]: - """Encode tokens and remove special char added by ConveRT.""" - - tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens] - return [string for string in tokens if string] + rasa.shared.utils.io.raise_warning( + f"'{self.__class__.__name__}' is deprecated and " + f"will be removed in the future. " + f"It is recommended to use the '{WhitespaceTokenizer.__name__}' instead.", + category=DeprecationWarning, + docs=DOCS_URL_MIGRATION_GUIDE, + ) diff --git a/tests/nlu/featurizers/test_convert_featurizer.py b/tests/nlu/featurizers/test_convert_featurizer.py index a2f170b9fe57..db5e858374d0 100644 --- a/tests/nlu/featurizers/test_convert_featurizer.py +++ b/tests/nlu/featurizers/test_convert_featurizer.py @@ -1,30 +1,32 @@ import numpy as np import pytest -from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer +from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.shared.nlu.training_data.message import Message -from rasa.nlu.constants import TOKENS_NAMES +from rasa.nlu.constants import TOKENS_NAMES, NUMBER_OF_SUB_TOKENS from rasa.shared.nlu.constants import TEXT, INTENT, RESPONSE from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.featurizers.dense_featurizer.convert_featurizer import ConveRTFeaturizer # TODO -# skip tests as the ConveRT model is not publicly available anymore (see https://github.com/RasaHQ/rasa/issues/6806) +# skip tests as the ConveRT model is not publicly available anymore (see +# https://github.com/RasaHQ/rasa/issues/6806) @pytest.mark.skip def test_convert_featurizer_process(component_builder): - tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) + tokenizer = WhitespaceTokenizer() featurizer = component_builder.create_component_from_class(ConveRTFeaturizer) - sentence = "Hey how are you today ?" - message = Message(data={TEXT: sentence}) - tokens = tokenizer.tokenize(message, attribute=TEXT) - message.set(TOKENS_NAMES[TEXT], tokens) + message = Message.build(text=sentence) - featurizer.process(message, tf_hub_module=tokenizer.module) + td = TrainingData([message]) + tokenizer.train(td) + tokens = featurizer.tokenize(message, attribute=TEXT) + + featurizer.process(message, tf_hub_module=featurizer.module) expected = np.array([2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( @@ -43,20 +45,23 @@ def test_convert_featurizer_process(component_builder): @pytest.mark.skip def test_convert_featurizer_train(component_builder): - tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) + tokenizer = WhitespaceTokenizer() featurizer = component_builder.create_component_from_class(ConveRTFeaturizer) sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) - tokens = tokenizer.tokenize(message, attribute=TEXT) + td = TrainingData([message]) + tokenizer.train(td) + + tokens = featurizer.tokenize(message, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) featurizer.train( - TrainingData([message]), RasaNLUModelConfig(), tf_hub_module=tokenizer.module + TrainingData([message]), RasaNLUModelConfig(), tf_hub_module=featurizer.module ) expected = np.array([2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) @@ -100,9 +105,66 @@ def test_convert_featurizer_train(component_builder): ) @pytest.mark.skip def test_convert_featurizer_tokens_to_text(component_builder, sentence, expected_text): - tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) - tokens = tokenizer.tokenize(Message(data={TEXT: sentence}), attribute=TEXT) + tokenizer = WhitespaceTokenizer() + featurizer = component_builder.create_component_from_class(ConveRTFeaturizer) + message = Message.build(text=sentence) + td = TrainingData([message]) + tokenizer.train(td) + tokens = featurizer.tokenize(message, attribute=TEXT) actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0] assert expected_text == actual_text + + +@pytest.mark.parametrize( + "text, expected_tokens, expected_indices", + [ + ( + "forecast for lunch", + ["forecast", "for", "lunch"], + [(0, 8), (9, 12), (13, 18)], + ), + ("hello", ["hello"], [(0, 5)]), + ("you're", ["you", "re"], [(0, 3), (4, 6)]), + ("r. n. b.", ["r", "n", "b"], [(0, 1), (3, 4), (6, 7)]), + ("rock & roll", ["rock", "&", "roll"], [(0, 4), (5, 6), (7, 11)]), + ("ńöñàśçií", ["ńöñàśçií"], [(0, 8)]), + ], +) +@pytest.mark.skip +def test_convert_featurizer_token_edge_cases( + component_builder, text, expected_tokens, expected_indices +): + tokenizer = WhitespaceTokenizer() + featurizer = component_builder.create_component_from_class(ConveRTFeaturizer) + message = Message.build(text=text) + td = TrainingData([message]) + tokenizer.train(td) + tokens = featurizer.tokenize(message, attribute=TEXT) + + assert [t.text for t in tokens] == expected_tokens + assert [t.start for t in tokens] == [i[0] for i in expected_indices] + assert [t.end for t in tokens] == [i[1] for i in expected_indices] + + +@pytest.mark.parametrize( + "text, expected_number_of_sub_tokens", + [("Aarhus is a city", [2, 1, 1, 1]), ("sentence embeddings", [1, 3])], +) +@pytest.mark.skip +def test_convert_featurizer_number_of_sub_tokens( + component_builder, text, expected_number_of_sub_tokens +): + tokenizer = WhitespaceTokenizer() + featurizer = component_builder.create_component_from_class(ConveRTFeaturizer) + + message = Message.build(text=text) + td = TrainingData([message]) + tokenizer.train(td) + + tokens = featurizer.tokenize(message, attribute=TEXT) + + assert [ + t.get(NUMBER_OF_SUB_TOKENS) for t in tokens + ] == expected_number_of_sub_tokens diff --git a/tests/nlu/tokenizers/test_convert_tokenizer.py b/tests/nlu/tokenizers/test_convert_tokenizer.py deleted file mode 100644 index e53bb7ecd8e0..000000000000 --- a/tests/nlu/tokenizers/test_convert_tokenizer.py +++ /dev/null @@ -1,79 +0,0 @@ -import pytest - -from rasa.shared.nlu.training_data.training_data import TrainingData -from rasa.shared.nlu.training_data.message import Message -from rasa.nlu.constants import TOKENS_NAMES, NUMBER_OF_SUB_TOKENS -from rasa.shared.nlu.constants import TEXT, INTENT -from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer - -# TODO -# skip tests as the ConveRT model is not publicly available anymore (see https://github.com/RasaHQ/rasa/issues/6806) - - -@pytest.mark.parametrize( - "text, expected_tokens, expected_indices", - [ - ( - "forecast for lunch", - ["forecast", "for", "lunch"], - [(0, 8), (9, 12), (13, 18)], - ), - ("hello", ["hello"], [(0, 5)]), - ("you're", ["you", "re"], [(0, 3), (4, 6)]), - ("r. n. b.", ["r", "n", "b"], [(0, 1), (3, 4), (6, 7)]), - ("rock & roll", ["rock", "&", "roll"], [(0, 4), (5, 6), (7, 11)]), - ("ńöñàśçií", ["ńöñàśçií"], [(0, 8)]), - ], -) -@pytest.mark.skip -def test_convert_tokenizer_edge_cases( - component_builder, text, expected_tokens, expected_indices -): - tk = component_builder.create_component_from_class(ConveRTTokenizer) - - tokens = tk.tokenize(Message(data={TEXT: text}), attribute=TEXT) - - assert [t.text for t in tokens] == expected_tokens - assert [t.start for t in tokens] == [i[0] for i in expected_indices] - assert [t.end for t in tokens] == [i[1] for i in expected_indices] - - -@pytest.mark.parametrize( - "text, expected_tokens", - [ - ("Forecast_for_LUNCH", ["Forecast_for_LUNCH"]), - ("Forecast for LUNCH", ["Forecast for LUNCH"]), - ], -) -@pytest.mark.skip -def test_custom_intent_symbol(component_builder, text, expected_tokens): - tk = component_builder.create_component_from_class( - ConveRTTokenizer, intent_tokenization_flag=True, intent_split_symbol="+" - ) - - message = Message(data={TEXT: text}) - message.set(INTENT, text) - - tk.train(TrainingData([message])) - - assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens - - -@pytest.mark.parametrize( - "text, expected_number_of_sub_tokens", - [("Aarhus is a city", [2, 1, 1, 1]), ("sentence embeddings", [1, 3])], -) -@pytest.mark.skip -def test_convert_tokenizer_number_of_sub_tokens( - component_builder, text, expected_number_of_sub_tokens -): - tk = component_builder.create_component_from_class(ConveRTTokenizer) - - message = Message(data={TEXT: text}) - message.set(INTENT, text) - - tk.train(TrainingData([message])) - - assert [ - t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT]) - ] == expected_number_of_sub_tokens From adf3c312ea6efe0707561ece38cbbb1c521daae8 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Thu, 15 Oct 2020 17:36:05 +0200 Subject: [PATCH 06/31] Changed ConveRT model url back to official (broken) poly ai url --- rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index 47a0091326eb..97a45ee6eb7a 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -30,7 +30,7 @@ logger = logging.getLogger(__name__) TF_HUB_MODULE_URL = ( - "https://github.com/connorbrinton/polyai-models/releases/download/v1.0/model.tar.gz" + "https://github.com/PolyAI-LDN/polyai-models/releases/download/v1.0/model.tar.gz" ) From df92bec8d8d7ec481e5618905b2607802749e817 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Thu, 15 Oct 2020 17:48:50 +0200 Subject: [PATCH 07/31] Updated documentation to reflect deprecation of ConveRTTokenizer --- docs/docs/components.mdx | 60 ++++------------------------------------ 1 file changed, 6 insertions(+), 54 deletions(-) diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx index 17d22712b02b..8bfa1c19875c 100644 --- a/docs/docs/components.mdx +++ b/docs/docs/components.mdx @@ -404,59 +404,6 @@ word vectors in your pipeline. ``` - ### ConveRTTokenizer - - - * **Short** - - Tokenizer using [ConveRT](https://github.com/PolyAI-LDN/polyai-models#convert) model. - - - - * **Outputs** - - `tokens` for user messages, responses (if present), and intents (if specified) - - - - * **Requires** - - Nothing - - - - * **Description** - - Creates tokens using the ConveRT tokenizer. Must be used whenever the [ConveRTFeaturizer](./components.mdx#convertfeaturizer) is used. - - :::note - Since `ConveRT` model is trained only on an English corpus of conversations, this tokenizer should only - be used if your training data is in English language. - - ::: - - :::note - To use `ConveRTTokenizer`, install Rasa Open Source with `pip3 install rasa[convert]`. - - ::: - - - - * **Configuration** - - ```yaml-rasa - pipeline: - - name: "ConveRTTokenizer" - # Flag to check whether to split intents - "intent_tokenization_flag": False - # Symbol on which intent should be split - "intent_split_symbol": "_" - # Regular expression to detect tokens - "token_pattern": None - # Remote URL of hosted model - "model_url": TF_HUB_MODULE_URL - ``` - ## Featurizers @@ -598,7 +545,7 @@ Note: The `feature-dimension` for sequence and sentence features does not have t * **Requires** - [ConveRTTokenizer](./components.mdx#converttokenizer) + `tokens` @@ -2707,4 +2654,9 @@ The sentence features are represented by a matrix of size `(1 x feature-dimensio The `LanguageModelTokenizer` is deprecated. The [HFTransformersNLP](./components.mdx#hftransformersnlp) now implements its behaviour. Any [tokenizer](./components.mdx#tokenizers) can be used in its place; this must be placed before the [HFTransformersNLP](./components.mdx#hftransformersnlp) Component. +::: + +:::caution Deprecated +The `ConveRTokenizer` is deprecated. The [ConveRTFeaturizer](./components.mdx#convertfeaturizer) now implements +its behaviour. Any [tokenizer](./components.mdx#tokenizers) can be used in its place. ::: \ No newline at end of file From ddea5183f23f6bf1dab86b96c4fa6a8c362c6ca7 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Sun, 25 Oct 2020 20:06:10 +0100 Subject: [PATCH 08/31] Moved all featurizer and tokenizer logic from HFTransformersNLP to LanguageModelFeaturizer. HFTransformersNLP is deprecated; however will still work as before if included in pipeline (i.e. LanguageModelFeaturizer does not overwrite tokens or features) --- .../dense_featurizer/lm_featurizer.py | 736 +++++++++++++++++- rasa/nlu/tokenizers/lm_tokenizer.py | 2 - .../nlu/utils/hugging_face/hf_transformers.py | 27 +- tests/nlu/featurizers/test_lm_featurizer.py | 565 +++++++++++++- tests/nlu/utils/test_hf_transformers.py | 61 +- 5 files changed, 1294 insertions(+), 97 deletions(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index e28346560555..79e7ff8b6435 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -1,21 +1,47 @@ -from typing import Any, Optional, Text, List, Type +import numpy as np +import logging +from typing import Any, Optional, Text, List, Type, Dict, Tuple + +from rasa.core.utils import get_dict_hash from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.components import Component from rasa.nlu.featurizers.featurizer import DenseFeaturizer +from rasa.nlu.model import Metadata from rasa.shared.nlu.training_data.features import Features -from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP -from rasa.nlu.tokenizers.tokenizer import Tokenizer +from rasa.nlu.tokenizers.tokenizer import Tokenizer, Token from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.shared.nlu.training_data.message import Message from rasa.nlu.constants import ( - LANGUAGE_MODEL_DOCS, DENSE_FEATURIZABLE_ATTRIBUTES, SEQUENCE_FEATURES, SENTENCE_FEATURES, FEATURIZER_CLASS_ALIAS, + NO_LENGTH_RESTRICTION, + NUMBER_OF_SUB_TOKENS, + TOKEN_IDS, + TOKENS, + TOKENS_NAMES, + LANGUAGE_MODEL_DOCS, ) -from rasa.shared.nlu.constants import TEXT, FEATURE_TYPE_SENTENCE, FEATURE_TYPE_SEQUENCE +from rasa.shared.nlu.constants import ( + TEXT, + FEATURE_TYPE_SENTENCE, + FEATURE_TYPE_SEQUENCE, + ACTION_TEXT, +) +from rasa.utils import train_utils + +MAX_SEQUENCE_LENGTHS = { + "bert": 512, + "gpt": 512, + "gpt2": 512, + "xlnet": NO_LENGTH_RESTRICTION, + "distilbert": 512, + "roberta": 512, +} + +logger = logging.getLogger(__name__) class LanguageModelFeaturizer(DenseFeaturizer): @@ -25,9 +51,629 @@ class LanguageModelFeaturizer(DenseFeaturizer): level representations for dense featurizable attributes of each message object. """ + defaults = { + # name of the language model to load. + "model_name": "bert", + # Pre-Trained weights to be loaded(string) + "model_weights": None, + # an optional path to a specific directory to download + # and cache the pre-trained model weights. + "cache_dir": None, + } + @classmethod def required_components(cls) -> List[Type[Component]]: - return [HFTransformersNLP, Tokenizer] + return [Tokenizer] + + def __init__( + self, + component_config: Optional[Dict[Text, Any]] = None, + skip_model_load: bool = False, + ) -> None: + super(LanguageModelFeaturizer, self).__init__(component_config) + + self._load_model_metadata() + self._load_model_instance(skip_model_load) + + def _load_model_metadata(self) -> None: + + from rasa.nlu.utils.hugging_face.registry import ( + model_class_dict, + model_weights_defaults, + ) + + self.model_name = self.component_config["model_name"] + + if self.model_name not in model_class_dict: + raise KeyError( + f"'{self.model_name}' not a valid model name. Choose from " + f"{str(list(model_class_dict.keys()))} or create" + f"a new class inheriting from this class to support your model." + ) + + self.model_weights = self.component_config["model_weights"] + self.cache_dir = self.component_config["cache_dir"] + + if not self.model_weights: + logger.info( + f"Model weights not specified. Will choose default model weights: " + f"{model_weights_defaults[self.model_name]}" + ) + self.model_weights = model_weights_defaults[self.model_name] + + self.max_model_sequence_length = MAX_SEQUENCE_LENGTHS[self.model_name] + + def _load_model_instance(self, skip_model_load: bool) -> None: + """Try loading the model instance + + Args: + skip_model_load: Skip loading the model instances to save time. This should be True only for pytests + """ + + if skip_model_load: + # This should be True only during pytests + return + + from rasa.nlu.utils.hugging_face.registry import ( + model_class_dict, + model_tokenizer_dict, + ) + + logger.debug(f"Loading Tokenizer and Model for {self.model_name}") + + self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained( + self.model_weights, cache_dir=self.cache_dir + ) + self.model = model_class_dict[self.model_name].from_pretrained( + self.model_weights, cache_dir=self.cache_dir + ) + + # Use a universal pad token since all transformer architectures do not have a + # consistent token. Instead of pad_token_id we use unk_token_id because + # pad_token_id is not set for all architectures. We can't add a new token as + # well since vocabulary resizing is not yet supported for TF classes. + # Also, this does not hurt the model predictions since we use an attention mask + # while feeding input. + self.pad_token_id = self.tokenizer.unk_token_id + + @classmethod + def cache_key( + cls, component_meta: Dict[Text, Any], model_metadata: Metadata + ) -> Optional[Text]: + + weights = component_meta.get("model_weights") or {} + + return f"{cls.name}-{component_meta.get('model_name')}-{get_dict_hash(weights)}" + + @classmethod + def required_packages(cls) -> List[Text]: + return ["transformers"] + + def _lm_tokenize(self, text: Text) -> Tuple[List[int], List[Text]]: + """Pass the text through the tokenizer of the language model. + + Args: + text: Text to be tokenized. + + Returns: + List of token ids and token strings. + + """ + split_token_ids = self.tokenizer.encode(text, add_special_tokens=False) + + split_token_strings = self.tokenizer.convert_ids_to_tokens(split_token_ids) + + return split_token_ids, split_token_strings + + def _add_lm_specific_special_tokens( + self, token_ids: List[List[int]] + ) -> List[List[int]]: + """Add language model specific special tokens which were used during their training. + + Args: + token_ids: List of token ids for each example in the batch. + + Returns: + Augmented list of token ids for each example in the batch. + """ + from rasa.nlu.utils.hugging_face.registry import ( + model_special_tokens_pre_processors, + ) + + augmented_tokens = [ + model_special_tokens_pre_processors[self.model_name](example_token_ids) + for example_token_ids in token_ids + ] + return augmented_tokens + + def _lm_specific_token_cleanup( + self, split_token_ids: List[int], token_strings: List[Text] + ) -> Tuple[List[int], List[Text]]: + """Clean up special chars added by tokenizers of language models. + + Many language models add a special char in front/back of (some) words. We clean + up those chars as they are not + needed once the features are already computed. + + Args: + split_token_ids: List of token ids received as output from the language + model specific tokenizer. + token_strings: List of token strings received as output from the language + model specific tokenizer. + + Returns: + Cleaned up token ids and token strings. + """ + from rasa.nlu.utils.hugging_face.registry import model_tokens_cleaners + + return model_tokens_cleaners[self.model_name](split_token_ids, token_strings) + + def _post_process_sequence_embeddings( + self, sequence_embeddings: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: + """Compute sentence level representations and sequence level representations + for relevant tokens. + + Args: + sequence_embeddings: Sequence level dense features received as output from + language model. + + Returns: + Sentence and sequence level representations. + """ + + from rasa.nlu.utils.hugging_face.registry import ( + model_embeddings_post_processors, + ) + + sentence_embeddings = [] + post_processed_sequence_embeddings = [] + + for example_embedding in sequence_embeddings: + ( + example_sentence_embedding, + example_post_processed_embedding, + ) = model_embeddings_post_processors[self.model_name](example_embedding) + + sentence_embeddings.append(example_sentence_embedding) + post_processed_sequence_embeddings.append(example_post_processed_embedding) + + return ( + np.array(sentence_embeddings), + np.array(post_processed_sequence_embeddings), + ) + + def _tokenize_example( + self, message: Message, attribute: Text + ) -> Tuple[List[Token], List[int]]: + """Tokenize a single message example. + + Many language models add a special char in front of (some) words and split + words into sub-words. To ensure the entity start and end values matches the + token values, use the tokens produced by the Tokenizer component. If + individual tokens are split up into multiple tokens, we add this information + to the respected token. + + Args: + message: Single message object to be processed. + attribute: Property of message to be processed, one of ``TEXT`` or + ``RESPONSE``. + + Returns: + List of token strings and token ids for the corresponding attribute of the + message. + """ + + tokens_in = message.get(TOKENS_NAMES[attribute]) + tokens_out = [] + + token_ids_out = [] + + for token in tokens_in: + # use lm specific tokenizer to further tokenize the text + split_token_ids, split_token_strings = self._lm_tokenize(token.text) + + split_token_ids, split_token_strings = self._lm_specific_token_cleanup( + split_token_ids, split_token_strings + ) + + token_ids_out += split_token_ids + + token.set(NUMBER_OF_SUB_TOKENS, len(split_token_strings)) + + tokens_out.append(token) + + return tokens_out, token_ids_out + + def _get_token_ids_for_batch( + self, batch_examples: List[Message], attribute: Text + ) -> Tuple[List[List[Token]], List[List[int]]]: + """Compute token ids and token strings for each example in batch. + + A token id is the id of that token in the vocabulary of the language model. + Args: + batch_examples: Batch of message objects for which tokens need to be + computed. + attribute: Property of message to be processed, one of ``TEXT`` or + ``RESPONSE``. + + Returns: + List of token strings and token ids for each example in the batch. + """ + + batch_token_ids = [] + batch_tokens = [] + for example in batch_examples: + + example_tokens, example_token_ids = self._tokenize_example( + example, attribute + ) + batch_tokens.append(example_tokens) + batch_token_ids.append(example_token_ids) + + return batch_tokens, batch_token_ids + + @staticmethod + def _compute_attention_mask( + actual_sequence_lengths: List[int], max_input_sequence_length: int + ) -> np.ndarray: + """Compute a mask for padding tokens. + + This mask will be used by the language model so that it does not attend to + padding tokens. + + Args: + actual_sequence_lengths: List of length of each example without any padding. + max_input_sequence_length: Maximum length of a sequence that will be present in the input batch. This is + after taking into consideration the maximum input sequence the model can handle. Hence it can never be + greater than self.max_model_sequence_length in case the model applies length restriction. + + Returns: + Computed attention mask, 0 for padding and 1 for non-padding tokens. + """ + + attention_mask = [] + + for actual_sequence_length in actual_sequence_lengths: + # add 1s for present tokens, fill up the remaining space up to max + # sequence length with 0s (non-existing tokens) + padded_sequence = [1] * min( + actual_sequence_length, max_input_sequence_length + ) + [0] * ( + max_input_sequence_length + - min(actual_sequence_length, max_input_sequence_length) + ) + attention_mask.append(padded_sequence) + + attention_mask = np.array(attention_mask).astype(np.float32) + return attention_mask + + def _extract_sequence_lengths( + self, batch_token_ids: List[List[int]] + ) -> Tuple[List[int], int]: + + # Compute max length across examples + max_input_sequence_length = 0 + actual_sequence_lengths = [] + + for example_token_ids in batch_token_ids: + sequence_length = len(example_token_ids) + actual_sequence_lengths.append(sequence_length) + max_input_sequence_length = max( + max_input_sequence_length, len(example_token_ids) + ) + + # Take into account the maximum sequence length the model can handle + max_input_sequence_length = ( + max_input_sequence_length + if self.max_model_sequence_length == NO_LENGTH_RESTRICTION + else min(max_input_sequence_length, self.max_model_sequence_length) + ) + + return actual_sequence_lengths, max_input_sequence_length + + def _add_padding_to_batch( + self, batch_token_ids: List[List[int]], max_sequence_length_model: int + ) -> List[List[int]]: + """Add padding so that all examples in the batch are of the same length. + + Args: + batch_token_ids: Batch of examples where each example is a non-padded list + of token ids. + max_sequence_length_model: Maximum length of any input sequence in the batch + to be fed to the model. + + Returns: + Padded batch with all examples of the same length. + """ + padded_token_ids = [] + + # Add padding according to max_sequence_length + # Some models don't contain pad token, we use unknown token as padding token. + # This doesn't affect the computation since we compute an attention mask + # anyways. + for example_token_ids in batch_token_ids: + + # Truncate any longer sequences so that they can be fed to the model + if len(example_token_ids) > max_sequence_length_model: + example_token_ids = example_token_ids[:max_sequence_length_model] + + padded_token_ids.append( + example_token_ids + + [self.pad_token_id] + * (max_sequence_length_model - len(example_token_ids)) + ) + return padded_token_ids + + @staticmethod + def _extract_nonpadded_embeddings( + embeddings: np.ndarray, actual_sequence_lengths: List[int] + ) -> np.ndarray: + """Use pre-computed non-padded lengths of each example to extract embeddings + for non-padding tokens. + + Args: + embeddings: sequence level representations for each example of the batch. + actual_sequence_lengths: non-padded lengths of each example of the batch. + + Returns: + Sequence level embeddings for only non-padding tokens of the batch. + """ + nonpadded_sequence_embeddings = [] + for index, embedding in enumerate(embeddings): + unmasked_embedding = embedding[: actual_sequence_lengths[index]] + nonpadded_sequence_embeddings.append(unmasked_embedding) + + return np.array(nonpadded_sequence_embeddings) + + def _compute_batch_sequence_features( + self, batch_attention_mask: np.ndarray, padded_token_ids: List[List[int]] + ) -> np.ndarray: + """Feed the padded batch to the language model. + + Args: + batch_attention_mask: Mask of 0s and 1s which indicate whether the token + is a padding token or not. + padded_token_ids: Batch of token ids for each example. The batch is padded + and hence can be fed at once. + + Returns: + Sequence level representations from the language model. + """ + model_outputs = self.model( + np.array(padded_token_ids), attention_mask=np.array(batch_attention_mask) + ) + + # sequence hidden states is always the first output from all models + sequence_hidden_states = model_outputs[0] + + sequence_hidden_states = sequence_hidden_states.numpy() + return sequence_hidden_states + + def _validate_sequence_lengths( + self, + actual_sequence_lengths: List[int], + batch_examples: List[Message], + attribute: Text, + inference_mode: bool = False, + ) -> None: + """Validate if sequence lengths of all inputs are less the max sequence length the model can handle + + This method should throw an error during training, whereas log a debug message during inference if + any of the input examples have a length greater than maximum sequence length allowed. + + Args: + actual_sequence_lengths: original sequence length of all inputs + batch_examples: all message instances in the batch + attribute: attribute of message object to be processed + inference_mode: Whether this is during training or during inferencing + """ + if self.max_model_sequence_length == NO_LENGTH_RESTRICTION: + # There is no restriction on sequence length from the model + return + + for sequence_length, example in zip(actual_sequence_lengths, batch_examples): + if sequence_length > self.max_model_sequence_length: + if not inference_mode: + raise RuntimeError( + f"The sequence length of '{example.get(attribute)[:20]}...' " + f"is too long({sequence_length} tokens) for the " + f"model chosen {self.model_name} which has a maximum " + f"sequence length of {self.max_model_sequence_length} tokens. Either " + f"shorten the message or use a model which has no " + f"restriction on input sequence length like XLNet." + ) + else: + logger.debug( + f"The sequence length of '{example.get(attribute)[:20]}...' " + f"is too long({sequence_length} tokens) for the " + f"model chosen {self.model_name} which has a maximum " + f"sequence length of {self.max_model_sequence_length} tokens. " + f"Downstream model predictions may be affected because of this." + ) + + def _add_extra_padding( + self, sequence_embeddings: np.ndarray, actual_sequence_lengths: List[int] + ) -> np.ndarray: + """ + Add extra zero padding to match the original sequence length. + + This is only done if the input was truncated during the batch preparation of input for the model. + Args: + sequence_embeddings: Embeddings returned from the model + actual_sequence_lengths: original sequence length of all inputs + + Returns: + Modified sequence embeddings with padding if necessary + """ + + if self.max_model_sequence_length == NO_LENGTH_RESTRICTION: + # No extra padding needed because there wouldn't have been any truncation in the first place + return sequence_embeddings + + reshaped_sequence_embeddings = [] + for index, embedding in enumerate(sequence_embeddings): + embedding_size = embedding.shape[-1] + if actual_sequence_lengths[index] > self.max_model_sequence_length: + embedding = np.concatenate( + [ + embedding, + np.zeros( + ( + actual_sequence_lengths[index] + - self.max_model_sequence_length, + embedding_size, + ), + dtype=np.float32, + ), + ] + ) + reshaped_sequence_embeddings.append(embedding) + + return np.array(reshaped_sequence_embeddings) + + def _get_model_features_for_batch( + self, + batch_token_ids: List[List[int]], + batch_tokens: List[List[Token]], + batch_examples: List[Message], + attribute: Text, + inference_mode: bool = False, + ) -> Tuple[np.ndarray, np.ndarray]: + """Compute dense features of each example in the batch. + + We first add the special tokens corresponding to each language model. Next, we + add appropriate padding and compute a mask for that padding so that it doesn't + affect the feature computation. The padded batch is next fed to the language + model and token level embeddings are computed. Using the pre-computed mask, + embeddings for non-padding tokens are extracted and subsequently sentence + level embeddings are computed. + + Args: + batch_token_ids: List of token ids of each example in the batch. + batch_tokens: List of token objects for each example in the batch. + batch_examples: List of examples in the batch. + attribute: attribute of the Message object to be processed. + inference_mode: Whether the call is during training or during inference. + + Returns: + Sentence and token level dense representations. + """ + # Let's first add tokenizer specific special tokens to all examples + batch_token_ids_augmented = self._add_lm_specific_special_tokens( + batch_token_ids + ) + + # Compute sequence lengths for all examples + ( + actual_sequence_lengths, + max_input_sequence_length, + ) = self._extract_sequence_lengths(batch_token_ids_augmented) + + # Validate that all sequences can be processed based on their sequence lengths and + # the maximum sequence length the model can handle + self._validate_sequence_lengths( + actual_sequence_lengths, batch_examples, attribute, inference_mode + ) + + # Add padding so that whole batch can be fed to the model + padded_token_ids = self._add_padding_to_batch( + batch_token_ids_augmented, max_input_sequence_length + ) + + # Compute attention mask based on actual_sequence_length + batch_attention_mask = self._compute_attention_mask( + actual_sequence_lengths, max_input_sequence_length + ) + + # Get token level features from the model + sequence_hidden_states = self._compute_batch_sequence_features( + batch_attention_mask, padded_token_ids + ) + + # Extract features for only non-padding tokens + sequence_nonpadded_embeddings = self._extract_nonpadded_embeddings( + sequence_hidden_states, actual_sequence_lengths + ) + + # Extract sentence level and post-processed features + ( + sentence_embeddings, + sequence_embeddings, + ) = self._post_process_sequence_embeddings(sequence_nonpadded_embeddings) + + # Pad zeros for examples which were truncated in inference mode. + # This is intentionally done after sentence embeddings have been extracted so that they are not affected + sequence_embeddings = self._add_extra_padding( + sequence_embeddings, actual_sequence_lengths + ) + + # shape of matrix for all sequence embeddings + batch_dim = len(sequence_embeddings) + seq_dim = max(e.shape[0] for e in sequence_embeddings) + feature_dim = sequence_embeddings[0].shape[1] + shape = (batch_dim, seq_dim, feature_dim) + + # align features with tokens so that we have just one vector per token + # (don't include sub-tokens) + sequence_embeddings = train_utils.align_token_features( + batch_tokens, sequence_embeddings, shape + ) + + # sequence_embeddings is a padded numpy array + # remove the padding, keep just the non-zero vectors + sequence_final_embeddings = [] + for embeddings, tokens in zip(sequence_embeddings, batch_tokens): + sequence_final_embeddings.append(embeddings[: len(tokens)]) + sequence_final_embeddings = np.array(sequence_final_embeddings) + + return sentence_embeddings, sequence_final_embeddings + + def _get_docs_for_batch( + self, + batch_examples: List[Message], + attribute: Text, + inference_mode: bool = False, + ) -> List[Dict[Text, Any]]: + """Compute language model docs for all examples in the batch. + + Args: + batch_examples: Batch of message objects for which language model docs + need to be computed. + attribute: Property of message to be processed, one of ``TEXT`` or + ``RESPONSE``. + inference_mode: Whether the call is during inference or during training. + + + Returns: + List of language model docs for each message in batch. + """ + + batch_tokens, batch_token_ids = self._get_token_ids_for_batch( + batch_examples, attribute + ) + + ( + batch_sentence_features, + batch_sequence_features, + ) = self._get_model_features_for_batch( + batch_token_ids, batch_tokens, batch_examples, attribute, inference_mode + ) + + # A doc consists of + # {'token_ids': ..., 'tokens': ..., 'sequence_features': ..., + # 'sentence_features': ...} + batch_docs = [] + for index in range(len(batch_examples)): + doc = { + TOKEN_IDS: batch_token_ids[index], + TOKENS: batch_tokens[index], + SEQUENCE_FEATURES: batch_sequence_features[index], + SENTENCE_FEATURES: np.reshape(batch_sentence_features[index], (1, -1)), + } + batch_docs.append(doc) + + return batch_docs def train( self, @@ -35,31 +681,75 @@ def train( config: Optional[RasaNLUModelConfig] = None, **kwargs: Any, ) -> None: + """Compute tokens and dense features for each message in training data. - for example in training_data.training_examples: - for attribute in DENSE_FEATURIZABLE_ATTRIBUTES: - self._set_lm_features(example, attribute) + Args: + training_data: NLU training data to be tokenized and featurized + config: NLU pipeline config consisting of all components. - def _get_doc(self, message: Message, attribute: Text) -> Any: - """ - Get the language model doc. A doc consists of - {'token_ids': ..., 'tokens': ..., - 'sequence_features': ..., 'sentence_features': ...} """ - return message.get(LANGUAGE_MODEL_DOCS[attribute]) - def process(self, message: Message, **kwargs: Any) -> None: - """Sets the dense features from the language model doc to the incoming - message.""" + batch_size = 64 + for attribute in DENSE_FEATURIZABLE_ATTRIBUTES: - self._set_lm_features(message, attribute) - def _set_lm_features(self, message: Message, attribute: Text = TEXT) -> None: + non_empty_examples = list( + filter(lambda x: x.get(attribute), training_data.training_examples) + ) + + batch_start_index = 0 + + while batch_start_index < len(non_empty_examples): + + batch_end_index = min( + batch_start_index + batch_size, len(non_empty_examples) + ) + # Collect batch examples + batch_messages = non_empty_examples[batch_start_index:batch_end_index] + + # Construct a doc with relevant features + # extracted(tokens, dense_features) + batch_docs = self._get_docs_for_batch(batch_messages, attribute) + + for index, ex in enumerate(batch_messages): + self._set_lm_features(batch_docs[index], ex, attribute) + batch_start_index += batch_size + + def process(self, message: Message, **kwargs: Any) -> None: + """Process an incoming message by computing its tokens and dense features. + + Args: + message: Incoming message object + """ + + # process of all featurizers operates only on TEXT and ACTION_TEXT attributes, + # because all other attributes are labels which are featurized during training + # and their features are stored by the model itself. + for attribute in {TEXT, ACTION_TEXT}: + if message.get(attribute): + self._set_lm_features( + self._get_docs_for_batch( + [message], attribute=attribute, inference_mode=True + )[0], + message, + attribute, + ) + + def _set_lm_features(self, doc, message: Message, attribute: Text = TEXT) -> None: """Adds the precomputed word vectors to the messages features.""" - doc = self._get_doc(message, attribute) - if doc is None: - return + hf_transformers_doc = message.get(LANGUAGE_MODEL_DOCS[attribute]) + if hf_transformers_doc: + # This should only be the case if the deprecated + # HFTransformersNLP component is used in the pipeline + # TODO: remove this when HFTransformersNLP is removed for good + logging.debug( + f"{LANGUAGE_MODEL_DOCS[attribute]} set: this " + f"indicates you're using the deprecated component " + f"HFTransformersNLP, please remove it from your " + f"pipeline." + ) + doc = hf_transformers_doc sequence_features = doc[SEQUENCE_FEATURES] sentence_features = doc[SENTENCE_FEATURES] diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py index 66cb41f4ee3c..02f758d82a29 100644 --- a/rasa/nlu/tokenizers/lm_tokenizer.py +++ b/rasa/nlu/tokenizers/lm_tokenizer.py @@ -2,7 +2,6 @@ import rasa.shared.utils.io from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer -from rasa.shared.constants import DOCS_URL_MIGRATION_GUIDE class LanguageModelTokenizer(WhitespaceTokenizer): @@ -20,5 +19,4 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None: f"will be removed in the future. " f"It is recommended to use the '{WhitespaceTokenizer.__name__}' instead.", category=DeprecationWarning, - docs=DOCS_URL_MIGRATION_GUIDE, ) diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py index 179022da57c8..0024e5efead7 100644 --- a/rasa/nlu/utils/hugging_face/hf_transformers.py +++ b/rasa/nlu/utils/hugging_face/hf_transformers.py @@ -1,13 +1,16 @@ import logging -from typing import Any, Dict, List, Text, Tuple, Optional, Type +from typing import Any, Dict, List, Text, Tuple, Optional from rasa.core.utils import get_dict_hash from rasa.nlu.model import Metadata +from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer +from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer from rasa.nlu.components import Component from rasa.nlu.config import RasaNLUModelConfig from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.shared.nlu.training_data.message import Message -from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer +from rasa.nlu.tokenizers.tokenizer import Token +import rasa.shared.utils.io import rasa.utils.train_utils as train_utils import numpy as np @@ -16,7 +19,6 @@ DENSE_FEATURIZABLE_ATTRIBUTES, TOKEN_IDS, TOKENS, - TOKENS_NAMES, SENTENCE_FEATURES, SEQUENCE_FEATURES, NUMBER_OF_SUB_TOKENS, @@ -43,6 +45,8 @@ class HFTransformersNLP(Component): is used to load pre-trained language models like BERT, GPT-2, etc. The component also tokenizes and featurizes dense featurizable attributes of each message. + + This Component is deprecated; use the LanguageModelFeaturizer in its place. """ defaults = { @@ -55,10 +59,6 @@ class HFTransformersNLP(Component): "cache_dir": None, } - @classmethod - def required_components(cls) -> List[Type[Component]]: - return [Tokenizer] - def __init__( self, component_config: Optional[Dict[Text, Any]] = None, @@ -68,6 +68,14 @@ def __init__( self._load_model_metadata() self._load_model_instance(skip_model_load) + self.whitespace_tokenizer = WhitespaceTokenizer() + rasa.shared.utils.io.raise_warning( + f"'{self.__class__.__name__}' is deprecated and " + f"will be removed in the future. " + f"It is recommended to use the '{LanguageModelFeaturizer.__name__}'" + f"instead.", + category=DeprecationWarning, + ) def _load_model_metadata(self) -> None: @@ -244,7 +252,7 @@ def _tokenize_example( Many language models add a special char in front of (some) words and split words into sub-words. To ensure the entity start and end values matches the - token values, use the tokens produced by the Tokenizer component. If + token values, tokenize the text first using the whitespace tokenizer. If individual tokens are split up into multiple tokens, we add this information to the respected token. @@ -258,7 +266,8 @@ def _tokenize_example( message. """ - tokens_in = message.get(TOKENS_NAMES[attribute]) + tokens_in = self.whitespace_tokenizer.tokenize(message, attribute) + tokens_out = [] token_ids_out = [] diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py index bb87f8f90a79..928d9e922548 100644 --- a/tests/nlu/featurizers/test_lm_featurizer.py +++ b/tests/nlu/featurizers/test_lm_featurizer.py @@ -1,6 +1,17 @@ +from typing import Text, List + import numpy as np import pytest +import logging +from rasa.nlu.constants import ( + TOKENS_NAMES, + NUMBER_OF_SUB_TOKENS, + SEQUENCE_FEATURES, + SENTENCE_FEATURES, +) +from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer +from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.shared.nlu.training_data.message import Message from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer @@ -173,17 +184,15 @@ def test_lm_featurizer_shape_values( model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec ): - transformers_config = {"model_name": model_name} + config = {"model_name": model_name} - transformers_nlp = HFTransformersNLP(transformers_config) - lm_featurizer = LanguageModelFeaturizer() + lm_featurizer = LanguageModelFeaturizer(config) messages = [] for text in texts: messages.append(Message.build(text=text)) td = TrainingData(messages) - transformers_nlp.train(td) lm_featurizer.train(td) for index in range(len(texts)): @@ -223,3 +232,551 @@ def test_lm_featurizer_shape_values( assert intent_sequence_vec is None assert intent_sentence_vec is None + + +@pytest.mark.parametrize( + "input_sequence_length, model_name, should_overflow", + [(20, "bert", False), (1000, "bert", True), (1000, "xlnet", False)], +) +def test_sequence_length_overflow_train( + input_sequence_length: int, model_name: Text, should_overflow: bool +): + component = LanguageModelFeaturizer( + {"model_name": model_name}, skip_model_load=True + ) + message = Message.build(text=" ".join(["hi"] * input_sequence_length)) + if should_overflow: + with pytest.raises(RuntimeError): + component._validate_sequence_lengths( + [input_sequence_length], [message], "text", inference_mode=False + ) + else: + component._validate_sequence_lengths( + [input_sequence_length], [message], "text", inference_mode=False + ) + + +@pytest.mark.parametrize( + "sequence_embeddings, actual_sequence_lengths, model_name, padding_needed", + [ + (np.ones((1, 512, 5)), [1000], "bert", True), + (np.ones((1, 512, 5)), [1000], "xlnet", False), + (np.ones((1, 256, 5)), [256], "bert", False), + ], +) +def test_long_sequences_extra_padding( + sequence_embeddings: np.ndarray, + actual_sequence_lengths: List[int], + model_name: Text, + padding_needed: bool, +): + component = LanguageModelFeaturizer( + {"model_name": model_name}, skip_model_load=True + ) + modified_sequence_embeddings = component._add_extra_padding( + sequence_embeddings, actual_sequence_lengths + ) + if not padding_needed: + assert np.all(modified_sequence_embeddings) == np.all(sequence_embeddings) + else: + assert modified_sequence_embeddings.shape[1] == actual_sequence_lengths[0] + assert ( + modified_sequence_embeddings[0].shape[-1] + == sequence_embeddings[0].shape[-1] + ) + zero_embeddings = modified_sequence_embeddings[0][ + sequence_embeddings.shape[1] : + ] + assert np.all(zero_embeddings == 0) + + +@pytest.mark.parametrize( + "token_ids, max_sequence_length_model, resulting_length, padding_added", + [ + ([[1] * 200], 512, 512, True), + ([[1] * 700], 512, 512, False), + ([[1] * 200], 200, 200, False), + ], +) +def test_input_padding( + token_ids: List[List[int]], + max_sequence_length_model: int, + resulting_length: int, + padding_added: bool, +): + component = LanguageModelFeaturizer({"model_name": "bert"}, skip_model_load=True) + component.pad_token_id = 0 + padded_input = component._add_padding_to_batch(token_ids, max_sequence_length_model) + assert len(padded_input[0]) == resulting_length + if padding_added: + original_length = len(token_ids[0]) + assert np.all(np.array(padded_input[0][original_length:]) == 0) + + +@pytest.mark.parametrize( + "sequence_length, model_name, should_overflow", + [(1000, "bert", True), (256, "bert", False)], +) +@pytest.mark.skip_on_windows +def test_log_longer_sequence( + sequence_length: int, model_name: Text, should_overflow: bool, caplog +): + config = {"model_name": model_name} + + featurizer = LanguageModelFeaturizer(config) + + text = " ".join(["hi"] * sequence_length) + tokenizer = WhitespaceTokenizer() + message = Message.build(text=text) + td = TrainingData([message]) + tokenizer.train(td) + caplog.set_level(logging.DEBUG) + featurizer.process(message) + if should_overflow: + assert "hi hi hi" in caplog.text + assert len(message.features) >= 2 + + +@pytest.mark.parametrize( + "actual_sequence_length, max_input_sequence_length, zero_start_index", + [(256, 512, 256), (700, 700, 700), (700, 512, 512)], +) +def test_attention_mask( + actual_sequence_length: int, max_input_sequence_length: int, zero_start_index: int +): + component = LanguageModelFeaturizer({"model_name": "bert"}, skip_model_load=True) + + attention_mask = component._compute_attention_mask( + [actual_sequence_length], max_input_sequence_length + ) + mask_ones = attention_mask[0][:zero_start_index] + mask_zeros = attention_mask[0][zero_start_index:] + + assert np.all(mask_ones == 1) + assert np.all(mask_zeros == 0) + + +# TODO: need to fix this failing test +@pytest.mark.skip(reason="Results in random crashing of github action workers") +@pytest.mark.parametrize( + "model_name, model_weights, texts, expected_tokens, expected_indices, expected_num_token_ids", + [ + ( + "bert", + None, + [ + "Good evening.", + "you're", + "r. n. b.", + "rock & roll", + "here is the sentence I want embeddings for.", + ], + [ + ["good", "evening"], + ["you", "re"], + ["r", "n", "b"], + ["rock", "&", "roll"], + [ + "here", + "is", + "the", + "sentence", + "i", + "want", + "em", + "bed", + "ding", + "s", + "for", + ], + ], + [ + [(0, 4), (5, 12)], + [(0, 3), (4, 6)], + [(0, 1), (3, 4), (6, 7)], + [(0, 4), (5, 6), (7, 11)], + [ + (0, 4), + (5, 7), + (8, 11), + (12, 20), + (21, 22), + (23, 27), + (28, 30), + (30, 33), + (33, 37), + (37, 38), + (39, 42), + ], + ], + [4, 4, 5, 5, 13], + ), + ( + "bert", + "bert-base-chinese", + [ + "晚上好", # normal & easy case + "没问题!", # `!` is a Chinese punctuation + "去东畈村", # `畈` is a OOV token for bert-base-chinese + "好的😃", # include a emoji which is common in Chinese text-based chat + ], + [ + ["晚", "上", "好"], + ["没", "问", "题", "!"], + ["去", "东", "畈", "村"], + ["好", "的", "😃"], + ], + [ + [(0, 1), (1, 2), (2, 3)], + [(0, 1), (1, 2), (2, 3), (3, 4)], + [(0, 1), (1, 2), (2, 3), (3, 4)], + [(0, 1), (1, 2), (2, 3)], + ], + [3, 4, 4, 3], + ), + ( + "gpt", + None, + [ + "Good evening.", + "hello", + "you're", + "r. n. b.", + "rock & roll", + "here is the sentence I want embeddings for.", + ], + [ + ["good", "evening"], + ["hello"], + ["you", "re"], + ["r", "n", "b"], + ["rock", "&", "roll"], + ["here", "is", "the", "sentence", "i", "want", "embe", "ddings", "for"], + ], + [ + [(0, 4), (5, 12)], + [(0, 5)], + [(0, 3), (4, 6)], + [(0, 1), (3, 4), (6, 7)], + [(0, 4), (5, 6), (7, 11)], + [ + (0, 4), + (5, 7), + (8, 11), + (12, 20), + (21, 22), + (23, 27), + (28, 32), + (32, 38), + (39, 42), + ], + ], + [2, 1, 2, 3, 3, 9], + ), + ( + "gpt2", + None, + [ + "Good evening.", + "hello", + "you're", + "r. n. b.", + "rock & roll", + "here is the sentence I want embeddings for.", + ], + [ + ["Good", "even", "ing"], + ["hello"], + ["you", "re"], + ["r", "n", "b"], + ["rock", "&", "roll"], + [ + "here", + "is", + "the", + "sent", + "ence", + "I", + "want", + "embed", + "d", + "ings", + "for", + ], + ], + [ + [(0, 4), (5, 9), (9, 12)], + [(0, 5)], + [(0, 3), (4, 6)], + [(0, 1), (3, 4), (6, 7)], + [(0, 4), (5, 6), (7, 11)], + [ + (0, 4), + (5, 7), + (8, 11), + (12, 16), + (16, 20), + (21, 22), + (23, 27), + (28, 33), + (33, 34), + (34, 38), + (39, 42), + ], + ], + [3, 1, 2, 3, 3, 11], + ), + ( + "xlnet", + None, + [ + "Good evening.", + "hello", + "you're", + "r. n. b.", + "rock & roll", + "here is the sentence I want embeddings for.", + ], + [ + ["Good", "evening"], + ["hello"], + ["you", "re"], + ["r", "n", "b"], + ["rock", "&", "roll"], + [ + "here", + "is", + "the", + "sentence", + "I", + "want", + "embed", + "ding", + "s", + "for", + ], + ], + [ + [(0, 4), (5, 12)], + [(0, 5)], + [(0, 3), (4, 6)], + [(0, 1), (3, 4), (6, 7)], + [(0, 4), (5, 6), (7, 11)], + [ + (0, 4), + (5, 7), + (8, 11), + (12, 20), + (21, 22), + (23, 27), + (28, 33), + (33, 37), + (37, 38), + (39, 42), + ], + ], + [4, 3, 4, 5, 5, 12], + ), + ( + "distilbert", + None, + [ + "Good evening.", + "you're", + "r. n. b.", + "rock & roll", + "here is the sentence I want embeddings for.", + ], + [ + ["good", "evening"], + ["you", "re"], + ["r", "n", "b"], + ["rock", "&", "roll"], + [ + "here", + "is", + "the", + "sentence", + "i", + "want", + "em", + "bed", + "ding", + "s", + "for", + ], + ], + [ + [(0, 4), (5, 12)], + [(0, 3), (4, 6)], + [(0, 1), (3, 4), (6, 7)], + [(0, 4), (5, 6), (7, 11)], + [ + (0, 4), + (5, 7), + (8, 11), + (12, 20), + (21, 22), + (23, 27), + (28, 30), + (30, 33), + (33, 37), + (37, 38), + (39, 42), + ], + ], + [4, 4, 5, 5, 13], + ), + ( + "roberta", + None, + [ + "Good evening.", + "hello", + "you're", + "r. n. b.", + "rock & roll", + "here is the sentence I want embeddings for.", + ], + [ + ["Good", "even", "ing"], + ["hello"], + ["you", "re"], + ["r", "n", "b"], + ["rock", "&", "roll"], + [ + "here", + "is", + "the", + "sent", + "ence", + "I", + "want", + "embed", + "d", + "ings", + "for", + ], + ], + [ + [(0, 4), (5, 9), (9, 12)], + [(0, 5)], + [(0, 3), (4, 6)], + [(0, 1), (3, 4), (6, 7)], + [(0, 4), (5, 6), (7, 11)], + [ + (0, 4), + (5, 7), + (8, 11), + (12, 16), + (16, 20), + (21, 22), + (23, 27), + (28, 33), + (33, 34), + (34, 38), + (39, 42), + ], + ], + [5, 3, 4, 5, 5, 13], + ), + ], +) +@pytest.mark.skip_on_windows +def test_lm_featurizer_edge_cases( + model_name, + model_weights, + texts, + expected_tokens, + expected_indices, + expected_num_token_ids, +): + + if model_weights is None: + model_weights_config = {} + else: + model_weights_config = {"model_weights": model_weights} + transformers_config = {**{"model_name": model_name}, **model_weights_config} + + lm_featurizer = LanguageModelFeaturizer(transformers_config) + whitespace_tokenizer = WhitespaceTokenizer() + + for text, gt_tokens, gt_indices, gt_num_indices in zip( + texts, expected_tokens, expected_indices, expected_num_token_ids + ): + + message = Message.build(text=text) + tokens = whitespace_tokenizer.tokenize(message, TEXT) + message.set(TOKENS_NAMES[TEXT], tokens) + lm_featurizer.process(message) + + assert [t.text for t in tokens] == gt_tokens + assert [t.start for t in tokens] == [i[0] for i in gt_indices] + assert [t.end for t in tokens] == [i[1] for i in gt_indices] + + +@pytest.mark.parametrize( + "text, expected_number_of_sub_tokens", + [("sentence embeddings", [1, 4]), ("this is a test", [1, 1, 1, 1])], +) +@pytest.mark.skip_on_windows +def test_hf_transformers_number_of_sub_tokens(text, expected_number_of_sub_tokens): + config = {"model_name": "bert"} # Test for one should be enough + + lm_featurizer = LanguageModelFeaturizer(config) + whitespace_tokenizer = WhitespaceTokenizer() + + message = Message.build(text=text) + + td = TrainingData([message]) + whitespace_tokenizer.train(td) + lm_featurizer.train(td) + + assert [ + t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT]) + ] == expected_number_of_sub_tokens + + +@pytest.mark.parametrize( + "text", + [("hi there")], +) +@pytest.mark.skip_on_windows +def test_log_deprecation_warning_with_old_config(text, caplog): + message = Message.build(text) + + transformers_nlp = HFTransformersNLP() + transformers_nlp.process(message) + + caplog.set_level(logging.DEBUG) + lm_tokenizer = LanguageModelTokenizer() + lm_tokenizer.process(message) + lm_featurizer = LanguageModelFeaturizer() + lm_featurizer.process(message) + + assert "deprecated component HFTransformersNLP" in caplog.text + + +@pytest.mark.skip_on_windows +def test_preserve_sentence_and_sequence_features_old_config(): + attribute = "text" + message = Message.build("hi there") + + transformers_nlp = HFTransformersNLP({"model_name": "bert"}) + transformers_nlp.process(message) + lm_tokenizer = LanguageModelTokenizer() + lm_tokenizer.process(message) + + lm_featurizer = LanguageModelFeaturizer({"model_name": "gpt2"}) + lm_featurizer.process(message) + + lm_docs = lm_featurizer._get_docs_for_batch( + [message], attribute=attribute, inference_mode=True + )[0] + hf_docs = transformers_nlp._get_docs_for_batch( + [message], attribute=attribute, inference_mode=True + )[0] + assert not (message.features[0].features == lm_docs[SEQUENCE_FEATURES]).any() + assert not (message.features[1].features == lm_docs[SENTENCE_FEATURES]).any() + assert (message.features[0].features == hf_docs[SEQUENCE_FEATURES]).all() + assert (message.features[1].features == hf_docs[SENTENCE_FEATURES]).all() diff --git a/tests/nlu/utils/test_hf_transformers.py b/tests/nlu/utils/test_hf_transformers.py index 22957e52d3e7..70aa7f9ab19b 100644 --- a/tests/nlu/utils/test_hf_transformers.py +++ b/tests/nlu/utils/test_hf_transformers.py @@ -6,14 +6,8 @@ from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP from rasa.shared.nlu.training_data.message import Message from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer -from rasa.shared.nlu.training_data.training_data import TrainingData -from rasa.nlu.constants import ( - TOKENS_NAMES, - NUMBER_OF_SUB_TOKENS, - LANGUAGE_MODEL_DOCS, - TOKEN_IDS, -) -from rasa.shared.nlu.constants import TEXT, INTENT +from rasa.nlu.constants import TOKENS_NAMES +from rasa.shared.nlu.constants import TEXT @pytest.mark.parametrize( @@ -485,58 +479,7 @@ def test_hf_transformers_edge_cases( tokens = whitespace_tokenizer.tokenize(message, TEXT) message.set(TOKENS_NAMES[TEXT], tokens) transformers_nlp.process(message) - token_ids = message.get(LANGUAGE_MODEL_DOCS[TEXT])[TOKEN_IDS] assert [t.text for t in tokens] == gt_tokens assert [t.start for t in tokens] == [i[0] for i in gt_indices] assert [t.end for t in tokens] == [i[1] for i in gt_indices] - assert len(token_ids) == gt_num_indices - - -@pytest.mark.parametrize( - "text, expected_tokens", - [ - ("Forecast_for_LUNCH", ["Forecast_for_LUNCH"]), - ("Forecast for LUNCH", ["Forecast for LUNCH"]), - ("Forecast+for+LUNCH", ["Forecast", "for", "LUNCH"]), - ], -) -@pytest.mark.skip_on_windows -def test_hf_transformers_custom_intent_symbol(text, expected_tokens): - - transformers_config = {"model_name": "bert"} # Test for one should be enough - - transformers_nlp = HFTransformersNLP(transformers_config) - whitespace_tokenizer = WhitespaceTokenizer() - - message = Message.build(text=text) - message.set(INTENT, text) - - td = TrainingData([message]) - - whitespace_tokenizer.train(td) - transformers_nlp.train(td) - - assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens - - -@pytest.mark.parametrize( - "text, expected_number_of_sub_tokens", - [("sentence embeddings", [1, 4]), ("this is a test", [1, 1, 1, 1])], -) -@pytest.mark.skip_on_windows -def test_hf_transformers_number_of_sub_tokens(text, expected_number_of_sub_tokens): - transformers_config = {"model_name": "bert"} # Test for one should be enough - - transformers_nlp = HFTransformersNLP(transformers_config) - whitespace_tokenizer = WhitespaceTokenizer() - - message = Message.build(text=text) - - td = TrainingData([message]) - whitespace_tokenizer.train(td) - transformers_nlp.train(td) - - assert [ - t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT]) - ] == expected_number_of_sub_tokens From 17058138b99bf7ea69353fa4f8a807392579a409 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Sun, 25 Oct 2020 20:07:10 +0100 Subject: [PATCH 09/31] Adjusted warning, removed incorrect reference to MIGRATION_DOCS --- rasa/nlu/tokenizers/convert_tokenizer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py index 627b7d714a44..d0c2f8efd148 100644 --- a/rasa/nlu/tokenizers/convert_tokenizer.py +++ b/rasa/nlu/tokenizers/convert_tokenizer.py @@ -2,7 +2,6 @@ import rasa.shared.utils.io from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer -from rasa.shared.constants import DOCS_URL_MIGRATION_GUIDE class ConveRTTokenizer(WhitespaceTokenizer): @@ -20,5 +19,4 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None: f"will be removed in the future. " f"It is recommended to use the '{WhitespaceTokenizer.__name__}' instead.", category=DeprecationWarning, - docs=DOCS_URL_MIGRATION_GUIDE, ) From 2ec801a5c9d12a2008f9cbb4b203116f1b2fa6e7 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Sun, 25 Oct 2020 20:08:15 +0100 Subject: [PATCH 10/31] Updated docs to reflect deprecation of HFTransformersNLP --- docs/docs/components.mdx | 149 +++++++++++++++------------------------ 1 file changed, 58 insertions(+), 91 deletions(-) diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx index 8bfa1c19875c..e357903fba03 100644 --- a/docs/docs/components.mdx +++ b/docs/docs/components.mdx @@ -137,85 +137,6 @@ word vectors in your pipeline. by passing in your language identifier as the `language` option. -### HFTransformersNLP - - -* **Short** - - HuggingFace's Transformers based pre-trained language model initializer - - - -* **Outputs** - - Nothing - - - -* **Requires** - - `tokens` - - - -* **Description** - - Initializes specified pre-trained language model from HuggingFace's [Transformers library](https://huggingface.co/transformers/). The component applies language model specific tokenization and - featurization to compute sequence and sentence level representations for each example in the training data. - Include a [tokenizer](./components.mdx#tokenizers) and [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) to utilize the output of this - component for downstream NLU models. - - :::note - To use `HFTransformersNLP` component, install Rasa Open Source with `pip3 install rasa[transformers]`. - - ::: - - - -* **Configuration** - - You should specify what language model to load via the parameter `model_name`. See the below table for the - available language models. - Additionally, you can also specify the architecture variation of the chosen language model by specifying the - parameter `model_weights`. - The full list of supported architectures can be found in the - [HuggingFace documentation](https://huggingface.co/transformers/pretrained_models.html). - If left empty, it uses the default model architecture that original Transformers library loads (see table below). - - ``` - +----------------+--------------+-------------------------+ - | Language Model | Parameter | Default value for | - | | "model_name" | "model_weights" | - +----------------+--------------+-------------------------+ - | BERT | bert | bert-base-uncased | - +----------------+--------------+-------------------------+ - | GPT | gpt | openai-gpt | - +----------------+--------------+-------------------------+ - | GPT-2 | gpt2 | gpt2 | - +----------------+--------------+-------------------------+ - | XLNet | xlnet | xlnet-base-cased | - +----------------+--------------+-------------------------+ - | DistilBERT | distilbert | distilbert-base-uncased | - +----------------+--------------+-------------------------+ - | RoBERTa | roberta | roberta-base | - +----------------+--------------+-------------------------+ - ``` - - The following configuration loads the language model BERT: - - ```yaml-rasa - pipeline: - - name: HFTransformersNLP - # Name of the language model to use - model_name: "bert" - # Pre-Trained weights to be loaded - model_weights: "bert-base-uncased" - - # An optional path to a specific directory to download and cache the pre-trained model weights. - # The `default` cache_dir is the same as https://huggingface.co/transformers/serialization.html#cache-directory . - cache_dir: null - ``` - ## Tokenizers @@ -568,7 +489,7 @@ Note: The `feature-dimension` for sequence and sentence features does not have t ::: :::note - To use `ConveRTTokenizer`, install Rasa Open Source with `pip3 install rasa[convert]`. + To use `ConveRTFeaturizer`, install Rasa Open Source with `pip3 install rasa[convert]`. ::: @@ -589,7 +510,15 @@ Note: The `feature-dimension` for sequence and sentence features does not have t Creates a vector representation of user message and response (if specified) using a pre-trained language model. +* **Description** + Initializes specified pre-trained language model from HuggingFace's [Transformers library](https://huggingface.co/transformers/). + The component applies language model specific tokenization and featurization to compute sequence and sentence level + representations for each example in the training data. + :::note + To use `LanguageModelFeaturizer` component, install Rasa Open Source with `pip3 install rasa[transformers]`. + + ::: * **Outputs** @@ -599,7 +528,7 @@ Note: The `feature-dimension` for sequence and sentence features does not have t * **Requires** - `tokens` and [HFTransformersNLP](./components.mdx#hftransformersnlp). + `tokens`. @@ -612,8 +541,7 @@ Note: The `feature-dimension` for sequence and sentence features does not have t * **Description** Creates features for entity extraction, intent classification, and response selection. - Uses the pre-trained language model specified in upstream [HFTransformersNLP](./components.mdx#hftransformersnlp) component to compute vector - representations of input text. + Uses the pre-trained language model to compute vector representations of input text. :::note Please make sure that you use a language model which is pre-trained on the same language corpus as that of your @@ -625,14 +553,49 @@ Note: The `feature-dimension` for sequence and sentence features does not have t * **Configuration** - Include [HFTransformersNLP](./components.mdx#hftransformersnlp) and a [Tokenizer](./components.mdx#tokenizers) components before this component. Use - [HFTransformersNLP](./components.mdx#hftransformersnlp) to ensure tokens are correctly set for all components throughout the pipeline. + Include a [Tokenizer](./components.mdx#tokenizers) component before this component. + + You should specify what language model to load via the parameter `model_name`. See the below table for the + available language models. + Additionally, you can also specify the architecture variation of the chosen language model by specifying the + parameter `model_weights`. + The full list of supported architectures can be found in the + [HuggingFace documentation](https://huggingface.co/transformers/pretrained_models.html). + If left empty, it uses the default model architecture that original Transformers library loads (see table below). + + ``` + +----------------+--------------+-------------------------+ + | Language Model | Parameter | Default value for | + | | "model_name" | "model_weights" | + +----------------+--------------+-------------------------+ + | BERT | bert | bert-base-uncased | + +----------------+--------------+-------------------------+ + | GPT | gpt | openai-gpt | + +----------------+--------------+-------------------------+ + | GPT-2 | gpt2 | gpt2 | + +----------------+--------------+-------------------------+ + | XLNet | xlnet | xlnet-base-cased | + +----------------+--------------+-------------------------+ + | DistilBERT | distilbert | distilbert-base-uncased | + +----------------+--------------+-------------------------+ + | RoBERTa | roberta | roberta-base | + +----------------+--------------+-------------------------+ + ``` + + The following configuration loads the language model BERT: ```yaml-rasa pipeline: - - name: "LanguageModelFeaturizer" - ``` + - name: LanguageModelFeaturizer + # Name of the language model to use + model_name: "bert" + # Pre-Trained weights to be loaded + model_weights: "bert-base-uncased" + # An optional path to a specific directory to download and cache the pre-trained model weights. + # The `default` cache_dir is the same as https://huggingface.co/transformers/serialization.html#cache-directory . + cache_dir: null + ``` ### RegexFeaturizer @@ -2651,12 +2614,16 @@ The sentence features are represented by a matrix of size `(1 x feature-dimensio ## Deprecated Components :::caution Deprecated -The `LanguageModelTokenizer` is deprecated. The [HFTransformersNLP](./components.mdx#hftransformersnlp) now implements -its behaviour. Any [tokenizer](./components.mdx#tokenizers) can be used in its place; this must be placed before the -[HFTransformersNLP](./components.mdx#hftransformersnlp) Component. +The `LanguageModelTokenizer` is deprecated. The [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) now implements +its behaviour. +::: + +:::caution Deprecated +The `HFTransformersNLP` is deprecated. The [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) now implements +its behaviour. ::: :::caution Deprecated -The `ConveRTokenizer` is deprecated. The [ConveRTFeaturizer](./components.mdx#convertfeaturizer) now implements +The `ConveRTTokenizer` is deprecated. The [ConveRTFeaturizer](./components.mdx#convertfeaturizer) now implements its behaviour. Any [tokenizer](./components.mdx#tokenizers) can be used in its place. ::: \ No newline at end of file From ef850542d98fd647e779091973cc374856978933 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Mon, 26 Oct 2020 08:16:48 +0100 Subject: [PATCH 11/31] Adjusted the docstrings a little --- .../dense_featurizer/lm_featurizer.py | 8 +++++--- rasa/nlu/tokenizers/convert_tokenizer.py | 4 +++- rasa/nlu/tokenizers/lm_tokenizer.py | 7 ++++--- rasa/nlu/utils/hugging_face/hf_transformers.py | 17 +++++++---------- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index 79e7ff8b6435..200bcbd9afc5 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -45,10 +45,12 @@ class LanguageModelFeaturizer(DenseFeaturizer): - """Featurizer using transformer based language models. + """Featurizer using transformer-based language models. - Uses the output of HFTransformersNLP component to set the sequence and sentence - level representations for dense featurizable attributes of each message object. + The transformers(https://github.com/huggingface/transformers) library + is used to load pre-trained language models like BERT, GPT-2, etc. + The component also tokenizes and featurizes dense featurizable attributes of each + message. """ defaults = { diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py index d0c2f8efd148..9eeb8290515c 100644 --- a/rasa/nlu/tokenizers/convert_tokenizer.py +++ b/rasa/nlu/tokenizers/convert_tokenizer.py @@ -1,6 +1,7 @@ from typing import Dict, Text, Any import rasa.shared.utils.io +from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer @@ -17,6 +18,7 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None: rasa.shared.utils.io.raise_warning( f"'{self.__class__.__name__}' is deprecated and " f"will be removed in the future. " - f"It is recommended to use the '{WhitespaceTokenizer.__name__}' instead.", + f"It is recommended to use the '{WhitespaceTokenizer.__name__}' or " + f"another {Tokenizer.__name__} instead.", category=DeprecationWarning, ) diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py index 02f758d82a29..3c31d683008d 100644 --- a/rasa/nlu/tokenizers/lm_tokenizer.py +++ b/rasa/nlu/tokenizers/lm_tokenizer.py @@ -1,6 +1,7 @@ from typing import Dict, Text, Any import rasa.shared.utils.io +from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer @@ -8,8 +9,7 @@ class LanguageModelTokenizer(WhitespaceTokenizer): """ This tokenizer is deprecated and will be removed in the future. - The HFTransformersNLP component now sets the tokens - for dense featurizable attributes of each message object. + Use the LanguageModelFeaturizer with any other Tokenizer instead. """ def __init__(self, component_config: Dict[Text, Any] = None) -> None: @@ -17,6 +17,7 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None: rasa.shared.utils.io.raise_warning( f"'{self.__class__.__name__}' is deprecated and " f"will be removed in the future. " - f"It is recommended to use the '{WhitespaceTokenizer.__name__}' instead.", + f"It is recommended to use the '{WhitespaceTokenizer.__name__}' or " + f"another {Tokenizer.__name__} instead.", category=DeprecationWarning, ) diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py index 0024e5efead7..bd6daef2e06f 100644 --- a/rasa/nlu/utils/hugging_face/hf_transformers.py +++ b/rasa/nlu/utils/hugging_face/hf_transformers.py @@ -39,14 +39,10 @@ class HFTransformersNLP(Component): - """Utility Component for interfacing between Transformers library and Rasa OS. - - The transformers(https://github.com/huggingface/transformers) library - is used to load pre-trained language models like BERT, GPT-2, etc. - The component also tokenizes and featurizes dense featurizable attributes of each - message. + """ + This component is deprecated and will be removed in the future. - This Component is deprecated; use the LanguageModelFeaturizer in its place. + Use the LanguageModelFeaturizer instead. """ defaults = { @@ -72,7 +68,7 @@ def __init__( rasa.shared.utils.io.raise_warning( f"'{self.__class__.__name__}' is deprecated and " f"will be removed in the future. " - f"It is recommended to use the '{LanguageModelFeaturizer.__name__}'" + f"It is recommended to use the '{LanguageModelFeaturizer.__name__}' " f"instead.", category=DeprecationWarning, ) @@ -89,7 +85,7 @@ def _load_model_metadata(self) -> None: if self.model_name not in model_class_dict: raise KeyError( f"'{self.model_name}' not a valid model name. Choose from " - f"{str(list(model_class_dict.keys()))} or create" + f"{str(list(model_class_dict.keys()))} or create " f"a new class inheriting from this class to support your model." ) @@ -109,7 +105,8 @@ def _load_model_instance(self, skip_model_load: bool) -> None: """Try loading the model instance Args: - skip_model_load: Skip loading the model instances to save time. This should be True only for pytests + skip_model_load: Skip loading the model instances to save time. + This should be True only for pytests """ if skip_model_load: From 5abfe12b8fd45920e65d668691d2f78dd0075f9a Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Wed, 28 Oct 2020 10:11:15 +0100 Subject: [PATCH 12/31] Adjusted to reflect code review comments --- changelog/7027.improvement.md | 6 ++++ docs/docs/components.mdx | 6 ++-- rasa/nlu/constants.py | 3 -- .../dense_featurizer/lm_featurizer.py | 34 +++++++++---------- .../nlu/utils/hugging_face/hf_transformers.py | 4 --- tests/nlu/featurizers/test_lm_featurizer.py | 2 ++ 6 files changed, 27 insertions(+), 28 deletions(-) create mode 100644 changelog/7027.improvement.md diff --git a/changelog/7027.improvement.md b/changelog/7027.improvement.md new file mode 100644 index 000000000000..baaa4813790e --- /dev/null +++ b/changelog/7027.improvement.md @@ -0,0 +1,6 @@ +Remove dependency between `ConveRTTokenizer` and `ConveRTFeaturizer`. The `ConveRTTokenizer` is now deprecated, and the +`ConveRTFeaturizer` can be used with any other `Tokenizer`. + +Remove dependency between `HFTransformersNLP`, `LanguageModelTokenizer`, and `LanguageModelFeaturizer`. Both +`HFTransformersNLP` and `LanguageModelTokenizer` are now deprecated. `LanguageModelFeaturizer` implements the behavior +of the stack and can be used with any other `Tokenizer`. diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx index e357903fba03..4cd10bf89ccc 100644 --- a/docs/docs/components.mdx +++ b/docs/docs/components.mdx @@ -2615,15 +2615,15 @@ The sentence features are represented by a matrix of size `(1 x feature-dimensio :::caution Deprecated The `LanguageModelTokenizer` is deprecated. The [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) now implements -its behaviour. +its behavior. ::: :::caution Deprecated The `HFTransformersNLP` is deprecated. The [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) now implements -its behaviour. +its behavior. ::: :::caution Deprecated The `ConveRTTokenizer` is deprecated. The [ConveRTFeaturizer](./components.mdx#convertfeaturizer) now implements -its behaviour. Any [tokenizer](./components.mdx#tokenizers) can be used in its place. +its behavior. Any [tokenizer](./components.mdx#tokenizers) can be used in its place. ::: \ No newline at end of file diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py index 49e0978b075b..14297822acb3 100644 --- a/rasa/nlu/constants.py +++ b/rasa/nlu/constants.py @@ -63,9 +63,6 @@ rasa.shared.nlu.constants.INTENT_RESPONSE_KEY: "intent_response_key_tokens", } -TOKENS = "tokens" -TOKEN_IDS = "token_ids" - SEQUENCE_FEATURES = "sequence_features" SENTENCE_FEATURES = "sentence_features" diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index 200bcbd9afc5..14ecf1f344ba 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -19,8 +19,6 @@ FEATURIZER_CLASS_ALIAS, NO_LENGTH_RESTRICTION, NUMBER_OF_SUB_TOKENS, - TOKEN_IDS, - TOKENS, TOKENS_NAMES, LANGUAGE_MODEL_DOCS, ) @@ -651,6 +649,19 @@ def _get_docs_for_batch( List of language model docs for each message in batch. """ + hf_transformers_doc = batch_examples[0].get(LANGUAGE_MODEL_DOCS[attribute]) + if hf_transformers_doc: + # This should only be the case if the deprecated + # HFTransformersNLP component is used in the pipeline + # TODO: remove this when HFTransformersNLP is removed for good + logging.debug( + f"{LANGUAGE_MODEL_DOCS[attribute]} set: this " + f"indicates you're using the deprecated component " + f"HFTransformersNLP, please remove it from your " + f"pipeline." + ) + return [ex.get(LANGUAGE_MODEL_DOCS[attribute]) for ex in batch_examples] + batch_tokens, batch_token_ids = self._get_token_ids_for_batch( batch_examples, attribute ) @@ -668,8 +679,6 @@ def _get_docs_for_batch( batch_docs = [] for index in range(len(batch_examples)): doc = { - TOKEN_IDS: batch_token_ids[index], - TOKENS: batch_tokens[index], SEQUENCE_FEATURES: batch_sequence_features[index], SENTENCE_FEATURES: np.reshape(batch_sentence_features[index], (1, -1)), } @@ -737,22 +746,11 @@ def process(self, message: Message, **kwargs: Any) -> None: attribute, ) - def _set_lm_features(self, doc, message: Message, attribute: Text = TEXT) -> None: + def _set_lm_features( + self, doc: Dict[Text, Any], message: Message, attribute: Text = TEXT + ) -> None: """Adds the precomputed word vectors to the messages features.""" - hf_transformers_doc = message.get(LANGUAGE_MODEL_DOCS[attribute]) - if hf_transformers_doc: - # This should only be the case if the deprecated - # HFTransformersNLP component is used in the pipeline - # TODO: remove this when HFTransformersNLP is removed for good - logging.debug( - f"{LANGUAGE_MODEL_DOCS[attribute]} set: this " - f"indicates you're using the deprecated component " - f"HFTransformersNLP, please remove it from your " - f"pipeline." - ) - doc = hf_transformers_doc - sequence_features = doc[SEQUENCE_FEATURES] sentence_features = doc[SENTENCE_FEATURES] diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py index bd6daef2e06f..d5db6e76eb0c 100644 --- a/rasa/nlu/utils/hugging_face/hf_transformers.py +++ b/rasa/nlu/utils/hugging_face/hf_transformers.py @@ -17,8 +17,6 @@ from rasa.nlu.constants import ( LANGUAGE_MODEL_DOCS, DENSE_FEATURIZABLE_ATTRIBUTES, - TOKEN_IDS, - TOKENS, SENTENCE_FEATURES, SEQUENCE_FEATURES, NUMBER_OF_SUB_TOKENS, @@ -666,8 +664,6 @@ def _get_docs_for_batch( batch_docs = [] for index in range(len(batch_examples)): doc = { - TOKEN_IDS: batch_token_ids[index], - TOKENS: batch_tokens[index], SEQUENCE_FEATURES: batch_sequence_features[index], SENTENCE_FEATURES: np.reshape(batch_sentence_features[index], (1, -1)), } diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py index 928d9e922548..660b225e9ad2 100644 --- a/tests/nlu/featurizers/test_lm_featurizer.py +++ b/tests/nlu/featurizers/test_lm_featurizer.py @@ -9,6 +9,7 @@ NUMBER_OF_SUB_TOKENS, SEQUENCE_FEATURES, SENTENCE_FEATURES, + LANGUAGE_MODEL_DOCS, ) from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer @@ -770,6 +771,7 @@ def test_preserve_sentence_and_sequence_features_old_config(): lm_featurizer = LanguageModelFeaturizer({"model_name": "gpt2"}) lm_featurizer.process(message) + message.set(LANGUAGE_MODEL_DOCS[attribute], None) lm_docs = lm_featurizer._get_docs_for_batch( [message], attribute=attribute, inference_mode=True )[0] From 19d1c14963adaca562b74df3a523bb39dc40c30b Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Wed, 28 Oct 2020 12:49:03 +0100 Subject: [PATCH 13/31] Fixed some deepsource errors --- .../dense_featurizer/convert_featurizer.py | 4 -- .../dense_featurizer/lm_featurizer.py | 63 ++++++++++++------- .../nlu/utils/hugging_face/hf_transformers.py | 11 +++- 3 files changed, 52 insertions(+), 26 deletions(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index 97a45ee6eb7a..f1fde8df037f 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -74,7 +74,6 @@ def __get_signature(signature: Text, module: Any) -> NoReturn: def _compute_features( self, batch_examples: List[Message], module: Any, attribute: Text = TEXT ) -> Tuple[np.ndarray, np.ndarray]: - sentence_encodings = self._compute_sentence_encodings( batch_examples, module, attribute ) @@ -227,7 +226,6 @@ def train( def process( self, message: Message, *, tf_hub_module: Any = None, **kwargs: Any ) -> None: - for attribute in DENSE_FEATURIZABLE_ATTRIBUTES: if message.get(attribute): sequence_features, sentence_features = self._compute_features( @@ -286,7 +284,6 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: are split up into multiple tokens, add this information to the respected tokens. """ - tokens_in = message.get(TOKENS_NAMES[attribute]) tokens_out = [] @@ -308,6 +305,5 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: @staticmethod def _clean_tokens(tokens: List[bytes]) -> List[Text]: """Encode tokens and remove special char added by ConveRT.""" - tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens] return [string for string in tokens if string] diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index 14ecf1f344ba..c998ff682103 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -76,7 +76,10 @@ def __init__( self._load_model_instance(skip_model_load) def _load_model_metadata(self) -> None: - + """Load the metadata for the specified model and sets these properties. + This includes the model name, model weights, cache directory and the + maximum sequence length the model can handle. + """ from rasa.nlu.utils.hugging_face.registry import ( model_class_dict, model_weights_defaults, @@ -96,8 +99,8 @@ def _load_model_metadata(self) -> None: if not self.model_weights: logger.info( - f"Model weights not specified. Will choose default model weights: " - f"{model_weights_defaults[self.model_name]}" + f"Model weights not specified. Will choose default model " + f"weights: {model_weights_defaults[self.model_name]}" ) self.model_weights = model_weights_defaults[self.model_name] @@ -107,9 +110,9 @@ def _load_model_instance(self, skip_model_load: bool) -> None: """Try loading the model instance Args: - skip_model_load: Skip loading the model instances to save time. This should be True only for pytests + skip_model_load: Skip loading the model instances to save time. This + should be True only for pytests """ - if skip_model_load: # This should be True only during pytests return @@ -140,7 +143,6 @@ def _load_model_instance(self, skip_model_load: bool) -> None: def cache_key( cls, component_meta: Dict[Text, Any], model_metadata: Metadata ) -> Optional[Text]: - weights = component_meta.get("model_weights") or {} return f"{cls.name}-{component_meta.get('model_name')}-{get_dict_hash(weights)}" @@ -273,9 +275,10 @@ def _tokenize_example( # use lm specific tokenizer to further tokenize the text split_token_ids, split_token_strings = self._lm_tokenize(token.text) - split_token_ids, split_token_strings = self._lm_specific_token_cleanup( - split_token_ids, split_token_strings - ) + ( + split_token_ids, + split_token_strings, + ) = self._lm_specific_token_cleanup(split_token_ids, split_token_strings) token_ids_out += split_token_ids @@ -351,7 +354,17 @@ def _compute_attention_mask( def _extract_sequence_lengths( self, batch_token_ids: List[List[int]] ) -> Tuple[List[int], int]: + """Extracts the sequence length for each example, as well as the maximum + sequence length across examples. + + Args: + batch_token_ids: List of token ids for each example in the batch. + Returns: + Tuple consisting of: the actual sequence lengths for each example, + and the maximum input sequence length (taking into account the + maximum sequence length that the model can handle. + """ # Compute max length across examples max_input_sequence_length = 0 actual_sequence_lengths = [] @@ -427,7 +440,9 @@ def _extract_nonpadded_embeddings( return np.array(nonpadded_sequence_embeddings) def _compute_batch_sequence_features( - self, batch_attention_mask: np.ndarray, padded_token_ids: List[List[int]] + self, + batch_attention_mask: np.ndarray, + padded_token_ids: List[List[int]], ) -> np.ndarray: """Feed the padded batch to the language model. @@ -441,7 +456,8 @@ def _compute_batch_sequence_features( Sequence level representations from the language model. """ model_outputs = self.model( - np.array(padded_token_ids), attention_mask=np.array(batch_attention_mask) + np.array(padded_token_ids), + attention_mask=np.array(batch_attention_mask), ) # sequence hidden states is always the first output from all models @@ -483,17 +499,18 @@ def _validate_sequence_lengths( f"shorten the message or use a model which has no " f"restriction on input sequence length like XLNet." ) - else: - logger.debug( - f"The sequence length of '{example.get(attribute)[:20]}...' " - f"is too long({sequence_length} tokens) for the " - f"model chosen {self.model_name} which has a maximum " - f"sequence length of {self.max_model_sequence_length} tokens. " - f"Downstream model predictions may be affected because of this." - ) + logger.debug( + f"The sequence length of '{example.get(attribute)[:20]}...' " + f"is too long({sequence_length} tokens) for the " + f"model chosen {self.model_name} which has a maximum " + f"sequence length of {self.max_model_sequence_length} tokens. " + f"Downstream model predictions may be affected because of this." + ) def _add_extra_padding( - self, sequence_embeddings: np.ndarray, actual_sequence_lengths: List[int] + self, + sequence_embeddings: np.ndarray, + actual_sequence_lengths: List[int], ) -> np.ndarray: """ Add extra zero padding to match the original sequence length. @@ -670,7 +687,11 @@ def _get_docs_for_batch( batch_sentence_features, batch_sequence_features, ) = self._get_model_features_for_batch( - batch_token_ids, batch_tokens, batch_examples, attribute, inference_mode + batch_token_ids, + batch_tokens, + batch_examples, + attribute, + inference_mode, ) # A doc consists of diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py index d5db6e76eb0c..a8762088aa4e 100644 --- a/rasa/nlu/utils/hugging_face/hf_transformers.py +++ b/rasa/nlu/utils/hugging_face/hf_transformers.py @@ -137,7 +137,6 @@ def _load_model_instance(self, skip_model_load: bool) -> None: def cache_key( cls, component_meta: Dict[Text, Any], model_metadata: Metadata ) -> Optional[Text]: - weights = component_meta.get("model_weights") or {} return f"{cls.name}-{component_meta.get('model_name')}-{get_dict_hash(weights)}" @@ -349,7 +348,17 @@ def _compute_attention_mask( def _extract_sequence_lengths( self, batch_token_ids: List[List[int]] ) -> Tuple[List[int], int]: + """Extracts the sequence length for each example, as well as the maximum + sequence length across examples. + + Args: + batch_token_ids: List of token ids for each example in the batch. + Returns: + Tuple consisting of: the actual sequence lengths for each example, + and the maximum input sequence length (taking into account the + maximum sequence length that the model can handle. + """ # Compute max length across examples max_input_sequence_length = 0 actual_sequence_lengths = [] From 5557cfb4e175c29d3cb134f29f16c944a6a468a0 Mon Sep 17 00:00:00 2001 From: koernerfelicia <45405119+koernerfelicia@users.noreply.github.com> Date: Thu, 29 Oct 2020 14:33:15 +0100 Subject: [PATCH 14/31] Update docstring for rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py Co-authored-by: Tanja --- rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index f1fde8df037f..6dee2ed2db45 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -280,7 +280,7 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: """Tokenize the text using the ConveRT model. ConveRT adds a special char in front of (some) words and splits words into sub-words. To ensure the entity start and end values matches the token values, - tokenize the text first using the whitespace tokenizer. If individual tokens + reuse the tokens that are already assigned to the message. If individual tokens are split up into multiple tokens, add this information to the respected tokens. """ From d6468cb65974b6a047da5d62b69c4e4a686d6e41 Mon Sep 17 00:00:00 2001 From: koernerfelicia <45405119+koernerfelicia@users.noreply.github.com> Date: Thu, 29 Oct 2020 14:34:10 +0100 Subject: [PATCH 15/31] Update warning about use of deprecated HFTransformersNLP in LMFeaturizer Co-authored-by: Tanja --- rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index c998ff682103..545d0604fed9 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -672,7 +672,7 @@ def _get_docs_for_batch( # HFTransformersNLP component is used in the pipeline # TODO: remove this when HFTransformersNLP is removed for good logging.debug( - f"{LANGUAGE_MODEL_DOCS[attribute]} set: this " + f"'{LANGUAGE_MODEL_DOCS[attribute]}' set: this " f"indicates you're using the deprecated component " f"HFTransformersNLP, please remove it from your " f"pipeline." From 717c7cfeb19843dc7d5887967991885b34da4512 Mon Sep 17 00:00:00 2001 From: koernerfelicia <45405119+koernerfelicia@users.noreply.github.com> Date: Thu, 29 Oct 2020 14:34:32 +0100 Subject: [PATCH 16/31] Update docstring in LMFeaturizer Co-authored-by: Tanja --- rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index 545d0604fed9..9a4a96fd42cb 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -695,8 +695,7 @@ def _get_docs_for_batch( ) # A doc consists of - # {'token_ids': ..., 'tokens': ..., 'sequence_features': ..., - # 'sentence_features': ...} + # {'sequence_features': ..., 'sentence_features': ...} batch_docs = [] for index in range(len(batch_examples)): doc = { From b9ec75e975ff06072c03b536dac1df134f8abe1d Mon Sep 17 00:00:00 2001 From: Daksh Date: Fri, 30 Oct 2020 21:58:27 +0100 Subject: [PATCH 17/31] add changes to migration guide --- docs/docs/migration-guide.mdx | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx index f800317be063..25a52c497788 100644 --- a/docs/docs/migration-guide.mdx +++ b/docs/docs/migration-guide.mdx @@ -10,6 +10,34 @@ description: | This page contains information about changes between major versions and how you can migrate from one version to another. +## Rasa 2.0 to Rasa 2.1 + +### Deprecations + +`ConveRTTokenizer` is now deprecated. [ConveRTFeaturizer](./components.mdx#convertfeaturizer) now implements +its behaviour. To migrate, remove `ConveRTTokenizer` with any other tokenizer, for e.g.: + +```yaml +pipeline: + - name: WhitespaceTokenizer + - name: ConveRTFeaturizer + model_url: + ... +``` + +`HFTransformersNLP` and `LanguageModelTokenizer` components are now deprecated. +[LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) now implements their behaviour. +To migrate, remove both the above components with any tokenizer and specify the model architecture and model weights +as part of `LanguageModelFeaturizer`, for e.g.: + +```yaml +pipeline: + - name: WhitespaceTokenizer + - name: LanguageModelFeaturizer + model_name: "bert" + model_weights: "rasa/LaBSE" + ... +``` ## Rasa 1.10 to Rasa 2.0 From 04590afc3c4e1312d3bb9718c5e4d10e41644b96 Mon Sep 17 00:00:00 2001 From: Daksh Date: Mon, 2 Nov 2020 15:43:08 +0100 Subject: [PATCH 18/31] make linter happy --- .../dense_featurizer/convert_featurizer.py | 22 +++++- .../dense_featurizer/lm_featurizer.py | 67 +++++++++---------- rasa/nlu/tokenizers/convert_tokenizer.py | 5 ++ rasa/nlu/tokenizers/lm_tokenizer.py | 8 ++- .../nlu/utils/hugging_face/hf_transformers.py | 19 ++++-- 5 files changed, 77 insertions(+), 44 deletions(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index 311b7cf835d3..fb7b84336ca0 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -63,14 +63,20 @@ class ConveRTFeaturizer(DenseFeaturizer): @classmethod def required_components(cls) -> List[Type[Component]]: + """Components that should be included in the pipeline before this component.""" return [Tokenizer] @classmethod def required_packages(cls) -> List[Text]: + """Packages needed to be installed.""" return ["tensorflow_text", "tensorflow_hub"] def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: + """Initializes ConveRTFeaturizer with the model and different encoding signatures. + Args: + component_config: Configuration for the component. + """ super(ConveRTFeaturizer, self).__init__(component_config) self.model_url = self._get_validated_model_url() @@ -178,7 +184,6 @@ def _get_validated_model_url(self) -> Text: @staticmethod def _get_signature(signature: Text, module: Any) -> NoReturn: """Retrieve a signature from a (hopefully loaded) TF model.""" - if not module: raise Exception( "ConveRTFeaturizer needs a proper loaded tensorflow module when used. " @@ -295,6 +300,13 @@ def train( config: Optional[RasaNLUModelConfig] = None, **kwargs: Any, ) -> None: + """Featurize all message attributes in the training data with the ConveRT model. + + Args: + training_data: Training data to be featurized + config: Pipeline configuration + **kwargs: Any other arguments. + """ if config is not None and config.language != "en": rasa.shared.utils.io.raise_warning( f"Since ``ConveRT`` model is trained only on an english " @@ -337,6 +349,12 @@ def train( ) def process(self, message: Message, **kwargs: Any) -> None: + """Featurize an incoming message with the ConveRT model. + + Args: + message: Message to be featurized + **kwargs: Any other arguments. + """ for attribute in {TEXT, ACTION_TEXT}: if message.get(attribute): sequence_features, sentence_features = self._compute_features( @@ -387,6 +405,7 @@ def cache_key( return f"{cls.name}-{get_dict_hash(_config)}" def provide_context(self) -> Dict[Text, Any]: + """Store the model in pipeline context for future use.""" return {"tf_hub_module": self.module} def _tokenize(self, sentence: Text) -> Any: @@ -397,6 +416,7 @@ def _tokenize(self, sentence: Text) -> Any: def tokenize(self, message: Message, attribute: Text) -> List[Token]: """Tokenize the text using the ConveRT model. + ConveRT adds a special char in front of (some) words and splits words into sub-words. To ensure the entity start and end values matches the token values, reuse the tokens that are already assigned to the message. If individual tokens diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index 0c56e6eff552..0e1b49dc286f 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -63,6 +63,7 @@ class LanguageModelFeaturizer(DenseFeaturizer): @classmethod def required_components(cls) -> List[Type[Component]]: + """Packages needed to be installed.""" return [Tokenizer] def __init__( @@ -70,6 +71,12 @@ def __init__( component_config: Optional[Dict[Text, Any]] = None, skip_model_load: bool = False, ) -> None: + """Initializes LanguageModelFeaturizer with the specified model. + + Args: + component_config: Configuration for the component. + skip_model_load: Skip loading the model for pytests. + """ super(LanguageModelFeaturizer, self).__init__(component_config) self._load_model_metadata() @@ -77,6 +84,7 @@ def __init__( def _load_model_metadata(self) -> None: """Load the metadata for the specified model and sets these properties. + This includes the model name, model weights, cache directory and the maximum sequence length the model can handle. """ @@ -107,7 +115,7 @@ def _load_model_metadata(self) -> None: self.max_model_sequence_length = MAX_SEQUENCE_LENGTHS[self.model_name] def _load_model_instance(self, skip_model_load: bool) -> None: - """Try loading the model instance + """Try loading the model instance. Args: skip_model_load: Skip loading the model instances to save time. This @@ -143,12 +151,21 @@ def _load_model_instance(self, skip_model_load: bool) -> None: def cache_key( cls, component_meta: Dict[Text, Any], model_metadata: Metadata ) -> Optional[Text]: + """Cache the component for future use. + + Args: + component_meta: configuration for the component. + model_metadata: configuration for the whole pipeline. + + Returns: key of the cache for future retrievals. + """ weights = component_meta.get("model_weights") or {} return f"{cls.name}-{component_meta.get('model_name')}-{get_dict_hash(weights)}" @classmethod def required_packages(cls) -> List[Text]: + """Packages needed to be installed.""" return ["transformers"] def _lm_tokenize(self, text: Text) -> Tuple[List[int], List[Text]]: @@ -157,9 +174,7 @@ def _lm_tokenize(self, text: Text) -> Tuple[List[int], List[Text]]: Args: text: Text to be tokenized. - Returns: - List of token ids and token strings. - + Returns: List of token ids and token strings. """ split_token_ids = self.tokenizer.encode(text, add_special_tokens=False) @@ -175,8 +190,7 @@ def _add_lm_specific_special_tokens( Args: token_ids: List of token ids for each example in the batch. - Returns: - Augmented list of token ids for each example in the batch. + Returns: Augmented list of token ids for each example in the batch. """ from rasa.nlu.utils.hugging_face.registry import ( model_special_tokens_pre_processors, @@ -203,8 +217,7 @@ def _lm_specific_token_cleanup( token_strings: List of token strings received as output from the language model specific tokenizer. - Returns: - Cleaned up token ids and token strings. + Returns: Cleaned up token ids and token strings. """ from rasa.nlu.utils.hugging_face.registry import model_tokens_cleaners @@ -213,17 +226,14 @@ def _lm_specific_token_cleanup( def _post_process_sequence_embeddings( self, sequence_embeddings: np.ndarray ) -> Tuple[np.ndarray, np.ndarray]: - """Compute sentence level representations and sequence level representations - for relevant tokens. + """Compute sentence and sequence level representations for relevant tokens. Args: sequence_embeddings: Sequence level dense features received as output from language model. - Returns: - Sentence and sequence level representations. + Returns: Sentence and sequence level representations. """ - from rasa.nlu.utils.hugging_face.registry import ( model_embeddings_post_processors, ) @@ -261,11 +271,9 @@ def _tokenize_example( attribute: Property of message to be processed, one of ``TEXT`` or ``RESPONSE``. - Returns: - List of token strings and token ids for the corresponding attribute of the + Returns: List of token strings and token ids for the corresponding attribute of the message. """ - tokens_in = message.get(TOKENS_NAMES[attribute]) tokens_out = [] @@ -293,16 +301,15 @@ def _get_token_ids_for_batch( """Compute token ids and token strings for each example in batch. A token id is the id of that token in the vocabulary of the language model. + Args: batch_examples: Batch of message objects for which tokens need to be computed. attribute: Property of message to be processed, one of ``TEXT`` or ``RESPONSE``. - Returns: - List of token strings and token ids for each example in the batch. + Returns: List of token strings and token ids for each example in the batch. """ - batch_token_ids = [] batch_tokens = [] for example in batch_examples: @@ -330,10 +337,8 @@ def _compute_attention_mask( after taking into consideration the maximum input sequence the model can handle. Hence it can never be greater than self.max_model_sequence_length in case the model applies length restriction. - Returns: - Computed attention mask, 0 for padding and 1 for non-padding tokens. + Returns: Computed attention mask, 0 for padding and 1 for non-padding tokens. """ - attention_mask = [] for actual_sequence_length in actual_sequence_lengths: @@ -353,8 +358,7 @@ def _compute_attention_mask( def _extract_sequence_lengths( self, batch_token_ids: List[List[int]] ) -> Tuple[List[int], int]: - """Extracts the sequence length for each example, as well as the maximum - sequence length across examples. + """Extracts the sequence length for each example and maximum sequence length. Args: batch_token_ids: List of token ids for each example in the batch. @@ -421,7 +425,9 @@ def _add_padding_to_batch( def _extract_nonpadded_embeddings( embeddings: np.ndarray, actual_sequence_lengths: List[int] ) -> np.ndarray: - """Use pre-computed non-padded lengths of each example to extract embeddings + """Extract embeddings for actual tokens. + + Use pre-computed non-padded lengths of each example to extract embeddings for non-padding tokens. Args: @@ -469,7 +475,7 @@ def _validate_sequence_lengths( attribute: Text, inference_mode: bool = False, ) -> None: - """Validate if sequence lengths of all inputs are less the max sequence length the model can handle + """Validate if sequence lengths of all inputs are less the max sequence length the model can handle. This method should throw an error during training, whereas log a debug message during inference if any of the input examples have a length greater than maximum sequence length allowed. @@ -506,8 +512,7 @@ def _validate_sequence_lengths( def _add_extra_padding( self, sequence_embeddings: np.ndarray, actual_sequence_lengths: List[int] ) -> np.ndarray: - """ - Add extra zero padding to match the original sequence length. + """Add extra zero padding to match the original sequence length. This is only done if the input was truncated during the batch preparation of input for the model. Args: @@ -517,7 +522,6 @@ def _add_extra_padding( Returns: Modified sequence embeddings with padding if necessary """ - if self.max_model_sequence_length == NO_LENGTH_RESTRICTION: # No extra padding needed because there wouldn't have been any truncation in the first place return sequence_embeddings @@ -659,7 +663,6 @@ def _get_docs_for_batch( Returns: List of language model docs for each message in batch. """ - hf_transformers_doc = batch_examples[0].get(LANGUAGE_MODEL_DOCS[attribute]) if hf_transformers_doc: # This should only be the case if the deprecated @@ -707,9 +710,7 @@ def train( Args: training_data: NLU training data to be tokenized and featurized config: NLU pipeline config consisting of all components. - """ - batch_size = 64 for attribute in DENSE_FEATURIZABLE_ATTRIBUTES: @@ -742,7 +743,6 @@ def process(self, message: Message, **kwargs: Any) -> None: Args: message: Incoming message object """ - # process of all featurizers operates only on TEXT and ACTION_TEXT attributes, # because all other attributes are labels which are featurized during training # and their features are stored by the model itself. @@ -760,7 +760,6 @@ def _set_lm_features( self, doc: Dict[Text, Any], message: Message, attribute: Text = TEXT ) -> None: """Adds the precomputed word vectors to the messages features.""" - sequence_features = doc[SEQUENCE_FEATURES] sentence_features = doc[SENTENCE_FEATURES] diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py index 50749a0d9d0e..369753791960 100644 --- a/rasa/nlu/tokenizers/convert_tokenizer.py +++ b/rasa/nlu/tokenizers/convert_tokenizer.py @@ -13,6 +13,11 @@ class ConveRTTokenizer(WhitespaceTokenizer): """ def __init__(self, component_config: Dict[Text, Any] = None) -> None: + """Initializes ConveRTTokenizer with the ConveRT model. + + Args: + component_config: Configuration for the component. + """ super().__init__(component_config) rasa.shared.utils.io.raise_warning( f"'{self.__class__.__name__}' is deprecated and " diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py index 3c31d683008d..fbee73158ef1 100644 --- a/rasa/nlu/tokenizers/lm_tokenizer.py +++ b/rasa/nlu/tokenizers/lm_tokenizer.py @@ -6,13 +6,17 @@ class LanguageModelTokenizer(WhitespaceTokenizer): - """ - This tokenizer is deprecated and will be removed in the future. + """This tokenizer is deprecated and will be removed in the future. Use the LanguageModelFeaturizer with any other Tokenizer instead. """ def __init__(self, component_config: Dict[Text, Any] = None) -> None: + """Initializes LanguageModelTokenizer for tokenization. + + Args: + component_config: Configuration for the component. + """ super().__init__(component_config) rasa.shared.utils.io.raise_warning( f"'{self.__class__.__name__}' is deprecated and " diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py index a8762088aa4e..4efaccbbec21 100644 --- a/rasa/nlu/utils/hugging_face/hf_transformers.py +++ b/rasa/nlu/utils/hugging_face/hf_transformers.py @@ -37,8 +37,7 @@ class HFTransformersNLP(Component): - """ - This component is deprecated and will be removed in the future. + """This component is deprecated and will be removed in the future. Use the LanguageModelFeaturizer instead. """ @@ -58,6 +57,7 @@ def __init__( component_config: Optional[Dict[Text, Any]] = None, skip_model_load: bool = False, ) -> None: + """Initializes HFTransformsNLP with the models specified.""" super(HFTransformersNLP, self).__init__(component_config) self._load_model_metadata() @@ -100,13 +100,12 @@ def _load_model_metadata(self) -> None: self.max_model_sequence_length = MAX_SEQUENCE_LENGTHS[self.model_name] def _load_model_instance(self, skip_model_load: bool) -> None: - """Try loading the model instance + """Try loading the model instance. Args: skip_model_load: Skip loading the model instances to save time. This should be True only for pytests """ - if skip_model_load: # This should be True only during pytests return @@ -137,6 +136,14 @@ def _load_model_instance(self, skip_model_load: bool) -> None: def cache_key( cls, component_meta: Dict[Text, Any], model_metadata: Metadata ) -> Optional[Text]: + """Cache the component for future use. + + Args: + component_meta: configuration for the component. + model_metadata: configuration for the whole pipeline. + + Returns: key of the cache for future retrievals. + """ weights = component_meta.get("model_weights") or {} return f"{cls.name}-{component_meta.get('model_name')}-{get_dict_hash(weights)}" @@ -328,7 +335,6 @@ def _compute_attention_mask( Returns: Computed attention mask, 0 for padding and 1 for non-padding tokens. """ - attention_mask = [] for actual_sequence_length in actual_sequence_lengths: @@ -348,8 +354,7 @@ def _compute_attention_mask( def _extract_sequence_lengths( self, batch_token_ids: List[List[int]] ) -> Tuple[List[int], int]: - """Extracts the sequence length for each example, as well as the maximum - sequence length across examples. + """Extracts the sequence length for each example and maximum sequence length. Args: batch_token_ids: List of token ids for each example in the batch. From f75308ac369c04d8a3272dc13a89d7a2f713d61e Mon Sep 17 00:00:00 2001 From: Daksh Date: Mon, 2 Nov 2020 15:54:03 +0100 Subject: [PATCH 19/31] fix pytests --- tests/nlu/featurizers/test_lm_featurizer.py | 15 +++++++++++---- tests/nlu/test_train.py | 6 +++++- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py index ca0e15d1a1f6..596960933227 100644 --- a/tests/nlu/featurizers/test_lm_featurizer.py +++ b/tests/nlu/featurizers/test_lm_featurizer.py @@ -315,14 +315,21 @@ def test_input_padding( @pytest.mark.parametrize( - "sequence_length, model_name, should_overflow", - [(1000, "bert", True), (256, "bert", False)], + "sequence_length, model_name, model_weights, should_overflow", + [ + (1000, "bert", "bert-base-uncased", True), + (256, "bert", "bert-base-uncased", False), + ], ) @pytest.mark.skip_on_windows def test_log_longer_sequence( - sequence_length: int, model_name: Text, should_overflow: bool, caplog + sequence_length: int, + model_name: Text, + model_weights: Text, + should_overflow: bool, + caplog, ): - config = {"model_name": model_name} + config = {"model_name": model_name, "model_weights": model_weights} featurizer = LanguageModelFeaturizer(config) diff --git a/tests/nlu/test_train.py b/tests/nlu/test_train.py index 12f0520d1200..0417464d626a 100644 --- a/tests/nlu/test_train.py +++ b/tests/nlu/test_train.py @@ -7,12 +7,16 @@ from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.utils.tensorflow.constants import EPOCHS from tests.nlu.conftest import DEFAULT_DATA_PATH -from typing import Any, Dict, List, Tuple, Text, Union, Optional +from typing import Any, Dict, List, Tuple, Text, Union COMPONENTS_TEST_PARAMS = { "DIETClassifier": {EPOCHS: 1}, "ResponseSelector": {EPOCHS: 1}, "HFTransformersNLP": {"model_name": "bert", "model_weights": "bert-base-uncased"}, + "LanguageModelFeaturizer": { + "model_name": "bert", + "model_weights": "bert-base-uncased", + }, } From 5202a18a091e912ad19f8e0e334ea87213d8a2e6 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Wed, 4 Nov 2020 16:28:54 +0100 Subject: [PATCH 20/31] Add check for HFTransformersNLP in pipeline, prevent model loading in LMFeaturizer if found --- .../featurizers/dense_featurizer/lm_featurizer.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index 0e1b49dc286f..81f1715e0869 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -70,18 +70,30 @@ def __init__( self, component_config: Optional[Dict[Text, Any]] = None, skip_model_load: bool = False, + hf_transformers_loaded: bool = False, ) -> None: """Initializes LanguageModelFeaturizer with the specified model. Args: component_config: Configuration for the component. skip_model_load: Skip loading the model for pytests. + hf_transformers_loaded: Skip loading of model and metadata, use + HFTransformers output instead. """ super(LanguageModelFeaturizer, self).__init__(component_config) - + if hf_transformers_loaded: + return self._load_model_metadata() self._load_model_instance(skip_model_load) + @classmethod + def create( + cls, component_config: Dict[Text, Any], config: RasaNLUModelConfig + ) -> "DenseFeaturizer": + # TODO: remove this when HFTransformersNLP is removed for good + hf_transformers_loaded = "HFTransformersNLP" in config.component_names + return cls(component_config, hf_transformers_loaded=hf_transformers_loaded) + def _load_model_metadata(self) -> None: """Load the metadata for the specified model and sets these properties. From 239d1966c8bede451f279365a02b7d05a94b12cf Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Thu, 5 Nov 2020 08:49:07 +0100 Subject: [PATCH 21/31] Add language check to create method in LanguageModelFeaturizer --- rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index 81f1715e0869..b017a2798920 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -5,7 +5,7 @@ from rasa.core.utils import get_dict_hash from rasa.nlu.config import RasaNLUModelConfig -from rasa.nlu.components import Component +from rasa.nlu.components import Component, UnsupportedLanguageError from rasa.nlu.featurizers.featurizer import DenseFeaturizer from rasa.nlu.model import Metadata from rasa.shared.nlu.training_data.features import Features @@ -90,6 +90,10 @@ def __init__( def create( cls, component_config: Dict[Text, Any], config: RasaNLUModelConfig ) -> "DenseFeaturizer": + language = config.language + if not cls.can_handle_language(language): + # check failed + raise UnsupportedLanguageError(cls.name, language) # TODO: remove this when HFTransformersNLP is removed for good hf_transformers_loaded = "HFTransformersNLP" in config.component_names return cls(component_config, hf_transformers_loaded=hf_transformers_loaded) From 58f0add481350d128a6ed5a7cdb6999ea8daf36c Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Thu, 5 Nov 2020 09:34:22 +0100 Subject: [PATCH 22/31] Put info for deprecated components back into docs, with deprecation warning --- docs/docs/components.mdx | 228 ++++++++++++++++++++++++++++++++------- 1 file changed, 190 insertions(+), 38 deletions(-) diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx index 6f1667e2efd3..0a593913308b 100644 --- a/docs/docs/components.mdx +++ b/docs/docs/components.mdx @@ -137,6 +137,89 @@ word vectors in your pipeline. by passing in your language identifier as the `language` option. +### HFTransformersNLP + + :::caution Deprecated + The `HFTransformersNLP` is deprecated and will be removed in a future release. The [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) + now implements its behavior. + ::: + + * **Short** + + HuggingFace's Transformers based pre-trained language model initializer + + + + * **Outputs** + + Nothing + + + + * **Requires** + + Nothing + + + + * **Description** + + Initializes specified pre-trained language model from HuggingFace's [Transformers library](https://huggingface.co/transformers/). The component applies language model specific tokenization and + featurization to compute sequence and sentence level representations for each example in the training data. + Include [LanguageModelTokenizer](./components.mdx#languagemodeltokenizer) and [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) to utilize the output of this + component for downstream NLU models. + + :::note + To use `HFTransformersNLP` component, install Rasa Open Source with `pip3 install rasa[transformers]`. + + ::: + + + + * **Configuration** + + You should specify what language model to load via the parameter `model_name`. See the below table for the + available language models. + Additionally, you can also specify the architecture variation of the chosen language model by specifying the + parameter `model_weights`. + The full list of supported architectures can be found in the + [HuggingFace documentation](https://huggingface.co/transformers/pretrained_models.html). + If left empty, it uses the default model architecture that original Transformers library loads (see table below). + + ``` + +----------------+--------------+-------------------------+ + | Language Model | Parameter | Default value for | + | | "model_name" | "model_weights" | + +----------------+--------------+-------------------------+ + | BERT | bert | rasa/LaBSE | + +----------------+--------------+-------------------------+ + | GPT | gpt | openai-gpt | + +----------------+--------------+-------------------------+ + | GPT-2 | gpt2 | gpt2 | + +----------------+--------------+-------------------------+ + | XLNet | xlnet | xlnet-base-cased | + +----------------+--------------+-------------------------+ + | DistilBERT | distilbert | distilbert-base-uncased | + +----------------+--------------+-------------------------+ + | RoBERTa | roberta | roberta-base | + +----------------+--------------+-------------------------+ + ``` + + The following configuration loads the language model BERT: + + ```yaml-rasa + pipeline: + - name: HFTransformersNLP + # Name of the language model to use + model_name: "bert" + # Pre-Trained weights to be loaded + model_weights: "rasa/LaBSE" + + # An optional path to a specific directory to download and cache the pre-trained model weights. + # The `default` cache_dir is the same as https://huggingface.co/transformers/serialization.html#cache-directory . + cache_dir: null + ``` + ## Tokenizers @@ -325,6 +408,113 @@ word vectors in your pipeline. ``` + ### ConveRTTokenizer + +:::caution Deprecated +The `ConveRTTokenizer` is deprecated and will be removed in a future release. The [ConveRTFeaturizer](./components.mdx#convertfeaturizer) +now implements its behavior. Any [tokenizer](./components.mdx#tokenizers) can be used in its place. +::: + + * **Short** + + Tokenizer using [ConveRT](https://github.com/PolyAI-LDN/polyai-models#convert) model. + + + + * **Outputs** + + `tokens` for user messages, responses (if present), and intents (if specified) + + + + * **Requires** + + Nothing + + + + * **Description** + + Creates tokens using the ConveRT tokenizer. Must be used whenever the [ConveRTFeaturizer](./components.mdx#convertfeaturizer) is used. + + :::note + Since `ConveRT` model is trained only on an English corpus of conversations, this tokenizer should only + be used if your training data is in English language. + + ::: + + :::note + To use `ConveRTTokenizer`, install Rasa Open Source with `pip3 install rasa[convert]`. + + ::: + + + + * **Configuration** + + ```yaml-rasa + pipeline: + - name: "ConveRTTokenizer" + # Flag to check whether to split intents + "intent_tokenization_flag": False + # Symbol on which intent should be split + "intent_split_symbol": "_" + # Regular expression to detect tokens + "token_pattern": None + # Remote URL/Local directory of model files(Required) + "model_url": None + ``` + + :::note + Since the public URL of the ConveRT model was taken offline recently, it is now mandatory + to set the parameter `model_url` to a community/self-hosted URL or path to a local directory containing model files. + + ::: + + + ### LanguageModelTokenizer + +:::caution Deprecated +The `LanguageModelTokenizer` is deprecated and will be removed in a future release. The [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) +now implements its behavior. +::: + + * **Short** + + Tokenizer from pre-trained language models + + + + * **Outputs** + + `tokens` for user messages, responses (if present), and intents (if specified) + + + + * **Requires** + + [HFTransformersNLP](./components.mdx#hftransformersnlp) + + + + * **Description** + + Creates tokens using the pre-trained language model specified in upstream [HFTransformersNLP](./components.mdx#hftransformersnlp) component. + Must be used whenever the [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) is used. + + + + * **Configuration** + + ```yaml-rasa + pipeline: + - name: "LanguageModelTokenizer" + # Flag to check whether to split intents + "intent_tokenization_flag": False + # Symbol on which intent should be split + "intent_split_symbol": "_" + ``` + ## Featurizers @@ -500,16 +690,8 @@ Note: The `feature-dimension` for sequence and sentence features does not have t ```yaml-rasa pipeline: - name: "ConveRTFeaturizer" - # Remote URL/Local directory of model files(Required) - "model_url": None ``` - :::note - Since the public URL of the ConveRT model was taken offline recently, it is now mandatory - to set the parameter `model_url` to a community/self-hosted URL or path to a local directory containing model files. - - ::: - ### LanguageModelFeaturizer @@ -518,15 +700,7 @@ Note: The `feature-dimension` for sequence and sentence features does not have t Creates a vector representation of user message and response (if specified) using a pre-trained language model. -* **Description** - Initializes specified pre-trained language model from HuggingFace's [Transformers library](https://huggingface.co/transformers/). - The component applies language model specific tokenization and featurization to compute sequence and sentence level - representations for each example in the training data. - :::note - To use `LanguageModelFeaturizer` component, install Rasa Open Source with `pip3 install rasa[transformers]`. - - ::: * **Outputs** @@ -2586,11 +2760,6 @@ See [starspace paper](https://arxiv.org/abs/1709.03856) for details. You can create a custom component to perform a specific task which NLU doesn't currently offer (for example, sentiment analysis). Below is the specification of the `rasa.nlu.components.Component`] class with the methods you'll need to implement. -:::tip follow the tutorial -There is a detailed tutorial on building custom components on the [Rasa Blog](https://blog.rasa.com/enhancing-rasa-nlu-with-custom-components/). - -::: - You can add a custom component to your pipeline by adding the module path. So if you have a module called `sentiment` containing a `SentimentAnalyzer` class: @@ -2634,20 +2803,3 @@ the matrix contains a feature vector for every token in the sequence. The sentence features are represented by a matrix of size `(1 x feature-dimension)`. ::: - -## Deprecated Components - -:::caution Deprecated -The `LanguageModelTokenizer` is deprecated. The [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) now implements -its behavior. -::: - -:::caution Deprecated -The `HFTransformersNLP` is deprecated. The [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) now implements -its behavior. -::: - -:::caution Deprecated -The `ConveRTTokenizer` is deprecated. The [ConveRTFeaturizer](./components.mdx#convertfeaturizer) now implements -its behavior. Any [tokenizer](./components.mdx#tokenizers) can be used in its place. -::: \ No newline at end of file From 917efbff5886e654c512faf9aefdbd02cc8f5ade Mon Sep 17 00:00:00 2001 From: koernerfelicia <45405119+koernerfelicia@users.noreply.github.com> Date: Fri, 6 Nov 2020 12:26:04 +0100 Subject: [PATCH 23/31] Apply suggestions from code review Co-authored-by: Tanja --- docs/docs/components.mdx | 4 ++-- docs/docs/migration-guide.mdx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx index 0a593913308b..a4e71f89e943 100644 --- a/docs/docs/components.mdx +++ b/docs/docs/components.mdx @@ -476,7 +476,7 @@ now implements its behavior. Any [tokenizer](./components.mdx#tokenizers) can be :::caution Deprecated The `LanguageModelTokenizer` is deprecated and will be removed in a future release. The [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) -now implements its behavior. +now implements its behavior. Any [tokenizer](./components.mdx#tokenizers) can be used in its place. ::: * **Short** @@ -723,7 +723,7 @@ Note: The `feature-dimension` for sequence and sentence features does not have t * **Description** Creates features for entity extraction, intent classification, and response selection. - Uses the pre-trained language model to compute vector representations of input text. + Uses a pre-trained language model to compute vector representations of input text. :::note Please make sure that you use a language model which is pre-trained on the same language corpus as that of your diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx index 25a52c497788..60c812594701 100644 --- a/docs/docs/migration-guide.mdx +++ b/docs/docs/migration-guide.mdx @@ -15,7 +15,7 @@ how you can migrate from one version to another. ### Deprecations `ConveRTTokenizer` is now deprecated. [ConveRTFeaturizer](./components.mdx#convertfeaturizer) now implements -its behaviour. To migrate, remove `ConveRTTokenizer` with any other tokenizer, for e.g.: +its behaviour. To migrate, replace `ConveRTTokenizer` with any other tokenizer, for e.g.: ```yaml pipeline: @@ -27,7 +27,7 @@ pipeline: `HFTransformersNLP` and `LanguageModelTokenizer` components are now deprecated. [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) now implements their behaviour. -To migrate, remove both the above components with any tokenizer and specify the model architecture and model weights +To migrate, replace both the above components with any tokenizer and specify the model architecture and model weights as part of `LanguageModelFeaturizer`, for e.g.: ```yaml From 8fcd7158a762f18993c268e0faef5ac3de874cd7 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Fri, 6 Nov 2020 12:33:57 +0100 Subject: [PATCH 24/31] Apply suggestions from code review --- docs/docs/components.mdx | 150 +++++++++--------- .../dense_featurizer/convert_featurizer.py | 4 +- .../dense_featurizer/lm_featurizer.py | 7 +- .../nlu/utils/hugging_face/hf_transformers.py | 7 +- 4 files changed, 87 insertions(+), 81 deletions(-) diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx index a4e71f89e943..3d5369768e93 100644 --- a/docs/docs/components.mdx +++ b/docs/docs/components.mdx @@ -139,86 +139,86 @@ word vectors in your pipeline. ### HFTransformersNLP - :::caution Deprecated - The `HFTransformersNLP` is deprecated and will be removed in a future release. The [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) - now implements its behavior. - ::: +:::caution Deprecated +The `HFTransformersNLP` is deprecated and will be removed in a future release. The [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) +now implements its behavior. +::: - * **Short** +* **Short** - HuggingFace's Transformers based pre-trained language model initializer + HuggingFace's Transformers based pre-trained language model initializer - * **Outputs** +* **Outputs** - Nothing + Nothing - * **Requires** +* **Requires** - Nothing + Nothing - * **Description** +* **Description** - Initializes specified pre-trained language model from HuggingFace's [Transformers library](https://huggingface.co/transformers/). The component applies language model specific tokenization and - featurization to compute sequence and sentence level representations for each example in the training data. - Include [LanguageModelTokenizer](./components.mdx#languagemodeltokenizer) and [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) to utilize the output of this - component for downstream NLU models. + Initializes specified pre-trained language model from HuggingFace's [Transformers library](https://huggingface.co/transformers/). The component applies language model specific tokenization and + featurization to compute sequence and sentence level representations for each example in the training data. + Include [LanguageModelTokenizer](./components.mdx#languagemodeltokenizer) and [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) to utilize the output of this + component for downstream NLU models. - :::note - To use `HFTransformersNLP` component, install Rasa Open Source with `pip3 install rasa[transformers]`. + :::note + To use `HFTransformersNLP` component, install Rasa Open Source with `pip3 install rasa[transformers]`. - ::: + ::: - * **Configuration** +* **Configuration** - You should specify what language model to load via the parameter `model_name`. See the below table for the - available language models. - Additionally, you can also specify the architecture variation of the chosen language model by specifying the - parameter `model_weights`. - The full list of supported architectures can be found in the - [HuggingFace documentation](https://huggingface.co/transformers/pretrained_models.html). - If left empty, it uses the default model architecture that original Transformers library loads (see table below). + You should specify what language model to load via the parameter `model_name`. See the below table for the + available language models. + Additionally, you can also specify the architecture variation of the chosen language model by specifying the + parameter `model_weights`. + The full list of supported architectures can be found in the + [HuggingFace documentation](https://huggingface.co/transformers/pretrained_models.html). + If left empty, it uses the default model architecture that original Transformers library loads (see table below). - ``` - +----------------+--------------+-------------------------+ - | Language Model | Parameter | Default value for | - | | "model_name" | "model_weights" | - +----------------+--------------+-------------------------+ - | BERT | bert | rasa/LaBSE | - +----------------+--------------+-------------------------+ - | GPT | gpt | openai-gpt | - +----------------+--------------+-------------------------+ - | GPT-2 | gpt2 | gpt2 | - +----------------+--------------+-------------------------+ - | XLNet | xlnet | xlnet-base-cased | - +----------------+--------------+-------------------------+ - | DistilBERT | distilbert | distilbert-base-uncased | - +----------------+--------------+-------------------------+ - | RoBERTa | roberta | roberta-base | - +----------------+--------------+-------------------------+ - ``` + ``` + +----------------+--------------+-------------------------+ + | Language Model | Parameter | Default value for | + | | "model_name" | "model_weights" | + +----------------+--------------+-------------------------+ + | BERT | bert | rasa/LaBSE | + +----------------+--------------+-------------------------+ + | GPT | gpt | openai-gpt | + +----------------+--------------+-------------------------+ + | GPT-2 | gpt2 | gpt2 | + +----------------+--------------+-------------------------+ + | XLNet | xlnet | xlnet-base-cased | + +----------------+--------------+-------------------------+ + | DistilBERT | distilbert | distilbert-base-uncased | + +----------------+--------------+-------------------------+ + | RoBERTa | roberta | roberta-base | + +----------------+--------------+-------------------------+ + ``` - The following configuration loads the language model BERT: + The following configuration loads the language model BERT: - ```yaml-rasa - pipeline: - - name: HFTransformersNLP - # Name of the language model to use - model_name: "bert" - # Pre-Trained weights to be loaded - model_weights: "rasa/LaBSE" + ```yaml-rasa + pipeline: + - name: HFTransformersNLP + # Name of the language model to use + model_name: "bert" + # Pre-Trained weights to be loaded + model_weights: "rasa/LaBSE" - # An optional path to a specific directory to download and cache the pre-trained model weights. - # The `default` cache_dir is the same as https://huggingface.co/transformers/serialization.html#cache-directory . - cache_dir: null - ``` + # An optional path to a specific directory to download and cache the pre-trained model weights. + # The `default` cache_dir is the same as https://huggingface.co/transformers/serialization.html#cache-directory . + cache_dir: null + ``` ## Tokenizers @@ -417,7 +417,7 @@ now implements its behavior. Any [tokenizer](./components.mdx#tokenizers) can be * **Short** - Tokenizer using [ConveRT](https://github.com/PolyAI-LDN/polyai-models#convert) model. + Tokenizer using [ConveRT](ht tps://github.com/PolyAI-LDN/polyai-models#convert) model. @@ -479,41 +479,41 @@ The `LanguageModelTokenizer` is deprecated and will be removed in a future relea now implements its behavior. Any [tokenizer](./components.mdx#tokenizers) can be used in its place. ::: - * **Short** +* **Short** - Tokenizer from pre-trained language models +Tokenizer from pre-trained language models - * **Outputs** +* **Outputs** - `tokens` for user messages, responses (if present), and intents (if specified) +`tokens` for user messages, responses (if present), and intents (if specified) - * **Requires** +* **Requires** - [HFTransformersNLP](./components.mdx#hftransformersnlp) +[HFTransformersNLP](./components.mdx#hftransformersnlp) - * **Description** +* **Description** - Creates tokens using the pre-trained language model specified in upstream [HFTransformersNLP](./components.mdx#hftransformersnlp) component. - Must be used whenever the [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) is used. +Creates tokens using the pre-trained language model specified in upstream [HFTransformersNLP](./components.mdx#hftransformersnlp) component. +Must be used whenever the [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) is used. - * **Configuration** +* **Configuration** - ```yaml-rasa - pipeline: - - name: "LanguageModelTokenizer" - # Flag to check whether to split intents - "intent_tokenization_flag": False - # Symbol on which intent should be split - "intent_split_symbol": "_" - ``` +```yaml-rasa +pipeline: +- name: "LanguageModelTokenizer" + # Flag to check whether to split intents + "intent_tokenization_flag": False + # Symbol on which intent should be split + "intent_split_symbol": "_" +``` ## Featurizers diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index fb7b84336ca0..8d6d875a2940 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -5,7 +5,7 @@ import os import rasa.shared.utils.io -from rasa.core.utils import get_dict_hash +import rasa.core.utils from rasa.utils import common from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer from rasa.nlu.model import Metadata @@ -402,7 +402,7 @@ def cache_key( Returns: key of the cache for future retrievals. """ _config = common.update_existing_keys(cls.defaults, component_meta) - return f"{cls.name}-{get_dict_hash(_config)}" + return f"{cls.name}-{rasa.core.utils.get_dict_hash(_config)}" def provide_context(self) -> Dict[Text, Any]: """Store the model in pipeline context for future use.""" diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index b017a2798920..9562a7311c78 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -3,7 +3,7 @@ from typing import Any, Optional, Text, List, Type, Dict, Tuple -from rasa.core.utils import get_dict_hash +import rasa.core.utils from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.components import Component, UnsupportedLanguageError from rasa.nlu.featurizers.featurizer import DenseFeaturizer @@ -177,7 +177,10 @@ def cache_key( """ weights = component_meta.get("model_weights") or {} - return f"{cls.name}-{component_meta.get('model_name')}-{get_dict_hash(weights)}" + return ( + f"{cls.name}-{component_meta.get('model_name')}-" + f"{rasa.core.utils.get_dict_hash(weights)}" + ) @classmethod def required_packages(cls) -> List[Text]: diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py index 4efaccbbec21..79cbf7db171e 100644 --- a/rasa/nlu/utils/hugging_face/hf_transformers.py +++ b/rasa/nlu/utils/hugging_face/hf_transformers.py @@ -1,7 +1,7 @@ import logging from typing import Any, Dict, List, Text, Tuple, Optional -from rasa.core.utils import get_dict_hash +import rasa.core.utils from rasa.nlu.model import Metadata from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer @@ -146,7 +146,10 @@ def cache_key( """ weights = component_meta.get("model_weights") or {} - return f"{cls.name}-{component_meta.get('model_name')}-{get_dict_hash(weights)}" + return ( + f"{cls.name}-{component_meta.get('model_name')}-" + f"{rasa.core.utils.get_dict_hash(weights)}" + ) @classmethod def required_packages(cls) -> List[Text]: From 066091b4509977005c422e7a5ea8bf87228ed206 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Fri, 6 Nov 2020 12:42:48 +0100 Subject: [PATCH 25/31] Fix typo --- docs/docs/components.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx index 3d5369768e93..62e308ab6c24 100644 --- a/docs/docs/components.mdx +++ b/docs/docs/components.mdx @@ -417,7 +417,7 @@ now implements its behavior. Any [tokenizer](./components.mdx#tokenizers) can be * **Short** - Tokenizer using [ConveRT](ht tps://github.com/PolyAI-LDN/polyai-models#convert) model. + Tokenizer using [ConveRT](https://github.com/PolyAI-LDN/polyai-models#convert) model. From 1af7b9e164c110dfdc9e619d570f5108e796023e Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Mon, 9 Nov 2020 17:45:44 +0100 Subject: [PATCH 26/31] Fix pytests --- tests/nlu/test_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nlu/test_config.py b/tests/nlu/test_config.py index 0b052c9a6286..d682d5d490a1 100644 --- a/tests/nlu/test_config.py +++ b/tests/nlu/test_config.py @@ -54,7 +54,7 @@ def test_invalid_many_tokenizers_in_config(): { "pipeline": [ {"name": "WhitespaceTokenizer"}, - {"name": "LanguageModelFeaturizer"}, + {"name": "MitieIntentClassifier"}, ] } ), From 37c92d2f2ccb22d4790bde8235a0c7b36930c3b1 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Mon, 9 Nov 2020 17:47:37 +0100 Subject: [PATCH 27/31] Use create instead of constructor for tests --- rasa/nlu/components.py | 2 +- rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py index 47bc38e5d902..eed424396b18 100644 --- a/rasa/nlu/components.py +++ b/rasa/nlu/components.py @@ -495,7 +495,7 @@ def load( if cached_component: return cached_component - return cls(meta) + return cls.create(meta, model_metadata) @classmethod def create( diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index 9562a7311c78..d8030810338e 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -95,7 +95,12 @@ def create( # check failed raise UnsupportedLanguageError(cls.name, language) # TODO: remove this when HFTransformersNLP is removed for good - hf_transformers_loaded = "HFTransformersNLP" in config.component_names + if isinstance(config, Metadata): + hf_transformers_loaded = "HFTransformersNLP" in [ + c["name"] for c in config.metadata["pipeline"] + ] + else: + hf_transformers_loaded = "HFTransformersNLP" in config.component_names return cls(component_config, hf_transformers_loaded=hf_transformers_loaded) def _load_model_metadata(self) -> None: From a82adfbf7ec7ad8aa4d8fc9bfd80d23808835e09 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Tue, 10 Nov 2020 09:00:42 +0100 Subject: [PATCH 28/31] Fix pytest --- tests/nlu/featurizers/test_lm_featurizer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py index 596960933227..b3a699844374 100644 --- a/tests/nlu/featurizers/test_lm_featurizer.py +++ b/tests/nlu/featurizers/test_lm_featurizer.py @@ -4,6 +4,8 @@ import pytest import logging +from _pytest.logging import LogCaptureFixture + from rasa.nlu.constants import ( TOKENS_NAMES, NUMBER_OF_SUB_TOKENS, @@ -748,7 +750,7 @@ def test_hf_transformers_number_of_sub_tokens(text, expected_number_of_sub_token @pytest.mark.parametrize("text", [("hi there")]) -def test_log_deprecation_warning_with_old_config(text, caplog): +def test_log_deprecation_warning_with_old_config(text: str, caplog: LogCaptureFixture): message = Message.build(text) transformers_nlp = HFTransformersNLP( @@ -760,7 +762,9 @@ def test_log_deprecation_warning_with_old_config(text, caplog): lm_tokenizer = LanguageModelTokenizer() lm_tokenizer.process(message) lm_featurizer = LanguageModelFeaturizer() - lm_featurizer.process(message) + caplog.clear() + with caplog.at_level(logging.DEBUG): + lm_featurizer.process(message) assert "deprecated component HFTransformersNLP" in caplog.text From 5443f89570917daa1fe3f8d7fe145d564d6be8f5 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Tue, 10 Nov 2020 09:46:40 +0100 Subject: [PATCH 29/31] Some deepsource checks and hopefully actually fix pytests --- .../dense_featurizer/convert_featurizer.py | 9 ++-- .../dense_featurizer/lm_featurizer.py | 49 ++++++++++++------- .../nlu/utils/hugging_face/hf_transformers.py | 7 --- tests/nlu/featurizers/test_lm_featurizer.py | 37 ++------------ tests/nlu/test_train.py | 4 +- tests/nlu/utils/test_hf_transformers.py | 38 ++------------ 6 files changed, 47 insertions(+), 97 deletions(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index 8d6d875a2940..e24c82d27219 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -72,7 +72,8 @@ def required_packages(cls) -> List[Text]: return ["tensorflow_text", "tensorflow_hub"] def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: - """Initializes ConveRTFeaturizer with the model and different encoding signatures. + """Initializes ConveRTFeaturizer with the model and different + encoding signatures. Args: component_config: Configuration for the component. @@ -248,7 +249,6 @@ def _get_features( number_of_tokens_in_sentence: List[int], ) -> Tuple[np.ndarray, np.ndarray]: """Get the sequence and sentence features.""" - sentence_embeddings = [] sequence_embeddings = [] @@ -266,8 +266,9 @@ def _get_features( def _tokens_to_text(list_of_tokens: List[List[Token]]) -> List[Text]: """Convert list of tokens to text. - Add a whitespace between two tokens if the end value of the first tokens is - not the same as the end value of the second token.""" + Add a whitespace between two tokens if the end value of the first tokens + is not the same as the end value of the second token. + """ texts = [] for tokens in list_of_tokens: text = "" diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index d8030810338e..1c17b342341d 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -47,8 +47,8 @@ class LanguageModelFeaturizer(DenseFeaturizer): The transformers(https://github.com/huggingface/transformers) library is used to load pre-trained language models like BERT, GPT-2, etc. - The component also tokenizes and featurizes dense featurizable attributes of each - message. + The component also tokenizes and featurizes dense featurizable attributes of + each message. """ defaults = { @@ -209,7 +209,8 @@ def _lm_tokenize(self, text: Text) -> Tuple[List[int], List[Text]]: def _add_lm_specific_special_tokens( self, token_ids: List[List[int]] ) -> List[List[int]]: - """Add language model specific special tokens which were used during their training. + """Add language model specific special tokens which were used during + their training. Args: token_ids: List of token ids for each example in the batch. @@ -295,8 +296,8 @@ def _tokenize_example( attribute: Property of message to be processed, one of ``TEXT`` or ``RESPONSE``. - Returns: List of token strings and token ids for the corresponding attribute of the - message. + Returns: List of token strings and token ids for the corresponding + attribute of the message. """ tokens_in = message.get(TOKENS_NAMES[attribute]) tokens_out = [] @@ -356,12 +357,17 @@ def _compute_attention_mask( padding tokens. Args: - actual_sequence_lengths: List of length of each example without any padding. - max_input_sequence_length: Maximum length of a sequence that will be present in the input batch. This is - after taking into consideration the maximum input sequence the model can handle. Hence it can never be - greater than self.max_model_sequence_length in case the model applies length restriction. - - Returns: Computed attention mask, 0 for padding and 1 for non-padding tokens. + actual_sequence_lengths: List of length of each example without any + padding. + max_input_sequence_length: Maximum length of a sequence that will be + present in the input batch. This is + after taking into consideration the maximum input sequence the model + can handle. Hence it can never be + greater than self.max_model_sequence_length in case the model + applies length restriction. + + Returns: Computed attention mask, 0 for padding and 1 for non-padding + tokens. """ attention_mask = [] @@ -499,10 +505,12 @@ def _validate_sequence_lengths( attribute: Text, inference_mode: bool = False, ) -> None: - """Validate if sequence lengths of all inputs are less the max sequence length the model can handle. + """Validate if sequence lengths of all inputs are less the max sequence + length the model can handle. - This method should throw an error during training, whereas log a debug message during inference if - any of the input examples have a length greater than maximum sequence length allowed. + This method should throw an error during training, whereas log a debug + message during inference if any of the input examples have a length + greater than maximum sequence length allowed. Args: actual_sequence_lengths: original sequence length of all inputs @@ -538,7 +546,8 @@ def _add_extra_padding( ) -> np.ndarray: """Add extra zero padding to match the original sequence length. - This is only done if the input was truncated during the batch preparation of input for the model. + This is only done if the input was truncated during the batch + preparation of input for the model. Args: sequence_embeddings: Embeddings returned from the model actual_sequence_lengths: original sequence length of all inputs @@ -547,7 +556,8 @@ def _add_extra_padding( Modified sequence embeddings with padding if necessary """ if self.max_model_sequence_length == NO_LENGTH_RESTRICTION: - # No extra padding needed because there wouldn't have been any truncation in the first place + # No extra padding needed because there wouldn't have been any + # truncation in the first place return sequence_embeddings reshaped_sequence_embeddings = [] @@ -609,8 +619,8 @@ def _get_model_features_for_batch( max_input_sequence_length, ) = self._extract_sequence_lengths(batch_token_ids_augmented) - # Validate that all sequences can be processed based on their sequence lengths and - # the maximum sequence length the model can handle + # Validate that all sequences can be processed based on their sequence + # lengths and the maximum sequence length the model can handle self._validate_sequence_lengths( actual_sequence_lengths, batch_examples, attribute, inference_mode ) @@ -642,7 +652,8 @@ def _get_model_features_for_batch( ) = self._post_process_sequence_embeddings(sequence_nonpadded_embeddings) # Pad zeros for examples which were truncated in inference mode. - # This is intentionally done after sentence embeddings have been extracted so that they are not affected + # This is intentionally done after sentence embeddings have been + # extracted so that they are not affected sequence_embeddings = self._add_extra_padding( sequence_embeddings, actual_sequence_lengths ) diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py index 79cbf7db171e..8a512876d200 100644 --- a/rasa/nlu/utils/hugging_face/hf_transformers.py +++ b/rasa/nlu/utils/hugging_face/hf_transformers.py @@ -227,7 +227,6 @@ def _post_process_sequence_embeddings( Returns: Sentence and sequence level representations. """ - from rasa.nlu.utils.hugging_face.registry import ( model_embeddings_post_processors, ) @@ -269,7 +268,6 @@ def _tokenize_example( List of token strings and token ids for the corresponding attribute of the message. """ - tokens_in = self.whitespace_tokenizer.tokenize(message, attribute) tokens_out = [] @@ -307,7 +305,6 @@ def _get_token_ids_for_batch( Returns: List of token strings and token ids for each example in the batch. """ - batch_token_ids = [] batch_tokens = [] for example in batch_examples: @@ -521,7 +518,6 @@ def _add_extra_padding( Returns: Modified sequence embeddings with padding if necessary """ - if self.max_model_sequence_length == NO_LENGTH_RESTRICTION: # No extra padding needed because there wouldn't have been any truncation in the first place return sequence_embeddings @@ -663,7 +659,6 @@ def _get_docs_for_batch( Returns: List of language model docs for each message in batch. """ - batch_tokens, batch_token_ids = self._get_token_ids_for_batch( batch_examples, attribute ) @@ -701,7 +696,6 @@ def train( config: NLU pipeline config consisting of all components. """ - batch_size = 64 for attribute in DENSE_FEATURIZABLE_ATTRIBUTES: @@ -736,7 +730,6 @@ def process(self, message: Message, **kwargs: Any) -> None: Args: message: Incoming message object """ - # process of all featurizers operates only on TEXT and ACTION_TEXT attributes, # because all other attributes are labels which are featurized during training # and their features are stored by the model itself. diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py index b3a699844374..3f67063dddec 100644 --- a/tests/nlu/featurizers/test_lm_featurizer.py +++ b/tests/nlu/featurizers/test_lm_featurizer.py @@ -369,7 +369,7 @@ def test_attention_mask( # TODO: need to fix this failing test @pytest.mark.skip(reason="Results in random crashing of github action workers") @pytest.mark.parametrize( - "model_name, model_weights, texts, expected_tokens, expected_indices, expected_num_token_ids", + "model_name, model_weights, texts, expected_tokens, expected_indices", [ ( "bert", @@ -419,7 +419,6 @@ def test_attention_mask( (39, 42), ], ], - [4, 4, 5, 5, 13], ), ( "bert", @@ -442,7 +441,6 @@ def test_attention_mask( [(0, 1), (1, 2), (2, 3), (3, 4)], [(0, 1), (1, 2), (2, 3)], ], - [3, 4, 4, 3], ), ( "gpt", @@ -481,7 +479,6 @@ def test_attention_mask( (39, 42), ], ], - [2, 1, 2, 3, 3, 9], ), ( "gpt2", @@ -534,7 +531,6 @@ def test_attention_mask( (39, 42), ], ], - [3, 1, 2, 3, 3, 11], ), ( "xlnet", @@ -566,25 +562,6 @@ def test_attention_mask( "for", ], ], - [ - [(0, 4), (5, 12)], - [(0, 5)], - [(0, 3), (4, 6)], - [(0, 1), (3, 4), (6, 7)], - [(0, 4), (5, 6), (7, 11)], - [ - (0, 4), - (5, 7), - (8, 11), - (12, 20), - (21, 22), - (23, 27), - (28, 33), - (33, 37), - (37, 38), - (39, 42), - ], - ], [4, 3, 4, 5, 5, 12], ), ( @@ -635,7 +612,6 @@ def test_attention_mask( (39, 42), ], ], - [4, 4, 5, 5, 13], ), ( "roberta", @@ -688,7 +664,6 @@ def test_attention_mask( (39, 42), ], ], - [5, 3, 4, 5, 5, 13], ), ], ) @@ -699,7 +674,6 @@ def test_lm_featurizer_edge_cases( texts, expected_tokens, expected_indices, - expected_num_token_ids, ): if model_weights is None: @@ -711,9 +685,7 @@ def test_lm_featurizer_edge_cases( lm_featurizer = LanguageModelFeaturizer(transformers_config) whitespace_tokenizer = WhitespaceTokenizer() - for text, gt_tokens, gt_indices, gt_num_indices in zip( - texts, expected_tokens, expected_indices, expected_num_token_ids - ): + for text, gt_tokens, gt_indices in zip(texts, expected_tokens, expected_indices): message = Message.build(text=text) tokens = whitespace_tokenizer.tokenize(message, TEXT) @@ -729,7 +701,7 @@ def test_lm_featurizer_edge_cases( "text, expected_number_of_sub_tokens", [("sentence embeddings", [1, 4]), ("this is a test", [1, 1, 1, 1])], ) -def test_hf_transformers_number_of_sub_tokens(text, expected_number_of_sub_tokens): +def test_lm_featurizer_number_of_sub_tokens(text, expected_number_of_sub_tokens): config = { "model_name": "bert", "model_weights": "bert-base-uncased", @@ -761,7 +733,7 @@ def test_log_deprecation_warning_with_old_config(text: str, caplog: LogCaptureFi caplog.set_level(logging.DEBUG) lm_tokenizer = LanguageModelTokenizer() lm_tokenizer.process(message) - lm_featurizer = LanguageModelFeaturizer() + lm_featurizer = LanguageModelFeaturizer(skip_model_load=True) caplog.clear() with caplog.at_level(logging.DEBUG): lm_featurizer.process(message) @@ -769,6 +741,7 @@ def test_log_deprecation_warning_with_old_config(text: str, caplog: LogCaptureFi assert "deprecated component HFTransformersNLP" in caplog.text +@pytest.mark.skip(reason="Results in random crashing of github action workers") def test_preserve_sentence_and_sequence_features_old_config(): attribute = "text" message = Message.build("hi there") diff --git a/tests/nlu/test_train.py b/tests/nlu/test_train.py index 0417464d626a..459a93933950 100644 --- a/tests/nlu/test_train.py +++ b/tests/nlu/test_train.py @@ -116,8 +116,8 @@ def pipelines_for_non_windows_tests() -> List[Tuple[Text, List[Dict[Text, Any]]] def test_all_components_are_in_at_least_one_test_pipeline(): """There is a template that includes all components to test the train-persist-load-use cycle. Ensures that - really all components are in there.""" - + really all components are in there. + """ all_pipelines = pipelines_for_tests() + pipelines_for_non_windows_tests() all_components = [c["name"] for _, p in all_pipelines for c in p] diff --git a/tests/nlu/utils/test_hf_transformers.py b/tests/nlu/utils/test_hf_transformers.py index 72d6a5d20f09..30abf348dd3b 100644 --- a/tests/nlu/utils/test_hf_transformers.py +++ b/tests/nlu/utils/test_hf_transformers.py @@ -137,7 +137,7 @@ def test_attention_mask( # TODO: need to fix this failing test @pytest.mark.skip(reason="Results in random crashing of github action workers") @pytest.mark.parametrize( - "model_name, model_weights, texts, expected_tokens, expected_indices, expected_num_token_ids", + "model_name, model_weights, texts, expected_tokens, expected_indices", [ ( "bert", @@ -187,7 +187,6 @@ def test_attention_mask( (39, 42), ], ], - [4, 4, 5, 5, 13], ), ( "bert", @@ -210,7 +209,6 @@ def test_attention_mask( [(0, 1), (1, 2), (2, 3), (3, 4)], [(0, 1), (1, 2), (2, 3)], ], - [3, 4, 4, 3], ), ( "gpt", @@ -249,7 +247,6 @@ def test_attention_mask( (39, 42), ], ], - [2, 1, 2, 3, 3, 9], ), ( "gpt2", @@ -302,7 +299,6 @@ def test_attention_mask( (39, 42), ], ], - [3, 1, 2, 3, 3, 11], ), ( "xlnet", @@ -334,25 +330,6 @@ def test_attention_mask( "for", ], ], - [ - [(0, 4), (5, 12)], - [(0, 5)], - [(0, 3), (4, 6)], - [(0, 1), (3, 4), (6, 7)], - [(0, 4), (5, 6), (7, 11)], - [ - (0, 4), - (5, 7), - (8, 11), - (12, 20), - (21, 22), - (23, 27), - (28, 33), - (33, 37), - (37, 38), - (39, 42), - ], - ], [4, 3, 4, 5, 5, 12], ), ( @@ -403,7 +380,6 @@ def test_attention_mask( (39, 42), ], ], - [4, 4, 5, 5, 13], ), ( "roberta", @@ -456,18 +432,16 @@ def test_attention_mask( (39, 42), ], ], - [5, 3, 4, 5, 5, 13], ), ], ) @pytest.mark.skip_on_windows -def test_hf_transformers_edge_cases( +def test_hf_transformer_edge_cases( model_name, model_weights, texts, expected_tokens, expected_indices, - expected_num_token_ids, ): if model_weights is None: @@ -476,17 +450,15 @@ def test_hf_transformers_edge_cases( model_weights_config = {"model_weights": model_weights} transformers_config = {**{"model_name": model_name}, **model_weights_config} - transformers_nlp = HFTransformersNLP(transformers_config) + hf_transformer = HFTransformersNLP(transformers_config) whitespace_tokenizer = WhitespaceTokenizer() - for text, gt_tokens, gt_indices, gt_num_indices in zip( - texts, expected_tokens, expected_indices, expected_num_token_ids - ): + for text, gt_tokens, gt_indices in zip(texts, expected_tokens, expected_indices): message = Message.build(text=text) tokens = whitespace_tokenizer.tokenize(message, TEXT) message.set(TOKENS_NAMES[TEXT], tokens) - transformers_nlp.process(message) + hf_transformer.process(message) assert [t.text for t in tokens] == gt_tokens assert [t.start for t in tokens] == [i[0] for i in gt_indices] From b02ed915ad21a6ffb9da0c2bd141566fdfc62bf2 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Tue, 10 Nov 2020 10:27:56 +0100 Subject: [PATCH 30/31] Reformat tests again --- tests/nlu/featurizers/test_lm_featurizer.py | 6 +----- tests/nlu/utils/test_hf_transformers.py | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py index 3f67063dddec..4acdc78c8de4 100644 --- a/tests/nlu/featurizers/test_lm_featurizer.py +++ b/tests/nlu/featurizers/test_lm_featurizer.py @@ -669,11 +669,7 @@ def test_attention_mask( ) @pytest.mark.skip_on_windows def test_lm_featurizer_edge_cases( - model_name, - model_weights, - texts, - expected_tokens, - expected_indices, + model_name, model_weights, texts, expected_tokens, expected_indices ): if model_weights is None: diff --git a/tests/nlu/utils/test_hf_transformers.py b/tests/nlu/utils/test_hf_transformers.py index 30abf348dd3b..89362c822ca3 100644 --- a/tests/nlu/utils/test_hf_transformers.py +++ b/tests/nlu/utils/test_hf_transformers.py @@ -437,11 +437,7 @@ def test_attention_mask( ) @pytest.mark.skip_on_windows def test_hf_transformer_edge_cases( - model_name, - model_weights, - texts, - expected_tokens, - expected_indices, + model_name, model_weights, texts, expected_tokens, expected_indices ): if model_weights is None: From b548debaa79b372d6aae110c9b4533591fc33441 Mon Sep 17 00:00:00 2001 From: koernerfelicia Date: Tue, 10 Nov 2020 12:10:12 +0100 Subject: [PATCH 31/31] Overloaded load method so that we do not call the constructor for LMFeaturizer --- rasa/nlu/components.py | 2 +- .../dense_featurizer/lm_featurizer.py | 38 +++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py index eed424396b18..47bc38e5d902 100644 --- a/rasa/nlu/components.py +++ b/rasa/nlu/components.py @@ -495,7 +495,7 @@ def load( if cached_component: return cached_component - return cls.create(meta, model_metadata) + return cls(meta) @classmethod def create( diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index 1c17b342341d..4583dcd6fad1 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -103,6 +103,44 @@ def create( hf_transformers_loaded = "HFTransformersNLP" in config.component_names return cls(component_config, hf_transformers_loaded=hf_transformers_loaded) + @classmethod + def load( + cls, + meta: Dict[Text, Any], + model_dir: Optional[Text] = None, + model_metadata: Optional["Metadata"] = None, + cached_component: Optional["Component"] = None, + **kwargs: Any, + ) -> "Component": + """Load this component from file. + + After a component has been trained, it will be persisted by + calling `persist`. When the pipeline gets loaded again, + this component needs to be able to restore itself. + Components can rely on any context attributes that are + created by :meth:`components.Component.create` + calls to components previous to this one. + + This method differs from the parent method only in that it calls create + rather than the constructor if the component is not found. This is to + trigger the check for HFTransformersNLP and the method can be removed + when HFTRansformersNLP is removed. + + Args: + meta: Any configuration parameter related to the model. + model_dir: The directory to load the component from. + model_metadata: The model's :class:`rasa.nlu.model.Metadata`. + cached_component: The cached component. + + Returns: + the loaded component + """ + # TODO: remove this when HFTransformersNLP is removed for good + if cached_component: + return cached_component + + return cls.create(meta, model_metadata) + def _load_model_metadata(self) -> None: """Load the metadata for the specified model and sets these properties.