diff --git a/changelog/5905.bugfix.rst b/changelog/5905.bugfix.rst new file mode 100644 index 000000000000..4e4d3f0d70da --- /dev/null +++ b/changelog/5905.bugfix.rst @@ -0,0 +1,3 @@ +Remove option ``token_pattern`` from ``CountVectorsFeaturizer``. +Instead all tokenizers now have the option ``token_pattern``. +If a regular expression is set, the tokenizer will apply the token pattern. diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst index 06ed7274c391..08437b51ea27 100644 --- a/docs/nlu/components.rst +++ b/docs/nlu/components.rst @@ -182,6 +182,8 @@ WhitespaceTokenizer "intent_split_symbol": "_" # Text will be tokenized with case sensitive as default "case_sensitive": True + # Regular expression to detect tokens + "token_pattern": None JiebaTokenizer @@ -210,6 +212,8 @@ JiebaTokenizer "intent_tokenization_flag": False # Symbol on which intent should be split "intent_split_symbol": "_" + # Regular expression to detect tokens + "token_pattern": None MitieTokenizer @@ -229,6 +233,8 @@ MitieTokenizer "intent_tokenization_flag": False # Symbol on which intent should be split "intent_split_symbol": "_" + # Regular expression to detect tokens + "token_pattern": None SpacyTokenizer ~~~~~~~~~~~~~~ @@ -248,6 +254,8 @@ SpacyTokenizer "intent_tokenization_flag": False # Symbol on which intent should be split "intent_split_symbol": "_" + # Regular expression to detect tokens + "token_pattern": None .. _ConveRTTokenizer: @@ -282,6 +290,8 @@ ConveRTTokenizer "intent_split_symbol": "_" # Text will be tokenized with case sensitive as default "case_sensitive": True + # Regular expression to detect tokens + "token_pattern": None .. _LanguageModelTokenizer: @@ -306,7 +316,6 @@ LanguageModelTokenizer "intent_split_symbol": "_" - .. _text-featurizers: Text Featurizers @@ -582,9 +591,6 @@ CountVectorsFeaturizer | | | n-grams at the edges of words are padded with space. | | | | Valid values: 'word', 'char', 'char_wb'. | +-------------------+-------------------------+--------------------------------------------------------------+ - | token_pattern | r"(?u)\b\w\w+\b" | Regular expression used to detect tokens. | - | | | Only used if 'analyzer' is set to 'word'. | - +-------------------+-------------------------+--------------------------------------------------------------+ | strip_accents | None | Remove accents during the pre-processing step. | | | | Valid values: 'ascii', 'unicode', 'None'. | +-------------------+-------------------------+--------------------------------------------------------------+ diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index e7fef6f46e32..1d1e08cc2c34 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -53,11 +53,6 @@ def required_components(cls) -> List[Type[Component]]: # 'char_wb' creates character n-grams inside word boundaries # n-grams at the edges of words are padded with space. "analyzer": "word", # use 'char' or 'char_wb' for character - # regular expression for tokens - # only used if analyzer == 'word' - # WARNING this pattern is used during training - # but not currently used during inference! - "token_pattern": r"(?u)\b\w\w+\b", # remove accents during the preprocessing step "strip_accents": None, # {'ascii', 'unicode', None} # list of stop words @@ -95,9 +90,6 @@ def _load_count_vect_params(self) -> None: # set analyzer self.analyzer = self.component_config["analyzer"] - # regular expression for tokens - self.token_pattern = self.component_config["token_pattern"] - # remove accents during the preprocessing step self.strip_accents = self.component_config["strip_accents"] @@ -341,7 +333,6 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]): self.vectorizers = self._create_shared_vocab_vectorizers( { - "token_pattern": self.token_pattern, "strip_accents": self.strip_accents, "lowercase": self.lowercase, "stop_words": self.stop_words, @@ -375,7 +366,6 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]]) self.vectorizers = self._create_independent_vocab_vectorizers( { - "token_pattern": self.token_pattern, "strip_accents": self.strip_accents, "lowercase": self.lowercase, "stop_words": self.stop_words, @@ -605,7 +595,7 @@ def _create_shared_vocab_vectorizers( """Create vectorizers for all attributes with shared vocabulary""" shared_vectorizer = CountVectorizer( - token_pattern=parameters["token_pattern"], + token_pattern=r"(?u)\b\w+\b", strip_accents=parameters["strip_accents"], lowercase=parameters["lowercase"], stop_words=parameters["stop_words"], @@ -637,7 +627,7 @@ def _create_independent_vocab_vectorizers( attribute_vocabulary = vocabulary[attribute] if vocabulary else None attribute_vectorizer = CountVectorizer( - token_pattern=parameters["token_pattern"], + token_pattern=r"(?u)\b\w+\b", strip_accents=parameters["strip_accents"], lowercase=parameters["lowercase"], stop_words=parameters["stop_words"], diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py index e4b63e60a6f8..acae4ade5130 100644 --- a/rasa/nlu/tokenizers/convert_tokenizer.py +++ b/rasa/nlu/tokenizers/convert_tokenizer.py @@ -26,6 +26,8 @@ class ConveRTTokenizer(WhitespaceTokenizer): "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", + # Regular expression to detect tokens + "token_pattern": None, # Text will be tokenized with case sensitive as default "case_sensitive": True, } @@ -82,7 +84,8 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: return tokens_out - def _clean_tokens(self, tokens: List[bytes]): + @staticmethod + def _clean_tokens(tokens: List[bytes]) -> List[Text]: """Encode tokens and remove special char added by ConveRT.""" tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens] diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py index 59dd9425a404..ef034bde430f 100644 --- a/rasa/nlu/tokenizers/jieba_tokenizer.py +++ b/rasa/nlu/tokenizers/jieba_tokenizer.py @@ -28,6 +28,8 @@ class JiebaTokenizer(Tokenizer): "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", + # Regular expression to detect tokens + "token_pattern": None, } # default don't load custom dictionary def __init__(self, component_config: Dict[Text, Any] = None) -> None: @@ -69,7 +71,7 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] - return tokens + return self._apply_token_pattern(tokens) @classmethod def load( diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py index 56ac683ddf60..4edb431b5986 100644 --- a/rasa/nlu/tokenizers/lm_tokenizer.py +++ b/rasa/nlu/tokenizers/lm_tokenizer.py @@ -5,10 +5,7 @@ from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP from rasa.nlu.training_data import Message -from rasa.nlu.constants import ( - LANGUAGE_MODEL_DOCS, - TOKENS, -) +from rasa.nlu.constants import LANGUAGE_MODEL_DOCS, TOKENS class LanguageModelTokenizer(Tokenizer): diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py index 054e3225fb10..376aa901fba0 100644 --- a/rasa/nlu/tokenizers/mitie_tokenizer.py +++ b/rasa/nlu/tokenizers/mitie_tokenizer.py @@ -14,6 +14,8 @@ class MitieTokenizer(Tokenizer): "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", + # Regular expression to detect tokens + "token_pattern": None, } @classmethod @@ -32,7 +34,7 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: for token, offset in tokenized ] - return tokens + return self._apply_token_pattern(tokens) def _token_from_offset( self, text: bytes, offset: int, encoded_sentence: bytes diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py index b3ab4cdc6b64..3860cc274443 100644 --- a/rasa/nlu/tokenizers/spacy_tokenizer.py +++ b/rasa/nlu/tokenizers/spacy_tokenizer.py @@ -25,6 +25,8 @@ def required_components(cls) -> List[Type[Component]]: "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", + # Regular expression to detect tokens + "token_pattern": None, } def get_doc(self, message: Message, attribute: Text) -> "Doc": @@ -33,7 +35,7 @@ def get_doc(self, message: Message, attribute: Text) -> "Doc": def tokenize(self, message: Message, attribute: Text) -> List[Token]: doc = self.get_doc(message, attribute) - return [ + tokens = [ Token( t.text, t.idx, lemma=t.lemma_, data={POS_TAG_KEY: self._tag_of_token(t)} ) @@ -41,6 +43,8 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: if t.text and t.text.strip() ] + return self._apply_token_pattern(tokens) + @staticmethod def _tag_of_token(token: Any) -> Text: import spacy diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py index a1e8ccc13e17..4d3bad85e73c 100644 --- a/rasa/nlu/tokenizers/tokenizer.py +++ b/rasa/nlu/tokenizers/tokenizer.py @@ -1,4 +1,5 @@ import logging +import re from typing import Text, List, Optional, Dict, Any @@ -65,6 +66,11 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None: ) # split symbol for intents self.intent_split_symbol = self.component_config.get("intent_split_symbol", "_") + # token pattern to further split tokens + token_pattern = self.component_config.get("token_pattern", None) + self.token_pattern_regex = None + if token_pattern: + self.token_pattern_regex = re.compile(token_pattern) def tokenize(self, message: Message, attribute: Text) -> List[Token]: """Tokenizes the text of the provided attribute of the incoming message.""" @@ -105,6 +111,42 @@ def _split_intent(self, message: Message): return self._convert_words_to_tokens(words, text) + def _apply_token_pattern(self, tokens: List[Token]) -> List[Token]: + """Apply the token pattern to the given tokens. + + Args: + tokens: list of tokens to split + + Returns: + List of tokens. + """ + if not self.token_pattern_regex: + return tokens + + final_tokens = [] + for token in tokens: + new_tokens = self.token_pattern_regex.findall(token.text) + new_tokens = [t for t in new_tokens if t] + + if not new_tokens: + final_tokens.append(token) + + running_offset = 0 + for new_token in new_tokens: + word_offset = token.text.index(new_token, running_offset) + word_len = len(new_token) + running_offset = word_offset + word_len + final_tokens.append( + Token( + new_token, + token.start + word_offset, + data=token.data, + lemma=token.lemma, + ) + ) + + return final_tokens + @staticmethod def _convert_words_to_tokens(words: List[Text], text: Text) -> List[Token]: running_offset = 0 diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py index 47f614f3d534..9d51f0d33701 100644 --- a/rasa/nlu/tokenizers/whitespace_tokenizer.py +++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py @@ -14,6 +14,8 @@ class WhitespaceTokenizer(Tokenizer): "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", + # Regular expression to detect tokens + "token_pattern": None, # Text will be tokenized with case sensitive as default "case_sensitive": True, } @@ -77,4 +79,6 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: if not words: words = [text] - return self._convert_words_to_tokens(words, text) + tokens = self._convert_words_to_tokens(words, text) + + return self._apply_token_pattern(tokens) diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py index 14bf35dc8ef9..42effc649aeb 100644 --- a/tests/nlu/featurizers/test_count_vectors_featurizer.py +++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py @@ -23,7 +23,7 @@ ], ) def test_count_vector_featurizer(sentence, expected, expected_cls): - ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) + ftr = CountVectorsFeaturizer() train_message = Message(sentence) test_message = Message(sentence) @@ -54,7 +54,7 @@ def test_count_vector_featurizer(sentence, expected, expected_cls): def test_count_vector_featurizer_response_attribute_featurization( sentence, intent, response, intent_features, response_features ): - ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) + ftr = CountVectorsFeaturizer() tk = WhitespaceTokenizer() train_message = Message(sentence) @@ -104,7 +104,7 @@ def test_count_vector_featurizer_response_attribute_featurization( def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features ): - ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) + ftr = CountVectorsFeaturizer() tk = WhitespaceTokenizer() train_message = Message(sentence) @@ -153,9 +153,7 @@ def test_count_vector_featurizer_attribute_featurization( def test_count_vector_featurizer_shared_vocab( sentence, intent, response, text_features, intent_features, response_features ): - ftr = CountVectorsFeaturizer( - {"token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True} - ) + ftr = CountVectorsFeaturizer({"use_shared_vocab": True}) tk = WhitespaceTokenizer() train_message = Message(sentence) @@ -188,9 +186,7 @@ def test_count_vector_featurizer_shared_vocab( ], ) def test_count_vector_featurizer_oov_token(sentence, expected): - ftr = CountVectorsFeaturizer( - {"token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__"} - ) + ftr = CountVectorsFeaturizer({"OOV_token": "__oov__"}) train_message = Message(sentence) WhitespaceTokenizer().process(train_message) @@ -217,11 +213,7 @@ def test_count_vector_featurizer_oov_token(sentence, expected): def test_count_vector_featurizer_oov_words(sentence, expected): ftr = CountVectorsFeaturizer( - { - "token_pattern": r"(?u)\b\w+\b", - "OOV_token": "__oov__", - "OOV_words": ["oov_word0", "OOV_word1"], - } + {"OOV_token": "__oov__", "OOV_words": ["oov_word0", "OOV_word1"]} ) train_message = Message(sentence) WhitespaceTokenizer().process(train_message) @@ -251,7 +243,7 @@ def test_count_vector_featurizer_oov_words(sentence, expected): ) def test_count_vector_featurizer_using_tokens(tokens, expected): - ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) + ftr = CountVectorsFeaturizer() # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. @@ -307,7 +299,6 @@ def test_count_vector_featurizer_persist_load(tmp_path): # set non default values to config config = { "analyzer": "char", - "token_pattern": r"(?u)\b\w+\b", "strip_accents": "ascii", "stop_words": "stop", "min_df": 2, diff --git a/tests/nlu/tokenizers/test_tokenizer.py b/tests/nlu/tokenizers/test_tokenizer.py index b6d6de5a14a0..f250472f182f 100644 --- a/tests/nlu/tokenizers/test_tokenizer.py +++ b/tests/nlu/tokenizers/test_tokenizer.py @@ -1,13 +1,14 @@ +from typing import List, Text + import pytest +from rasa.nlu.tokenizers.tokenizer import Token from rasa.nlu.constants import TEXT, INTENT, RESPONSE, TOKENS_NAMES from rasa.nlu.training_data import Message, TrainingData from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer def test_tokens_comparison(): - from rasa.nlu.tokenizers.tokenizer import Token - x = Token("hello", 0) y = Token("Hello", 0) @@ -83,3 +84,53 @@ def test_split_intent(text, expected_tokens): message.set(INTENT, text) assert [t.text for t in tk._split_intent(message)] == expected_tokens + + +@pytest.mark.parametrize( + "token_pattern, tokens, expected_tokens", + [ + ( + None, + [Token("hello", 0), Token("there", 6)], + [Token("hello", 0), Token("there", 6)], + ), + ( + "", + [Token("hello", 0), Token("there", 6)], + [Token("hello", 0), Token("there", 6)], + ), + ( + r"(?u)\b\w\w+\b", + [Token("role-based", 0), Token("access-control", 11)], + [ + Token("role", 0), + Token("based", 5), + Token("access", 11), + Token("control", 18), + ], + ), + ( + r".*", + [Token("role-based", 0), Token("access-control", 11)], + [Token("role-based", 0), Token("access-control", 11)], + ), + ( + r"(test)", + [Token("role-based", 0), Token("access-control", 11)], + [Token("role-based", 0), Token("access-control", 11)], + ), + ], +) +def test_apply_token_pattern( + token_pattern: Text, tokens: List[Token], expected_tokens: List[Token] +): + component_config = {"token_pattern": token_pattern} + + tokenizer = WhitespaceTokenizer(component_config) + actual_tokens = tokenizer._apply_token_pattern(tokens) + + assert len(actual_tokens) == len(expected_tokens) + for actual_token, expected_token in zip(actual_tokens, expected_tokens): + assert actual_token.text == expected_token.text + assert actual_token.start == expected_token.start + assert actual_token.end == expected_token.end