From 3b51563dc49830f4e5f9a09ebd823c5f7eb563ef Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Fri, 18 Oct 2019 16:26:57 +0200 Subject: [PATCH] Add use_cls_token to default dict. --- rasa/nlu/tokenizers/jieba_tokenizer.py | 2 ++ rasa/nlu/tokenizers/mitie_tokenizer.py | 5 +++++ rasa/nlu/tokenizers/spacy_tokenizer.py | 5 +++++ rasa/nlu/tokenizers/tokenizer.py | 2 +- rasa/nlu/tokenizers/whitespace_tokenizer.py | 2 ++ 5 files changed, 15 insertions(+), 1 deletion(-) diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py index c434a5a2e050..dfbbf2cbcb9b 100644 --- a/rasa/nlu/tokenizers/jieba_tokenizer.py +++ b/rasa/nlu/tokenizers/jieba_tokenizer.py @@ -36,6 +36,8 @@ class JiebaTokenizer(Tokenizer): "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", + # add __CLS__ token to the end of the list of tokens + "use_cls_token": True, } # default don't load custom dictionary def __init__(self, component_config: Dict[Text, Any] = None) -> None: diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py index 6b4c6b30abdc..ec5556d5840d 100644 --- a/rasa/nlu/tokenizers/mitie_tokenizer.py +++ b/rasa/nlu/tokenizers/mitie_tokenizer.py @@ -16,6 +16,11 @@ class MitieTokenizer(Tokenizer): provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES] + defaults = { + # add __CLS__ token to the end of the list of tokens + "use_cls_token": True + } + @classmethod def required_packages(cls) -> List[Text]: return ["mitie"] diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py index 432f283af1ce..9f061c2b29ec 100644 --- a/rasa/nlu/tokenizers/spacy_tokenizer.py +++ b/rasa/nlu/tokenizers/spacy_tokenizer.py @@ -28,6 +28,11 @@ class SpacyTokenizer(Tokenizer): for attribute in SPACY_FEATURIZABLE_ATTRIBUTES ] + defaults = { + # add __CLS__ token to the end of the list of tokens + "use_cls_token": True + } + def train( self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any ) -> None: diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py index 41e04c844385..1b786590f010 100644 --- a/rasa/nlu/tokenizers/tokenizer.py +++ b/rasa/nlu/tokenizers/tokenizer.py @@ -29,7 +29,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: if "use_cls_token" in self.component_config: self.use_cls_token = self.component_config["use_cls_token"] else: - self.use_cls_token = False + self.use_cls_token = True def add_cls_token( self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py index 3641fb909689..9be597b49a9d 100644 --- a/rasa/nlu/tokenizers/whitespace_tokenizer.py +++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py @@ -24,6 +24,8 @@ class WhitespaceTokenizer(Tokenizer): "intent_split_symbol": "_", # Text will be tokenized with case sensitive as default "case_sensitive": True, + # add __CLS__ token to the end of the list of tokens + "use_cls_token": True, } def __init__(self, component_config: Dict[Text, Any] = None) -> None: