Add use_cls_token to default dict.

RasaHQ · Oct 18, 2019 · 3b51563 · 3b51563
1 parent 95fe8da
commit 3b51563
Show file tree

Hide file tree

Showing 5 changed files with 15 additions and 1 deletion.
diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -36,6 +36,8 @@ class JiebaTokenizer(Tokenizer):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True,
     }  # default don't load custom dictionary
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:

diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -16,6 +16,11 @@ class MitieTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
+    defaults = {
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True
+    }
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["mitie"]

diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -28,6 +28,11 @@ class SpacyTokenizer(Tokenizer):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
+    defaults = {
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True
+    }
+
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
     ) -> None:

diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
@@ -29,7 +29,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         if "use_cls_token" in self.component_config:
             self.use_cls_token = self.component_config["use_cls_token"]
         else:
-            self.use_cls_token = False
+            self.use_cls_token = True
 
     def add_cls_token(
         self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE

diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -24,6 +24,8 @@ class WhitespaceTokenizer(Tokenizer):
         "intent_split_symbol": "_",
         # Text will be tokenized with case sensitive as default
         "case_sensitive": True,
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True,
     }
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None: