RasaHQ · tabergma · Feb 3, 2020 · Feb 3, 2020 · Feb 3, 2020 · Feb 3, 2020
diff --git a/changelog/5171.bugfix.rst b/changelog/5171.bugfix.rst
@@ -0,0 +1,4 @@
+Fix bug ``ValueError: Cannot concatenate sparse features as sequence dimension does not match``.
+
+When training a Rasa model that contains responses for just some of the intents, training was failing.
+Fixed the featurizers to return a consistent feature vector in case no response was given for a specific message.
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -269,9 +269,9 @@ def _get_processed_message_tokens_by_attribute(
         """Get processed text of attribute of a message"""
 
         if message.get(attribute) is None:
-            # return empty string since sklearn countvectorizer does not like None
+            # return empty list since sklearn countvectorizer does not like None
             # object while training and predicting
-            return [""]
+            return []
 
         tokens = self._get_message_tokens_by_attribute(message, attribute)
         tokens = self._process_tokens(tokens, attribute)
@@ -416,6 +416,11 @@ def _create_sequence(
             if attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]:
                 tokens_without_cls = tokens[:-1]
 
+            if not tokens_without_cls:
+                # attribute is not set (e.g. response not present)
+                X.append(None)
+                continue
+
             seq_vec = self.vectorizers[attribute].transform(tokens_without_cls)
             seq_vec.sort_indices()
 
@@ -489,7 +494,6 @@ def train(
 
         # transform for all attributes
         for attribute in self._attributes:
-
             attribute_features = self._get_featurized_attribute(
                 attribute, processed_attribute_tokens[attribute]
             )

diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -81,13 +81,18 @@ def _add_lookup_table_regexes(
 
     def _features_for_patterns(
         self, message: Message, attribute: Text
-    ) -> scipy.sparse.coo_matrix:
+    ) -> Optional[scipy.sparse.coo_matrix]:
         """Checks which known patterns match the message.
 
         Given a sentence, returns a vector of {1,0} values indicating which
         regexes did match. Furthermore, if the
         message is tokenized, the function will mark all tokens with a dict
         relating the name of the regex to whether it was matched."""
+
+        # Attribute not set (e.g. response not present)
+        if not message.get(attribute):
+            return None
+
         tokens = message.get(TOKENS_NAMES[attribute], [])
         seq_length = len(tokens)
 

diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py
@@ -52,6 +52,49 @@ def test_count_vector_featurizer(sentence, expected, expected_cls):
     assert np.all(actual[-1] == expected_cls)
 
 
+@pytest.mark.parametrize(
+    "sentence, intent, response, intent_features, response_features",
+    [("hello", "greet", None, [[1]], None), ("hello", "greet", "hi", [[1]], [[1]])],
+)
+def test_count_vector_featurizer_response_attribute_featurization(
+    sentence, intent, response, intent_features, response_features
+):
+    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
+    tk = WhitespaceTokenizer()
+
+    train_message = Message(sentence)
+    # this is needed for a valid training example
+    train_message.set(INTENT_ATTRIBUTE, intent)
+    train_message.set(RESPONSE_ATTRIBUTE, response)
+
+    # add a second example that has some response, so that the vocabulary for
+    # response exists
+    second_message = Message("hello")
+    second_message.set(RESPONSE_ATTRIBUTE, "hi")
+    second_message.set(INTENT_ATTRIBUTE, "greet")
+
+    data = TrainingData([train_message, second_message])
+
+    tk.train(data)
+    ftr.train(data)
+
+    if intent_features:
+        assert (
+            train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]).toarray()[0]
+            == intent_features
+        )
+    else:
+        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) is None
+
+    if response_features:
+        assert (
+            train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).toarray()[0]
+            == response_features
+        )
+    else:
+        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) is None
+
+
 @pytest.mark.parametrize(
     "sentence, intent, response, intent_features, response_features",
     [