From db7f7bbfa3eb9e2854357ae4a13967e4a59a9257 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 3 Feb 2020 10:13:09 +0100
Subject: [PATCH 1/4] fix response bug

---
 .../count_vectors_featurizer.py               |  7 +--
 .../test_count_vectors_featurizer.py          | 44 +++++++++++++++++++
 2 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 4e6c547ef78f..9536737e7d63 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -271,7 +271,7 @@ def _get_processed_message_tokens_by_attribute(
         if message.get(attribute) is None:
             # return empty string since sklearn countvectorizer does not like None
             # object while training and predicting
-            return [""]
+            return []
 
         tokens = self._get_message_tokens_by_attribute(message, attribute)
         tokens = self._process_tokens(tokens, attribute)
@@ -420,7 +420,9 @@ def _create_sequence(
             seq_vec.sort_indices()
 
             if attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]:
-                tokens_text = [" ".join(tokens_without_cls)]
+                tokens_text = (
+                    [" ".join(tokens_without_cls)] if tokens_without_cls else []
+                )
                 cls_vec = self.vectorizers[attribute].transform(tokens_text)
                 cls_vec.sort_indices()
 
@@ -489,7 +491,6 @@ def train(
 
         # transform for all attributes
         for attribute in self._attributes:
-
             attribute_features = self._get_featurized_attribute(
                 attribute, processed_attribute_tokens[attribute]
             )
diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py
index 581eefc77648..e31bb89818eb 100644
--- a/tests/nlu/featurizers/test_count_vectors_featurizer.py
+++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py
@@ -52,6 +52,50 @@ def test_count_vector_featurizer(sentence, expected, expected_cls):
     assert np.all(actual[-1] == expected_cls)
 
 
+@pytest.mark.parametrize(
+    "sentence, intent, response, intent_features, response_features",
+    [("hello", "greet", None, [[1]], None), ("hello", "greet", "hi", [[1]], [[1]])],
+)
+def test_count_vector_featurizer_response_attribute_featurization(
+    sentence, intent, response, intent_features, response_features
+):
+    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
+    tk = WhitespaceTokenizer()
+
+    train_message = Message(sentence)
+    # this is needed for a valid training example
+    train_message.set(INTENT_ATTRIBUTE, intent)
+    train_message.set(RESPONSE_ATTRIBUTE, response)
+
+    second_message = Message("hello")
+    second_message.set(RESPONSE_ATTRIBUTE, "hi")
+    second_message.set(INTENT_ATTRIBUTE, "greet")
+
+    data = TrainingData([train_message, second_message])
+
+    tk.train(data)
+    ftr.train(data)
+
+    if intent_features:
+        assert (
+            train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]).toarray()[0]
+            == intent_features
+        )
+    else:
+        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) is None
+
+    if response_features:
+        assert (
+            train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).toarray()[0]
+            == response_features
+        )
+    else:
+        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).shape == (
+            0,
+            1,
+        )
+
+
 @pytest.mark.parametrize(
     "sentence, intent, response, intent_features, response_features",
     [

From f29bd1eddd4ea240bba0240e59584c72603df607 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 3 Feb 2020 10:17:03 +0100
Subject: [PATCH 2/4] add changelog

---
 changelog/5171.bugfix.rst | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 changelog/5171.bugfix.rst

diff --git a/changelog/5171.bugfix.rst b/changelog/5171.bugfix.rst
new file mode 100644
index 000000000000..84ab7abe61bf
--- /dev/null
+++ b/changelog/5171.bugfix.rst
@@ -0,0 +1,4 @@
+Fix bug ``ValueError: Cannot concatenate sparse features as sequence dimension does not match``.
+
+When training a Rasa model that contains responses for just some of the intents, training was failing.
+Fixed the featurizers to return a consistent feature vector in case no response was given for a specific message.
\ No newline at end of file

From 1e76bbf6de2faec85372faa9b5967c16a5992d08 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 3 Feb 2020 10:57:00 +0100
Subject: [PATCH 3/4] review comments

---
 .../sparse_featurizer/count_vectors_featurizer.py          | 7 ++++++-
 rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py | 7 ++++++-
 tests/nlu/featurizers/test_count_vectors_featurizer.py     | 7 +++----
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 9536737e7d63..a584517a6303 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -269,7 +269,7 @@ def _get_processed_message_tokens_by_attribute(
         """Get processed text of attribute of a message"""
 
         if message.get(attribute) is None:
-            # return empty string since sklearn countvectorizer does not like None
+            # return empty list since sklearn countvectorizer does not like None
             # object while training and predicting
             return []
 
@@ -416,6 +416,11 @@ def _create_sequence(
             if attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]:
                 tokens_without_cls = tokens[:-1]
 
+            if not tokens_without_cls:
+                # attribute is not set (e.g. response not present)
+                X.append(None)
+                continue
+
             seq_vec = self.vectorizers[attribute].transform(tokens_without_cls)
             seq_vec.sort_indices()
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index 0d44c34a6b4c..b831382f4b7c 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -81,13 +81,18 @@ def _add_lookup_table_regexes(
 
     def _features_for_patterns(
         self, message: Message, attribute: Text
-    ) -> scipy.sparse.coo_matrix:
+    ) -> Optional[scipy.sparse.coo_matrix]:
         """Checks which known patterns match the message.
 
         Given a sentence, returns a vector of {1,0} values indicating which
         regexes did match. Furthermore, if the
         message is tokenized, the function will mark all tokens with a dict
         relating the name of the regex to whether it was matched."""
+
+        # Attribute not set (e.g. response not present)
+        if not message.get(attribute):
+            return None
+
         tokens = message.get(TOKENS_NAMES[attribute], [])
         seq_length = len(tokens)
 
diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py
index e31bb89818eb..b65c29f7cd92 100644
--- a/tests/nlu/featurizers/test_count_vectors_featurizer.py
+++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py
@@ -67,6 +67,8 @@ def test_count_vector_featurizer_response_attribute_featurization(
     train_message.set(INTENT_ATTRIBUTE, intent)
     train_message.set(RESPONSE_ATTRIBUTE, response)
 
+    # add a second example that has some response, so that the vocabulary for
+    # response exists
     second_message = Message("hello")
     second_message.set(RESPONSE_ATTRIBUTE, "hi")
     second_message.set(INTENT_ATTRIBUTE, "greet")
@@ -90,10 +92,7 @@ def test_count_vector_featurizer_response_attribute_featurization(
             == response_features
         )
     else:
-        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).shape == (
-            0,
-            1,
-        )
+        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) is None
 
 
 @pytest.mark.parametrize(

From 4fa2e5231ed72ab4183a104a1eca86f18c98e3fb Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 3 Feb 2020 10:58:40 +0100
Subject: [PATCH 4/4] clean up

---
 .../featurizers/sparse_featurizer/count_vectors_featurizer.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index a584517a6303..26408c8525d8 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -425,9 +425,7 @@ def _create_sequence(
             seq_vec.sort_indices()
 
             if attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]:
-                tokens_text = (
-                    [" ".join(tokens_without_cls)] if tokens_without_cls else []
-                )
+                tokens_text = [" ".join(tokens_without_cls)]
                 cls_vec = self.vectorizers[attribute].transform(tokens_text)
                 cls_vec.sort_indices()