From db7f7bbfa3eb9e2854357ae4a13967e4a59a9257 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Mon, 3 Feb 2020 10:13:09 +0100 Subject: [PATCH 1/4] fix response bug --- .../count_vectors_featurizer.py | 7 +-- .../test_count_vectors_featurizer.py | 44 +++++++++++++++++++ 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index 4e6c547ef78f..9536737e7d63 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -271,7 +271,7 @@ def _get_processed_message_tokens_by_attribute( if message.get(attribute) is None: # return empty string since sklearn countvectorizer does not like None # object while training and predicting - return [""] + return [] tokens = self._get_message_tokens_by_attribute(message, attribute) tokens = self._process_tokens(tokens, attribute) @@ -420,7 +420,9 @@ def _create_sequence( seq_vec.sort_indices() if attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]: - tokens_text = [" ".join(tokens_without_cls)] + tokens_text = ( + [" ".join(tokens_without_cls)] if tokens_without_cls else [] + ) cls_vec = self.vectorizers[attribute].transform(tokens_text) cls_vec.sort_indices() @@ -489,7 +491,6 @@ def train( # transform for all attributes for attribute in self._attributes: - attribute_features = self._get_featurized_attribute( attribute, processed_attribute_tokens[attribute] ) diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py index 581eefc77648..e31bb89818eb 100644 --- a/tests/nlu/featurizers/test_count_vectors_featurizer.py +++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py @@ -52,6 +52,50 @@ def test_count_vector_featurizer(sentence, expected, expected_cls): assert np.all(actual[-1] == expected_cls) +@pytest.mark.parametrize( + "sentence, intent, response, intent_features, response_features", + [("hello", "greet", None, [[1]], None), ("hello", "greet", "hi", [[1]], [[1]])], +) +def test_count_vector_featurizer_response_attribute_featurization( + sentence, intent, response, intent_features, response_features +): + ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) + tk = WhitespaceTokenizer() + + train_message = Message(sentence) + # this is needed for a valid training example + train_message.set(INTENT_ATTRIBUTE, intent) + train_message.set(RESPONSE_ATTRIBUTE, response) + + second_message = Message("hello") + second_message.set(RESPONSE_ATTRIBUTE, "hi") + second_message.set(INTENT_ATTRIBUTE, "greet") + + data = TrainingData([train_message, second_message]) + + tk.train(data) + ftr.train(data) + + if intent_features: + assert ( + train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]).toarray()[0] + == intent_features + ) + else: + assert train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) is None + + if response_features: + assert ( + train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).toarray()[0] + == response_features + ) + else: + assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).shape == ( + 0, + 1, + ) + + @pytest.mark.parametrize( "sentence, intent, response, intent_features, response_features", [ From f29bd1eddd4ea240bba0240e59584c72603df607 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Mon, 3 Feb 2020 10:17:03 +0100 Subject: [PATCH 2/4] add changelog --- changelog/5171.bugfix.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 changelog/5171.bugfix.rst diff --git a/changelog/5171.bugfix.rst b/changelog/5171.bugfix.rst new file mode 100644 index 000000000000..84ab7abe61bf --- /dev/null +++ b/changelog/5171.bugfix.rst @@ -0,0 +1,4 @@ +Fix bug ``ValueError: Cannot concatenate sparse features as sequence dimension does not match``. + +When training a Rasa model that contains responses for just some of the intents, training was failing. +Fixed the featurizers to return a consistent feature vector in case no response was given for a specific message. \ No newline at end of file From 1e76bbf6de2faec85372faa9b5967c16a5992d08 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Mon, 3 Feb 2020 10:57:00 +0100 Subject: [PATCH 3/4] review comments --- .../sparse_featurizer/count_vectors_featurizer.py | 7 ++++++- rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py | 7 ++++++- tests/nlu/featurizers/test_count_vectors_featurizer.py | 7 +++---- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index 9536737e7d63..a584517a6303 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -269,7 +269,7 @@ def _get_processed_message_tokens_by_attribute( """Get processed text of attribute of a message""" if message.get(attribute) is None: - # return empty string since sklearn countvectorizer does not like None + # return empty list since sklearn countvectorizer does not like None # object while training and predicting return [] @@ -416,6 +416,11 @@ def _create_sequence( if attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]: tokens_without_cls = tokens[:-1] + if not tokens_without_cls: + # attribute is not set (e.g. response not present) + X.append(None) + continue + seq_vec = self.vectorizers[attribute].transform(tokens_without_cls) seq_vec.sort_indices() diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index 0d44c34a6b4c..b831382f4b7c 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -81,13 +81,18 @@ def _add_lookup_table_regexes( def _features_for_patterns( self, message: Message, attribute: Text - ) -> scipy.sparse.coo_matrix: + ) -> Optional[scipy.sparse.coo_matrix]: """Checks which known patterns match the message. Given a sentence, returns a vector of {1,0} values indicating which regexes did match. Furthermore, if the message is tokenized, the function will mark all tokens with a dict relating the name of the regex to whether it was matched.""" + + # Attribute not set (e.g. response not present) + if not message.get(attribute): + return None + tokens = message.get(TOKENS_NAMES[attribute], []) seq_length = len(tokens) diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py index e31bb89818eb..b65c29f7cd92 100644 --- a/tests/nlu/featurizers/test_count_vectors_featurizer.py +++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py @@ -67,6 +67,8 @@ def test_count_vector_featurizer_response_attribute_featurization( train_message.set(INTENT_ATTRIBUTE, intent) train_message.set(RESPONSE_ATTRIBUTE, response) + # add a second example that has some response, so that the vocabulary for + # response exists second_message = Message("hello") second_message.set(RESPONSE_ATTRIBUTE, "hi") second_message.set(INTENT_ATTRIBUTE, "greet") @@ -90,10 +92,7 @@ def test_count_vector_featurizer_response_attribute_featurization( == response_features ) else: - assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).shape == ( - 0, - 1, - ) + assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) is None @pytest.mark.parametrize( From 4fa2e5231ed72ab4183a104a1eca86f18c98e3fb Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Mon, 3 Feb 2020 10:58:40 +0100 Subject: [PATCH 4/4] clean up --- .../featurizers/sparse_featurizer/count_vectors_featurizer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index a584517a6303..26408c8525d8 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -425,9 +425,7 @@ def _create_sequence( seq_vec.sort_indices() if attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]: - tokens_text = ( - [" ".join(tokens_without_cls)] if tokens_without_cls else [] - ) + tokens_text = [" ".join(tokens_without_cls)] cls_vec = self.vectorizers[attribute].transform(tokens_text) cls_vec.sort_indices()