Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug in sparse featurizers #5172

Merged
merged 4 commits into from
Feb 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog/5171.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fix bug ``ValueError: Cannot concatenate sparse features as sequence dimension does not match``.

When training a Rasa model that contains responses for just some of the intents, training was failing.
Fixed the featurizers to return a consistent feature vector in case no response was given for a specific message.
Original file line number Diff line number Diff line change
Expand Up @@ -269,9 +269,9 @@ def _get_processed_message_tokens_by_attribute(
"""Get processed text of attribute of a message"""

if message.get(attribute) is None:
# return empty string since sklearn countvectorizer does not like None
# return empty list since sklearn countvectorizer does not like None
# object while training and predicting
return [""]
return []

tokens = self._get_message_tokens_by_attribute(message, attribute)
tokens = self._process_tokens(tokens, attribute)
Expand Down Expand Up @@ -416,6 +416,11 @@ def _create_sequence(
if attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]:
tokens_without_cls = tokens[:-1]

if not tokens_without_cls:
# attribute is not set (e.g. response not present)
X.append(None)
continue

seq_vec = self.vectorizers[attribute].transform(tokens_without_cls)
seq_vec.sort_indices()

Expand Down Expand Up @@ -489,7 +494,6 @@ def train(

# transform for all attributes
for attribute in self._attributes:

attribute_features = self._get_featurized_attribute(
attribute, processed_attribute_tokens[attribute]
)
Expand Down
7 changes: 6 additions & 1 deletion rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,18 @@ def _add_lookup_table_regexes(

def _features_for_patterns(
self, message: Message, attribute: Text
) -> scipy.sparse.coo_matrix:
) -> Optional[scipy.sparse.coo_matrix]:
"""Checks which known patterns match the message.

Given a sentence, returns a vector of {1,0} values indicating which
regexes did match. Furthermore, if the
message is tokenized, the function will mark all tokens with a dict
relating the name of the regex to whether it was matched."""

# Attribute not set (e.g. response not present)
if not message.get(attribute):
return None

tokens = message.get(TOKENS_NAMES[attribute], [])
seq_length = len(tokens)

Expand Down
43 changes: 43 additions & 0 deletions tests/nlu/featurizers/test_count_vectors_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,49 @@ def test_count_vector_featurizer(sentence, expected, expected_cls):
assert np.all(actual[-1] == expected_cls)


@pytest.mark.parametrize(
"sentence, intent, response, intent_features, response_features",
[("hello", "greet", None, [[1]], None), ("hello", "greet", "hi", [[1]], [[1]])],
)
def test_count_vector_featurizer_response_attribute_featurization(
sentence, intent, response, intent_features, response_features
):
ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
tk = WhitespaceTokenizer()

train_message = Message(sentence)
# this is needed for a valid training example
train_message.set(INTENT_ATTRIBUTE, intent)
train_message.set(RESPONSE_ATTRIBUTE, response)

# add a second example that has some response, so that the vocabulary for
# response exists
second_message = Message("hello")
second_message.set(RESPONSE_ATTRIBUTE, "hi")
second_message.set(INTENT_ATTRIBUTE, "greet")

data = TrainingData([train_message, second_message])

tk.train(data)
ftr.train(data)

if intent_features:
assert (
train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]).toarray()[0]
== intent_features
)
else:
assert train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) is None

if response_features:
assert (
train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).toarray()[0]
== response_features
)
else:
assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) is None


@pytest.mark.parametrize(
"sentence, intent, response, intent_features, response_features",
[
Expand Down