Merge pull request #7203 from RasaHQ/fallback-classifier-improvement

ignore `FallbackClassifier` predictions during evaluation
RasaHQ · Nov 10, 2020 · e2f6d02 · e2f6d02
2 parents d37631e + 7c1f1ff
commit e2f6d02
Show file tree

Hide file tree

Showing 5 changed files with 223 additions and 33 deletions.
diff --git a/changelog/6285.improvement.md b/changelog/6285.improvement.md
@@ -0,0 +1,5 @@
+Predictions of the [`FallbackClassifier`](components.mdx#fallbackclassifier) are
+ignored when 
+[evaluating the NLU model](testing-your-assistant.mdx#evaluating-an-nlu-model)
+Note that the `FallbackClassifier` predictions still apply to 
+[test stories](testing-your-assistant.mdx#writing-test-stories).
diff --git a/rasa/nlu/classifiers/fallback_classifier.py b/rasa/nlu/classifiers/fallback_classifier.py
@@ -1,3 +1,4 @@
+import copy
 import logging
 from typing import Any, List, Type, Text, Dict, Union, Tuple, Optional
 
@@ -121,3 +122,40 @@ def _fallback_intent() -> Dict[Text, Union[Text, float]]:
         # TODO: Re-consider how we represent the confidence here
         PREDICTED_CONFIDENCE_KEY: 1.0,
     }
+
+
+def is_fallback_classifier_prediction(prediction: Dict[Text, Any]) -> bool:
+    """Checks if the intent was predicted by the `FallbackClassifier`.
+
+    Args:
+        prediction: The prediction of the NLU model.
+
+    Returns:
+        `True` if the top classified intent was the fallback intent.
+    """
+    return (
+        prediction.get(INTENT, {}).get(INTENT_NAME_KEY)
+        == DEFAULT_NLU_FALLBACK_INTENT_NAME
+    )
+
+
+def undo_fallback_prediction(prediction: Dict[Text, Any]) -> Dict[Text, Any]:
+    """Undo the prediction of the fallback intent.
+
+    Args:
+        prediction: The prediction of the NLU model.
+
+    Returns:
+        The prediction as if the `FallbackClassifier` wasn't present in the pipeline.
+        If the fallback intent is the only intent, return the prediction as it was
+        provided.
+    """
+    intent_ranking = prediction.get(INTENT_RANKING_KEY, [])
+    if len(intent_ranking) < 2:
+        return prediction
+
+    prediction = copy.deepcopy(prediction)
+    prediction[INTENT] = intent_ranking[1]
+    prediction[INTENT_RANKING_KEY] = prediction[INTENT_RANKING_KEY][1:]
+
+    return prediction
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
@@ -24,6 +24,7 @@
 import rasa.utils.common
 
 from rasa.constants import TEST_DATA_FILE, TRAIN_DATA_FILE, NLG_DATA_FILE
+import rasa.nlu.classifiers.fallback_classifier
 from rasa.nlu.constants import (
     RESPONSE_SELECTOR_DEFAULT_INTENT,
     RESPONSE_SELECTOR_PROPERTY_NAME,
@@ -1270,7 +1271,7 @@ def get_eval_data(
     List[IntentEvaluationResult],
     List[ResponseSelectionEvaluationResult],
     List[EntityEvaluationResult],
-]:  # pragma: no cover
+]:
     """Runs the model for the test set and extracts targets and predictions.
 
     Returns intent results (intent targets and predictions, the original
@@ -1310,7 +1311,16 @@ def get_eval_data(
         result = interpreter.parse(example.get(TEXT), only_output_properties=False)
 
         if should_eval_intents:
+            if rasa.nlu.classifiers.fallback_classifier.is_fallback_classifier_prediction(
+                result
+            ):
+                # Revert fallback prediction to not shadow the wrongly predicted intent
+                # during the test phase.
+                result = rasa.nlu.classifiers.fallback_classifier.undo_fallback_prediction(
+                    result
+                )
             intent_prediction = result.get(INTENT, {}) or {}
+
             intent_results.append(
                 IntentEvaluationResult(
                     example.get(INTENT, ""),
@@ -1485,7 +1495,7 @@ def run_evaluation(
     if output_directory:
         rasa.shared.utils.io.create_directory(output_directory)
 
-    (intent_results, response_selection_results, entity_results,) = get_eval_data(
+    (intent_results, response_selection_results, entity_results) = get_eval_data(
         interpreter, test_data
     )
 

diff --git a/tests/nlu/classifiers/test_fallback_classifier.py b/tests/nlu/classifiers/test_fallback_classifier.py
@@ -1,8 +1,9 @@
 import copy
-from typing import Dict
+from typing import Dict, Text, Any
 
 import pytest
 
+from rasa.nlu.classifiers import fallback_classifier
 from rasa.shared.constants import DEFAULT_NLU_FALLBACK_INTENT_NAME
 from rasa.core.constants import DEFAULT_NLU_FALLBACK_THRESHOLD
 from rasa.nlu.classifiers.fallback_classifier import (
@@ -152,3 +153,70 @@ def test_defaults():
 
     assert classifier.component_config[THRESHOLD_KEY] == DEFAULT_NLU_FALLBACK_THRESHOLD
     assert classifier.component_config[AMBIGUITY_THRESHOLD_KEY] == 0.1
+
+
+@pytest.mark.parametrize(
+    "prediction, expected",
+    [
+        ({INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME}}, True),
+        ({INTENT: {INTENT_NAME_KEY: "some other intent"}}, False),
+    ],
+)
+def test_is_fallback_classifier_prediction(prediction: Dict[Text, Any], expected: bool):
+    assert fallback_classifier.is_fallback_classifier_prediction(prediction) == expected
+
+
+@pytest.mark.parametrize(
+    "prediction, expected",
+    [
+        (
+            {INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME}},
+            {INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME}},
+        ),
+        (
+            {
+                INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME},
+                INTENT_RANKING_KEY: [],
+            },
+            {
+                INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME},
+                INTENT_RANKING_KEY: [],
+            },
+        ),
+        (
+            {
+                INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME},
+                INTENT_RANKING_KEY: [
+                    {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME}
+                ],
+            },
+            {
+                INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME},
+                INTENT_RANKING_KEY: [
+                    {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME}
+                ],
+            },
+        ),
+        (
+            {
+                INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME},
+                INTENT_RANKING_KEY: [
+                    {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME},
+                    {INTENT_NAME_KEY: "other", PREDICTED_CONFIDENCE_KEY: 123},
+                    {INTENT_NAME_KEY: "other2", PREDICTED_CONFIDENCE_KEY: 12},
+                ],
+            },
+            {
+                INTENT: {INTENT_NAME_KEY: "other", PREDICTED_CONFIDENCE_KEY: 123},
+                INTENT_RANKING_KEY: [
+                    {INTENT_NAME_KEY: "other", PREDICTED_CONFIDENCE_KEY: 123},
+                    {INTENT_NAME_KEY: "other2", PREDICTED_CONFIDENCE_KEY: 12},
+                ],
+            },
+        ),
+    ],
+)
+def test_undo_fallback_prediction(
+    prediction: Dict[Text, Any], expected: Dict[Text, Any]
+):
+    assert fallback_classifier.undo_fallback_prediction(prediction) == expected
diff --git a/tests/nlu/test_evaluation.py b/tests/nlu/test_evaluation.py
@@ -1,23 +1,28 @@
-from pathlib import Path
-
-from sanic.request import Request
-from typing import Text, Iterator, List, Dict, Any
-
 import asyncio
+import datetime
+import json
+import os
+from pathlib import Path
+from typing import Text, Iterator, List, Dict, Any, Set, Optional
 
 import pytest
-from _pytest.tmpdir import TempdirFactory
+from sanic.request import Request
 
+import rasa.nlu.test
+import rasa.shared.nlu.training_data.loading
 import rasa.shared.utils.io
 import rasa.utils.io
-from rasa.shared.nlu.constants import NO_ENTITY_TAG
+from rasa.nlu import train
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
+from rasa.nlu.classifiers.fallback_classifier import FallbackClassifier
+from rasa.nlu.components import ComponentBuilder, Component
+from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-from rasa.test import compare_nlu_models
 from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.extractors.mitie_entity_extractor import MitieEntityExtractor
 from rasa.nlu.extractors.spacy_entity_extractor import SpacyEntityExtractor
 from rasa.nlu.model import Interpreter, Trainer
+from rasa.nlu.selectors.response_selector import ResponseSelector
 from rasa.nlu.test import (
     is_token_within_entity,
     do_entities_overlap,
@@ -41,22 +46,28 @@
     collect_incorrect_entity_predictions,
     merge_confidences,
     _get_entity_confidences,
+    is_response_selector_present,
+    get_eval_data,
+    does_token_cross_borders,
+    align_entity_predictions,
+    determine_intersection,
+    determine_token_labels,
 )
-from rasa.nlu.test import does_token_cross_borders
-from rasa.nlu.test import align_entity_predictions
-from rasa.nlu.test import determine_intersection
-from rasa.nlu.test import determine_token_labels
-from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Token
-import json
-import os
-import rasa.shared.nlu.training_data.loading
-from tests.nlu.conftest import DEFAULT_DATA_PATH
-from rasa.nlu.selectors.response_selector import ResponseSelector
-from rasa.nlu.test import is_response_selector_present, get_eval_data
-from rasa.utils.tensorflow.constants import EPOCHS, ENTITY_RECOGNITION
-from rasa.nlu import train
+from rasa.shared.constants import DEFAULT_NLU_FALLBACK_INTENT_NAME
 from rasa.shared.importers.importer import TrainingDataImporter
+from rasa.shared.nlu.constants import (
+    NO_ENTITY_TAG,
+    INTENT,
+    INTENT_RANKING_KEY,
+    INTENT_NAME_KEY,
+    PREDICTED_CONFIDENCE_KEY,
+)
+from rasa.shared.nlu.training_data.message import Message
+from rasa.shared.nlu.training_data.training_data import TrainingData
+from rasa.test import compare_nlu_models
+from rasa.utils.tensorflow.constants import EPOCHS, ENTITY_RECOGNITION
+from tests.nlu.conftest import DEFAULT_DATA_PATH
 
 # https://github.com/pytest-dev/pytest-asyncio/issues/68
 # this event_loop is used by pytest-asyncio, and redefining it
@@ -346,7 +357,7 @@ def test_drop_intents_below_freq():
     assert clean_td.intents == {"affirm", "restaurant_search"}
 
 
-def test_run_evaluation(unpacked_trained_moodbot_path):
+def test_run_evaluation(unpacked_trained_moodbot_path: Text):
     result = run_evaluation(
         DEFAULT_DATA_PATH,
         os.path.join(unpacked_trained_moodbot_path, "nlu"),
@@ -358,7 +369,9 @@ def test_run_evaluation(unpacked_trained_moodbot_path):
     assert result.get("intent_evaluation")
 
 
-async def test_eval_data(component_builder, tmpdir, project):
+async def test_eval_data(
+    component_builder: ComponentBuilder, tmp_path: Path, project: Text
+):
     _config = RasaNLUModelConfig(
         {
             "pipeline": [
@@ -382,7 +395,7 @@ async def test_eval_data(component_builder, tmpdir, project):
 
     (_, _, persisted_path) = await train(
         _config,
-        path=tmpdir.strpath,
+        path=str(tmp_path),
         data=data_importer,
         component_builder=component_builder,
         persist_nlu_training_data=True,
@@ -391,7 +404,7 @@ async def test_eval_data(component_builder, tmpdir, project):
     interpreter = Interpreter.load(persisted_path, component_builder)
 
     data = await data_importer.get_nlu_data()
-    (intent_results, response_selection_results, entity_results,) = get_eval_data(
+    (intent_results, response_selection_results, entity_results) = get_eval_data(
         interpreter, data
     )
 
@@ -401,7 +414,7 @@ async def test_eval_data(component_builder, tmpdir, project):
 
 
 @pytest.mark.timeout(240)  # these can take a longer time than the default timeout
-def test_run_cv_evaluation(pretrained_embeddings_spacy_config):
+def test_run_cv_evaluation(pretrained_embeddings_spacy_config: RasaNLUModelConfig):
     td = rasa.shared.nlu.training_data.loading.load_data(
         "data/examples/rasa/demo-rasa.json"
     )
@@ -673,14 +686,16 @@ def test_response_evaluation_report(tmp_path: Path):
         ([ResponseSelector()], set()),
     ],
 )
-def test_get_entity_extractors(components, expected_extractors):
+def test_get_entity_extractors(
+    components: List[Component], expected_extractors: Set[Text]
+):
     mock_interpreter = Interpreter(components, None)
     extractors = get_entity_extractors(mock_interpreter)
 
     assert extractors == expected_extractors
 
 
-def test_entity_evaluation_report(tmp_path):
+def test_entity_evaluation_report(tmp_path: Path):
     class EntityExtractorA(EntityExtractor):
 
         provides = ["entities"]
@@ -866,7 +881,7 @@ def test_evaluate_entities_cv():
     }, "Wrong entity prediction alignment"
 
 
-def test_remove_pretrained_extractors(component_builder):
+def test_remove_pretrained_extractors(component_builder: ComponentBuilder):
     _config = RasaNLUModelConfig(
         {
             "pipeline": [
@@ -1073,7 +1088,11 @@ def test_nlu_comparison(tmp_path: Path):
     ],
 )
 def test_collect_entity_predictions(
-    entity_results, targets, predictions, successes, errors
+    entity_results: List[EntityEvaluationResult],
+    targets: List[Text],
+    predictions: List[Text],
+    successes: List[Dict[Text, Any]],
+    errors: List[Dict[Text, Any]],
 ):
     actual = collect_successful_entity_predictions(entity_results, targets, predictions)
 
@@ -1084,3 +1103,53 @@ def test_collect_entity_predictions(
 
     assert len(errors) == len(actual)
     assert errors == actual
+
+
+class ConstantInterpreter(Interpreter):
+    def __init__(self, prediction_to_return: Dict[Text, Any]) -> None:
+        # add intent classifier to make sure intents are evaluated
+        super().__init__([FallbackClassifier()], None)
+        self.prediction = prediction_to_return
+
+    def parse(
+        self,
+        text: Text,
+        time: Optional[datetime.datetime] = None,
+        only_output_properties: bool = True,
+    ) -> Dict[Text, Any]:
+        return self.prediction
+
+
+def test_replacing_fallback_intent():
+    expected_intent = "greet"
+    expected_confidence = 0.345
+    fallback_prediction = {
+        INTENT: {
+            INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME,
+            PREDICTED_CONFIDENCE_KEY: 1,
+        },
+        INTENT_RANKING_KEY: [
+            {
+                INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME,
+                PREDICTED_CONFIDENCE_KEY: 1,
+            },
+            {
+                INTENT_NAME_KEY: expected_intent,
+                PREDICTED_CONFIDENCE_KEY: expected_confidence,
+            },
+            {INTENT_NAME_KEY: "some", PREDICTED_CONFIDENCE_KEY: 0.1},
+        ],
+    }
+
+    interpreter = ConstantInterpreter(fallback_prediction)
+    training_data = TrainingData(
+        [Message.build("hi", "greet"), Message.build("bye", "bye")]
+    )
+
+    intent_evaluations, _, _ = get_eval_data(interpreter, training_data)
+
+    assert all(
+        prediction.intent_prediction == expected_intent
+        and prediction.confidence == expected_confidence
+        for prediction in intent_evaluations
+    )