Skip to content

Commit

Permalink
Merge pull request #7203 from RasaHQ/fallback-classifier-improvement
Browse files Browse the repository at this point in the history
ignore `FallbackClassifier` predictions during evaluation
  • Loading branch information
rasabot authored Nov 10, 2020
2 parents d37631e + 7c1f1ff commit e2f6d02
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 33 deletions.
5 changes: 5 additions & 0 deletions changelog/6285.improvement.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Predictions of the [`FallbackClassifier`](components.mdx#fallbackclassifier) are
ignored when
[evaluating the NLU model](testing-your-assistant.mdx#evaluating-an-nlu-model)
Note that the `FallbackClassifier` predictions still apply to
[test stories](testing-your-assistant.mdx#writing-test-stories).
38 changes: 38 additions & 0 deletions rasa/nlu/classifiers/fallback_classifier.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import logging
from typing import Any, List, Type, Text, Dict, Union, Tuple, Optional

Expand Down Expand Up @@ -121,3 +122,40 @@ def _fallback_intent() -> Dict[Text, Union[Text, float]]:
# TODO: Re-consider how we represent the confidence here
PREDICTED_CONFIDENCE_KEY: 1.0,
}


def is_fallback_classifier_prediction(prediction: Dict[Text, Any]) -> bool:
"""Checks if the intent was predicted by the `FallbackClassifier`.
Args:
prediction: The prediction of the NLU model.
Returns:
`True` if the top classified intent was the fallback intent.
"""
return (
prediction.get(INTENT, {}).get(INTENT_NAME_KEY)
== DEFAULT_NLU_FALLBACK_INTENT_NAME
)


def undo_fallback_prediction(prediction: Dict[Text, Any]) -> Dict[Text, Any]:
"""Undo the prediction of the fallback intent.
Args:
prediction: The prediction of the NLU model.
Returns:
The prediction as if the `FallbackClassifier` wasn't present in the pipeline.
If the fallback intent is the only intent, return the prediction as it was
provided.
"""
intent_ranking = prediction.get(INTENT_RANKING_KEY, [])
if len(intent_ranking) < 2:
return prediction

prediction = copy.deepcopy(prediction)
prediction[INTENT] = intent_ranking[1]
prediction[INTENT_RANKING_KEY] = prediction[INTENT_RANKING_KEY][1:]

return prediction
14 changes: 12 additions & 2 deletions rasa/nlu/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import rasa.utils.common

from rasa.constants import TEST_DATA_FILE, TRAIN_DATA_FILE, NLG_DATA_FILE
import rasa.nlu.classifiers.fallback_classifier
from rasa.nlu.constants import (
RESPONSE_SELECTOR_DEFAULT_INTENT,
RESPONSE_SELECTOR_PROPERTY_NAME,
Expand Down Expand Up @@ -1270,7 +1271,7 @@ def get_eval_data(
List[IntentEvaluationResult],
List[ResponseSelectionEvaluationResult],
List[EntityEvaluationResult],
]: # pragma: no cover
]:
"""Runs the model for the test set and extracts targets and predictions.
Returns intent results (intent targets and predictions, the original
Expand Down Expand Up @@ -1310,7 +1311,16 @@ def get_eval_data(
result = interpreter.parse(example.get(TEXT), only_output_properties=False)

if should_eval_intents:
if rasa.nlu.classifiers.fallback_classifier.is_fallback_classifier_prediction(
result
):
# Revert fallback prediction to not shadow the wrongly predicted intent
# during the test phase.
result = rasa.nlu.classifiers.fallback_classifier.undo_fallback_prediction(
result
)
intent_prediction = result.get(INTENT, {}) or {}

intent_results.append(
IntentEvaluationResult(
example.get(INTENT, ""),
Expand Down Expand Up @@ -1485,7 +1495,7 @@ def run_evaluation(
if output_directory:
rasa.shared.utils.io.create_directory(output_directory)

(intent_results, response_selection_results, entity_results,) = get_eval_data(
(intent_results, response_selection_results, entity_results) = get_eval_data(
interpreter, test_data
)

Expand Down
70 changes: 69 additions & 1 deletion tests/nlu/classifiers/test_fallback_classifier.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import copy
from typing import Dict
from typing import Dict, Text, Any

import pytest

from rasa.nlu.classifiers import fallback_classifier
from rasa.shared.constants import DEFAULT_NLU_FALLBACK_INTENT_NAME
from rasa.core.constants import DEFAULT_NLU_FALLBACK_THRESHOLD
from rasa.nlu.classifiers.fallback_classifier import (
Expand Down Expand Up @@ -152,3 +153,70 @@ def test_defaults():

assert classifier.component_config[THRESHOLD_KEY] == DEFAULT_NLU_FALLBACK_THRESHOLD
assert classifier.component_config[AMBIGUITY_THRESHOLD_KEY] == 0.1


@pytest.mark.parametrize(
"prediction, expected",
[
({INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME}}, True),
({INTENT: {INTENT_NAME_KEY: "some other intent"}}, False),
],
)
def test_is_fallback_classifier_prediction(prediction: Dict[Text, Any], expected: bool):
assert fallback_classifier.is_fallback_classifier_prediction(prediction) == expected


@pytest.mark.parametrize(
"prediction, expected",
[
(
{INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME}},
{INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME}},
),
(
{
INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME},
INTENT_RANKING_KEY: [],
},
{
INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME},
INTENT_RANKING_KEY: [],
},
),
(
{
INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME},
INTENT_RANKING_KEY: [
{INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME}
],
},
{
INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME},
INTENT_RANKING_KEY: [
{INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME}
],
},
),
(
{
INTENT: {INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME},
INTENT_RANKING_KEY: [
{INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME},
{INTENT_NAME_KEY: "other", PREDICTED_CONFIDENCE_KEY: 123},
{INTENT_NAME_KEY: "other2", PREDICTED_CONFIDENCE_KEY: 12},
],
},
{
INTENT: {INTENT_NAME_KEY: "other", PREDICTED_CONFIDENCE_KEY: 123},
INTENT_RANKING_KEY: [
{INTENT_NAME_KEY: "other", PREDICTED_CONFIDENCE_KEY: 123},
{INTENT_NAME_KEY: "other2", PREDICTED_CONFIDENCE_KEY: 12},
],
},
),
],
)
def test_undo_fallback_prediction(
prediction: Dict[Text, Any], expected: Dict[Text, Any]
):
assert fallback_classifier.undo_fallback_prediction(prediction) == expected
129 changes: 99 additions & 30 deletions tests/nlu/test_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,28 @@
from pathlib import Path

from sanic.request import Request
from typing import Text, Iterator, List, Dict, Any

import asyncio
import datetime
import json
import os
from pathlib import Path
from typing import Text, Iterator, List, Dict, Any, Set, Optional

import pytest
from _pytest.tmpdir import TempdirFactory
from sanic.request import Request

import rasa.nlu.test
import rasa.shared.nlu.training_data.loading
import rasa.shared.utils.io
import rasa.utils.io
from rasa.shared.nlu.constants import NO_ENTITY_TAG
from rasa.nlu import train
from rasa.nlu.classifiers.diet_classifier import DIETClassifier
from rasa.nlu.classifiers.fallback_classifier import FallbackClassifier
from rasa.nlu.components import ComponentBuilder, Component
from rasa.nlu.config import RasaNLUModelConfig
from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
from rasa.test import compare_nlu_models
from rasa.nlu.extractors.extractor import EntityExtractor
from rasa.nlu.extractors.mitie_entity_extractor import MitieEntityExtractor
from rasa.nlu.extractors.spacy_entity_extractor import SpacyEntityExtractor
from rasa.nlu.model import Interpreter, Trainer
from rasa.nlu.selectors.response_selector import ResponseSelector
from rasa.nlu.test import (
is_token_within_entity,
do_entities_overlap,
Expand All @@ -41,22 +46,28 @@
collect_incorrect_entity_predictions,
merge_confidences,
_get_entity_confidences,
is_response_selector_present,
get_eval_data,
does_token_cross_borders,
align_entity_predictions,
determine_intersection,
determine_token_labels,
)
from rasa.nlu.test import does_token_cross_borders
from rasa.nlu.test import align_entity_predictions
from rasa.nlu.test import determine_intersection
from rasa.nlu.test import determine_token_labels
from rasa.nlu.config import RasaNLUModelConfig
from rasa.nlu.tokenizers.tokenizer import Token
import json
import os
import rasa.shared.nlu.training_data.loading
from tests.nlu.conftest import DEFAULT_DATA_PATH
from rasa.nlu.selectors.response_selector import ResponseSelector
from rasa.nlu.test import is_response_selector_present, get_eval_data
from rasa.utils.tensorflow.constants import EPOCHS, ENTITY_RECOGNITION
from rasa.nlu import train
from rasa.shared.constants import DEFAULT_NLU_FALLBACK_INTENT_NAME
from rasa.shared.importers.importer import TrainingDataImporter
from rasa.shared.nlu.constants import (
NO_ENTITY_TAG,
INTENT,
INTENT_RANKING_KEY,
INTENT_NAME_KEY,
PREDICTED_CONFIDENCE_KEY,
)
from rasa.shared.nlu.training_data.message import Message
from rasa.shared.nlu.training_data.training_data import TrainingData
from rasa.test import compare_nlu_models
from rasa.utils.tensorflow.constants import EPOCHS, ENTITY_RECOGNITION
from tests.nlu.conftest import DEFAULT_DATA_PATH

# https://github.com/pytest-dev/pytest-asyncio/issues/68
# this event_loop is used by pytest-asyncio, and redefining it
Expand Down Expand Up @@ -346,7 +357,7 @@ def test_drop_intents_below_freq():
assert clean_td.intents == {"affirm", "restaurant_search"}


def test_run_evaluation(unpacked_trained_moodbot_path):
def test_run_evaluation(unpacked_trained_moodbot_path: Text):
result = run_evaluation(
DEFAULT_DATA_PATH,
os.path.join(unpacked_trained_moodbot_path, "nlu"),
Expand All @@ -358,7 +369,9 @@ def test_run_evaluation(unpacked_trained_moodbot_path):
assert result.get("intent_evaluation")


async def test_eval_data(component_builder, tmpdir, project):
async def test_eval_data(
component_builder: ComponentBuilder, tmp_path: Path, project: Text
):
_config = RasaNLUModelConfig(
{
"pipeline": [
Expand All @@ -382,7 +395,7 @@ async def test_eval_data(component_builder, tmpdir, project):

(_, _, persisted_path) = await train(
_config,
path=tmpdir.strpath,
path=str(tmp_path),
data=data_importer,
component_builder=component_builder,
persist_nlu_training_data=True,
Expand All @@ -391,7 +404,7 @@ async def test_eval_data(component_builder, tmpdir, project):
interpreter = Interpreter.load(persisted_path, component_builder)

data = await data_importer.get_nlu_data()
(intent_results, response_selection_results, entity_results,) = get_eval_data(
(intent_results, response_selection_results, entity_results) = get_eval_data(
interpreter, data
)

Expand All @@ -401,7 +414,7 @@ async def test_eval_data(component_builder, tmpdir, project):


@pytest.mark.timeout(240) # these can take a longer time than the default timeout
def test_run_cv_evaluation(pretrained_embeddings_spacy_config):
def test_run_cv_evaluation(pretrained_embeddings_spacy_config: RasaNLUModelConfig):
td = rasa.shared.nlu.training_data.loading.load_data(
"data/examples/rasa/demo-rasa.json"
)
Expand Down Expand Up @@ -673,14 +686,16 @@ def test_response_evaluation_report(tmp_path: Path):
([ResponseSelector()], set()),
],
)
def test_get_entity_extractors(components, expected_extractors):
def test_get_entity_extractors(
components: List[Component], expected_extractors: Set[Text]
):
mock_interpreter = Interpreter(components, None)
extractors = get_entity_extractors(mock_interpreter)

assert extractors == expected_extractors


def test_entity_evaluation_report(tmp_path):
def test_entity_evaluation_report(tmp_path: Path):
class EntityExtractorA(EntityExtractor):

provides = ["entities"]
Expand Down Expand Up @@ -866,7 +881,7 @@ def test_evaluate_entities_cv():
}, "Wrong entity prediction alignment"


def test_remove_pretrained_extractors(component_builder):
def test_remove_pretrained_extractors(component_builder: ComponentBuilder):
_config = RasaNLUModelConfig(
{
"pipeline": [
Expand Down Expand Up @@ -1073,7 +1088,11 @@ def test_nlu_comparison(tmp_path: Path):
],
)
def test_collect_entity_predictions(
entity_results, targets, predictions, successes, errors
entity_results: List[EntityEvaluationResult],
targets: List[Text],
predictions: List[Text],
successes: List[Dict[Text, Any]],
errors: List[Dict[Text, Any]],
):
actual = collect_successful_entity_predictions(entity_results, targets, predictions)

Expand All @@ -1084,3 +1103,53 @@ def test_collect_entity_predictions(

assert len(errors) == len(actual)
assert errors == actual


class ConstantInterpreter(Interpreter):
def __init__(self, prediction_to_return: Dict[Text, Any]) -> None:
# add intent classifier to make sure intents are evaluated
super().__init__([FallbackClassifier()], None)
self.prediction = prediction_to_return

def parse(
self,
text: Text,
time: Optional[datetime.datetime] = None,
only_output_properties: bool = True,
) -> Dict[Text, Any]:
return self.prediction


def test_replacing_fallback_intent():
expected_intent = "greet"
expected_confidence = 0.345
fallback_prediction = {
INTENT: {
INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME,
PREDICTED_CONFIDENCE_KEY: 1,
},
INTENT_RANKING_KEY: [
{
INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME,
PREDICTED_CONFIDENCE_KEY: 1,
},
{
INTENT_NAME_KEY: expected_intent,
PREDICTED_CONFIDENCE_KEY: expected_confidence,
},
{INTENT_NAME_KEY: "some", PREDICTED_CONFIDENCE_KEY: 0.1},
],
}

interpreter = ConstantInterpreter(fallback_prediction)
training_data = TrainingData(
[Message.build("hi", "greet"), Message.build("bye", "bye")]
)

intent_evaluations, _, _ = get_eval_data(interpreter, training_data)

assert all(
prediction.intent_prediction == expected_intent
and prediction.confidence == expected_confidence
for prediction in intent_evaluations
)

0 comments on commit e2f6d02

Please sign in to comment.