Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Response Selector in Cross Validation #4976

Merged
merged 18 commits into from
Dec 18, 2019
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelog/4976.improvement.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
``rasa test nlu --cross-validation`` now includes evaluation for response selector also.
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved
As a result, the train and test F1-score, accuracy and precision is logged for response selector.
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved
A report is also generated in the ``results`` folder by the name ``response_selection_report.json``
81 changes: 73 additions & 8 deletions rasa/nlu/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@

IntentMetrics = Dict[Text, List[float]]
EntityMetrics = Dict[Text, Dict[Text, List[float]]]
ResponseSelectionMetrics = Dict[Text, List[float]]


def plot_confusion_matrix(
Expand Down Expand Up @@ -1128,7 +1129,10 @@ def generate_folds(

skf = StratifiedKFold(n_splits=n, shuffle=True)
x = td.intent_examples
y = [example.get("intent") for example in x]

# Get labels with response key appended to intent name because we want a stratified split on all
# intents(including retrieval intents if they exist)
y = [example.get_combined_intent_response_key() for example in x]
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved
for i_fold, (train_index, test_index) in enumerate(skf.split(x, y)):
logger.debug(f"Fold: {i_fold}")
train = [x[i] for i in train_index]
Expand All @@ -1150,11 +1154,15 @@ def generate_folds(
def combine_result(
intent_metrics: IntentMetrics,
entity_metrics: EntityMetrics,
response_selection_metrics: ResponseSelectionMetrics,
interpreter: Interpreter,
data: TrainingData,
intent_results: Optional[List[IntentEvaluationResult]] = None,
entity_results: Optional[List[EntityEvaluationResult]] = None,
) -> Tuple[IntentMetrics, EntityMetrics]:
response_selection_results: Optional[
List[ResponseSelectionEvaluationResult]
] = None,
) -> Tuple[IntentMetrics, EntityMetrics, ResponseSelectionMetrics]:
"""Collects intent and entity metrics for crossvalidation folds.
If `intent_results` or `entity_results` is provided as a list, prediction results
are also collected.
Expand All @@ -1163,8 +1171,10 @@ def combine_result(
(
intent_current_metrics,
entity_current_metrics,
response_selection_current_metrics,
current_intent_results,
current_entity_results,
current_response_selection_results,
) = compute_metrics(interpreter, data)

if intent_results is not None:
Expand All @@ -1173,15 +1183,21 @@ def combine_result(
if entity_results is not None:
entity_results += current_entity_results

if response_selection_results is not None:
response_selection_results += current_response_selection_results

for k, v in intent_current_metrics.items():
intent_metrics[k] = v + intent_metrics[k]

for k, v in response_selection_current_metrics.items():
response_selection_metrics[k] = v + response_selection_metrics[k]

for extractor, extractor_metric in entity_current_metrics.items():
entity_metrics[extractor] = {
k: v + entity_metrics[extractor][k] for k, v in extractor_metric.items()
}

return intent_metrics, entity_metrics
return intent_metrics, entity_metrics, response_selection_metrics


def cross_validate(
Expand All @@ -1194,14 +1210,15 @@ def cross_validate(
confmat: Optional[Text] = None,
histogram: Optional[Text] = None,
disable_plotting: bool = False,
) -> Tuple[CVEvaluationResult, CVEvaluationResult]:
) -> Tuple[CVEvaluationResult, CVEvaluationResult, CVEvaluationResult]:

"""Stratified cross validation on data.

Args:
data: Training Data
n_folds: integer, number of cv folds
nlu_config: nlu config file
report: path to folder where reports are stored
output: path to folder where reports are stored
successes: if true successful predictions are written to a file
errors: if true incorrect predictions are written to a file
confmat: path to file that will show the confusion matrix
Expand All @@ -1226,25 +1243,43 @@ def cross_validate(
intent_test_metrics = defaultdict(list) # type: IntentMetrics
entity_train_metrics = defaultdict(lambda: defaultdict(list)) # type: EntityMetrics
entity_test_metrics = defaultdict(lambda: defaultdict(list)) # type: EntityMetrics
response_selection_train_metrics = defaultdict(
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved
list
) # type: ResponseSelectionMetrics
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved
response_selection_test_metrics = defaultdict(
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved
list
) # type: ResponseSelectionMetrics

intent_test_results = [] # type: List[IntentEvaluationResult]
entity_test_results = [] # type: List[EntityEvaluationResult]
response_selection_test_results = (
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved
[]
) # type: List[ResponseSelectionEvaluationResult]
intent_classifier_present = False
response_selector_present = False
extractors = set() # type: Set[Text]

for train, test in generate_folds(n_folds, data):
interpreter = trainer.train(train)

# calculate train accuracy
combine_result(intent_train_metrics, entity_train_metrics, interpreter, train)
combine_result(
intent_train_metrics,
entity_train_metrics,
response_selection_train_metrics,
interpreter,
train,
)
# calculate test accuracy
combine_result(
intent_test_metrics,
entity_test_metrics,
response_selection_test_metrics,
interpreter,
test,
intent_test_results,
entity_test_results,
response_selection_test_results,
)

if not extractors:
Expand All @@ -1253,6 +1288,9 @@ def cross_validate(
if is_intent_classifier_present(interpreter):
intent_classifier_present = True

if is_response_selector_present(interpreter):
response_selector_present = True

if intent_classifier_present:
logger.info("Accumulated test folds intent evaluation results:")
evaluate_intents(
Expand All @@ -1269,9 +1307,17 @@ def cross_validate(
logger.info("Accumulated test folds entity evaluation results:")
evaluate_entities(entity_test_results, extractors, output, successes, errors)

if response_selector_present:
logger.info("Accumulated test folds response selection evaluation results:")
evaluate_response_selections(response_selection_test_results, output)

return (
CVEvaluationResult(dict(intent_train_metrics), dict(intent_test_metrics)),
CVEvaluationResult(dict(entity_train_metrics), dict(entity_test_metrics)),
CVEvaluationResult(
dict(response_selection_train_metrics),
dict(response_selection_test_metrics),
),
)


Expand All @@ -1290,8 +1336,10 @@ def compute_metrics(
) -> Tuple[
IntentMetrics,
EntityMetrics,
ResponseSelectionMetrics,
List[IntentEvaluationResult],
List[EntityEvaluationResult],
List[ResponseSelectionEvaluationResult],
]:
"""Computes metrics for intent classification and entity extraction.
Returns intent and entity metrics, and prediction results.
Expand All @@ -1303,12 +1351,29 @@ def compute_metrics(

intent_results = remove_empty_intent_examples(intent_results)

response_selection_results = remove_empty_response_examples(
response_selection_results
)

intent_metrics = _compute_metrics(
intent_results, "intent_target", "intent_prediction"
)
entity_metrics = _compute_entity_metrics(entity_results, interpreter)

return (intent_metrics, entity_metrics, intent_results, entity_results)
response_selection_metrics = {}
if response_selection_results:
response_selection_metrics = _compute_metrics(
response_selection_results, "response_target", "response_prediction"
)

return (
intent_metrics,
entity_metrics,
response_selection_metrics,
intent_results,
entity_results,
response_selection_results,
)


def compare_nlu(
Expand Down Expand Up @@ -1407,7 +1472,7 @@ def _compute_metrics(
],
target_key: Text,
target_prediction: Text,
) -> IntentMetrics:
) -> Union[IntentMetrics, ResponseSelectionMetrics]:
"""Computes evaluation metrics for a given corpus and
returns the results
"""
Expand Down
8 changes: 7 additions & 1 deletion rasa/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,9 @@ def perform_nlu_cross_validation(
data = rasa.nlu.training_data.load_data(nlu)
data = drop_intents_below_freq(data, cutoff=folds)
kwargs = utils.minimal_kwargs(kwargs, cross_validate)
results, entity_results = cross_validate(data, folds, nlu_config, output, **kwargs)
results, entity_results, response_selection_results = cross_validate(
data, folds, nlu_config, output, **kwargs
)
logger.info(f"CV evaluation (n={folds})")

if any(results):
Expand All @@ -213,3 +215,7 @@ def perform_nlu_cross_validation(
logger.info("Entity evaluation results")
return_entity_results(entity_results.train, "train")
return_entity_results(entity_results.test, "test")
if any(response_selection_results):
logger.info("Response Selection evaluation results")
return_results(response_selection_results.train, "train")
return_results(response_selection_results.test, "test")
9 changes: 4 additions & 5 deletions tests/nlu/base/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,9 @@ def test_run_cv_evaluation():
nlu_config = config.load("sample_configs/config_pretrained_embeddings_spacy.yml")

n_folds = 2
intent_results, entity_results = cross_validate(td, n_folds, nlu_config)
intent_results, entity_results, response_selection_results = cross_validate(
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved
td, n_folds, nlu_config
)

assert len(intent_results.train["Accuracy"]) == n_folds
assert len(intent_results.train["Precision"]) == n_folds
Expand Down Expand Up @@ -379,10 +381,7 @@ def incorrect(label: Text, _label: Text) -> IntentEvaluationResult:
"confused_with": {"C": 5, "": 5},
}

c_confused_with = {
"D": 1,
"E": 1,
}
c_confused_with = {"D": 1, "E": 1}

assert len(report.keys()) == 8
assert report["A"] == a_results
Expand Down