diff --git a/.gitignore b/.gitignore index aede8bd..d162ef3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ src/massive/**/__pycache__/* *~ +*.pyc +*.swp diff --git a/conda_env.yml b/conda_env.yml index 1d22fba..9bee24d 100644 --- a/conda_env.yml +++ b/conda_env.yml @@ -80,6 +80,7 @@ dependencies: - scikit-learn==1.0.1 - scipy==1.7.3 - sentencepiece==0.1.96 + - seqeval==1.2.2 - six==1.16.0 - sklearn==0.0 - tabulate==0.8.9 diff --git a/src/massive/utils/training_utils.py b/src/massive/utils/training_utils.py index 4800cd9..7dfae27 100644 --- a/src/massive/utils/training_utils.py +++ b/src/massive/utils/training_utils.py @@ -24,6 +24,7 @@ from math import sqrt import numpy as np import os +from seqeval.metrics import f1_score import sklearn.metrics as sklm import torch from transformers import ( @@ -374,9 +375,57 @@ def compute_metrics(p): return compute_metrics +def convert_to_bio(seq_tags, outside='Other', labels_merge=None): + """ + Converts a sequence of tags into BIO format. EX: + + ['city', 'city', 'Other', 'country', -100, 'Other'] + to + ['B-city', 'I-city', 'O', 'B-country', 'I-country', 'O'] + where outside = 'Other' and labels_merge = [-100] + + :param seq_tags: the sequence of tags that should be converted + :type seq_tags: list + :param outside: The label(s) to put outside (ignore). Default: 'Other' + :type outside: str or list + :param labels_merge: The labels to merge leftward (i.e. for tokenized inputs) + :type labels_merge: str or list + :return: a BIO-tagged sequence + :rtype: list + """ + + seq_tags = [str(x) for x in seq_tags] + + outside = [outside] if type(outside) != list else outside + outside = [str(x) for x in outside] + + if labels_merge: + labels_merge = [labels_merge] if type(labels_merge) != list else labels_merge + labels_merge = [str(x) for x in labels_merge] + else: + labels_merge = [] + + bio_tagged = [] + prev_tag = None + for tag in seq_tags: + if tag in outside: + bio_tagged.append('O') + prev_tag = tag + continue + if tag != prev_tag and tag not in labels_merge: + bio_tagged.append('B-' + tag) + prev_tag = tag + continue + if tag == prev_tag or tag in labels_merge: + if prev_tag in outside: + bio_tagged.append('O') + else: + bio_tagged.append('I-' + prev_tag) + + return bio_tagged + def eval_preds(pred_intents=None, lab_intents=None, pred_slots=None, lab_slots=None, - eval_metrics='all', labels_ignore='Other', labels_merge=None, pad='Other', - slot_level_combination=True): + eval_metrics='all', labels_ignore='Other', labels_merge=None, pad='Other'): """ Function to evaluate the predictions from a model @@ -397,18 +446,8 @@ def eval_preds(pred_intents=None, lab_intents=None, pred_slots=None, lab_slots=N :type labels_merge: str or list :param pad: The value to use when padding slot predictions to match the length of ground truth :type pad: str - :param slot_level_combination: Whether to merge adjacent tokens with the same slot label - :type slot_level_combination: bool """ - # convert to correct types - labels_ignore = [labels_ignore] if type(labels_ignore) != list else labels_ignore - labels_ignore = [str(x) for x in labels_ignore] - if labels_merge: - labels_merge = [labels_merge] if type(labels_merge) != list else labels_merge - labels_merge = [str(x) for x in labels_merge] - else: - labels_merge = [] results = {} # Check lengths @@ -424,69 +463,38 @@ def eval_preds(pred_intents=None, lab_intents=None, pred_slots=None, lab_slots=N results['intent_acc_stderr'] = sqrt(intent_acc*(1-intent_acc)/len(pred_intents)) if lab_slots is not None and pred_slots is not None: - pruned_slot_labels, pruned_slot_preds = [], [] + bio_slot_labels, bio_slot_preds = [], [] for lab, pred in zip(lab_slots, pred_slots): # Pad or truncate prediction as needed using `pad` arg if type(pred) == list: pred = pred[:len(lab)] + [pad]*(len(lab) - len(pred)) - if slot_level_combination: - # for each prediction and label, we want to combine tokens with same slot into - # a single slot. So we'll make a string for each and concatenate with commas - new_lab, new_pred = [], [] - prev_lab = '' - - in_merge = False - for i in range(len(lab)): - if str(lab[i]) in labels_ignore: - prev_lab = str(lab[i]) - in_merge = False - elif str(lab[i]) in labels_merge: - if i != 0 and not in_merge: - prev_lab = str(lab[i-1]) - in_merge = True - # Combine slots - elif str(lab[i]) == prev_lab: - new_lab[-1] = new_lab[-1] + ',' + str(lab[i]) - new_pred[-1] = new_pred[-1] + ',' + str(pred[i]) - prev_lab = str(lab[i]) - in_merge = False - else: - new_lab.append(str(lab[i])) - new_pred.append(str(pred[i])) - prev_lab = str(lab[i]) - in_merge = False - - pred = new_pred - lab = new_lab - - pruned_slot_labels.append(lab) - pruned_slot_preds.append(pred) + # convert to BIO + bio_slot_labels.append( + convert_to_bio(lab, outside=labels_ignore, labels_merge=labels_merge) + ) + bio_slot_preds.append( + convert_to_bio(pred, outside=labels_ignore, labels_merge=labels_merge) + ) if ('slot_micro_f1' in eval_metrics) or ('all' in eval_metrics): - # Flatten list of lists to a single list - flat_pruned_slot_labels = [item for sublist in pruned_slot_labels for item in sublist] - flat_pruned_slot_preds = [item for sublist in pruned_slot_preds for item in sublist] - - # Calculate globally micro averaged slot f1 (~0.2 seconds) - smf1 = sklm.f1_score(flat_pruned_slot_labels, - flat_pruned_slot_preds, - average='micro', - zero_division=0) + # from seqeval + smf1 = f1_score(bio_slot_labels, bio_slot_preds) results['slot_micro_f1'] = smf1 # Assuming normal distribution. Multiply by z (from "z table") to get confidence int - results['slot_micro_f1_stderr'] = sqrt(smf1*(1-smf1)/len(flat_pruned_slot_preds)) + total_slots = sum([len(x) for x in bio_slot_preds]) + results['slot_micro_f1_stderr'] = sqrt(smf1*(1-smf1)/total_slots) if ('ex_match_acc' in eval_metrics) or ('all' in eval_metrics): # calculate exact match accuracy (~0.01 seconds) matches = 0 denom = 0 for p_int, p_slot, l_int, l_slot in zip(pred_intents, - pruned_slot_preds, + bio_slot_preds, lab_intents, - pruned_slot_labels): + bio_slot_labels): if (p_int == l_int) and (p_slot == l_slot): matches += 1 diff --git a/test/__pycache__/test_eval_metrics.cpython-39-pytest-7.1.1.pyc b/test/__pycache__/test_eval_metrics.cpython-39-pytest-7.1.1.pyc deleted file mode 100644 index b9fdfa2..0000000 Binary files a/test/__pycache__/test_eval_metrics.cpython-39-pytest-7.1.1.pyc and /dev/null differ diff --git a/test/__pycache__/test_t2t.cpython-39-pytest-7.1.1.pyc b/test/__pycache__/test_t2t.cpython-39-pytest-7.1.1.pyc deleted file mode 100644 index c011ec4..0000000 Binary files a/test/__pycache__/test_t2t.cpython-39-pytest-7.1.1.pyc and /dev/null differ diff --git a/test/test_eval_metrics.py b/test/test_eval_metrics.py index 8f5a855..b8bf7e1 100644 --- a/test/test_eval_metrics.py +++ b/test/test_eval_metrics.py @@ -15,7 +15,7 @@ """ import pytest -from massive.utils.training_utils import eval_preds +from massive.utils.training_utils import convert_to_bio, eval_preds cases = [ # ---------------- Slot F1 --------------- @@ -42,14 +42,15 @@ ), ( # Test padding + # 2 TP and 1 FN = 2 / (2 + (1 + 0) / 2) None, None, - [['X', 'X']], - [['X', 'X', 'Y']], + [['X', 'X', 'Y', 'Other']], + [['X', 'X', 'Y', 'Other', 'Y']], 'Other', None, 'slot_micro_f1', - {'slot_micro_f1': 0.5} + {'slot_micro_f1': 0.8} ), ( # Test truncation @@ -73,16 +74,50 @@ 'slot_micro_f1', {'slot_micro_f1': 0.5} ), + ( + # Test prediction too long + None, + None, + [['X', 'X', 'X', 'Y', 'Y']], + [['X', 'X', 'Other', 'Y', 'Y']], + 'Other', + None, + 'slot_micro_f1', + {'slot_micro_f1': 0.5} + ), + ( + # Test prediction too short + None, + None, + [['X', 'Other', 'Other', 'Y', 'Y']], + [['X', 'X', 'Other', 'Y', 'Y']], + 'Other', + None, + 'slot_micro_f1', + {'slot_micro_f1': 0.5} + ), + ( + # Test prediction number mismatch + # 1 FN for Y and 1 TP for X = 1 / (1 + (0 + 1) / 2) + None, + None, + [['Other'], ['X']], + [['Y'], ['X']], + 'Other', + None, + 'slot_micro_f1', + {'slot_micro_f1': 0.67} + ), ( # Test -100 merging None, None, [[50, -100, 50, -100, -100, 20, 20, 10, 10, -100, 20]], - [[50, -100, 50, -100, -100, 20, 0, 20, 0, -100, 20]], + [[50, -100, 50, -100, -100, 20, 0, 10, 0, -100, 20]], [0], [-100], 'slot_micro_f1', - {'slot_micro_f1': 0.75} + {'slot_micro_f1': 0.5} ), # ------------- Exact match acc and intent acc ---------- @@ -107,6 +142,17 @@ 'all', {'ex_match_acc': 0.25, 'intent_acc': 0.75} ), + ( + # Test prediction too long + ['A'], + ['A'], + [['X', 'X', 'X']], + [['X', 'X', 'Other']], + 'Other', + None, + 'all', + {'ex_match_acc': 0} + ) ] @pytest.mark.parametrize( @@ -131,3 +177,22 @@ def test_eval_preds( for key in out: assert key in results assert round(out[key], 2) == round(results[key], 2) + +bio_cases = [ + ( + ['city', 'city', 'Other', 'country', -100, 'Other'], + 'Other', + -100, + ['B-city', 'I-city', 'O', 'B-country', 'I-country', 'O'] + ), + ( + [1, 1, 3, 3, 9, 4], + [3], + [9], + ['B-1', 'I-1', 'O', 'O', 'O', 'B-4'] + ) +] + +@pytest.mark.parametrize('seq_tags, outside, labels_merge, out', bio_cases) +def test_convert_to_bio(seq_tags, outside, labels_merge, out): + assert convert_to_bio(seq_tags, outside, labels_merge) == out