From a28d256b9ab2af4373fac63375259c70f327459e Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Thu, 16 Dec 2021 00:01:15 +0100 Subject: [PATCH] Add Precision, Recall, F-measure, Confusion Matrix to Taggers (#2862) * Add Precision, Recall, F-measure, Confusion Matrix and per-tag evaluation to Taggers And add precision, recall and f-measure to ConfusionMatrix. Includes large doctests, and some small doctest fixes throughout the tag module * Move evaluation of ConfusionMatrix into nltk\metrics\confusionmatrix.py * Add self as author in significantly updated files * Deprecate tagger evaluate(gold) in favor of accuracy(gold) * Missed one case of Tagger evaluate still being used - fixed now * Deprecate ChunkParser's evaluate(gold) in favor of accuracy(gold) Co-authored-by: Steven Bird --- nltk/chunk/api.py | 5 + nltk/metrics/confusionmatrix.py | 137 ++++++++++ nltk/tag/__init__.py | 8 +- nltk/tag/api.py | 221 +++++++++++++++- nltk/tag/brill_trainer.py | 14 +- nltk/tag/crf.py | 4 +- nltk/tag/perceptron.py | 2 +- nltk/tag/tnt.py | 8 +- nltk/tbl/demo.py | 4 +- nltk/test/metrics.doctest | 21 ++ nltk/test/probability.doctest | 2 +- nltk/test/tag.doctest | 432 ++++++++++++++++++++++++++++++++ 12 files changed, 833 insertions(+), 25 deletions(-) diff --git a/nltk/chunk/api.py b/nltk/chunk/api.py index de4bb93958..56a63d2743 100644 --- a/nltk/chunk/api.py +++ b/nltk/chunk/api.py @@ -11,6 +11,7 @@ ##////////////////////////////////////////////////////// from nltk.chunk.util import ChunkScore +from nltk.internals import deprecated from nltk.parse import ParserI @@ -34,7 +35,11 @@ def parse(self, tokens): """ raise NotImplementedError() + @deprecated("Use accuracy(gold) instead.") def evaluate(self, gold): + return self.accuracy(gold) + + def accuracy(self, gold): """ Score the accuracy of the chunker against the gold standard. Remove the chunking the gold standard text, rechunk it using diff --git a/nltk/metrics/confusionmatrix.py b/nltk/metrics/confusionmatrix.py index 1dc7121082..5fbcbe3493 100644 --- a/nltk/metrics/confusionmatrix.py +++ b/nltk/metrics/confusionmatrix.py @@ -3,6 +3,7 @@ # Copyright (C) 2001-2021 NLTK Project # Author: Edward Loper # Steven Bird +# Tom Aarsen <> # URL: # For license information, see LICENSE.TXT @@ -201,6 +202,140 @@ def key(self): return str + def recall(self, value): + """Given a value in the confusion matrix, return the recall + that corresponds to this value. The recall is defined as: + + - *r* = true positive / (true positive + false positive) + + and can loosely be considered the ratio of how often ``value`` + was predicted correctly relative to how often ``value`` was + the true result. + + :param value: value used in the ConfusionMatrix + :return: the recall corresponding to ``value``. + :rtype: float + """ + # Number of times `value` was correct, and also predicted + TP = self[value, value] + # Number of times `value` was correct + TP_FN = sum(self[value, pred_value] for pred_value in self._values) + if TP_FN == 0: + return 0.0 + return TP / TP_FN + + def precision(self, value): + """Given a value in the confusion matrix, return the precision + that corresponds to this value. The precision is defined as: + + - *p* = true positive / (true positive + false negative) + + and can loosely be considered the ratio of how often ``value`` + was predicted correctly relative to the number of predictions + for ``value``. + + :param value: value used in the ConfusionMatrix + :return: the precision corresponding to ``value``. + :rtype: float + """ + # Number of times `value` was correct, and also predicted + TP = self[value, value] + # Number of times `value` was predicted + TP_FP = sum(self[real_value, value] for real_value in self._values) + if TP_FP == 0: + return 0.0 + return TP / TP_FP + + def f_measure(self, value, alpha=0.5): + """ + Given a value used in the confusion matrix, return the f-measure + that corresponds to this value. The f-measure is the harmonic mean + of the ``precision`` and ``recall``, weighted by ``alpha``. + In particular, given the precision *p* and recall *r* defined by: + + - *p* = true positive / (true positive + false negative) + - *r* = true positive / (true positive + false positive) + + The f-measure is: + + - *1/(alpha/p + (1-alpha)/r)* + + With ``alpha = 0.5``, this reduces to: + + - *2pr / (p + r)* + + :param value: value used in the ConfusionMatrix + :param alpha: Ratio of the cost of false negative compared to false + positives. Defaults to 0.5, where the costs are equal. + :type alpha: float + :return: the F-measure corresponding to ``value``. + :rtype: float + """ + p = self.precision(value) + r = self.recall(value) + if p == 0.0 or r == 0.0: + return 0.0 + return 1.0 / (alpha / p + (1 - alpha) / r) + + def evaluate(self, alpha=0.5, truncate=None, sort_by_count=False): + """ + Tabulate the **recall**, **precision** and **f-measure** + for each value in this confusion matrix. + + >>> reference = "DET NN VB DET JJ NN NN IN DET NN".split() + >>> test = "DET VB VB DET NN NN NN IN DET NN".split() + >>> cm = ConfusionMatrix(reference, test) + >>> print(cm.evaluate()) + Tag | Prec. | Recall | F-measure + ----+--------+--------+----------- + DET | 1.0000 | 1.0000 | 1.0000 + IN | 1.0000 | 1.0000 | 1.0000 + JJ | 0.0000 | 0.0000 | 0.0000 + NN | 0.7500 | 0.7500 | 0.7500 + VB | 0.5000 | 1.0000 | 0.6667 + + + :param alpha: Ratio of the cost of false negative compared to false + positives, as used in the f-measure computation. Defaults to 0.5, + where the costs are equal. + :type alpha: float + :param truncate: If specified, then only show the specified + number of values. Any sorting (e.g., sort_by_count) + will be performed before truncation. Defaults to None + :type truncate: int, optional + :param sort_by_count: Whether to sort the outputs on frequency + in the reference label. Defaults to False. + :type sort_by_count: bool, optional + :return: A tabulated recall, precision and f-measure string + :rtype: str + """ + tags = self._values + + # Apply keyword parameters + if sort_by_count: + tags = sorted(tags, key=lambda v: -sum(self._confusion[self._indices[v]])) + if truncate: + tags = tags[:truncate] + + tag_column_len = max(max(len(tag) for tag in tags), 3) + + # Construct the header + s = ( + f"{' ' * (tag_column_len - 3)}Tag | Prec. | Recall | F-measure\n" + f"{'-' * tag_column_len}-+--------+--------+-----------\n" + ) + + # Construct the body + for tag in tags: + s += ( + f"{tag:>{tag_column_len}} | " + f"{self.precision(tag):<6.4f} | " + f"{self.recall(tag):<6.4f} | " + f"{self.f_measure(tag, alpha=alpha):.4f}\n" + ) + + return s + def demo(): reference = "DET NN VB DET JJ NN NN IN DET NN".split() @@ -211,6 +346,8 @@ def demo(): print(ConfusionMatrix(reference, test)) print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True)) + print(ConfusionMatrix(reference, test).recall("VB")) + if __name__ == "__main__": demo() diff --git a/nltk/tag/__init__.py b/nltk/tag/__init__.py index ce7610e171..36446de271 100644 --- a/nltk/tag/__init__.py +++ b/nltk/tag/__init__.py @@ -21,7 +21,7 @@ An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset: >>> from nltk import pos_tag, word_tokenize - >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) + >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +NORMALIZE_WHITESPACE [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] @@ -57,7 +57,7 @@ We evaluate a tagger on data that was not seen during training: - >>> tagger.evaluate(brown.tagged_sents(categories='news')[500:600]) + >>> tagger.accuracy(brown.tagged_sents(categories='news')[500:600]) 0.7... For more information, please consult chapter 5 of the NLTK Book. @@ -144,10 +144,10 @@ def pos_tag(tokens, tagset=None, lang="eng"): >>> from nltk.tag import pos_tag >>> from nltk.tokenize import word_tokenize - >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) + >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +NORMALIZE_WHITESPACE [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] - >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal') + >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal') # doctest: +NORMALIZE_WHITESPACE [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'), ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')] diff --git a/nltk/tag/api.py b/nltk/tag/api.py index 9ef6513549..25ffd1e0a4 100644 --- a/nltk/tag/api.py +++ b/nltk/tag/api.py @@ -3,6 +3,7 @@ # Copyright (C) 2001-2021 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) +# Tom Aarsen <> # URL: # For license information, see LICENSE.TXT @@ -11,10 +12,12 @@ information, such as its part of speech. """ from abc import ABCMeta, abstractmethod +from functools import lru_cache from itertools import chain +from typing import Dict -from nltk.internals import overridden -from nltk.metrics import accuracy +from nltk.internals import deprecated, overridden +from nltk.metrics import ConfusionMatrix, accuracy from nltk.tag.util import untag @@ -47,20 +50,24 @@ def tag(self, tokens): def tag_sents(self, sentences): """ - Apply ``self.tag()`` to each element of *sentences*. I.e.: + Apply ``self.tag()`` to each element of *sentences*. I.e.:: return [self.tag(sent) for sent in sentences] """ return [self.tag(sent) for sent in sentences] + @deprecated("Use accuracy(gold) instead.") def evaluate(self, gold): + return self.accuracy(gold) + + def accuracy(self, gold): """ Score the accuracy of the tagger against the gold standard. Strip the tags from the gold standard text, retag it using the tagger, then compute the accuracy score. - :type gold: list(list(tuple(str, str))) :param gold: The list of tagged sentences to score the tagger on. + :type gold: list(list(tuple(str, str))) :rtype: float """ @@ -69,6 +76,212 @@ def evaluate(self, gold): test_tokens = list(chain.from_iterable(tagged_sents)) return accuracy(gold_tokens, test_tokens) + @lru_cache(maxsize=1) + def _confusion_cached(self, gold): + """ + Inner function used after ``gold`` is converted to a + ``tuple(tuple(tuple(str, str)))``. That way, we can use caching on + creating a ConfusionMatrix. + + :param gold: The list of tagged sentences to run the tagger with, + also used as the reference values in the generated confusion matrix. + :type gold: tuple(tuple(tuple(str, str))) + :rtype: ConfusionMatrix + """ + + tagged_sents = self.tag_sents(untag(sent) for sent in gold) + gold_tokens = [token for _word, token in chain.from_iterable(gold)] + test_tokens = [token for _word, token in chain.from_iterable(tagged_sents)] + return ConfusionMatrix(gold_tokens, test_tokens) + + def confusion(self, gold): + """ + Return a ConfusionMatrix with the tags from ``gold`` as the reference + values, with the predictions from ``tag_sents`` as the predicted values. + + >>> from nltk.tag import PerceptronTagger + >>> from nltk.corpus import treebank + >>> tagger = PerceptronTagger() + >>> gold_data = treebank.tagged_sents()[:10] + >>> print(tagger.confusion(gold_data)) + | - | + | N | + | O P | + | N J J N N P P R R V V V V V W | + | ' E C C D E I J J J M N N N O R P R B R T V B B B B B D ` | + | ' , - . C D T X N J R S D N P S S P $ B R P O B D G N P Z T ` | + -------+----------------------------------------------------------------------------------------------+ + '' | <1> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + , | .<15> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + -NONE- | . . <.> . . 2 . . . 2 . . . 5 1 . . . . 2 . . . . . . . . . . . | + . | . . .<10> . . . . . . . . . . . . . . . . . . . . . . . . . . . | + CC | . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . . . . | + CD | . . . . . <5> . . . . . . . . . . . . . . . . . . . . . . . . . | + DT | . . . . . .<20> . . . . . . . . . . . . . . . . . . . . . . . . | + EX | . . . . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . | + IN | . . . . . . . .<22> . . . . . . . . . . 3 . . . . . . . . . . . | + JJ | . . . . . . . . .<16> . . . . 1 . . . . 1 . . . . . . . . . . . | + JJR | . . . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . | + JJS | . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . . . | + MD | . . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . . | + NN | . . . . . . . . . . . . .<28> 1 1 . . . . . . . . . . . . . . . | + NNP | . . . . . . . . . . . . . .<25> . . . . . . . . . . . . . . . . | + NNS | . . . . . . . . . . . . . . .<19> . . . . . . . . . . . . . . . | + POS | . . . . . . . . . . . . . . . . <1> . . . . . . . . . . . . . . | + PRP | . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . . . | + PRP$ | . . . . . . . . . . . . . . . . . . <2> . . . . . . . . . . . . | + RB | . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . | + RBR | . . . . . . . . . . 1 . . . . . . . . . <1> . . . . . . . . . . | + RP | . . . . . . . . . . . . . . . . . . . . . <1> . . . . . . . . . | + TO | . . . . . . . . . . . . . . . . . . . . . . <5> . . . . . . . . | + VB | . . . . . . . . . . . . . . . . . . . . . . . <3> . . . . . . . | + VBD | . . . . . . . . . . . . . 1 . . . . . . . . . . <6> . . . . . . | + VBG | . . . . . . . . . . . . . 1 . . . . . . . . . . . <4> . . . . . | + VBN | . . . . . . . . . . . . . . . . . . . . . . . . 1 . <4> . . . . | + VBP | . . . . . . . . . . . . . . . . . . . . . . . . . . . <3> . . . | + VBZ | . . . . . . . . . . . . . . . . . . . . . . . . . . . . <7> . . | + WDT | . . . . . . . . 2 . . . . . . . . . . . . . . . . . . . . <.> . | + `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <1>| + -------+----------------------------------------------------------------------------------------------+ + (row = reference; col = test) + + + :param gold: The list of tagged sentences to run the tagger with, + also used as the reference values in the generated confusion matrix. + :type gold: list(list(tuple(str, str))) + :rtype: ConfusionMatrix + """ + + return self._confusion_cached(tuple(tuple(sent) for sent in gold)) + + def recall(self, gold) -> Dict[str, float]: + """ + Compute the recall for each tag from ``gold`` or from running ``tag`` + on the tokenized sentences from ``gold``. Then, return the dictionary + with mappings from tag to recall. The recall is defined as: + + - *r* = true positive / (true positive + false positive) + + :param gold: The list of tagged sentences to score the tagger on. + :type gold: list(list(tuple(str, str))) + :return: A mapping from tags to recall + :rtype: Dict[str, float] + """ + + cm = self.confusion(gold) + return {tag: cm.recall(tag) for tag in cm._values} + + def precision(self, gold): + """ + Compute the precision for each tag from ``gold`` or from running ``tag`` + on the tokenized sentences from ``gold``. Then, return the dictionary + with mappings from tag to precision. The precision is defined as: + + - *p* = true positive / (true positive + false negative) + + :param gold: The list of tagged sentences to score the tagger on. + :type gold: list(list(tuple(str, str))) + :return: A mapping from tags to precision + :rtype: Dict[str, float] + """ + + cm = self.confusion(gold) + return {tag: cm.precision(tag) for tag in cm._values} + + def f_measure(self, gold, alpha=0.5): + """ + Compute the f-measure for each tag from ``gold`` or from running ``tag`` + on the tokenized sentences from ``gold``. Then, return the dictionary + with mappings from tag to f-measure. The f-measure is the harmonic mean + of the ``precision`` and ``recall``, weighted by ``alpha``. + In particular, given the precision *p* and recall *r* defined by: + + - *p* = true positive / (true positive + false negative) + - *r* = true positive / (true positive + false positive) + + The f-measure is: + + - *1/(alpha/p + (1-alpha)/r)* + + With ``alpha = 0.5``, this reduces to: + + - *2pr / (p + r)* + + :param gold: The list of tagged sentences to score the tagger on. + :type gold: list(list(tuple(str, str))) + :param alpha: Ratio of the cost of false negative compared to false + positives. Defaults to 0.5, where the costs are equal. + :type alpha: float + :return: A mapping from tags to precision + :rtype: Dict[str, float] + """ + cm = self.confusion(gold) + return {tag: cm.f_measure(tag, alpha) for tag in cm._values} + + def evaluate_per_tag(self, gold, alpha=0.5, truncate=None, sort_by_count=False): + """Tabulate the **recall**, **precision** and **f-measure** + for each tag from ``gold`` or from running ``tag`` on the tokenized + sentences from ``gold``. + + >>> from nltk.tag import PerceptronTagger + >>> from nltk.corpus import treebank + >>> tagger = PerceptronTagger() + >>> gold_data = treebank.tagged_sents()[:10] + >>> print(tagger.evaluate_per_tag(gold_data)) + Tag | Prec. | Recall | F-measure + -------+--------+--------+----------- + '' | 1.0000 | 1.0000 | 1.0000 + , | 1.0000 | 1.0000 | 1.0000 + -NONE- | 0.0000 | 0.0000 | 0.0000 + . | 1.0000 | 1.0000 | 1.0000 + CC | 1.0000 | 1.0000 | 1.0000 + CD | 0.7143 | 1.0000 | 0.8333 + DT | 1.0000 | 1.0000 | 1.0000 + EX | 1.0000 | 1.0000 | 1.0000 + IN | 0.9167 | 0.8800 | 0.8980 + JJ | 0.8889 | 0.8889 | 0.8889 + JJR | 0.0000 | 0.0000 | 0.0000 + JJS | 1.0000 | 1.0000 | 1.0000 + MD | 1.0000 | 1.0000 | 1.0000 + NN | 0.8000 | 0.9333 | 0.8615 + NNP | 0.8929 | 1.0000 | 0.9434 + NNS | 0.9500 | 1.0000 | 0.9744 + POS | 1.0000 | 1.0000 | 1.0000 + PRP | 1.0000 | 1.0000 | 1.0000 + PRP$ | 1.0000 | 1.0000 | 1.0000 + RB | 0.4000 | 1.0000 | 0.5714 + RBR | 1.0000 | 0.5000 | 0.6667 + RP | 1.0000 | 1.0000 | 1.0000 + TO | 1.0000 | 1.0000 | 1.0000 + VB | 1.0000 | 1.0000 | 1.0000 + VBD | 0.8571 | 0.8571 | 0.8571 + VBG | 1.0000 | 0.8000 | 0.8889 + VBN | 1.0000 | 0.8000 | 0.8889 + VBP | 1.0000 | 1.0000 | 1.0000 + VBZ | 1.0000 | 1.0000 | 1.0000 + WDT | 0.0000 | 0.0000 | 0.0000 + `` | 1.0000 | 1.0000 | 1.0000 + + + :param gold: The list of tagged sentences to score the tagger on. + :type gold: list(list(tuple(str, str))) + :param alpha: Ratio of the cost of false negative compared to false + positives, as used in the f-measure computation. Defaults to 0.5, + where the costs are equal. + :type alpha: float + :param truncate: If specified, then only show the specified + number of values. Any sorting (e.g., sort_by_count) + will be performed before truncation. Defaults to None + :type truncate: int, optional + :param sort_by_count: Whether to sort the outputs on number of + occurrences of that tag in the ``gold`` data, defaults to False + :type sort_by_count: bool, optional + :return: A tabulated recall, precision and f-measure string + :rtype: str + """ + cm = self.confusion(gold) + return cm.evaluate(alpha=alpha, truncate=truncate, sort_by_count=sort_by_count) + def _check_params(self, train, model): if (train and model) or (not train and not model): raise ValueError("Must specify either training data or trained model.") diff --git a/nltk/tag/brill_trainer.py b/nltk/tag/brill_trainer.py index 0f1c5bea8c..ff2ee760b7 100644 --- a/nltk/tag/brill_trainer.py +++ b/nltk/tag/brill_trainer.py @@ -123,9 +123,8 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): ... ]) >>> baseline = backoff #see NOTE1 - - >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS - 0.2433862... + >>> baseline.accuracy(gold_data) #doctest: +ELLIPSIS + 0.2450142... >>> # Set up templates >>> Template._cleartemplates() #clear any templates created in earlier tests @@ -174,8 +173,8 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): - >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS - 0.43833... + >>> tagger1.accuracy(gold_data) # doctest: +ELLIPSIS + 0.43996... >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) @@ -211,8 +210,9 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): 18 18 0 0 | CD->-NONE- if Pos:NN@[-1] & Word:0@[0] 18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] - >>> tagger2.evaluate(gold_data) # doctest: +ELLIPSIS - 0.43996743... + >>> tagger2.accuracy(gold_data) # doctest: +ELLIPSIS + 0.44159544... + >>> tagger2.rules()[2:4] (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')])) diff --git a/nltk/tag/crf.py b/nltk/tag/crf.py index cdcb4623be..5b1964e43f 100644 --- a/nltk/tag/crf.py +++ b/nltk/tag/crf.py @@ -35,13 +35,13 @@ class CRFTagger(TaggerI): [[('dog', 'Noun'), ('is', 'Verb'), ('good', 'Adj')], [('Cat', 'Noun'), ('eat', 'Verb'), ('meat', 'Noun')]] >>> gold_sentences = [[('dog','Noun'),('is','Verb'),('good','Adj')] , [('Cat','Noun'),('eat','Verb'), ('meat','Noun')]] - >>> ct.evaluate(gold_sentences) + >>> ct.accuracy(gold_sentences) 1.0 Setting learned model file >>> ct = CRFTagger() >>> ct.set_model_file('model.crf.tagger') - >>> ct.evaluate(gold_sentences) + >>> ct.accuracy(gold_sentences) 1.0 """ diff --git a/nltk/tag/perceptron.py b/nltk/tag/perceptron.py index a18c0c2069..02ff0865b8 100644 --- a/nltk/tag/perceptron.py +++ b/nltk/tag/perceptron.py @@ -363,7 +363,7 @@ def _get_pretrain_model(): print("Size of training and testing (sentence)", len(training), len(testing)) # Train and save the model tagger.train(training, PICKLE) - print("Accuracy : ", tagger.evaluate(testing)) + print("Accuracy : ", tagger.accuracy(testing)) if __name__ == "__main__": diff --git a/nltk/tag/tnt.py b/nltk/tag/tnt.py index 9174e498c7..e4cbf74b3e 100755 --- a/nltk/tag/tnt.py +++ b/nltk/tag/tnt.py @@ -492,7 +492,7 @@ def demo2(): s.train(d[(11) * 100 :]) for i in range(10): - tacc = t.evaluate(d[i * 100 : ((i + 1) * 100)]) + tacc = t.accuracy(d[i * 100 : ((i + 1) * 100)]) tp_un = t.unknown / (t.known + t.unknown) tp_kn = t.known / (t.known + t.unknown) t.unknown = 0 @@ -504,7 +504,7 @@ def demo2(): print("Percentage unknown:", tp_un) print("Accuracy over known words:", (tacc / tp_kn)) - sacc = s.evaluate(d[i * 100 : ((i + 1) * 100)]) + sacc = s.accuracy(d[i * 100 : ((i + 1) * 100)]) sp_un = s.unknown / (s.known + s.unknown) sp_kn = s.known / (s.known + s.unknown) s.unknown = 0 @@ -550,14 +550,14 @@ def demo3(): t.train(dtrain) s.train(etrain) - tacc = t.evaluate(dtest) + tacc = t.accuracy(dtest) tp_un = t.unknown / (t.known + t.unknown) tp_kn = t.known / (t.known + t.unknown) tknown += tp_kn t.unknown = 0 t.known = 0 - sacc = s.evaluate(etest) + sacc = s.accuracy(etest) sp_un = s.unknown / (s.known + s.unknown) sp_kn = s.known / (s.known + s.unknown) sknown += sp_kn diff --git a/nltk/tbl/demo.py b/nltk/tbl/demo.py index 8bdb8d360b..4ec84cb9dc 100644 --- a/nltk/tbl/demo.py +++ b/nltk/tbl/demo.py @@ -261,7 +261,7 @@ def postag( if gold_data: print( " Accuracy on test set: {:0.4f}".format( - baseline_tagger.evaluate(gold_data) + baseline_tagger.accuracy(gold_data) ) ) @@ -274,7 +274,7 @@ def postag( brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc) print(f"Trained tbl tagger in {time.time() - tbrill:0.2f} seconds") if gold_data: - print(" Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data)) + print(" Accuracy on test set: %.4f" % brill_tagger.accuracy(gold_data)) # printing the learned rules, if learned silently if trace == 1: diff --git a/nltk/test/metrics.doctest b/nltk/test/metrics.doctest index 42df6f42e9..d4852c12f1 100644 --- a/nltk/test/metrics.doctest +++ b/nltk/test/metrics.doctest @@ -217,6 +217,27 @@ Confusion Matrix 10: h +For "e", the number of true positives should be 6, while the number of false negatives is 3. +So, the recall ought to be 6 / (6 + 3): + + >>> cm.recall("e") # doctest: +ELLIPSIS + 0.666666... + +For "e", the false positive is just 1, so the precision should be 6 / (6 + 1): + + >>> cm.precision("e") # doctest: +ELLIPSIS + 0.857142... + +The f-measure with default value of ``alpha = 0.5`` should then be: + +* *1/(alpha/p + (1-alpha)/r) =* +* *1/(0.5/p + 0.5/r) =* +* *2pr / (p + r) =* +* *2 * 0.857142... * 0.666666... / (0.857142... + 0.666666...) =* +* *0.749999...* + + >>> cm.f_measure("e") # doctest: +ELLIPSIS + 0.749999... -------------------- Association measures diff --git a/nltk/test/probability.doctest b/nltk/test/probability.doctest index 952c03c027..7902dc9ca0 100644 --- a/nltk/test/probability.doctest +++ b/nltk/test/probability.doctest @@ -139,7 +139,7 @@ And now we can test the estimators >>> def train_and_test(est): ... hmm = trainer.train_supervised(train_corpus, estimator=est) - ... print('%.2f%%' % (100 * hmm.evaluate(test_corpus))) + ... print('%.2f%%' % (100 * hmm.accuracy(test_corpus))) Maximum Likelihood Estimation ----------------------------- diff --git a/nltk/test/tag.doctest b/nltk/test/tag.doctest index ca893b8f90..beda200e0c 100644 --- a/nltk/test/tag.doctest +++ b/nltk/test/tag.doctest @@ -1,6 +1,438 @@ .. Copyright (C) 2001-2021 NLTK Project .. For license information, see LICENSE.TXT +Evaluation of Taggers +===================== + +Evaluating the standard NLTK PerceptronTagger using Accuracy, +Precision, Recall and F-measure for each of the tags. + + >>> from nltk.tag import PerceptronTagger + >>> from nltk.corpus import treebank + >>> tagger = PerceptronTagger() + >>> gold_data = treebank.tagged_sents()[10:20] + >>> print(tagger.accuracy(gold_data)) # doctest: +ELLIPSIS + 0.885931... + + >>> print(tagger.evaluate_per_tag(gold_data)) + Tag | Prec. | Recall | F-measure + -------+--------+--------+----------- + '' | 1.0000 | 1.0000 | 1.0000 + , | 1.0000 | 1.0000 | 1.0000 + -NONE- | 0.0000 | 0.0000 | 0.0000 + . | 1.0000 | 1.0000 | 1.0000 + : | 1.0000 | 1.0000 | 1.0000 + CC | 1.0000 | 1.0000 | 1.0000 + CD | 0.7647 | 1.0000 | 0.8667 + DT | 1.0000 | 1.0000 | 1.0000 + IN | 1.0000 | 1.0000 | 1.0000 + JJ | 0.5882 | 0.8333 | 0.6897 + JJR | 1.0000 | 1.0000 | 1.0000 + JJS | 1.0000 | 1.0000 | 1.0000 + NN | 0.7647 | 0.9630 | 0.8525 + NNP | 0.8929 | 1.0000 | 0.9434 + NNS | 1.0000 | 1.0000 | 1.0000 + POS | 1.0000 | 1.0000 | 1.0000 + PRP | 1.0000 | 1.0000 | 1.0000 + RB | 0.8000 | 1.0000 | 0.8889 + RBR | 0.0000 | 0.0000 | 0.0000 + TO | 1.0000 | 1.0000 | 1.0000 + VB | 1.0000 | 1.0000 | 1.0000 + VBD | 0.8571 | 0.9231 | 0.8889 + VBG | 1.0000 | 1.0000 | 1.0000 + VBN | 0.8333 | 0.5556 | 0.6667 + VBP | 0.5714 | 0.8000 | 0.6667 + VBZ | 1.0000 | 1.0000 | 1.0000 + WP | 1.0000 | 1.0000 | 1.0000 + `` | 1.0000 | 1.0000 | 1.0000 + + +List only the 10 most common tags: + + >>> print(tagger.evaluate_per_tag(gold_data, truncate=10, sort_by_count=True)) + Tag | Prec. | Recall | F-measure + -------+--------+--------+----------- + IN | 1.0000 | 1.0000 | 1.0000 + DT | 1.0000 | 1.0000 | 1.0000 + NN | 0.7647 | 0.9630 | 0.8525 + NNP | 0.8929 | 1.0000 | 0.9434 + NNS | 1.0000 | 1.0000 | 1.0000 + -NONE- | 0.0000 | 0.0000 | 0.0000 + CD | 0.7647 | 1.0000 | 0.8667 + VBD | 0.8571 | 0.9231 | 0.8889 + JJ | 0.5882 | 0.8333 | 0.6897 + , | 1.0000 | 1.0000 | 1.0000 + + +Similarly, we can display the confusion matrix for this tagger. + + >>> print(tagger.confusion(gold_data)) + | - | + | N | + | O | + | N J J N N P P R V V V V V | + | ' E C C D I J J J N N N O R R B T V B B B B B W ` | + | ' , - . : C D T N J R S N P S S P B R O B D G N P Z P ` | + -------+-------------------------------------------------------------------------------------+ + '' | <3> . . . . . . . . . . . . . . . . . . . . . . . . . . . | + , | .<11> . . . . . . . . . . . . . . . . . . . . . . . . . . | + -NONE- | . . <.> . . . 4 . . 4 . . 7 2 . . . 1 . . . . . . 3 . . . | + . | . . .<10> . . . . . . . . . . . . . . . . . . . . . . . . | + : | . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . | + CC | . . . . . <5> . . . . . . . . . . . . . . . . . . . . . . | + CD | . . . . . .<13> . . . . . . . . . . . . . . . . . . . . . | + DT | . . . . . . .<28> . . . . . . . . . . . . . . . . . . . . | + IN | . . . . . . . .<34> . . . . . . . . . . . . . . . . . . . | + JJ | . . . . . . . . .<10> . . . 1 . . . . 1 . . . . . . . . . | + JJR | . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . | + JJS | . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . | + NN | . . . . . . . . . 1 . .<26> . . . . . . . . . . . . . . . | + NNP | . . . . . . . . . . . . .<25> . . . . . . . . . . . . . . | + NNS | . . . . . . . . . . . . . .<22> . . . . . . . . . . . . . | + POS | . . . . . . . . . . . . . . . <1> . . . . . . . . . . . . | + PRP | . . . . . . . . . . . . . . . . <3> . . . . . . . . . . . | + RB | . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . | + RBR | . . . . . . . . . . . . . . . . . . <.> . . . . . . . . . | + TO | . . . . . . . . . . . . . . . . . . . <2> . . . . . . . . | + VB | . . . . . . . . . . . . . . . . . . . . <1> . . . . . . . | + VBD | . . . . . . . . . . . . . . . . . . . . .<12> . 1 . . . . | + VBG | . . . . . . . . . . . . . . . . . . . . . . <3> . . . . . | + VBN | . . . . . . . . . 2 . . . . . . . . . . . 2 . <5> . . . . | + VBP | . . . . . . . . . . . . 1 . . . . . . . . . . . <4> . . . | + VBZ | . . . . . . . . . . . . . . . . . . . . . . . . . <2> . . | + WP | . . . . . . . . . . . . . . . . . . . . . . . . . . <3> . | + `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . <3>| + -------+-------------------------------------------------------------------------------------+ + (row = reference; col = test) + + +Brill Trainer with evaluation +============================= + + >>> # Perform the relevant imports. + >>> from nltk.tbl.template import Template + >>> from nltk.tag.brill import Pos, Word + >>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer, UnigramTagger + + >>> # Load some data + >>> from nltk.corpus import treebank + >>> training_data = treebank.tagged_sents()[:100] + >>> baseline_data = treebank.tagged_sents()[100:200] + >>> gold_data = treebank.tagged_sents()[200:300] + >>> testing_data = [untag(s) for s in gold_data] + + >>> backoff = RegexpTagger([ + ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers + ... (r'(The|the|A|a|An|an)$', 'AT'), # articles + ... (r'.*able$', 'JJ'), # adjectives + ... (r'.*ness$', 'NN'), # nouns formed from adjectives + ... (r'.*ly$', 'RB'), # adverbs + ... (r'.*s$', 'NNS'), # plural nouns + ... (r'.*ing$', 'VBG'), # gerunds + ... (r'.*ed$', 'VBD'), # past tense verbs + ... (r'.*', 'NN') # nouns (default) + ... ]) + +We've now created a simple ``RegexpTagger``, which tags according to the regular expression +rules it has been supplied. This tagger in and of itself does not have a great accuracy. + + >>> backoff.accuracy(gold_data) #doctest: +ELLIPSIS + 0.245014... + +Neither does a simple ``UnigramTagger``. This tagger is trained on some data, +and will then first try to match unigrams (i.e. tokens) of the sentence it has +to tag to the learned data. + + >>> unigram_tagger = UnigramTagger(baseline_data) + >>> unigram_tagger.accuracy(gold_data) #doctest: +ELLIPSIS + 0.581196... + +The lackluster accuracy here can be explained with the following example: + + >>> unigram_tagger.tag(["I", "would", "like", "this", "sentence", "to", "be", "tagged"]) + [('I', 'NNP'), ('would', 'MD'), ('like', None), ('this', 'DT'), ('sentence', None), + ('to', 'TO'), ('be', 'VB'), ('tagged', None)] + +As you can see, many tokens are tagged as ``None``, as these tokens are OOV (out of vocabulary). +The ``UnigramTagger`` has never seen them, and as a result they are not in its database of known terms. + +In practice, a ``UnigramTagger`` is exclusively used in conjunction with a *backoff*. Our real +baseline which will use such a backoff. We'll create a ``UnigramTagger`` like before, but now +the ``RegexpTagger`` will be used as a backoff for the situations where the ``UnigramTagger`` +encounters an OOV token. + + >>> baseline = UnigramTagger(baseline_data, backoff=backoff) + >>> baseline.accuracy(gold_data) #doctest: +ELLIPSIS + 0.7537647... + +That is already much better. We can investigate the performance further by running +``evaluate_per_tag``. This method will output the *Precision*, *Recall* and *F-measure* +of each tag. + + >>> print(baseline.evaluate_per_tag(gold_data, sort_by_count=True)) + Tag | Prec. | Recall | F-measure + -------+--------+--------+----------- + NNP | 0.9674 | 0.2738 | 0.4269 + NN | 0.4111 | 0.9136 | 0.5670 + IN | 0.9383 | 0.9580 | 0.9480 + DT | 0.9819 | 0.8859 | 0.9314 + JJ | 0.8167 | 0.2970 | 0.4356 + NNS | 0.7393 | 0.9630 | 0.8365 + -NONE- | 1.0000 | 0.8345 | 0.9098 + , | 1.0000 | 1.0000 | 1.0000 + . | 1.0000 | 1.0000 | 1.0000 + VBD | 0.6429 | 0.8804 | 0.7431 + CD | 1.0000 | 0.9872 | 0.9935 + CC | 1.0000 | 0.9355 | 0.9667 + VB | 0.7778 | 0.3684 | 0.5000 + VBN | 0.9375 | 0.3000 | 0.4545 + RB | 0.7778 | 0.7447 | 0.7609 + TO | 1.0000 | 1.0000 | 1.0000 + VBZ | 0.9643 | 0.6429 | 0.7714 + VBG | 0.6415 | 0.9444 | 0.7640 + PRP$ | 1.0000 | 1.0000 | 1.0000 + PRP | 1.0000 | 0.5556 | 0.7143 + MD | 1.0000 | 1.0000 | 1.0000 + VBP | 0.6471 | 0.5789 | 0.6111 + POS | 1.0000 | 1.0000 | 1.0000 + $ | 1.0000 | 0.8182 | 0.9000 + '' | 1.0000 | 1.0000 | 1.0000 + : | 1.0000 | 1.0000 | 1.0000 + WDT | 0.4000 | 0.2000 | 0.2667 + `` | 1.0000 | 1.0000 | 1.0000 + JJR | 1.0000 | 0.5000 | 0.6667 + NNPS | 0.0000 | 0.0000 | 0.0000 + RBR | 1.0000 | 1.0000 | 1.0000 + -LRB- | 0.0000 | 0.0000 | 0.0000 + -RRB- | 0.0000 | 0.0000 | 0.0000 + RP | 0.6667 | 0.6667 | 0.6667 + EX | 0.5000 | 0.5000 | 0.5000 + JJS | 0.0000 | 0.0000 | 0.0000 + WP | 1.0000 | 1.0000 | 1.0000 + PDT | 0.0000 | 0.0000 | 0.0000 + AT | 0.0000 | 0.0000 | 0.0000 + + +It's clear that although the precision of tagging `"NNP"` is high, the recall is very low. +With other words, we're missing a lot of cases where the true label is `"NNP"`. We can see +a similar effect with `"JJ"`. + +We can also see a very expected result: The precision of `"NN"` is low, while the recall +is high. If a term is OOV (i.e. ``UnigramTagger`` defers it to ``RegexpTagger``) and +``RegexpTagger`` doesn't have a good rule for it, then it will be tagged as `"NN"`. So, +we catch almost all tokens that are truly labeled as `"NN"`, but we also tag as `"NN"` +for many tokens that shouldn't be `"NN"`. + +This method gives us some insight in what parts of the tagger needs more attention, and why. +However, it doesn't tell us what the terms with true label `"NNP"` or `"JJ"` are actually +tagged as. +To help that, we can create a confusion matrix. + + >>> print(baseline.confusion(gold_data)) + | - | + | - N - | + | L O R N P | + | R N R J J N N N P P P R R V V V V V W | + | ' B E B A C C D E I J J J M N N P N D O R P R B R T V B B B B B D W ` | + | $ ' , - - - . : T C D T X N J R S D N P S S T S P $ B R P O B D G N P Z T P ` | + -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ + $ | <9> . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . . . . . . . . | + '' | . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + , | . .<115> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + -LRB- | . . . <.> . . . . . . . . . . . . . . 3 . . . . . . . . . . . . . . . . . . . . | + -NONE- | . . . .<121> . . . . . . . . . . . . . 24 . . . . . . . . . . . . . . . . . . . . | + -RRB- | . . . . . <.> . . . . . . . . . . . . 3 . . . . . . . . . . . . . . . . . . . . | + . | . . . . . .<100> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + : | . . . . . . . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + AT | . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + CC | . . . . . . . . . <58> . . . . . . . . 4 . . . . . . . . . . . . . . . . . . . . | + CD | . . . . . . . . . . <77> . . . . . . . 1 . . . . . . . . . . . . . . . . . . . . | + DT | . . . . . . . . 1 . .<163> . 4 . . . . 13 . . . . . . . . . . . . . . . . . 3 . . | + EX | . . . . . . . . . . . . <1> . . . . . 1 . . . . . . . . . . . . . . . . . . . . | + IN | . . . . . . . . . . . . .<228> . . . . 8 . . . . . . . . . . . . . 2 . . . . . . | + JJ | . . . . . . . . . . . . . . <49> . . . 86 2 . 4 . . . . 6 . . . . 12 3 . 3 . . . . | + JJR | . . . . . . . . . . . . . . . <3> . . 3 . . . . . . . . . . . . . . . . . . . . | + JJS | . . . . . . . . . . . . . . . . <.> . 2 . . . . . . . . . . . . . . . . . . . . | + MD | . . . . . . . . . . . . . . . . . <19> . . . . . . . . . . . . . . . . . . . . . | + NN | . . . . . . . . . . . . . . 9 . . .<296> . . 5 . . . . . . . . 5 . 9 . . . . . . | + NNP | . . . . . . . . . . . 2 . . . . . . 199 <89> . 26 . . . . 2 . . . . 2 5 . . . . . . | + NNPS | . . . . . . . . . . . . . . . . . . . 1 <.> 3 . . . . . . . . . . . . . . . . . | + NNS | . . . . . . . . . . . . . . . . . . 5 . .<156> . . . . . . . . . . . . . 1 . . . | + PDT | . . . . . . . . . . . 1 . . . . . . . . . . <.> . . . . . . . . . . . . . . . . | + POS | . . . . . . . . . . . . . . . . . . . . . . . <14> . . . . . . . . . . . . . . . | + PRP | . . . . . . . . . . . . . . . . . . 10 . . 2 . . <15> . . . . . . . . . . . . . . | + PRP$ | . . . . . . . . . . . . . . . . . . . . . . . . . <28> . . . . . . . . . . . . . | + RB | . . . . . . . . . . . . 1 4 . . . . 6 . . . . . . . <35> . 1 . . . . . . . . . . | + RBR | . . . . . . . . . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . | + RP | . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . <2> . . . . . . . . . . | + TO | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <47> . . . . . . . . . | + VB | . . . . . . . . . . . . . . 2 . . . 30 . . . . . . . 1 . . . <21> . . . 3 . . . . | + VBD | . . . . . . . . . . . . . . . . . . 10 . . . . . . . . . . . . <81> . 1 . . . . . | + VBG | . . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . <34> . . . . . . | + VBN | . . . . . . . . . . . . . . . . . . 4 . . . . . . . . . . . . 31 . <15> . . . . . | + VBP | . . . . . . . . . . . . . . . . . . 7 . . . . . . . . . . . 1 . . . <11> . . . . | + VBZ | . . . . . . . . . . . . . . . . . . . . . 15 . . . . . . . . . . . . . <27> . . . | + WDT | . . . . . . . . . . . . . 7 . . . . 1 . . . . . . . . . . . . . . . . . <2> . . | + WP | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <2> . | + `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <10>| + -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ + (row = reference; col = test) + + +Once again we can see that `"NN"` is the default if the tagger isn't sure. Beyond that, +we can see why the recall for `"NNP"` is so low: these tokens are often tagged as `"NN"`. +This effect can also be seen for `"JJ"`, where the majority of tokens that ought to be +tagged as `"JJ"` are actually tagged as `"NN"` by our tagger. + +This tagger will only serve as a baseline for the ``BrillTaggerTrainer``, which uses +templates to attempt to improve the performance of the tagger. + + >>> # Set up templates + >>> Template._cleartemplates() #clear any templates created in earlier tests + >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] + + >>> # Construct a BrillTaggerTrainer + >>> tt = BrillTaggerTrainer(baseline, templates, trace=3) + >>> tagger1 = tt.train(training_data, max_rules=10) + TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) + Finding initial useful rules... + Found 618 useful rules. + + B | + S F r O | Score = Fixed - Broken + c i o t | R Fixed = num tags changed incorrect -> correct + o x k h | u Broken = num tags changed correct -> incorrect + r e e e | l Other = num tags changed incorrect -> incorrect + e d n r | e + ------------------+------------------------------------------------------- + 13 14 1 4 | NN->VB if Pos:TO@[-1] + 8 8 0 0 | NN->VB if Pos:MD@[-1] + 7 10 3 22 | NN->IN if Pos:NNS@[-1] + 5 5 0 0 | NN->VBP if Pos:PRP@[-1] + 5 5 0 0 | VBD->VBN if Pos:VBZ@[-1] + 5 5 0 0 | NNS->NN if Pos:IN@[-1] & Word:asbestos@[0] + 4 4 0 0 | NN->-NONE- if Pos:WP@[-1] + 4 4 0 3 | NN->NNP if Pos:-NONE-@[-1] + 4 6 2 2 | NN->NNP if Pos:NNP@[-1] + 4 4 0 0 | NNS->VBZ if Pos:PRP@[-1] + + >>> tagger1.rules()[1:3] + (Rule('000', 'NN', 'VB', [(Pos([-1]),'MD')]), Rule('000', 'NN', 'IN', [(Pos([-1]),'NNS')])) + + >>> tagger1.print_template_statistics(printunused=False) + TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) + TRAIN ( 2417 tokens) initial 555 0.7704 final: 496 0.7948 + #ID | Score (train) | #Rules | Template + -------------------------------------------- + 000 | 54 0.915 | 9 0.900 | Template(Pos([-1])) + 001 | 5 0.085 | 1 0.100 | Template(Pos([-1]),Word([0])) + + + + >>> tagger1.accuracy(gold_data) # doctest: +ELLIPSIS + 0.769230... + + >>> print(tagger1.evaluate_per_tag(gold_data, sort_by_count=True)) + Tag | Prec. | Recall | F-measure + -------+--------+--------+----------- + NNP | 0.8298 | 0.3600 | 0.5021 + NN | 0.4435 | 0.8364 | 0.5797 + IN | 0.8476 | 0.9580 | 0.8994 + DT | 0.9819 | 0.8859 | 0.9314 + JJ | 0.8167 | 0.2970 | 0.4356 + NNS | 0.7464 | 0.9630 | 0.8410 + -NONE- | 1.0000 | 0.8414 | 0.9139 + , | 1.0000 | 1.0000 | 1.0000 + . | 1.0000 | 1.0000 | 1.0000 + VBD | 0.6723 | 0.8696 | 0.7583 + CD | 1.0000 | 0.9872 | 0.9935 + CC | 1.0000 | 0.9355 | 0.9667 + VB | 0.8103 | 0.8246 | 0.8174 + VBN | 0.9130 | 0.4200 | 0.5753 + RB | 0.7778 | 0.7447 | 0.7609 + TO | 1.0000 | 1.0000 | 1.0000 + VBZ | 0.9667 | 0.6905 | 0.8056 + VBG | 0.6415 | 0.9444 | 0.7640 + PRP$ | 1.0000 | 1.0000 | 1.0000 + PRP | 1.0000 | 0.5556 | 0.7143 + MD | 1.0000 | 1.0000 | 1.0000 + VBP | 0.6316 | 0.6316 | 0.6316 + POS | 1.0000 | 1.0000 | 1.0000 + $ | 1.0000 | 0.8182 | 0.9000 + '' | 1.0000 | 1.0000 | 1.0000 + : | 1.0000 | 1.0000 | 1.0000 + WDT | 0.4000 | 0.2000 | 0.2667 + `` | 1.0000 | 1.0000 | 1.0000 + JJR | 1.0000 | 0.5000 | 0.6667 + NNPS | 0.0000 | 0.0000 | 0.0000 + RBR | 1.0000 | 1.0000 | 1.0000 + -LRB- | 0.0000 | 0.0000 | 0.0000 + -RRB- | 0.0000 | 0.0000 | 0.0000 + RP | 0.6667 | 0.6667 | 0.6667 + EX | 0.5000 | 0.5000 | 0.5000 + JJS | 0.0000 | 0.0000 | 0.0000 + WP | 1.0000 | 1.0000 | 1.0000 + PDT | 0.0000 | 0.0000 | 0.0000 + AT | 0.0000 | 0.0000 | 0.0000 + + + >>> print(tagger1.confusion(gold_data)) + | - | + | - N - | + | L O R N P | + | R N R J J N N N P P P R R V V V V V W | + | ' B E B A C C D E I J J J M N N P N D O R P R B R T V B B B B B D W ` | + | $ ' , - - - . : T C D T X N J R S D N P S S T S P $ B R P O B D G N P Z T P ` | + -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ + $ | <9> . . . . . . . . . . . . . . . . . 1 . . . . . . . . . . . 1 . . . . . . . . | + '' | . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + , | . .<115> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + -LRB- | . . . <.> . . . . . . . . . 1 . . . . 2 . . . . . . . . . . . . . . . . . . . . | + -NONE- | . . . .<122> . . . . . . . . 1 . . . . 22 . . . . . . . . . . . . . . . . . . . . | + -RRB- | . . . . . <.> . . . . . . . . . . . . 2 1 . . . . . . . . . . . . . . . . . . . | + . | . . . . . .<100> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + : | . . . . . . . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + AT | . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + CC | . . . . . . . . . <58> . . . . . . . . 2 1 . . . . . . . . . . . . . . 1 . . . . | + CD | . . . . . . . . . . <77> . . . . . . . 1 . . . . . . . . . . . . . . . . . . . . | + DT | . . . . . . . . 1 . .<163> . 5 . . . . 12 . . . . . . . . . . . . . . . . . 3 . . | + EX | . . . . . . . . . . . . <1> . . . . . 1 . . . . . . . . . . . . . . . . . . . . | + IN | . . . . . . . . . . . . .<228> . . . . 8 . . . . . . . . . . . . . 2 . . . . . . | + JJ | . . . . . . . . . . . . . 4 <49> . . . 79 4 . 4 . . . . 6 . . . 1 12 3 . 3 . . . . | + JJR | . . . . . . . . . . . . . 2 . <3> . . 1 . . . . . . . . . . . . . . . . . . . . | + JJS | . . . . . . . . . . . . . . . . <.> . 2 . . . . . . . . . . . . . . . . . . . . | + MD | . . . . . . . . . . . . . . . . . <19> . . . . . . . . . . . . . . . . . . . . . | + NN | . . . . . . . . . . . . . 7 9 . . .<271> 16 . 5 . . . . . . . . 7 . 9 . . . . . . | + NNP | . . . . . . . . . . . 2 . 7 . . . . 163<117> . 26 . . . . 2 . . . 1 2 5 . . . . . . | + NNPS | . . . . . . . . . . . . . . . . . . . 1 <.> 3 . . . . . . . . . . . . . . . . . | + NNS | . . . . . . . . . . . . . . . . . . 5 . .<156> . . . . . . . . . . . . . 1 . . . | + PDT | . . . . . . . . . . . 1 . . . . . . . . . . <.> . . . . . . . . . . . . . . . . | + POS | . . . . . . . . . . . . . . . . . . . . . . . <14> . . . . . . . . . . . . . . . | + PRP | . . . . . . . . . . . . . . . . . . 10 . . 2 . . <15> . . . . . . . . . . . . . . | + PRP$ | . . . . . . . . . . . . . . . . . . . . . . . . . <28> . . . . . . . . . . . . . | + RB | . . . . . . . . . . . . 1 4 . . . . 6 . . . . . . . <35> . 1 . . . . . . . . . . | + RBR | . . . . . . . . . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . | + RP | . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . <2> . . . . . . . . . . | + TO | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <47> . . . . . . . . . | + VB | . . . . . . . . . . . . . . 2 . . . 4 . . . . . . . 1 . . . <47> . . . 3 . . . . | + VBD | . . . . . . . . . . . . . 1 . . . . 8 1 . . . . . . . . . . . <80> . 2 . . . . . | + VBG | . . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . <34> . . . . . . | + VBN | . . . . . . . . . . . . . . . . . . 4 . . . . . . . . . . . . 25 . <21> . . . . . | + VBP | . . . . . . . . . . . . . 2 . . . . 4 . . . . . . . . . . . 1 . . . <12> . . . . | + VBZ | . . . . . . . . . . . . . . . . . . . . . 13 . . . . . . . . . . . . . <29> . . . | + WDT | . . . . . . . . . . . . . 7 . . . . 1 . . . . . . . . . . . . . . . . . <2> . . | + WP | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <2> . | + `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <10>| + -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ + (row = reference; col = test) + + + >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) + >>> tagged[33][12:] + [('foreign', 'NN'), ('debt', 'NN'), ('of', 'IN'), ('$', '$'), ('64', 'CD'), + ('billion', 'CD'), ('*U*', '-NONE-'), ('--', ':'), ('the', 'DT'), ('third-highest', 'NN'), + ('in', 'IN'), ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')] + Regression Tests ~~~~~~~~~~~~~~~~