From a28d256b9ab2af4373fac63375259c70f327459e Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Thu, 16 Dec 2021 00:01:15 +0100
Subject: [PATCH] Add Precision, Recall, F-measure, Confusion Matrix to Taggers
 (#2862)

* Add Precision, Recall, F-measure, Confusion Matrix and per-tag evaluation to Taggers

And add precision, recall and f-measure to ConfusionMatrix.

Includes large doctests, and some small doctest fixes throughout the tag module

* Move evaluation of ConfusionMatrix into nltk\metrics\confusionmatrix.py

* Add self as author in significantly updated files

* Deprecate tagger evaluate(gold) in favor of accuracy(gold)

* Missed one case of Tagger evaluate still being used - fixed now

* Deprecate ChunkParser's evaluate(gold) in favor of accuracy(gold)

Co-authored-by: Steven Bird <stevenbird1@gmail.com>
---
 nltk/chunk/api.py               |   5 +
 nltk/metrics/confusionmatrix.py | 137 ++++++++++
 nltk/tag/__init__.py            |   8 +-
 nltk/tag/api.py                 | 221 +++++++++++++++-
 nltk/tag/brill_trainer.py       |  14 +-
 nltk/tag/crf.py                 |   4 +-
 nltk/tag/perceptron.py          |   2 +-
 nltk/tag/tnt.py                 |   8 +-
 nltk/tbl/demo.py                |   4 +-
 nltk/test/metrics.doctest       |  21 ++
 nltk/test/probability.doctest   |   2 +-
 nltk/test/tag.doctest           | 432 ++++++++++++++++++++++++++++++++
 12 files changed, 833 insertions(+), 25 deletions(-)

diff --git a/nltk/chunk/api.py b/nltk/chunk/api.py
index de4bb93958..56a63d2743 100644
--- a/nltk/chunk/api.py
+++ b/nltk/chunk/api.py
@@ -11,6 +11,7 @@
 ##//////////////////////////////////////////////////////
 
 from nltk.chunk.util import ChunkScore
+from nltk.internals import deprecated
 from nltk.parse import ParserI
 
 
@@ -34,7 +35,11 @@ def parse(self, tokens):
         """
         raise NotImplementedError()
 
+    @deprecated("Use accuracy(gold) instead.")
     def evaluate(self, gold):
+        return self.accuracy(gold)
+
+    def accuracy(self, gold):
         """
         Score the accuracy of the chunker against the gold standard.
         Remove the chunking the gold standard text, rechunk it using
diff --git a/nltk/metrics/confusionmatrix.py b/nltk/metrics/confusionmatrix.py
index 1dc7121082..5fbcbe3493 100644
--- a/nltk/metrics/confusionmatrix.py
+++ b/nltk/metrics/confusionmatrix.py
@@ -3,6 +3,7 @@
 # Copyright (C) 2001-2021 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
+#         Tom Aarsen <>
 # URL: <https://www.nltk.org/>
 # For license information, see LICENSE.TXT
 
@@ -201,6 +202,140 @@ def key(self):
 
         return str
 
+    def recall(self, value):
+        """Given a value in the confusion matrix, return the recall
+        that corresponds to this value. The recall is defined as:
+
+        - *r* = true positive / (true positive + false positive)
+
+        and can loosely be considered the ratio of how often ``value``
+        was predicted correctly relative to how often ``value`` was
+        the true result.
+
+        :param value: value used in the ConfusionMatrix
+        :return: the recall corresponding to ``value``.
+        :rtype: float
+        """
+        # Number of times `value` was correct, and also predicted
+        TP = self[value, value]
+        # Number of times `value` was correct
+        TP_FN = sum(self[value, pred_value] for pred_value in self._values)
+        if TP_FN == 0:
+            return 0.0
+        return TP / TP_FN
+
+    def precision(self, value):
+        """Given a value in the confusion matrix, return the precision
+        that corresponds to this value. The precision is defined as:
+
+        - *p* = true positive / (true positive + false negative)
+
+        and can loosely be considered the ratio of how often ``value``
+        was predicted correctly relative to the number of predictions
+        for ``value``.
+
+        :param value: value used in the ConfusionMatrix
+        :return: the precision corresponding to ``value``.
+        :rtype: float
+        """
+        # Number of times `value` was correct, and also predicted
+        TP = self[value, value]
+        # Number of times `value` was predicted
+        TP_FP = sum(self[real_value, value] for real_value in self._values)
+        if TP_FP == 0:
+            return 0.0
+        return TP / TP_FP
+
+    def f_measure(self, value, alpha=0.5):
+        """
+        Given a value used in the confusion matrix, return the f-measure
+        that corresponds to this value. The f-measure is the harmonic mean
+        of the ``precision`` and ``recall``, weighted by ``alpha``.
+        In particular, given the precision *p* and recall *r* defined by:
+
+        - *p* = true positive / (true positive + false negative)
+        - *r* = true positive / (true positive + false positive)
+
+        The f-measure is:
+
+        - *1/(alpha/p + (1-alpha)/r)*
+
+        With ``alpha = 0.5``, this reduces to:
+
+        - *2pr / (p + r)*
+
+        :param value: value used in the ConfusionMatrix
+        :param alpha: Ratio of the cost of false negative compared to false
+            positives. Defaults to 0.5, where the costs are equal.
+        :type alpha: float
+        :return: the F-measure corresponding to ``value``.
+        :rtype: float
+        """
+        p = self.precision(value)
+        r = self.recall(value)
+        if p == 0.0 or r == 0.0:
+            return 0.0
+        return 1.0 / (alpha / p + (1 - alpha) / r)
+
+    def evaluate(self, alpha=0.5, truncate=None, sort_by_count=False):
+        """
+        Tabulate the **recall**, **precision** and **f-measure**
+        for each value in this confusion matrix.
+
+        >>> reference = "DET NN VB DET JJ NN NN IN DET NN".split()
+        >>> test = "DET VB VB DET NN NN NN IN DET NN".split()
+        >>> cm = ConfusionMatrix(reference, test)
+        >>> print(cm.evaluate())
+        Tag | Prec.  | Recall | F-measure
+        ----+--------+--------+-----------
+        DET | 1.0000 | 1.0000 | 1.0000
+         IN | 1.0000 | 1.0000 | 1.0000
+         JJ | 0.0000 | 0.0000 | 0.0000
+         NN | 0.7500 | 0.7500 | 0.7500
+         VB | 0.5000 | 1.0000 | 0.6667
+        <BLANKLINE>
+
+        :param alpha: Ratio of the cost of false negative compared to false
+            positives, as used in the f-measure computation. Defaults to 0.5,
+            where the costs are equal.
+        :type alpha: float
+        :param truncate: If specified, then only show the specified
+            number of values. Any sorting (e.g., sort_by_count)
+            will be performed before truncation. Defaults to None
+        :type truncate: int, optional
+        :param sort_by_count: Whether to sort the outputs on frequency
+            in the reference label. Defaults to False.
+        :type sort_by_count: bool, optional
+        :return: A tabulated recall, precision and f-measure string
+        :rtype: str
+        """
+        tags = self._values
+
+        # Apply keyword parameters
+        if sort_by_count:
+            tags = sorted(tags, key=lambda v: -sum(self._confusion[self._indices[v]]))
+        if truncate:
+            tags = tags[:truncate]
+
+        tag_column_len = max(max(len(tag) for tag in tags), 3)
+
+        # Construct the header
+        s = (
+            f"{' ' * (tag_column_len - 3)}Tag | Prec.  | Recall | F-measure\n"
+            f"{'-' * tag_column_len}-+--------+--------+-----------\n"
+        )
+
+        # Construct the body
+        for tag in tags:
+            s += (
+                f"{tag:>{tag_column_len}} | "
+                f"{self.precision(tag):<6.4f} | "
+                f"{self.recall(tag):<6.4f} | "
+                f"{self.f_measure(tag, alpha=alpha):.4f}\n"
+            )
+
+        return s
+
 
 def demo():
     reference = "DET NN VB DET JJ NN NN IN DET NN".split()
@@ -211,6 +346,8 @@ def demo():
     print(ConfusionMatrix(reference, test))
     print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True))
 
+    print(ConfusionMatrix(reference, test).recall("VB"))
+
 
 if __name__ == "__main__":
     demo()
diff --git a/nltk/tag/__init__.py b/nltk/tag/__init__.py
index ce7610e171..36446de271 100644
--- a/nltk/tag/__init__.py
+++ b/nltk/tag/__init__.py
@@ -21,7 +21,7 @@
 An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset:
 
     >>> from nltk import pos_tag, word_tokenize
-    >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
+    >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +NORMALIZE_WHITESPACE
     [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
     ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
 
@@ -57,7 +57,7 @@
 
 We evaluate a tagger on data that was not seen during training:
 
-    >>> tagger.evaluate(brown.tagged_sents(categories='news')[500:600])
+    >>> tagger.accuracy(brown.tagged_sents(categories='news')[500:600])
     0.7...
 
 For more information, please consult chapter 5 of the NLTK Book.
@@ -144,10 +144,10 @@ def pos_tag(tokens, tagset=None, lang="eng"):
 
         >>> from nltk.tag import pos_tag
         >>> from nltk.tokenize import word_tokenize
-        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
+        >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +NORMALIZE_WHITESPACE
         [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
         ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
-        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal')
+        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal') # doctest: +NORMALIZE_WHITESPACE
         [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'),
         ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')]
 
diff --git a/nltk/tag/api.py b/nltk/tag/api.py
index 9ef6513549..25ffd1e0a4 100644
--- a/nltk/tag/api.py
+++ b/nltk/tag/api.py
@@ -3,6 +3,7 @@
 # Copyright (C) 2001-2021 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com> (minor additions)
+#         Tom Aarsen <>
 # URL: <https://www.nltk.org/>
 # For license information, see LICENSE.TXT
 
@@ -11,10 +12,12 @@
 information, such as its part of speech.
 """
 from abc import ABCMeta, abstractmethod
+from functools import lru_cache
 from itertools import chain
+from typing import Dict
 
-from nltk.internals import overridden
-from nltk.metrics import accuracy
+from nltk.internals import deprecated, overridden
+from nltk.metrics import ConfusionMatrix, accuracy
 from nltk.tag.util import untag
 
 
@@ -47,20 +50,24 @@ def tag(self, tokens):
 
     def tag_sents(self, sentences):
         """
-        Apply ``self.tag()`` to each element of *sentences*.  I.e.:
+        Apply ``self.tag()`` to each element of *sentences*.  I.e.::
 
             return [self.tag(sent) for sent in sentences]
         """
         return [self.tag(sent) for sent in sentences]
 
+    @deprecated("Use accuracy(gold) instead.")
     def evaluate(self, gold):
+        return self.accuracy(gold)
+
+    def accuracy(self, gold):
         """
         Score the accuracy of the tagger against the gold standard.
         Strip the tags from the gold standard text, retag it using
         the tagger, then compute the accuracy score.
 
-        :type gold: list(list(tuple(str, str)))
         :param gold: The list of tagged sentences to score the tagger on.
+        :type gold: list(list(tuple(str, str)))
         :rtype: float
         """
 
@@ -69,6 +76,212 @@ def evaluate(self, gold):
         test_tokens = list(chain.from_iterable(tagged_sents))
         return accuracy(gold_tokens, test_tokens)
 
+    @lru_cache(maxsize=1)
+    def _confusion_cached(self, gold):
+        """
+        Inner function used after ``gold`` is converted to a
+        ``tuple(tuple(tuple(str, str)))``. That way, we can use caching on
+        creating a ConfusionMatrix.
+
+        :param gold: The list of tagged sentences to run the tagger with,
+            also used as the reference values in the generated confusion matrix.
+        :type gold: tuple(tuple(tuple(str, str)))
+        :rtype: ConfusionMatrix
+        """
+
+        tagged_sents = self.tag_sents(untag(sent) for sent in gold)
+        gold_tokens = [token for _word, token in chain.from_iterable(gold)]
+        test_tokens = [token for _word, token in chain.from_iterable(tagged_sents)]
+        return ConfusionMatrix(gold_tokens, test_tokens)
+
+    def confusion(self, gold):
+        """
+        Return a ConfusionMatrix with the tags from ``gold`` as the reference
+        values, with the predictions from ``tag_sents`` as the predicted values.
+
+        >>> from nltk.tag import PerceptronTagger
+        >>> from nltk.corpus import treebank
+        >>> tagger = PerceptronTagger()
+        >>> gold_data = treebank.tagged_sents()[:10]
+        >>> print(tagger.confusion(gold_data))
+               |        -                                                                                     |
+               |        N                                                                                     |
+               |        O                                               P                                     |
+               |        N                       J  J        N  N  P  P  R     R           V  V  V  V  V  W    |
+               |  '     E     C  C  D  E  I  J  J  J  M  N  N  N  O  R  P  R  B  R  T  V  B  B  B  B  B  D  ` |
+               |  '  ,  -  .  C  D  T  X  N  J  R  S  D  N  P  S  S  P  $  B  R  P  O  B  D  G  N  P  Z  T  ` |
+        -------+----------------------------------------------------------------------------------------------+
+            '' | <1> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+             , |  .<15> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+        -NONE- |  .  . <.> .  .  2  .  .  .  2  .  .  .  5  1  .  .  .  .  2  .  .  .  .  .  .  .  .  .  .  . |
+             . |  .  .  .<10> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+            CC |  .  .  .  . <1> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+            CD |  .  .  .  .  . <5> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+            DT |  .  .  .  .  .  .<20> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+            EX |  .  .  .  .  .  .  . <1> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+            IN |  .  .  .  .  .  .  .  .<22> .  .  .  .  .  .  .  .  .  .  3  .  .  .  .  .  .  .  .  .  .  . |
+            JJ |  .  .  .  .  .  .  .  .  .<16> .  .  .  .  1  .  .  .  .  1  .  .  .  .  .  .  .  .  .  .  . |
+           JJR |  .  .  .  .  .  .  .  .  .  . <.> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+           JJS |  .  .  .  .  .  .  .  .  .  .  . <1> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+            MD |  .  .  .  .  .  .  .  .  .  .  .  . <1> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+            NN |  .  .  .  .  .  .  .  .  .  .  .  .  .<28> 1  1  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+           NNP |  .  .  .  .  .  .  .  .  .  .  .  .  .  .<25> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+           NNS |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .<19> .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+           POS |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <1> .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+           PRP |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <4> .  .  .  .  .  .  .  .  .  .  .  .  . |
+          PRP$ |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <2> .  .  .  .  .  .  .  .  .  .  .  . |
+            RB |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <4> .  .  .  .  .  .  .  .  .  .  . |
+           RBR |  .  .  .  .  .  .  .  .  .  .  1  .  .  .  .  .  .  .  .  . <1> .  .  .  .  .  .  .  .  .  . |
+            RP |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <1> .  .  .  .  .  .  .  .  . |
+            TO |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <5> .  .  .  .  .  .  .  . |
+            VB |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <3> .  .  .  .  .  .  . |
+           VBD |  .  .  .  .  .  .  .  .  .  .  .  .  .  1  .  .  .  .  .  .  .  .  .  . <6> .  .  .  .  .  . |
+           VBG |  .  .  .  .  .  .  .  .  .  .  .  .  .  1  .  .  .  .  .  .  .  .  .  .  . <4> .  .  .  .  . |
+           VBN |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  1  . <4> .  .  .  . |
+           VBP |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <3> .  .  . |
+           VBZ |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <7> .  . |
+           WDT |  .  .  .  .  .  .  .  .  2  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <.> . |
+            `` |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <1>|
+        -------+----------------------------------------------------------------------------------------------+
+        (row = reference; col = test)
+        <BLANKLINE>
+
+        :param gold: The list of tagged sentences to run the tagger with,
+            also used as the reference values in the generated confusion matrix.
+        :type gold: list(list(tuple(str, str)))
+        :rtype: ConfusionMatrix
+        """
+
+        return self._confusion_cached(tuple(tuple(sent) for sent in gold))
+
+    def recall(self, gold) -> Dict[str, float]:
+        """
+        Compute the recall for each tag from ``gold`` or from running ``tag``
+        on the tokenized sentences from ``gold``. Then, return the dictionary
+        with mappings from tag to recall. The recall is defined as:
+
+        - *r* = true positive / (true positive + false positive)
+
+        :param gold: The list of tagged sentences to score the tagger on.
+        :type gold: list(list(tuple(str, str)))
+        :return: A mapping from tags to recall
+        :rtype: Dict[str, float]
+        """
+
+        cm = self.confusion(gold)
+        return {tag: cm.recall(tag) for tag in cm._values}
+
+    def precision(self, gold):
+        """
+        Compute the precision for each tag from ``gold`` or from running ``tag``
+        on the tokenized sentences from ``gold``. Then, return the dictionary
+        with mappings from tag to precision. The precision is defined as:
+
+        - *p* = true positive / (true positive + false negative)
+
+        :param gold: The list of tagged sentences to score the tagger on.
+        :type gold: list(list(tuple(str, str)))
+        :return: A mapping from tags to precision
+        :rtype: Dict[str, float]
+        """
+
+        cm = self.confusion(gold)
+        return {tag: cm.precision(tag) for tag in cm._values}
+
+    def f_measure(self, gold, alpha=0.5):
+        """
+        Compute the f-measure for each tag from ``gold`` or from running ``tag``
+        on the tokenized sentences from ``gold``. Then, return the dictionary
+        with mappings from tag to f-measure. The f-measure is the harmonic mean
+        of the ``precision`` and ``recall``, weighted by ``alpha``.
+        In particular, given the precision *p* and recall *r* defined by:
+
+        - *p* = true positive / (true positive + false negative)
+        - *r* = true positive / (true positive + false positive)
+
+        The f-measure is:
+
+        - *1/(alpha/p + (1-alpha)/r)*
+
+        With ``alpha = 0.5``, this reduces to:
+
+        - *2pr / (p + r)*
+
+        :param gold: The list of tagged sentences to score the tagger on.
+        :type gold: list(list(tuple(str, str)))
+        :param alpha: Ratio of the cost of false negative compared to false
+            positives. Defaults to 0.5, where the costs are equal.
+        :type alpha: float
+        :return: A mapping from tags to precision
+        :rtype: Dict[str, float]
+        """
+        cm = self.confusion(gold)
+        return {tag: cm.f_measure(tag, alpha) for tag in cm._values}
+
+    def evaluate_per_tag(self, gold, alpha=0.5, truncate=None, sort_by_count=False):
+        """Tabulate the **recall**, **precision** and **f-measure**
+        for each tag from ``gold`` or from running ``tag`` on the tokenized
+        sentences from ``gold``.
+
+        >>> from nltk.tag import PerceptronTagger
+        >>> from nltk.corpus import treebank
+        >>> tagger = PerceptronTagger()
+        >>> gold_data = treebank.tagged_sents()[:10]
+        >>> print(tagger.evaluate_per_tag(gold_data))
+           Tag | Prec.  | Recall | F-measure
+        -------+--------+--------+-----------
+            '' | 1.0000 | 1.0000 | 1.0000
+             , | 1.0000 | 1.0000 | 1.0000
+        -NONE- | 0.0000 | 0.0000 | 0.0000
+             . | 1.0000 | 1.0000 | 1.0000
+            CC | 1.0000 | 1.0000 | 1.0000
+            CD | 0.7143 | 1.0000 | 0.8333
+            DT | 1.0000 | 1.0000 | 1.0000
+            EX | 1.0000 | 1.0000 | 1.0000
+            IN | 0.9167 | 0.8800 | 0.8980
+            JJ | 0.8889 | 0.8889 | 0.8889
+           JJR | 0.0000 | 0.0000 | 0.0000
+           JJS | 1.0000 | 1.0000 | 1.0000
+            MD | 1.0000 | 1.0000 | 1.0000
+            NN | 0.8000 | 0.9333 | 0.8615
+           NNP | 0.8929 | 1.0000 | 0.9434
+           NNS | 0.9500 | 1.0000 | 0.9744
+           POS | 1.0000 | 1.0000 | 1.0000
+           PRP | 1.0000 | 1.0000 | 1.0000
+          PRP$ | 1.0000 | 1.0000 | 1.0000
+            RB | 0.4000 | 1.0000 | 0.5714
+           RBR | 1.0000 | 0.5000 | 0.6667
+            RP | 1.0000 | 1.0000 | 1.0000
+            TO | 1.0000 | 1.0000 | 1.0000
+            VB | 1.0000 | 1.0000 | 1.0000
+           VBD | 0.8571 | 0.8571 | 0.8571
+           VBG | 1.0000 | 0.8000 | 0.8889
+           VBN | 1.0000 | 0.8000 | 0.8889
+           VBP | 1.0000 | 1.0000 | 1.0000
+           VBZ | 1.0000 | 1.0000 | 1.0000
+           WDT | 0.0000 | 0.0000 | 0.0000
+            `` | 1.0000 | 1.0000 | 1.0000
+        <BLANKLINE>
+
+        :param gold: The list of tagged sentences to score the tagger on.
+        :type gold: list(list(tuple(str, str)))
+        :param alpha: Ratio of the cost of false negative compared to false
+            positives, as used in the f-measure computation. Defaults to 0.5,
+            where the costs are equal.
+        :type alpha: float
+        :param truncate: If specified, then only show the specified
+            number of values.  Any sorting (e.g., sort_by_count)
+            will be performed before truncation. Defaults to None
+        :type truncate: int, optional
+        :param sort_by_count: Whether to sort the outputs on number of
+            occurrences of that tag in the ``gold`` data, defaults to False
+        :type sort_by_count: bool, optional
+        :return: A tabulated recall, precision and f-measure string
+        :rtype: str
+        """
+        cm = self.confusion(gold)
+        return cm.evaluate(alpha=alpha, truncate=truncate, sort_by_count=sort_by_count)
+
     def _check_params(self, train, model):
         if (train and model) or (not train and not model):
             raise ValueError("Must specify either training data or trained model.")
diff --git a/nltk/tag/brill_trainer.py b/nltk/tag/brill_trainer.py
index 0f1c5bea8c..ff2ee760b7 100644
--- a/nltk/tag/brill_trainer.py
+++ b/nltk/tag/brill_trainer.py
@@ -123,9 +123,8 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
         ... ])
 
         >>> baseline = backoff #see NOTE1
-
-        >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS
-        0.2433862...
+        >>> baseline.accuracy(gold_data) #doctest: +ELLIPSIS
+        0.2450142...
 
         >>> # Set up templates
         >>> Template._cleartemplates() #clear any templates created in earlier tests
@@ -174,8 +173,8 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
         <BLANKLINE>
         <BLANKLINE>
 
-        >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS
-        0.43833...
+        >>> tagger1.accuracy(gold_data) # doctest: +ELLIPSIS
+        0.43996...
 
         >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)
 
@@ -211,8 +210,9 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
           18  18   0   0  | CD->-NONE- if Pos:NN@[-1] & Word:0@[0]
           18  18   0   0  | NN->CC if Pos:NN@[-1] & Word:and@[0]
 
-        >>> tagger2.evaluate(gold_data)  # doctest: +ELLIPSIS
-        0.43996743...
+        >>> tagger2.accuracy(gold_data)  # doctest: +ELLIPSIS
+        0.44159544...
+
         >>> tagger2.rules()[2:4]
         (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')]))
 
diff --git a/nltk/tag/crf.py b/nltk/tag/crf.py
index cdcb4623be..5b1964e43f 100644
--- a/nltk/tag/crf.py
+++ b/nltk/tag/crf.py
@@ -35,13 +35,13 @@ class CRFTagger(TaggerI):
     [[('dog', 'Noun'), ('is', 'Verb'), ('good', 'Adj')], [('Cat', 'Noun'), ('eat', 'Verb'), ('meat', 'Noun')]]
 
     >>> gold_sentences = [[('dog','Noun'),('is','Verb'),('good','Adj')] , [('Cat','Noun'),('eat','Verb'), ('meat','Noun')]]
-    >>> ct.evaluate(gold_sentences)
+    >>> ct.accuracy(gold_sentences)
     1.0
 
     Setting learned model file
     >>> ct = CRFTagger()
     >>> ct.set_model_file('model.crf.tagger')
-    >>> ct.evaluate(gold_sentences)
+    >>> ct.accuracy(gold_sentences)
     1.0
     """
 
diff --git a/nltk/tag/perceptron.py b/nltk/tag/perceptron.py
index a18c0c2069..02ff0865b8 100644
--- a/nltk/tag/perceptron.py
+++ b/nltk/tag/perceptron.py
@@ -363,7 +363,7 @@ def _get_pretrain_model():
     print("Size of training and testing (sentence)", len(training), len(testing))
     # Train and save the model
     tagger.train(training, PICKLE)
-    print("Accuracy : ", tagger.evaluate(testing))
+    print("Accuracy : ", tagger.accuracy(testing))
 
 
 if __name__ == "__main__":
diff --git a/nltk/tag/tnt.py b/nltk/tag/tnt.py
index 9174e498c7..e4cbf74b3e 100755
--- a/nltk/tag/tnt.py
+++ b/nltk/tag/tnt.py
@@ -492,7 +492,7 @@ def demo2():
     s.train(d[(11) * 100 :])
 
     for i in range(10):
-        tacc = t.evaluate(d[i * 100 : ((i + 1) * 100)])
+        tacc = t.accuracy(d[i * 100 : ((i + 1) * 100)])
         tp_un = t.unknown / (t.known + t.unknown)
         tp_kn = t.known / (t.known + t.unknown)
         t.unknown = 0
@@ -504,7 +504,7 @@ def demo2():
         print("Percentage unknown:", tp_un)
         print("Accuracy over known words:", (tacc / tp_kn))
 
-        sacc = s.evaluate(d[i * 100 : ((i + 1) * 100)])
+        sacc = s.accuracy(d[i * 100 : ((i + 1) * 100)])
         sp_un = s.unknown / (s.known + s.unknown)
         sp_kn = s.known / (s.known + s.unknown)
         s.unknown = 0
@@ -550,14 +550,14 @@ def demo3():
         t.train(dtrain)
         s.train(etrain)
 
-        tacc = t.evaluate(dtest)
+        tacc = t.accuracy(dtest)
         tp_un = t.unknown / (t.known + t.unknown)
         tp_kn = t.known / (t.known + t.unknown)
         tknown += tp_kn
         t.unknown = 0
         t.known = 0
 
-        sacc = s.evaluate(etest)
+        sacc = s.accuracy(etest)
         sp_un = s.unknown / (s.known + s.unknown)
         sp_kn = s.known / (s.known + s.unknown)
         sknown += sp_kn
diff --git a/nltk/tbl/demo.py b/nltk/tbl/demo.py
index 8bdb8d360b..4ec84cb9dc 100644
--- a/nltk/tbl/demo.py
+++ b/nltk/tbl/demo.py
@@ -261,7 +261,7 @@ def postag(
     if gold_data:
         print(
             "    Accuracy on test set: {:0.4f}".format(
-                baseline_tagger.evaluate(gold_data)
+                baseline_tagger.accuracy(gold_data)
             )
         )
 
@@ -274,7 +274,7 @@ def postag(
     brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc)
     print(f"Trained tbl tagger in {time.time() - tbrill:0.2f} seconds")
     if gold_data:
-        print("    Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data))
+        print("    Accuracy on test set: %.4f" % brill_tagger.accuracy(gold_data))
 
     # printing the learned rules, if learned silently
     if trace == 1:
diff --git a/nltk/test/metrics.doctest b/nltk/test/metrics.doctest
index 42df6f42e9..d4852c12f1 100644
--- a/nltk/test/metrics.doctest
+++ b/nltk/test/metrics.doctest
@@ -217,6 +217,27 @@ Confusion Matrix
         10: h
     <BLANKLINE>
 
+For "e", the number of true positives should be 6, while the number of false negatives is 3.
+So, the recall ought to be 6 / (6 + 3):
+
+    >>> cm.recall("e") # doctest: +ELLIPSIS
+    0.666666...
+
+For "e", the false positive is just 1, so the precision should be 6 / (6 + 1):
+
+    >>> cm.precision("e") # doctest: +ELLIPSIS
+    0.857142...
+
+The f-measure with default value of ``alpha = 0.5`` should then be:
+
+* *1/(alpha/p + (1-alpha)/r) =*
+* *1/(0.5/p + 0.5/r) =*
+* *2pr / (p + r) =*
+* *2 * 0.857142... * 0.666666... / (0.857142... + 0.666666...) =*
+* *0.749999...*
+
+    >>> cm.f_measure("e") # doctest: +ELLIPSIS
+    0.749999...
 
 --------------------
 Association measures
diff --git a/nltk/test/probability.doctest b/nltk/test/probability.doctest
index 952c03c027..7902dc9ca0 100644
--- a/nltk/test/probability.doctest
+++ b/nltk/test/probability.doctest
@@ -139,7 +139,7 @@ And now we can test the estimators
 
     >>> def train_and_test(est):
     ...     hmm = trainer.train_supervised(train_corpus, estimator=est)
-    ...     print('%.2f%%' % (100 * hmm.evaluate(test_corpus)))
+    ...     print('%.2f%%' % (100 * hmm.accuracy(test_corpus)))
 
 Maximum Likelihood Estimation
 -----------------------------
diff --git a/nltk/test/tag.doctest b/nltk/test/tag.doctest
index ca893b8f90..beda200e0c 100644
--- a/nltk/test/tag.doctest
+++ b/nltk/test/tag.doctest
@@ -1,6 +1,438 @@
 .. Copyright (C) 2001-2021 NLTK Project
 .. For license information, see LICENSE.TXT
 
+Evaluation of Taggers
+=====================
+
+Evaluating the standard NLTK PerceptronTagger using Accuracy,
+Precision, Recall and F-measure for each of the tags.
+
+    >>> from nltk.tag import PerceptronTagger
+    >>> from nltk.corpus import treebank
+    >>> tagger = PerceptronTagger()
+    >>> gold_data = treebank.tagged_sents()[10:20]
+    >>> print(tagger.accuracy(gold_data)) # doctest: +ELLIPSIS
+    0.885931...
+
+    >>> print(tagger.evaluate_per_tag(gold_data))
+       Tag | Prec.  | Recall | F-measure
+    -------+--------+--------+-----------
+        '' | 1.0000 | 1.0000 | 1.0000
+         , | 1.0000 | 1.0000 | 1.0000
+    -NONE- | 0.0000 | 0.0000 | 0.0000
+         . | 1.0000 | 1.0000 | 1.0000
+         : | 1.0000 | 1.0000 | 1.0000
+        CC | 1.0000 | 1.0000 | 1.0000
+        CD | 0.7647 | 1.0000 | 0.8667
+        DT | 1.0000 | 1.0000 | 1.0000
+        IN | 1.0000 | 1.0000 | 1.0000
+        JJ | 0.5882 | 0.8333 | 0.6897
+       JJR | 1.0000 | 1.0000 | 1.0000
+       JJS | 1.0000 | 1.0000 | 1.0000
+        NN | 0.7647 | 0.9630 | 0.8525
+       NNP | 0.8929 | 1.0000 | 0.9434
+       NNS | 1.0000 | 1.0000 | 1.0000
+       POS | 1.0000 | 1.0000 | 1.0000
+       PRP | 1.0000 | 1.0000 | 1.0000
+        RB | 0.8000 | 1.0000 | 0.8889
+       RBR | 0.0000 | 0.0000 | 0.0000
+        TO | 1.0000 | 1.0000 | 1.0000
+        VB | 1.0000 | 1.0000 | 1.0000
+       VBD | 0.8571 | 0.9231 | 0.8889
+       VBG | 1.0000 | 1.0000 | 1.0000
+       VBN | 0.8333 | 0.5556 | 0.6667
+       VBP | 0.5714 | 0.8000 | 0.6667
+       VBZ | 1.0000 | 1.0000 | 1.0000
+        WP | 1.0000 | 1.0000 | 1.0000
+        `` | 1.0000 | 1.0000 | 1.0000
+    <BLANKLINE>
+
+List only the 10 most common tags:
+
+    >>> print(tagger.evaluate_per_tag(gold_data, truncate=10, sort_by_count=True))
+       Tag | Prec.  | Recall | F-measure
+    -------+--------+--------+-----------
+        IN | 1.0000 | 1.0000 | 1.0000
+        DT | 1.0000 | 1.0000 | 1.0000
+        NN | 0.7647 | 0.9630 | 0.8525
+       NNP | 0.8929 | 1.0000 | 0.9434
+       NNS | 1.0000 | 1.0000 | 1.0000
+    -NONE- | 0.0000 | 0.0000 | 0.0000
+        CD | 0.7647 | 1.0000 | 0.8667
+       VBD | 0.8571 | 0.9231 | 0.8889
+        JJ | 0.5882 | 0.8333 | 0.6897
+         , | 1.0000 | 1.0000 | 1.0000
+    <BLANKLINE>
+
+Similarly, we can display the confusion matrix for this tagger.
+
+    >>> print(tagger.confusion(gold_data))
+           |        -                                                                            |
+           |        N                                                                            |
+           |        O                                                                            |
+           |        N                       J  J     N  N  P  P     R        V  V  V  V  V       |
+           |  '     E        C  C  D  I  J  J  J  N  N  N  O  R  R  B  T  V  B  B  B  B  B  W  ` |
+           |  '  ,  -  .  :  C  D  T  N  J  R  S  N  P  S  S  P  B  R  O  B  D  G  N  P  Z  P  ` |
+    -------+-------------------------------------------------------------------------------------+
+        '' | <3> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+         , |  .<11> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+    -NONE- |  .  . <.> .  .  .  4  .  .  4  .  .  7  2  .  .  .  1  .  .  .  .  .  .  3  .  .  . |
+         . |  .  .  .<10> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+         : |  .  .  .  . <1> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+        CC |  .  .  .  .  . <5> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+        CD |  .  .  .  .  .  .<13> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+        DT |  .  .  .  .  .  .  .<28> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+        IN |  .  .  .  .  .  .  .  .<34> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+        JJ |  .  .  .  .  .  .  .  .  .<10> .  .  .  1  .  .  .  .  1  .  .  .  .  .  .  .  .  . |
+       JJR |  .  .  .  .  .  .  .  .  .  . <1> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+       JJS |  .  .  .  .  .  .  .  .  .  .  . <1> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+        NN |  .  .  .  .  .  .  .  .  .  1  .  .<26> .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+       NNP |  .  .  .  .  .  .  .  .  .  .  .  .  .<25> .  .  .  .  .  .  .  .  .  .  .  .  .  . |
+       NNS |  .  .  .  .  .  .  .  .  .  .  .  .  .  .<22> .  .  .  .  .  .  .  .  .  .  .  .  . |
+       POS |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <1> .  .  .  .  .  .  .  .  .  .  .  . |
+       PRP |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <3> .  .  .  .  .  .  .  .  .  .  . |
+        RB |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <4> .  .  .  .  .  .  .  .  .  . |
+       RBR |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <.> .  .  .  .  .  .  .  .  . |
+        TO |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <2> .  .  .  .  .  .  .  . |
+        VB |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <1> .  .  .  .  .  .  . |
+       VBD |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .<12> .  1  .  .  .  . |
+       VBG |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <3> .  .  .  .  . |
+       VBN |  .  .  .  .  .  .  .  .  .  2  .  .  .  .  .  .  .  .  .  .  .  2  . <5> .  .  .  . |
+       VBP |  .  .  .  .  .  .  .  .  .  .  .  .  1  .  .  .  .  .  .  .  .  .  .  . <4> .  .  . |
+       VBZ |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <2> .  . |
+        WP |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <3> . |
+        `` |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <3>|
+    -------+-------------------------------------------------------------------------------------+
+    (row = reference; col = test)
+    <BLANKLINE>
+
+Brill Trainer with evaluation
+=============================
+
+    >>> # Perform the relevant imports.
+    >>> from nltk.tbl.template import Template
+    >>> from nltk.tag.brill import Pos, Word
+    >>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer, UnigramTagger
+
+    >>> # Load some data
+    >>> from nltk.corpus import treebank
+    >>> training_data = treebank.tagged_sents()[:100]
+    >>> baseline_data = treebank.tagged_sents()[100:200]
+    >>> gold_data = treebank.tagged_sents()[200:300]
+    >>> testing_data = [untag(s) for s in gold_data]
+
+    >>> backoff = RegexpTagger([
+    ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
+    ... (r'(The|the|A|a|An|an)$', 'AT'),   # articles
+    ... (r'.*able$', 'JJ'),                # adjectives
+    ... (r'.*ness$', 'NN'),                # nouns formed from adjectives
+    ... (r'.*ly$', 'RB'),                  # adverbs
+    ... (r'.*s$', 'NNS'),                  # plural nouns
+    ... (r'.*ing$', 'VBG'),                # gerunds
+    ... (r'.*ed$', 'VBD'),                 # past tense verbs
+    ... (r'.*', 'NN')                      # nouns (default)
+    ... ])
+
+We've now created a simple ``RegexpTagger``, which tags according to the regular expression
+rules it has been supplied. This tagger in and of itself does not have a great accuracy.
+
+    >>> backoff.accuracy(gold_data) #doctest: +ELLIPSIS
+    0.245014...
+
+Neither does a simple ``UnigramTagger``. This tagger is trained on some data,
+and will then first try to match unigrams (i.e. tokens) of the sentence it has
+to tag to the learned data.
+
+    >>> unigram_tagger = UnigramTagger(baseline_data)
+    >>> unigram_tagger.accuracy(gold_data) #doctest: +ELLIPSIS
+    0.581196...
+
+The lackluster accuracy here can be explained with the following example:
+
+    >>> unigram_tagger.tag(["I", "would", "like", "this", "sentence", "to", "be", "tagged"])
+    [('I', 'NNP'), ('would', 'MD'), ('like', None), ('this', 'DT'), ('sentence', None),
+    ('to', 'TO'), ('be', 'VB'), ('tagged', None)]
+
+As you can see, many tokens are tagged as ``None``, as these tokens are OOV (out of vocabulary).
+The ``UnigramTagger`` has never seen them, and as a result they are not in its database of known terms.
+
+In practice, a ``UnigramTagger`` is exclusively used in conjunction with a *backoff*. Our real
+baseline which will use such a backoff. We'll create a ``UnigramTagger`` like before, but now
+the ``RegexpTagger`` will be used as a backoff for the situations where the ``UnigramTagger``
+encounters an OOV token.
+
+    >>> baseline = UnigramTagger(baseline_data, backoff=backoff)
+    >>> baseline.accuracy(gold_data) #doctest: +ELLIPSIS
+    0.7537647...
+
+That is already much better. We can investigate the performance further by running
+``evaluate_per_tag``. This method will output the *Precision*, *Recall* and *F-measure*
+of each tag.
+
+    >>> print(baseline.evaluate_per_tag(gold_data, sort_by_count=True))
+       Tag | Prec.  | Recall | F-measure
+    -------+--------+--------+-----------
+       NNP | 0.9674 | 0.2738 | 0.4269
+        NN | 0.4111 | 0.9136 | 0.5670
+        IN | 0.9383 | 0.9580 | 0.9480
+        DT | 0.9819 | 0.8859 | 0.9314
+        JJ | 0.8167 | 0.2970 | 0.4356
+       NNS | 0.7393 | 0.9630 | 0.8365
+    -NONE- | 1.0000 | 0.8345 | 0.9098
+         , | 1.0000 | 1.0000 | 1.0000
+         . | 1.0000 | 1.0000 | 1.0000
+       VBD | 0.6429 | 0.8804 | 0.7431
+        CD | 1.0000 | 0.9872 | 0.9935
+        CC | 1.0000 | 0.9355 | 0.9667
+        VB | 0.7778 | 0.3684 | 0.5000
+       VBN | 0.9375 | 0.3000 | 0.4545
+        RB | 0.7778 | 0.7447 | 0.7609
+        TO | 1.0000 | 1.0000 | 1.0000
+       VBZ | 0.9643 | 0.6429 | 0.7714
+       VBG | 0.6415 | 0.9444 | 0.7640
+      PRP$ | 1.0000 | 1.0000 | 1.0000
+       PRP | 1.0000 | 0.5556 | 0.7143
+        MD | 1.0000 | 1.0000 | 1.0000
+       VBP | 0.6471 | 0.5789 | 0.6111
+       POS | 1.0000 | 1.0000 | 1.0000
+         $ | 1.0000 | 0.8182 | 0.9000
+        '' | 1.0000 | 1.0000 | 1.0000
+         : | 1.0000 | 1.0000 | 1.0000
+       WDT | 0.4000 | 0.2000 | 0.2667
+        `` | 1.0000 | 1.0000 | 1.0000
+       JJR | 1.0000 | 0.5000 | 0.6667
+      NNPS | 0.0000 | 0.0000 | 0.0000
+       RBR | 1.0000 | 1.0000 | 1.0000
+     -LRB- | 0.0000 | 0.0000 | 0.0000
+     -RRB- | 0.0000 | 0.0000 | 0.0000
+        RP | 0.6667 | 0.6667 | 0.6667
+        EX | 0.5000 | 0.5000 | 0.5000
+       JJS | 0.0000 | 0.0000 | 0.0000
+        WP | 1.0000 | 1.0000 | 1.0000
+       PDT | 0.0000 | 0.0000 | 0.0000
+        AT | 0.0000 | 0.0000 | 0.0000
+    <BLANKLINE>
+
+It's clear that although the precision of tagging `"NNP"` is high, the recall is very low.
+With other words, we're missing a lot of cases where the true label is `"NNP"`. We can see
+a similar effect with `"JJ"`.
+
+We can also see a very expected result: The precision of `"NN"` is low, while the recall
+is high. If a term is OOV (i.e. ``UnigramTagger`` defers it to ``RegexpTagger``) and
+``RegexpTagger`` doesn't have a good rule for it, then it will be tagged as `"NN"`. So,
+we catch almost all tokens that are truly labeled as `"NN"`, but we also tag as `"NN"`
+for many tokens that shouldn't be `"NN"`.
+
+This method gives us some insight in what parts of the tagger needs more attention, and why.
+However, it doesn't tell us what the terms with true label `"NNP"` or `"JJ"` are actually
+tagged as.
+To help that, we can create a confusion matrix.
+
+    >>> print(baseline.confusion(gold_data))
+           |                   -                                                                                                                                         |
+           |               -   N   -                                                                                                                                     |
+           |               L   O   R                                                           N                   P                                                     |
+           |               R   N   R                                       J   J           N   N   N   P   P   P   R       R               V   V   V   V   V   W         |
+           |       '       B   E   B           A   C   C   D   E   I   J   J   J   M   N   N   P   N   D   O   R   P   R   B   R   T   V   B   B   B   B   B   D   W   ` |
+           |   $   '   ,   -   -   -   .   :   T   C   D   T   X   N   J   R   S   D   N   P   S   S   T   S   P   $   B   R   P   O   B   D   G   N   P   Z   T   P   ` |
+    -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+
+         $ |  <9>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   2   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+        '' |   . <10>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+         , |   .   .<115>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+     -LRB- |   .   .   .  <.>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   3   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+    -NONE- |   .   .   .   .<121>  .   .   .   .   .   .   .   .   .   .   .   .   .  24   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+     -RRB- |   .   .   .   .   .  <.>  .   .   .   .   .   .   .   .   .   .   .   .   3   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+         . |   .   .   .   .   .   .<100>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+         : |   .   .   .   .   .   .   . <10>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+        AT |   .   .   .   .   .   .   .   .  <.>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+        CC |   .   .   .   .   .   .   .   .   . <58>  .   .   .   .   .   .   .   .   4   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+        CD |   .   .   .   .   .   .   .   .   .   . <77>  .   .   .   .   .   .   .   1   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+        DT |   .   .   .   .   .   .   .   .   1   .   .<163>  .   4   .   .   .   .  13   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   3   .   . |
+        EX |   .   .   .   .   .   .   .   .   .   .   .   .  <1>  .   .   .   .   .   1   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+        IN |   .   .   .   .   .   .   .   .   .   .   .   .   .<228>  .   .   .   .   8   .   .   .   .   .   .   .   .   .   .   .   .   .   2   .   .   .   .   .   . |
+        JJ |   .   .   .   .   .   .   .   .   .   .   .   .   .   . <49>  .   .   .  86   2   .   4   .   .   .   .   6   .   .   .   .  12   3   .   3   .   .   .   . |
+       JJR |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .  <3>  .   .   3   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+       JJS |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .  <.>  .   2   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+        MD |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . <19>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+        NN |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   9   .   .   .<296>  .   .   5   .   .   .   .   .   .   .   .   5   .   9   .   .   .   .   .   . |
+       NNP |   .   .   .   .   .   .   .   .   .   .   .   2   .   .   .   .   .   . 199 <89>  .  26   .   .   .   .   2   .   .   .   .   2   5   .   .   .   .   .   . |
+      NNPS |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   1  <.>  3   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+       NNS |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   5   .   .<156>  .   .   .   .   .   .   .   .   .   .   .   .   .   1   .   .   . |
+       PDT |   .   .   .   .   .   .   .   .   .   .   .   1   .   .   .   .   .   .   .   .   .   .  <.>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+       POS |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . <14>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+       PRP |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .  10   .   .   2   .   . <15>  .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+      PRP$ |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . <28>  .   .   .   .   .   .   .   .   .   .   .   .   . |
+        RB |   .   .   .   .   .   .   .   .   .   .   .   .   1   4   .   .   .   .   6   .   .   .   .   .   .   . <35>  .   1   .   .   .   .   .   .   .   .   .   . |
+       RBR |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .  <4>  .   .   .   .   .   .   .   .   .   .   . |
+        RP |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   1   .  <2>  .   .   .   .   .   .   .   .   .   . |
+        TO |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . <47>  .   .   .   .   .   .   .   .   . |
+        VB |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   2   .   .   .  30   .   .   .   .   .   .   .   1   .   .   . <21>  .   .   .   3   .   .   .   . |
+       VBD |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .  10   .   .   .   .   .   .   .   .   .   .   .   . <81>  .   1   .   .   .   .   . |
+       VBG |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   2   .   .   .   .   .   .   .   .   .   .   .   .   . <34>  .   .   .   .   .   . |
+       VBN |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   4   .   .   .   .   .   .   .   .   .   .   .   .  31   . <15>  .   .   .   .   . |
+       VBP |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   7   .   .   .   .   .   .   .   .   .   .   .   1   .   .   . <11>  .   .   .   . |
+       VBZ |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .  15   .   .   .   .   .   .   .   .   .   .   .   .   . <27>  .   .   . |
+       WDT |   .   .   .   .   .   .   .   .   .   .   .   .   .   7   .   .   .   .   1   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .  <2>  .   . |
+        WP |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .  <2>  . |
+        `` |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . <10>|
+    -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    (row = reference; col = test)
+    <BLANKLINE>
+
+Once again we can see that `"NN"` is the default if the tagger isn't sure. Beyond that,
+we can see why the recall for `"NNP"` is so low: these tokens are often tagged as `"NN"`.
+This effect can also be seen for `"JJ"`, where the majority of tokens that ought to be
+tagged as `"JJ"` are actually tagged as `"NN"` by our tagger.
+
+This tagger will only serve as a baseline for the ``BrillTaggerTrainer``, which uses
+templates to attempt to improve the performance of the tagger.
+
+    >>> # Set up templates
+    >>> Template._cleartemplates() #clear any templates created in earlier tests
+    >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]
+
+    >>> # Construct a BrillTaggerTrainer
+    >>> tt = BrillTaggerTrainer(baseline, templates, trace=3)
+    >>> tagger1 = tt.train(training_data, max_rules=10)
+    TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None)
+    Finding initial useful rules...
+        Found 618 useful rules.
+    <BLANKLINE>
+               B      |
+       S   F   r   O  |        Score = Fixed - Broken
+       c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
+       o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
+       r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
+       e   d   n   r  |  e
+    ------------------+-------------------------------------------------------
+      13  14   1   4  | NN->VB if Pos:TO@[-1]
+       8   8   0   0  | NN->VB if Pos:MD@[-1]
+       7  10   3  22  | NN->IN if Pos:NNS@[-1]
+       5   5   0   0  | NN->VBP if Pos:PRP@[-1]
+       5   5   0   0  | VBD->VBN if Pos:VBZ@[-1]
+       5   5   0   0  | NNS->NN if Pos:IN@[-1] & Word:asbestos@[0]
+       4   4   0   0  | NN->-NONE- if Pos:WP@[-1]
+       4   4   0   3  | NN->NNP if Pos:-NONE-@[-1]
+       4   6   2   2  | NN->NNP if Pos:NNP@[-1]
+       4   4   0   0  | NNS->VBZ if Pos:PRP@[-1]
+
+    >>> tagger1.rules()[1:3]
+    (Rule('000', 'NN', 'VB', [(Pos([-1]),'MD')]), Rule('000', 'NN', 'IN', [(Pos([-1]),'NNS')]))
+
+    >>> tagger1.print_template_statistics(printunused=False)
+    TEMPLATE STATISTICS (TRAIN)  2 templates, 10 rules)
+    TRAIN (   2417 tokens) initial   555 0.7704 final:   496 0.7948
+    #ID | Score (train) |  #Rules     | Template
+    --------------------------------------------
+    000 |    54   0.915 |   9   0.900 | Template(Pos([-1]))
+    001 |     5   0.085 |   1   0.100 | Template(Pos([-1]),Word([0]))
+    <BLANKLINE>
+    <BLANKLINE>
+
+    >>> tagger1.accuracy(gold_data) # doctest: +ELLIPSIS
+    0.769230...
+
+    >>> print(tagger1.evaluate_per_tag(gold_data, sort_by_count=True))
+       Tag | Prec.  | Recall | F-measure
+    -------+--------+--------+-----------
+       NNP | 0.8298 | 0.3600 | 0.5021
+        NN | 0.4435 | 0.8364 | 0.5797
+        IN | 0.8476 | 0.9580 | 0.8994
+        DT | 0.9819 | 0.8859 | 0.9314
+        JJ | 0.8167 | 0.2970 | 0.4356
+       NNS | 0.7464 | 0.9630 | 0.8410
+    -NONE- | 1.0000 | 0.8414 | 0.9139
+         , | 1.0000 | 1.0000 | 1.0000
+         . | 1.0000 | 1.0000 | 1.0000
+       VBD | 0.6723 | 0.8696 | 0.7583
+        CD | 1.0000 | 0.9872 | 0.9935
+        CC | 1.0000 | 0.9355 | 0.9667
+        VB | 0.8103 | 0.8246 | 0.8174
+       VBN | 0.9130 | 0.4200 | 0.5753
+        RB | 0.7778 | 0.7447 | 0.7609
+        TO | 1.0000 | 1.0000 | 1.0000
+       VBZ | 0.9667 | 0.6905 | 0.8056
+       VBG | 0.6415 | 0.9444 | 0.7640
+      PRP$ | 1.0000 | 1.0000 | 1.0000
+       PRP | 1.0000 | 0.5556 | 0.7143
+        MD | 1.0000 | 1.0000 | 1.0000
+       VBP | 0.6316 | 0.6316 | 0.6316
+       POS | 1.0000 | 1.0000 | 1.0000
+         $ | 1.0000 | 0.8182 | 0.9000
+        '' | 1.0000 | 1.0000 | 1.0000
+         : | 1.0000 | 1.0000 | 1.0000
+       WDT | 0.4000 | 0.2000 | 0.2667
+        `` | 1.0000 | 1.0000 | 1.0000
+       JJR | 1.0000 | 0.5000 | 0.6667
+      NNPS | 0.0000 | 0.0000 | 0.0000
+       RBR | 1.0000 | 1.0000 | 1.0000
+     -LRB- | 0.0000 | 0.0000 | 0.0000
+     -RRB- | 0.0000 | 0.0000 | 0.0000
+        RP | 0.6667 | 0.6667 | 0.6667
+        EX | 0.5000 | 0.5000 | 0.5000
+       JJS | 0.0000 | 0.0000 | 0.0000
+        WP | 1.0000 | 1.0000 | 1.0000
+       PDT | 0.0000 | 0.0000 | 0.0000
+        AT | 0.0000 | 0.0000 | 0.0000
+    <BLANKLINE>
+
+    >>> print(tagger1.confusion(gold_data))
+           |                   -                                                                                                                                         |
+           |               -   N   -                                                                                                                                     |
+           |               L   O   R                                                           N                   P                                                     |
+           |               R   N   R                                       J   J           N   N   N   P   P   P   R       R               V   V   V   V   V   W         |
+           |       '       B   E   B           A   C   C   D   E   I   J   J   J   M   N   N   P   N   D   O   R   P   R   B   R   T   V   B   B   B   B   B   D   W   ` |
+           |   $   '   ,   -   -   -   .   :   T   C   D   T   X   N   J   R   S   D   N   P   S   S   T   S   P   $   B   R   P   O   B   D   G   N   P   Z   T   P   ` |
+    -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+
+         $ |  <9>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   1   .   .   .   .   .   .   .   .   .   .   .   1   .   .   .   .   .   .   .   . |
+        '' |   . <10>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+         , |   .   .<115>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+     -LRB- |   .   .   .  <.>  .   .   .   .   .   .   .   .   .   1   .   .   .   .   2   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+    -NONE- |   .   .   .   .<122>  .   .   .   .   .   .   .   .   1   .   .   .   .  22   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+     -RRB- |   .   .   .   .   .  <.>  .   .   .   .   .   .   .   .   .   .   .   .   2   1   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+         . |   .   .   .   .   .   .<100>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+         : |   .   .   .   .   .   .   . <10>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+        AT |   .   .   .   .   .   .   .   .  <.>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+        CC |   .   .   .   .   .   .   .   .   . <58>  .   .   .   .   .   .   .   .   2   1   .   .   .   .   .   .   .   .   .   .   .   .   .   .   1   .   .   .   . |
+        CD |   .   .   .   .   .   .   .   .   .   . <77>  .   .   .   .   .   .   .   1   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+        DT |   .   .   .   .   .   .   .   .   1   .   .<163>  .   5   .   .   .   .  12   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   3   .   . |
+        EX |   .   .   .   .   .   .   .   .   .   .   .   .  <1>  .   .   .   .   .   1   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+        IN |   .   .   .   .   .   .   .   .   .   .   .   .   .<228>  .   .   .   .   8   .   .   .   .   .   .   .   .   .   .   .   .   .   2   .   .   .   .   .   . |
+        JJ |   .   .   .   .   .   .   .   .   .   .   .   .   .   4 <49>  .   .   .  79   4   .   4   .   .   .   .   6   .   .   .   1  12   3   .   3   .   .   .   . |
+       JJR |   .   .   .   .   .   .   .   .   .   .   .   .   .   2   .  <3>  .   .   1   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+       JJS |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .  <.>  .   2   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+        MD |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . <19>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+        NN |   .   .   .   .   .   .   .   .   .   .   .   .   .   7   9   .   .   .<271> 16   .   5   .   .   .   .   .   .   .   .   7   .   9   .   .   .   .   .   . |
+       NNP |   .   .   .   .   .   .   .   .   .   .   .   2   .   7   .   .   .   . 163<117>  .  26   .   .   .   .   2   .   .   .   1   2   5   .   .   .   .   .   . |
+      NNPS |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   1  <.>  3   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+       NNS |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   5   .   .<156>  .   .   .   .   .   .   .   .   .   .   .   .   .   1   .   .   . |
+       PDT |   .   .   .   .   .   .   .   .   .   .   .   1   .   .   .   .   .   .   .   .   .   .  <.>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+       POS |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . <14>  .   .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+       PRP |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .  10   .   .   2   .   . <15>  .   .   .   .   .   .   .   .   .   .   .   .   .   . |
+      PRP$ |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . <28>  .   .   .   .   .   .   .   .   .   .   .   .   . |
+        RB |   .   .   .   .   .   .   .   .   .   .   .   .   1   4   .   .   .   .   6   .   .   .   .   .   .   . <35>  .   1   .   .   .   .   .   .   .   .   .   . |
+       RBR |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .  <4>  .   .   .   .   .   .   .   .   .   .   . |
+        RP |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   1   .  <2>  .   .   .   .   .   .   .   .   .   . |
+        TO |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . <47>  .   .   .   .   .   .   .   .   . |
+        VB |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   2   .   .   .   4   .   .   .   .   .   .   .   1   .   .   . <47>  .   .   .   3   .   .   .   . |
+       VBD |   .   .   .   .   .   .   .   .   .   .   .   .   .   1   .   .   .   .   8   1   .   .   .   .   .   .   .   .   .   .   . <80>  .   2   .   .   .   .   . |
+       VBG |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   2   .   .   .   .   .   .   .   .   .   .   .   .   . <34>  .   .   .   .   .   . |
+       VBN |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   4   .   .   .   .   .   .   .   .   .   .   .   .  25   . <21>  .   .   .   .   . |
+       VBP |   .   .   .   .   .   .   .   .   .   .   .   .   .   2   .   .   .   .   4   .   .   .   .   .   .   .   .   .   .   .   1   .   .   . <12>  .   .   .   . |
+       VBZ |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .  13   .   .   .   .   .   .   .   .   .   .   .   .   . <29>  .   .   . |
+       WDT |   .   .   .   .   .   .   .   .   .   .   .   .   .   7   .   .   .   .   1   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .  <2>  .   . |
+        WP |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .  <2>  . |
+        `` |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   . <10>|
+    -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    (row = reference; col = test)
+    <BLANKLINE>
+
+    >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)
+    >>> tagged[33][12:]
+    [('foreign', 'NN'), ('debt', 'NN'), ('of', 'IN'), ('$', '$'), ('64', 'CD'),
+    ('billion', 'CD'), ('*U*', '-NONE-'), ('--', ':'), ('the', 'DT'), ('third-highest', 'NN'),
+    ('in', 'IN'), ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')]
+
 Regression Tests
 ~~~~~~~~~~~~~~~~