From 9901a97e641495238b5ee8f4e1b7b262d8a98d04 Mon Sep 17 00:00:00 2001 From: Christopher Bryant Date: Tue, 10 Dec 2019 01:15:44 +0000 Subject: [PATCH] ERRANT v2.0.0 --- changelog.md => CHANGELOG.md | 26 +- LICENSE.md | 21 + MANIFEST.in | 1 + README.md | 233 ++++++++++ compare_m2.py | 330 -------------- demo/cor2.txt | 2 +- demo/orig.txt | 4 +- demo/out.m2 | 14 +- demo/readme.md | 6 +- errant/__init__.py | 24 + errant/alignment.py | 174 +++++++ errant/annotator.py | 98 ++++ {scripts => errant/commands}/__init__.py | 0 errant/commands/compare_m2.py | 381 ++++++++++++++++ errant/commands/m2_to_m2.py | 178 ++++++++ errant/commands/parallel_to_m2.py | 92 ++++ errant/edit.py | 56 +++ errant/en/__init__.py | 0 errant/en/classifier.py | 426 ++++++++++++++++++ errant/en/merger.py | 122 +++++ errant/en/resources/README.md | 20 + {resources => errant/en/resources}/en-ptb_map | 0 .../en/resources}/en_GB-large.txt | 0 m2_to_m2.py | 102 ----- parallel_to_m2.py | 79 ---- readme.md | 345 -------------- resources/readme.md | 27 -- scripts/align_text.py | 227 ---------- scripts/cat_rules.py | 376 ---------------- scripts/rdlextra.py | 341 -------------- scripts/toolbox.py | 136 ------ setup.py | 47 ++ 32 files changed, 1905 insertions(+), 1983 deletions(-) rename changelog.md => CHANGELOG.md (77%) create mode 100644 LICENSE.md create mode 100644 MANIFEST.in create mode 100644 README.md delete mode 100644 compare_m2.py create mode 100644 errant/__init__.py create mode 100644 errant/alignment.py create mode 100644 errant/annotator.py rename {scripts => errant/commands}/__init__.py (100%) create mode 100644 errant/commands/compare_m2.py create mode 100644 errant/commands/m2_to_m2.py create mode 100644 errant/commands/parallel_to_m2.py create mode 100644 errant/edit.py create mode 100644 errant/en/__init__.py create mode 100644 errant/en/classifier.py create mode 100644 errant/en/merger.py create mode 100644 errant/en/resources/README.md rename {resources => errant/en/resources}/en-ptb_map (100%) rename {resources => errant/en/resources}/en_GB-large.txt (100%) delete mode 100644 m2_to_m2.py delete mode 100644 parallel_to_m2.py delete mode 100644 readme.md delete mode 100644 resources/readme.md delete mode 100644 scripts/align_text.py delete mode 100644 scripts/cat_rules.py delete mode 100644 scripts/rdlextra.py delete mode 100644 scripts/toolbox.py create mode 100644 setup.py diff --git a/changelog.md b/CHANGELOG.md similarity index 77% rename from changelog.md rename to CHANGELOG.md index 33eb53f..92e4ee6 100644 --- a/changelog.md +++ b/CHANGELOG.md @@ -1,8 +1,20 @@ # Changelog -This document contains descriptions of all the significant changes made to ERRANT since its release. +This log describes all the significant changes made to ERRANT since its release. -## 16-11-18 +## v2.0.0 (10-12-19) + +1. ERRANT has been significantly refactored to accommodate a new API (see README). It should now also be much easier to extend to other languages. + +2. Added a `setup.py` script to make ERRANT `pip` installable. + +3. The Damerau-Levenshtein alignment code has been rewritten in a much cleaner Python implementation. This also makes ERRANT ~20% faster. + +Note: All these changes do **not** affect system output compared with the previous version. For the first `pip` release, we wanted to make sure v2.0.0 was fully compatible with the [BEA-2019 shared task](https://www.cl.cam.ac.uk/research/nl/bea2019st/) on Grammatical Error Correction. + +Thanks to [@sai-prasanna](https://github.com/sai-prasanna) for inspiring some of these changes! + +## v1.4 (16-11-18) 1. The `compare_m2.py` evaluation script was refactored to make it easier to use. @@ -24,7 +36,7 @@ The differences between the old and new version are summarised in the following | CoNLL-2014.1 | 1312 | Old
New | 82.50
84.04 | 82.73
82.85 | 82.61
**83.44** | 385
**50** | | NUCLE | 57151 | Old
New | 70.14
73.20 | 80.27
81.16 | 71.95
**76.97** | 7565
**725** | -## 23-08-18 +## v1.3 (23-08-18) Fix arbitrary reordering of edits with the same start and end span; e.g. S I am happy . @@ -37,21 +49,21 @@ S I am happy . A 2 2|||M:ADV|||very|||REQUIRED|||-NONE-|||0 A 2 2|||M:ADV|||really|||REQUIRED|||-NONE-|||0 -## 10-08-18 +## v1.2 (10-08-18) Added support for multiple annotators in `parallel_to_m2.py`. Before: `python3 parallel_to_m2.py -orig -cor -out ` After: `python3 parallel_to_m2.py -orig -cor [ ...] -out ` This is helpful if you have multiple annotations for the same orig file. -## 17-12-17 +## News (17-12-17) In November, spaCy changed significantly when it became version 2.0.0. Although we have not tested ERRANT with this new version, the main change seemed to be a slight increase in performance (pos tagging and parsing etc.) at a significant cost to speed. Consequently, we still recommend spaCy 1.9.0 for use with ERRANT. -## 22-11-17 +## v1.1 (22-11-17) ERRANT would sometimes run into memory problems if sentences were long and very different. We hence changed the default alignment from breadth-first to depth-first. This bypassed the memory problems, made ERRANT faster and barely affected results. -## 10-05-17 +## v1.0 (10-05-17) ERRANT v1.0 released. \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..2377854 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +# MIT License + +Copyright (c) 2017 Christopher Bryant, Mariano Felice + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..2d823e4 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include errant/en/resources/* \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..a2bae7d --- /dev/null +++ b/README.md @@ -0,0 +1,233 @@ +# ERRANT v2.0.0 + +This repository contains the grammatical ERRor ANnotation Toolkit (ERRANT) described in: + +> Christopher Bryant, Mariano Felice, and Ted Briscoe. 2017. [**Automatic annotation and evaluation of error types for grammatical error correction**](https://www.aclweb.org/anthology/P17-1074/). In Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). Vancouver, Canada. + +> Mariano Felice, Christopher Bryant, and Ted Briscoe. 2016. [**Automatic extraction of learner errors in ESL sentences using linguistically enhanced alignments**](https://www.aclweb.org/anthology/C16-1079/). In Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers. Osaka, Japan. + +If you make use of this code, please cite the above papers. More information about ERRANT can be found [here](https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-938.html). + +# Overview + +The main aim of ERRANT is to automatically annotate parallel English sentences with error type information. Specifically, given an original and corrected sentence pair, ERRANT will extract the edits that transform the former to the latter and classify them according to a rule-based error type framework. This can be used to standardise parallel datasets or facilitate detailed error type evaluation. Annotated output files are in M2 format and an evaluation script is provided. + +### Example: +**Original**: This are gramamtical sentence . +**Corrected**: This is a grammatical sentence . +**Output M2**: +S This are gramamtical sentence . +A 1 2|||R:VERB:SVA|||is|||REQUIRED|||-NONE-|||0 +A 2 2|||M:DET|||a|||REQUIRED|||-NONE-|||0 +A 2 3|||R:SPELL|||grammatical|||REQUIRED|||-NONE-|||0 +A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||1 + +In M2 format, a line preceded by S denotes an original sentence while a line preceded by A indicates an edit annotation. Each edit line consists of the start and end token offset of the edit, the error type, and the tokenized correction string. The next two fields are included for historical reasons (see the CoNLL-2014 shared task) while the last field is the annotator id. + +A "noop" edit is a special kind of edit that explicitly indicates an annotator/system made no changes to the original sentence. If there is only one annotator, noop edits are optional, otherwise a noop edit should be included whenever at least 1 out of n annotators considered the original sentence to be correct. This is something to be aware of when combining individual M2 files, as missing noops can affect evaluation. + +# Installation + +## Pip Install + +The easiest way to install ERRANT and its dependencies is using `pip`. We also recommend installing it in a clean virtual environment (e.g. with `venv`). ERRANT only supports Python >= 3.3. +``` +python3 -m venv errant_env +source errant_env/bin/activate +pip3 install errant +python3 -m spacy download en +``` +This will create and activate a new python3 environment called `errant_env` in the current directory. `pip` will then install ERRANT, [spaCy v1.9.0](https://spacy.io/), [NLTK](http://www.nltk.org/) and spaCy's default English model in this environment. You can deactivate the environment at any time by running `deactivate`, but must remember to activate it again whenever you want to use ERRANT. + +**Note: ERRANT does not support spaCy 2 at this time**. spaCy 2 POS tags are slightly different from spaCy 1 POS tags and so ERRANT rules, which were designed for spaCy 1, may not always work with spaCy 2. + +### BEA-2019 Shared Task + +ERRANT v2.0.0 was designed to be fully compatible with the [BEA-2019 Shared Task](https://www.cl.cam.ac.uk/research/nl/bea2019st/). If you want to directly compare against the results in the shared task, you should make sure to install ERRANT v2.0.0 as newer versions may produce slightly different scores. +``` +pip3 install errant==2.0.0 +``` + +## Source Install + +If you prefer to install ERRANT from source, you can instead run the following commands: +``` +git clone https://github.com/chrisjbryant/errant.git +cd errant +python3 -m venv errant_env +source errant_env/bin/activate +pip3 install -e . +python3 -m spacy download en +``` +This will clone the github ERRANT source into the current directory, build and activate a python environment inside it, and then install ERRANT and all its dependencies. If you wish to modify ERRANT code, this is the recommended way to install it. + +# Usage + +## CLI + +Three main commands are provided with ERRANT: `errant_parallel`, `errant_m2` and `errant_compare`. You can run them from anywhere on the command line without having to invoke a specific python script. + +1. `errant_parallel` + + This is the main annotation command that takes an original text file and at least one parallel corrected text file as input, and outputs an annotated M2 file. By default, it is assumed that the original and corrected text files are word tokenised with one sentence per line. + Example: + ``` + errant_parallel -orig -cor [ ...] -out + ``` + +2. `errant_m2` + + This is a variant of `errant_parallel` that operates on an M2 file instead of parallel text files. This makes it easier to reprocess existing M2 files. You must also specify whether you want to use gold or auto edits; i.e. `-gold` will only classify the existing edits, while `-auto` will extract and classify automatic edits. In both settings, uncorrected edits and noops are preserved. + Example: + ``` + errant_m2 {-auto|-gold} m2_file -out + ``` + +3. `errant_compare` + + This is the evaluation command that compares a hypothesis M2 file against a reference M2 file. The default behaviour evaluates the hypothesis overall in terms of span-based correction. The `-cat {1,2,3}` flag can be used to evaluate error types at increasing levels of granularity, while the `-ds` or `-dt` flag can be used to evaluate in terms of span-based or token-based detection (i.e. ignoring the correction). All scores are presented in terms of Precision, Recall and F-score (default: F0.5), and counts for True Positives (TP), False Positives (FP) and False Negatives (FN) are also shown. + Examples: + ``` + errant_compare -hyp -ref + errant_compare -hyp -ref -cat {1,2,3} + errant_compare -hyp -ref -ds + errant_compare -hyp -ref -ds -cat {1,2,3} + ``` + +All these scripts also have additional advanced command line options which can be displayed using the `-h` flag. + +#### Runtime + +In terms of speed, ERRANT processes ~155 sents/sec in the fully automatic edit extraction and classification setting, but ~1000 sents/sec in the classification setting alone. These figures were calculated on an Intel Core i5-6600 @ 3.30GHz machine, but results will vary depending on how different/long the original and corrected sentences are. + +## API + +As of v2.0.0, ERRANT now also comes with an API. + +### Quick Start + +``` +import errant + +annotator = errant.load('en') +orig = annotator.parse('This are gramamtical sentence .') +cor = annotator.parse('This is a grammatical sentence .') +edits = annotator.annotate(orig, cor) +for e in edits: + print(e.o_start, e.o_end, e.o_str, e.c_start, e.c_end, e.c_str, e.type) +``` + +### Loading + +`errant`.**load**(lang, nlp=None) +Create an ERRANT Annotator object. The `lang` parameter currently only accepts `'en'` for English, but we hope to extend it for other languages in the future. The optional `nlp` parameter can be used if you have already preloaded spacy and do not want ERRANT to load it again. + +``` +import errant +import spacy + +nlp = spacy.load('en') +annotator = errant.load('en', nlp) +``` + +### Annotator Objects + +An Annotator object is the main interface for ERRANT. + +#### Methods + +`annotator`.**parse**(string, tokenise=False) +Lemmatise, POS tag, and parse a text string with spacy. Set `tokenise` to True to also word tokenise with spacy. Returns a spacy Doc object. + +`annotator`.**align**(orig, cor, lev=False) +Align spacy-parsed original and corrected text. The default uses a linguistically-enhanced Damerau-Levenshtein alignment, but the `lev` flag can be used for a standard Levenshtein alignment. Returns an Alignment object. + +`annotator`.**merge**(alignment, merging='rules') +Extract edits from the optimum alignment in an Alignment object. Four different merging strategies are available: +1. rules: Use a rule-based merging strategy (default) +2. all-split: Merge nothing: MSSDI -> M, S, S, D, I +3. all-merge: Merge adjacent non-matches: MSSDI -> M, SSDI +4. all-equal: Merge adjacent same-type non-matches: MSSDI -> M, SS, D, I + +Returns a list of Edit objects. + +`annotator`.**classify**(edit) +Classify an edit. Sets the `edit.type` attribute in an Edit object and returns the same Edit object. + +`annotator`.**annotate**(orig, cor, lev=False, merging='rules') +Run the full annotation pipeline to align two sequences and extract and classify the edits. Equivalent to running `annotator.align`, `annotator.merge` and `annotator.classify` in sequence. Returns a list of Edit objects. + +``` +import errant + +annotator = errant.load('en') +orig = annotator.parse('This are gramamtical sentence .') +cor = annotator.parse('This is a grammatical sentence .') +alignment = annotator.align(orig, cor) +edits = annotator.merge(alignment) +for e in edits: + e = annotator.classify(e) +``` + +`annotator`.**import_edit**(orig, cor, edit, min=True, old_cat=False) +Load an Edit object from a list. `orig` and `cor` must be spacy-parsed Doc objects and the edit must be of the form: `[o_start, o_end, c_start, c_end(, type)]`. The values must be integers that correspond to the token start and end offsets in the original and corrected Doc objects. The `type` value is an optional string that denotes the error type of the edit (if known). Set `min` to True to minimise the edit (e.g. [a b -> a c] = [b -> c]) and `old_cat` to True to preserve the old error type category (i.e. turn off the classifier). + +``` +import errant + +annotator = errant.load('en') +orig = annotator.parse('This are gramamtical sentence .') +cor = annotator.parse('This is a grammatical sentence .') +edit = [1, 2, 1, 2, 'SVA'] # are -> is +edit = annotator.import_edit(orig, cor, edit) +print(edit.to_m2()) +``` + +### Alignment Objects + +An Alignment object is created from two spacy-parsed text sequences. + +#### Attributes + +`alignment`.**orig** +`alignment`.**cor** +The spacy-parsed original and corrected text sequences. + +`alignment`.**cost_matrix** +`alignment`.**op_matrix** +The cost matrix and operation matrix produced by the alignment. + +`alignment`.**align_seq** +The first cheapest alignment between the two sequences. + +### Edit Objects + +An Edit object represents a transformation between two text sequences. + +#### Attributes + +`edit`.**o_start** +`edit`.**o_end** +`edit`.**o_toks** +`edit`.**o_str** +The start and end offsets, the spacy tokens, and the string for the edit in the *original* text. + +`edit`.**c_start** +`edit`.**c_end** +`edit`.**c_toks** +`edit`.**c_str** +The start and end offsets, the spacy tokens, and the string for the edit in the *corrected* text. + +`edit`.**type** +The error type string. + +#### Methods + +`edit`.**to_m2**(id=0) +Format the edit for an output M2 file. `id` is the annotator id. + +# Contact + +If you have any questions, suggestions or bug reports, you can contact the authors at: +christopher d0t bryant at cl.cam.ac.uk +mariano d0t felice at cl.cam.ac.uk \ No newline at end of file diff --git a/compare_m2.py b/compare_m2.py deleted file mode 100644 index 7238679..0000000 --- a/compare_m2.py +++ /dev/null @@ -1,330 +0,0 @@ -import argparse -from collections import Counter - -# Input: Command line args -def main(args): - # Open hypothesis and reference m2 files and split into chunks - hyp_m2 = open(args.hyp).read().strip().split("\n\n") - ref_m2 = open(args.ref).read().strip().split("\n\n") - # Make sure they have the same number of sentences - assert len(hyp_m2) == len(ref_m2) - - # Store global corpus level best counts here - best_dict = Counter({"tp":0, "fp":0, "fn":0}) - best_cats = {} - # Process each sentence - sents = zip(hyp_m2, ref_m2) - for sent_id, sent in enumerate(sents): - # Simplify the edits into lists of lists - hyp_edits = simplifyEdits(sent[0]) - ref_edits = simplifyEdits(sent[1]) - # Process the edits for detection/correction based on args - hyp_dict = processEdits(hyp_edits, args) - ref_dict = processEdits(ref_edits, args) - # Evaluate the edits and get the best TP, FP, FN counts for the best hyp+ref combo. - count_dict, cat_dict = evaluateEdits(hyp_dict, ref_dict, best_dict, sent_id, args) - # Merge these dicts with best_dict and best_cats - best_dict += Counter(count_dict) - best_cats = mergeDict(best_cats, cat_dict) - # Print results - printResults(best_dict, best_cats, args) - -# Input: An m2 format sentence with edits. -# Output: A list of lists. Each edit: [start, end, cat, cor, coder] -def simplifyEdits(sent): - out_edits = [] - # Get the edit lines from an m2 block. - edits = sent.split("\n")[1:] - # Loop through the edits - for edit in edits: - # Preprocessing - edit = edit[2:].split("|||") # Ignore "A " then split. - span = edit[0].split() - start = int(span[0]) - end = int(span[1]) - cat = edit[1] - cor = edit[2] - coder = int(edit[-1]) - out_edit = [start, end, cat, cor, coder] - out_edits.append(out_edit) - return out_edits - -# Input 1: A list of edits. Each edit: [start, end, cat, cor, coder] -# Input 2: Command line args -# Output: A dict; key is coder, value is edit dict. edit dict format varies based on args. -def processEdits(edits, args): - coder_dict = {} - # Add an explicit noop edit if there are no edits. - if not edits: edits = [[-1, -1, "noop", "-NONE-", 0]] - # Loop through the edits - for edit in edits: - # Name the edit elements for clarity - start = edit[0] - end = edit[1] - cat = edit[2] - cor = edit[3] - coder = edit[4] - # Add the coder to the coder_dict if necessary - if coder not in coder_dict: coder_dict[coder] = {} - - # Optionally apply filters based on args - # 1. UNK type edits are only useful for detection, not correction. - if not args.dt and not args.ds and cat == "UNK": continue - # 2. Only evaluate single token edits; i.e. 0:1, 1:0 or 1:1 - if args.single and (end-start >= 2 or len(cor.split()) >= 2): continue - # 3. Only evaluate multi token edits; i.e. 2+:n or n:2+ - if args.multi and end-start < 2 and len(cor.split()) < 2: continue - # 4. If there is a filter, ignore the specified error types - if args.filt and cat in args.filt: continue - - # Token Based Detection - if args.dt: - # Preserve noop edits. - if start == -1: - if (start, start) in coder_dict[coder].keys(): - coder_dict[coder][(start, start)].append(cat) - else: - coder_dict[coder][(start, start)] = [cat] - # Insertions defined as affecting the token on the right - elif start == end and start >= 0: - if (start, start+1) in coder_dict[coder].keys(): - coder_dict[coder][(start, start+1)].append(cat) - else: - coder_dict[coder][(start, start+1)] = [cat] - # Edit spans are split for each token in the range. - else: - for tok_id in range(start, end): - if (tok_id, tok_id+1) in coder_dict[coder].keys(): - coder_dict[coder][(tok_id, tok_id+1)].append(cat) - else: - coder_dict[coder][(tok_id, tok_id+1)] = [cat] - - # Span Based Detection - elif args.ds: - if (start, end) in coder_dict[coder].keys(): - coder_dict[coder][(start, end)].append(cat) - else: - coder_dict[coder][(start, end)] = [cat] - - # Span Based Correction - else: - # With error type classification - if args.cse: - if (start, end, cat, cor) in coder_dict[coder].keys(): - coder_dict[coder][(start, end, cat, cor)].append(cat) - else: - coder_dict[coder][(start, end, cat, cor)] = [cat] - # Without error type classification - else: - if (start, end, cor) in coder_dict[coder].keys(): - coder_dict[coder][(start, end, cor)].append(cat) - else: - coder_dict[coder][(start, end, cor)] = [cat] - return coder_dict - -# Input 1: A hyp dict; key is coder_id, value is dict of processed hyp edits. -# Input 2: A ref dict; key is coder_id, value is dict of processed ref edits. -# Input 3: A dictionary of the best corpus level TP, FP and FN counts so far. -# Input 4: Sentence ID (for verbose output only) -# Input 5: Command line args -# Output 1: A dict of the best corpus level TP, FP and FN counts for the input sentence. -# Output 2: A dict of the equivalent error types for the best corpus level TP, FP and FNs. -def evaluateEdits(hyp_dict, ref_dict, best, sent_id, args): - # Store the best sentence level scores and hyp+ref combination IDs - # best_f is initialised as -1 cause 0 is a valid result. - best_tp, best_fp, best_fn, best_f, best_hyp, best_ref = 0, 0, 0, -1, 0, 0 - best_cat = {} - # Compare each hyp and ref combination - for hyp_id in hyp_dict.keys(): - for ref_id in ref_dict.keys(): - # Get the local counts for the current combination. - tp, fp, fn, cat_dict = compareEdits(hyp_dict[hyp_id], ref_dict[ref_id]) - # Compute the local sentence scores (for verbose output only) - loc_p, loc_r, loc_f = computeFScore(tp, fp, fn, args.beta) - # Compute the global sentence scores - p, r, f = computeFScore(tp+best["tp"], fp+best["fp"], fn+best["fn"], args.beta) - # Save the scores if they are the current best hyp+ref combo in terms of: - # 1. Higher F-score - # 2. Same F-score, higher TP - # 3. Same F-score and TP, lower FP - # 4. Same F-score, TP and FP, lower FN - if (f > best_f) or \ - (f == best_f and tp > best_tp) or \ - (f == best_f and tp == best_tp and fp < best_fp) or \ - (f == best_f and tp == best_tp and fp == best_fp and fn < best_fn): - best_tp, best_fp, best_fn, best_f, best_hyp, best_ref = tp, fp, fn, f, hyp_id, ref_id - best_cat = cat_dict - # Verbose output - if args.verbose: - # Prepare verbose output edits. - hyp_verb = list(sorted(hyp_dict[hyp_id].keys())) - ref_verb = list(sorted(ref_dict[ref_id].keys())) - # Ignore noop edits - if not hyp_verb or hyp_verb[0][0] == -1: hyp_verb = [] - if not ref_verb or ref_verb[0][0] == -1: ref_verb = [] - # Print verbose info - print('{:-^40}'.format("")) - print("SENTENCE "+str(sent_id)+" - HYP "+str(hyp_id)+" - REF "+str(ref_id)) - print("HYPOTHESIS EDITS :", hyp_verb) - print("REFERENCE EDITS :", ref_verb) - print("Local TP/FP/FN :", str(tp), str(fp), str(fn)) - print("Local P/R/F"+str(args.beta)+" :", str(loc_p), str(loc_r), str(loc_f)) - print("Global TP/FP/FN :", str(tp+best["tp"]), str(fp+best["fp"]), str(fn+best["fn"])) - print("Global P/R/F"+str(args.beta)+" :", str(p), str(r), str(f)) - # Verbose output: display the best hyp+ref combination - if args.verbose: - print('{:-^40}'.format("")) - print("^^ HYP "+str(best_hyp)+", REF "+str(best_ref)+" chosen for sentence "+str(sent_id)) - # Save the best TP, FP and FNs as a dict, and return this and the best_cat dict - best_dict = {"tp":best_tp, "fp":best_fp, "fn":best_fn} - return best_dict, best_cat - -# Input 1: A dictionary of hypothesis edits for a single system. -# Input 2: A dictionary of reference edits for a single annotator. -# Output 1-3: The TP, FP and FN for the hyp vs the given ref annotator. -# Output 4: A dictionary of the error type counts. -def compareEdits(hyp_edits, ref_edits): - tp = 0 # True Positives - fp = 0 # False Positives - fn = 0 # False Negatives - cat_dict = {} # {cat: [tp, fp, fn], ...} - - for h_edit, h_cats in hyp_edits.items(): - # noop hyp edits cannot be TP or FP - if h_cats[0] == "noop": continue - # TRUE POSITIVES - if h_edit in ref_edits.keys(): - # On occasion, multiple tokens at same span. - for h_cat in ref_edits[h_edit]: # Use ref dict for TP - tp += 1 - # Each dict value [TP, FP, FN] - if h_cat in cat_dict.keys(): - cat_dict[h_cat][0] += 1 - else: - cat_dict[h_cat] = [1, 0, 0] - # FALSE POSITIVES - else: - # On occasion, multiple tokens at same span. - for h_cat in h_cats: - fp += 1 - # Each dict value [TP, FP, FN] - if h_cat in cat_dict.keys(): - cat_dict[h_cat][1] += 1 - else: - cat_dict[h_cat] = [0, 1, 0] - for r_edit, r_cats in ref_edits.items(): - # noop ref edits cannot be FN - if r_cats[0] == "noop": continue - # FALSE NEGATIVES - if r_edit not in hyp_edits.keys(): - # On occasion, multiple tokens at same span. - for r_cat in r_cats: - fn += 1 - # Each dict value [TP, FP, FN] - if r_cat in cat_dict.keys(): - cat_dict[r_cat][2] += 1 - else: - cat_dict[r_cat] = [0, 0, 1] - return tp, fp, fn, cat_dict - -# Input 1-3: True positives, false positives, false negatives -# Input 4: Value of beta in F-score. -# Output 1-3: Precision, Recall and F-score rounded to 4dp. -def computeFScore(tp, fp, fn, beta): - p = float(tp)/(tp+fp) if fp else 1.0 - r = float(tp)/(tp+fn) if fn else 1.0 - f = float((1+(beta**2))*p*r)/(((beta**2)*p)+r) if p+r else 0.0 - return round(p, 4), round(r, 4), round(f, 4) - -# Input 1-2: Two error category dicts. Key is cat, value is list of TP, FP, FN. -# Output: The dictionaries combined with cumulative TP, FP, FN. -def mergeDict(dict1, dict2): - for cat, stats in dict2.items(): - if cat in dict1.keys(): - dict1[cat] = [x+y for x, y in zip(dict1[cat], stats)] - else: - dict1[cat] = stats - return dict1 - -# Input 1: A dict; key is error cat, value is counts for [tp, fp, fn] -# Input 2: Integer value denoting level of error category granularity. -# Specifically, 1: Operation tier; e.g. M, R, U. 2: Main tier; e.g. NOUN, VERB 3: Everything. -# Output: A dictionary of category TP, FP and FN based on Input 2. -def processCategories(cat_dict, setting): - # Otherwise, do some processing. - proc_cat_dict = {} - for cat, cnt in cat_dict.items(): - if cat == "UNK": - proc_cat_dict[cat] = cnt - continue - # M, U, R or UNK combined only. - if setting == 1: - if cat[0] in proc_cat_dict.keys(): - proc_cat_dict[cat[0]] = [x+y for x, y in zip(proc_cat_dict[cat[0]], cnt)] - else: - proc_cat_dict[cat[0]] = cnt - # Everything without M, U or R. - elif setting == 2: - if cat[2:] in proc_cat_dict.keys(): - proc_cat_dict[cat[2:]] = [x+y for x, y in zip(proc_cat_dict[cat[2:]], cnt)] - else: - proc_cat_dict[cat[2:]] = cnt - # All error category combinations - else: - return cat_dict - return proc_cat_dict - -# Input 1: A dict of global best TP, FP and FNs -# Input 2: A dict of error types and counts for those TP, FP and FNs -# Input 3: Command line args -def printResults(best, best_cats, args): - # Prepare output title. - if args.dt: title = " Token-Based Detection " - elif args.ds: title = " Span-Based Detection " - elif args.cse: title = " Span-Based Correction + Classification " - else: title = " Span-Based Correction " - - # Category Scores - if args.cat: - best_cats = processCategories(best_cats, args.cat) - print("") - print('{:=^66}'.format(title)) - print("Category".ljust(14), "TP".ljust(8), "FP".ljust(8), "FN".ljust(8), "P".ljust(8), "R".ljust(8), "F"+str(args.beta)) - for cat, cnts in sorted(best_cats.items()): - cat_p, cat_r, cat_f = computeFScore(cnts[0], cnts[1], cnts[2], args.beta) - print(cat.ljust(14), str(cnts[0]).ljust(8), str(cnts[1]).ljust(8), str(cnts[2]).ljust(8), str(cat_p).ljust(8), str(cat_r).ljust(8), cat_f) - - # Print the overall results. - print("") - print('{:=^46}'.format(title)) - print("\t".join(["TP", "FP", "FN", "Prec", "Rec", "F"+str(args.beta)])) - print("\t".join(map(str, [best["tp"], best["fp"], best["fn"]]+list(computeFScore(best["tp"], best["fp"], best["fn"], args.beta))))) - print('{:=^46}'.format("")) - print("") - -if __name__ == "__main__": - # Define and parse program input - parser = argparse.ArgumentParser(description="Calculate F-scores for error detection and/or correction.\n" - "Flags let you evaluate error types at different levels of granularity.", - formatter_class=argparse.RawTextHelpFormatter, - usage="%(prog)s [options] -hyp HYP -ref REF") - parser.add_argument("-hyp", help="A hypothesis M2 file", required=True) - parser.add_argument("-ref", help="A reference M2 file", required=True) - parser.add_argument("-b", "--beta", help="Value of beta in F-score. (default: 0.5)", default=0.5, type=float) - parser.add_argument("-v", "--verbose", help="Print verbose output.", action="store_true") - eval_type = parser.add_mutually_exclusive_group() - eval_type.add_argument("-dt", help="Evaluate Detection in terms of Tokens.", action="store_true") - eval_type.add_argument("-ds", help="Evaluate Detection in terms of Spans.", action="store_true") - eval_type.add_argument("-cs", help="Evaluate Correction in terms of Spans. (default)", action="store_true") - eval_type.add_argument("-cse", help="Evaluate Correction in terms of Spans and Error types.", action="store_true") - parser.add_argument("-single", help="Only evaluate single token edits; i.e. 0:1, 1:0 or 1:1", action="store_true") - parser.add_argument("-multi", help="Only evaluate multi token edits; i.e. 2+:n or n:2+", action="store_true") - parser.add_argument("-filt", help="Do not evaluate the specified error types", default=[], nargs="+") - parser.add_argument("-cat", help="Show error category scores.\n" - "1: Only show operation tier scores; e.g. R.\n" - "2: Only show main tier scores; e.g. NOUN.\n" - "3: Show all category scores; e.g. R:NOUN.", - choices=[1, 2, 3], type=int) - args = parser.parse_args() - # Run the program - main(args) \ No newline at end of file diff --git a/demo/cor2.txt b/demo/cor2.txt index bc63215..df86996 100644 --- a/demo/cor2.txt +++ b/demo/cor2.txt @@ -1,4 +1,4 @@ These are great sentences . -Can you see the sea from where you live ? +Can you see the sea from where live you ? Let us talk about the software problems you 've been having recently . This sentence contains no errors . diff --git a/demo/orig.txt b/demo/orig.txt index b04cf77..9a0b9b7 100644 --- a/demo/orig.txt +++ b/demo/orig.txt @@ -1,5 +1,5 @@ This are a great sentences . -Can you seen the sea from where you live . -Let us discuss about all the softwares problems you 've been having recently . +Can you seen the sea from where live you . +Let us discuss about all softwares problems you 've been having recently . This sentence contains no errors . \ No newline at end of file diff --git a/demo/out.m2 b/demo/out.m2 index 49bf18b..4eb63ef 100644 --- a/demo/out.m2 +++ b/demo/out.m2 @@ -4,18 +4,20 @@ A 4 5|||R:NOUN:NUM|||sentence|||REQUIRED|||-NONE-|||0 A 0 1|||R:DET|||These|||REQUIRED|||-NONE-|||1 A 2 3|||U:DET||||||REQUIRED|||-NONE-|||1 -S Can you seen the sea from where you live . +S Can you seen the sea from where live you . A 2 3|||R:VERB:FORM|||see|||REQUIRED|||-NONE-|||0 +A 7 9|||R:WO|||you live|||REQUIRED|||-NONE-|||0 A 9 10|||R:PUNCT|||?|||REQUIRED|||-NONE-|||0 A 2 3|||R:VERB:FORM|||see|||REQUIRED|||-NONE-|||1 A 9 10|||R:PUNCT|||?|||REQUIRED|||-NONE-|||1 -S Let us discuss about all the softwares problems you 've been having recently . -A 3 4|||U:PREP||||||REQUIRED|||-NONE-|||0 -A 6 7|||R:NOUN:INFL|||software|||REQUIRED|||-NONE-|||0 +S Let us discuss about all softwares problems you 've been having recently . +A 3 4|||U:ADV||||||REQUIRED|||-NONE-|||0 +A 5 5|||M:DET|||the|||REQUIRED|||-NONE-|||0 +A 5 6|||R:NOUN:INFL|||software|||REQUIRED|||-NONE-|||0 A 2 3|||R:VERB|||talk|||REQUIRED|||-NONE-|||1 -A 4 5|||U:DET||||||REQUIRED|||-NONE-|||1 -A 6 7|||R:NOUN:INFL|||software|||REQUIRED|||-NONE-|||1 +A 4 5|||R:DET|||the|||REQUIRED|||-NONE-|||1 +A 5 6|||R:NOUN:INFL|||software|||REQUIRED|||-NONE-|||1 S This sentence contains no errors . A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0 diff --git a/demo/readme.md b/demo/readme.md index 75b46eb..1dd461b 100644 --- a/demo/readme.md +++ b/demo/readme.md @@ -1,9 +1,7 @@ ## ERRANT Demo -Assuming you have read the main readme and downloaded all the dependencies etc., you can try running ERRANT on the sample text in this folder to make sure it's running properly. +Assuming you have read the main readme and installed ERRANT successfully, you can try running it on the sample text in this directory to make sure it's running properly: -Specifically, from this demo directory run: - -`python3 ../parallel_to_m2.py -orig orig.txt -cor cor1.txt cor2.txt -out test.m2` +`errant_parallel -orig orig.txt -cor cor1.txt cor2.txt -out test.m2` This should produce a file called `test.m2` which is identical to `out.m2`. \ No newline at end of file diff --git a/errant/__init__.py b/errant/__init__.py new file mode 100644 index 0000000..a56627a --- /dev/null +++ b/errant/__init__.py @@ -0,0 +1,24 @@ +from importlib import import_module +import spacy +from errant.annotator import Annotator + +# Load an ERRANT Annotator object for a given language +def load(lang, nlp=None): + # Make sure the language is supported + supported = {"en"} + if lang not in supported: + raise Exception("%s is an unsupported or unknown language" % lang) + + # Load spacy + nlp = nlp or spacy.load(lang, disable=["ner"]) + + # Load language edit merger + merger = import_module("errant.%s.merger" % lang) + + # Load language edit classifier + classifier = import_module("errant.%s.classifier" % lang) + # The English classifier needs spacy + if lang == "en": classifier.nlp = nlp + + # Return a configured ERRANT annotator + return Annotator(lang, nlp, merger, classifier) \ No newline at end of file diff --git a/errant/alignment.py b/errant/alignment.py new file mode 100644 index 0000000..eceb57a --- /dev/null +++ b/errant/alignment.py @@ -0,0 +1,174 @@ +from difflib import SequenceMatcher +from itertools import groupby +import spacy.parts_of_speech as POS +from errant.edit import Edit + +class Alignment: + # Protected class resource + _open_pos = {POS.ADJ, POS.ADV, POS.NOUN, POS.VERB} + + # Input 1: An original text string parsed by spacy + # Input 2: A corrected text string parsed by spacy + # Input 3: A flag for standard Levenshtein alignment + def __init__(self, orig, cor, lev=False): + # Set orig and cor + self.orig = orig + self.cor = cor + # Align orig and cor and get the cost and op matrices + self.cost_matrix, self.op_matrix = self.align(lev) + # Get the cheapest align sequence from the op matrix + self.align_seq = self.get_cheapest_align_seq() + + # Input: A flag for standard Levenshtein alignment + # Output: The cost matrix and the operation matrix of the alignment + def align(self, lev): + # Sentence lengths + o_len = len(self.orig) + c_len = len(self.cor) + # Lower case token IDs (for transpositions) + o_low = [o.lower for o in self.orig] + c_low = [c.lower for c in self.cor] + # Create the cost_matrix and the op_matrix + cost_matrix = [[0.0 for j in range(c_len+1)] for i in range(o_len+1)] + op_matrix = [["O" for j in range(c_len+1)] for i in range(o_len+1)] + # Fill in the edges + for i in range(1, o_len+1): + cost_matrix[i][0] = cost_matrix[i-1][0] + 1 + op_matrix[i][0] = "D" + for j in range(1, c_len+1): + cost_matrix[0][j] = cost_matrix[0][j-1] + 1 + op_matrix[0][j] = "I" + + # Loop through the cost_matrix + for i in range(o_len): + for j in range(c_len): + # Matches + if self.orig[i].orth == self.cor[j].orth: + cost_matrix[i+1][j+1] = cost_matrix[i][j] + op_matrix[i+1][j+1] = "M" + # Non-matches + else: + del_cost = cost_matrix[i][j+1] + 1 + ins_cost = cost_matrix[i+1][j] + 1 + trans_cost = float("inf") + # Standard Levenshtein (S = 1) + if lev: sub_cost = cost_matrix[i][j] + 1 + # Linguistic Damerau-Levenshtein + else: + # Custom substitution + sub_cost = cost_matrix[i][j] + \ + self.get_sub_cost(self.orig[i], self.cor[j]) + # Transpositions require >=2 tokens + # Traverse the diagonal while there is not a Match. + k = 1 + while i-k >= 0 and j-k >= 0 and \ + cost_matrix[i-k+1][j-k+1]-cost_matrix[i-k][j-k] > 0: + if sorted(o_low[i-k:i+1]) == sorted(c_low[j-k:j+1]): + trans_cost = cost_matrix[i-k][j-k] + k + break + k += 1 + # Costs + costs = [trans_cost, sub_cost, ins_cost, del_cost] + # Get the index of the cheapest (first cheapest if tied) + l = costs.index(min(costs)) + # Save the cost and the op in the matrices + cost_matrix[i+1][j+1] = costs[l] + if l == 0: op_matrix[i+1][j+1] = "T"+str(k+1) + elif l == 1: op_matrix[i+1][j+1] = "S" + elif l == 2: op_matrix[i+1][j+1] = "I" + else: op_matrix[i+1][j+1] = "D" + # Return the matrices + return cost_matrix, op_matrix + + # Input 1: A spacy orig Token + # Input 2: A spacy cor Token + # Output: A linguistic cost between 0 < x < 2 + def get_sub_cost(self, o, c): + # Short circuit if the only difference is case + if o.lower == c.lower: return 0 + # Lemma cost + if o.lemma == c.lemma: lemma_cost = 0 + else: lemma_cost = 0.499 + # POS cost + if o.pos == c.pos: pos_cost = 0 + elif o.pos in self._open_pos and c.pos in self._open_pos: pos_cost = 0.25 + else: pos_cost = 0.5 + # Char cost + char_cost = 1-SequenceMatcher(None, o.text, c.text).ratio() + # Combine the costs + return lemma_cost + pos_cost + char_cost + + # Get the cheapest alignment sequence and indices from the op matrix + # align_seq = [(op, o_start, o_end, c_start, c_end), ...] + def get_cheapest_align_seq(self): + i = len(self.op_matrix)-1 + j = len(self.op_matrix[0])-1 + align_seq = [] + # Work backwards from bottom right until we hit top left + while i + j != 0: + # Get the edit operation in the current cell + op = self.op_matrix[i][j] + # Matches and substitutions + if op in {"M", "S"}: + align_seq.append((op, i-1, i, j-1, j)) + i -= 1 + j -= 1 + # Deletions + elif op == "D": + align_seq.append((op, i-1, i, j, j)) + i -= 1 + # Insertions + elif op == "I": + align_seq.append((op, i, i, j-1, j)) + j -= 1 + # Transpositions + else: + # Get the size of the transposition + k = int(op[1:]) + align_seq.append((op, i-k, i, j-k, j)) + i -= k + j -= k + # Reverse the list to go from left to right and return + align_seq.reverse() + return align_seq + + # all-split: Don't merge anything + def get_all_split_edits(self): + edits = [] + for align in self.align_seq: + if align[0] != "M": + edits.append(Edit(self.orig, self.cor, align[1:])) + return edits + + # all-merge: Merge all adjacent non-match ops + def get_all_merge_edits(self): + edits = [] + for op, group in groupby(self.align_seq, + lambda x: True if x[0] == "M" else False): + if not op: + merged = self.merge_edits(list(group)) + edits.append(Edit(self.orig, self.cor, merged[0][1:])) + return edits + + # all-equal: Merge all edits of the same operation type. + def get_all_equal_edits(self): + edits = [] + for op, group in groupby(self.align_seq, lambda x: x[0]): + if op != "M": + merged = self.merge_edits(list(group)) + edits.append(Edit(self.orig, self.cor, merged[0][1:])) + return edits + + # Merge the input alignment sequence to a single edit span + def merge_edits(self, seq): + if seq: return [("X", seq[0][1], seq[-1][2], seq[0][3], seq[-1][4])] + else: return seq + + # Alignment object string representation + def __str__(self): + orig = " ".join(["Orig:"]+[tok.text for tok in self.orig]) + cor = " ".join(["Cor:"]+[tok.text for tok in self.cor]) + cost_matrix = "\n".join(["Cost Matrix:"]+[str(row) for row in self.cost_matrix]) + op_matrix = "\n".join(["Operation Matrix:"]+[str(row) for row in self.op_matrix]) + seq = "Best alignment: "+str([a[0] for a in self.align_seq]) + return "\n".join([orig, cor, cost_matrix, op_matrix, seq]) \ No newline at end of file diff --git a/errant/annotator.py b/errant/annotator.py new file mode 100644 index 0000000..fb5bbbb --- /dev/null +++ b/errant/annotator.py @@ -0,0 +1,98 @@ +from errant.alignment import Alignment +from errant.edit import Edit + +# Main ERRANT Annotator class +class Annotator: + + # Input 1: A string language id: e.g. "en" + # Input 2: A spacy processing object for the language + # Input 3: A merging module for the language + # Input 4: A classifier module for the language + def __init__(self, lang, nlp=None, merger=None, classifier=None): + self.lang = lang + self.nlp = nlp + self.merger = merger + self.classifier = classifier + + # Input 1: A text string + # Input 2: A flag for word tokenisation + # Output: The input string parsed by spacy + def parse(self, text, tokenise=False): + if tokenise: + text = self.nlp(text) + else: + text = self.nlp.tokenizer.tokens_from_list(text.split()) + self.nlp.tagger(text) + self.nlp.parser(text) + return text + + # Input 1: An original text string parsed by spacy + # Input 2: A corrected text string parsed by spacy + # Input 3: A flag for standard Levenshtein alignment + # Output: An Alignment object + def align(self, orig, cor, lev=False): + return Alignment(orig, cor, lev) + + # Input 1: An Alignment object + # Input 2: A flag for merging strategy + # Output: A list of Edit objects + def merge(self, alignment, merging="rules"): + # rules: Rule-based merging + if merging == "rules": + edits = self.merger.get_rule_edits(alignment) + # all-split: Don't merge anything + elif merging == "all-split": + edits = alignment.get_all_split_edits() + # all-merge: Merge all adjacent non-match ops + elif merging == "all-merge": + edits = alignment.get_all_merge_edits() + # all-equal: Merge all edits of the same operation type + elif merging == "all-equal": + edits = alignment.get_all_equal_edits() + # Unknown + else: + raise Exception("Unknown merging strategy. Choose from: " + "rules, all-split, all-merge, all-equal.") + return edits + + # Input: An Edit object + # Output: The same Edit object with an updated error type + def classify(self, edit): + return self.classifier.classify(edit) + + # Input 1: An original text string parsed by spacy + # Input 2: A corrected text string parsed by spacy + # Input 3: A flag for standard Levenshtein alignment + # Input 4: A flag for merging strategy + # Output: A list of automatically extracted, typed Edit objects + def annotate(self, orig, cor, lev=False, merging="rules"): + alignment = self.align(orig, cor, lev) + edits = self.merge(alignment, merging) + for edit in edits: + edit = self.classify(edit) + return edits + + # Input 1: An original text string parsed by spacy + # Input 2: A corrected text string parsed by spacy + # Input 3: A token span edit list; [o_start, o_end, c_start, c_end, (cat)] + # Input 4: A flag for gold edit minimisation; e.g. [a b -> a c] = [b -> c] + # Input 5: A flag to preserve the old error category (i.e. turn off classifier) + # Output: An Edit object + def import_edit(self, orig, cor, edit, min=True, old_cat=False): + # Undefined error type + if len(edit) == 4: + edit = Edit(orig, cor, edit) + # Existing error type + elif len(edit) == 5: + edit = Edit(orig, cor, edit[:4], edit[4]) + # Unknown edit format + else: + raise Exception("Edit not of the form: " + "[o_start, o_end, c_start, c_end, (cat)]") + # Minimise edit + if min: + edit = edit.minimise() + # Classify edit + if not old_cat: + edit = self.classify(edit) + return edit diff --git a/scripts/__init__.py b/errant/commands/__init__.py similarity index 100% rename from scripts/__init__.py rename to errant/commands/__init__.py diff --git a/errant/commands/compare_m2.py b/errant/commands/compare_m2.py new file mode 100644 index 0000000..6231e6c --- /dev/null +++ b/errant/commands/compare_m2.py @@ -0,0 +1,381 @@ +import argparse +from collections import Counter + +def main(): + # Parse command line args + args = parse_args() + # Open hypothesis and reference m2 files and split into chunks + hyp_m2 = open(args.hyp).read().strip().split("\n\n") + ref_m2 = open(args.ref).read().strip().split("\n\n") + # Make sure they have the same number of sentences + assert len(hyp_m2) == len(ref_m2) + + # Store global corpus level best counts here + best_dict = Counter({"tp":0, "fp":0, "fn":0}) + best_cats = {} + # Process each sentence + sents = zip(hyp_m2, ref_m2) + for sent_id, sent in enumerate(sents): + # Simplify the edits into lists of lists + hyp_edits = simplify_edits(sent[0]) + ref_edits = simplify_edits(sent[1]) + # Process the edits for detection/correction based on args + hyp_dict = process_edits(hyp_edits, args) + ref_dict = process_edits(ref_edits, args) + # Evaluate edits and get best TP, FP, FN hyp+ref combo. + count_dict, cat_dict = evaluate_edits( + hyp_dict, ref_dict, best_dict, sent_id, args) + # Merge these dicts with best_dict and best_cats + best_dict += Counter(count_dict) + best_cats = merge_dict(best_cats, cat_dict) + # Print results + print_results(best_dict, best_cats, args) + +# Parse command line args +def parse_args(): + parser = argparse.ArgumentParser( + description="Calculate F-scores for error detection and/or correction.\n" + "Flags let you evaluate at different levels of granularity.", + formatter_class=argparse.RawTextHelpFormatter, + usage="%(prog)s [options] -hyp HYP -ref REF") + parser.add_argument( + "-hyp", + help="A hypothesis M2 file.", + required=True) + parser.add_argument( + "-ref", + help="A reference M2 file.", + required=True) + parser.add_argument( + "-b", + "--beta", + help="Value of beta in F-score. (default: 0.5)", + default=0.5, + type=float) + parser.add_argument( + "-v", + "--verbose", + help="Print verbose output.", + action="store_true") + eval_type = parser.add_mutually_exclusive_group() + eval_type.add_argument( + "-dt", + help="Evaluate Detection in terms of Tokens.", + action="store_true") + eval_type.add_argument( + "-ds", + help="Evaluate Detection in terms of Spans.", + action="store_true") + eval_type.add_argument( + "-cs", + help="Evaluate Correction in terms of Spans. (default)", + action="store_true") + eval_type.add_argument( + "-cse", + help="Evaluate Correction in terms of Spans and Error types.", + action="store_true") + parser.add_argument( + "-single", + help="Only evaluate single token edits; i.e. 0:1, 1:0 or 1:1", + action="store_true") + parser.add_argument( + "-multi", + help="Only evaluate multi token edits; i.e. 2+:n or n:2+", + action="store_true") + parser.add_argument( + "-filt", + help="Do not evaluate the specified error types.", + nargs="+", + default=[]) + parser.add_argument( + "-cat", + help="Show error category scores.\n" + "1: Only show operation tier scores; e.g. R.\n" + "2: Only show main tier scores; e.g. NOUN.\n" + "3: Show all category scores; e.g. R:NOUN.", + choices=[1, 2, 3], + type=int) + args = parser.parse_args() + return args + +# Input: An m2 format sentence with edits. +# Output: A list of lists. Each edit: [start, end, cat, cor, coder] +def simplify_edits(sent): + out_edits = [] + # Get the edit lines from an m2 block. + edits = sent.split("\n")[1:] + # Loop through the edits + for edit in edits: + # Preprocessing + edit = edit[2:].split("|||") # Ignore "A " then split. + span = edit[0].split() + start = int(span[0]) + end = int(span[1]) + cat = edit[1] + cor = edit[2] + coder = int(edit[-1]) + out_edit = [start, end, cat, cor, coder] + out_edits.append(out_edit) + return out_edits + +# Input 1: A list of edits. Each edit: [start, end, cat, cor, coder] +# Input 2: Command line args +# Output: A dict; key is coder, value is edit dict. +def process_edits(edits, args): + coder_dict = {} + # Add an explicit noop edit if there are no edits. + if not edits: edits = [[-1, -1, "noop", "-NONE-", 0]] + # Loop through the edits + for edit in edits: + # Name the edit elements for clarity + start = edit[0] + end = edit[1] + cat = edit[2] + cor = edit[3] + coder = edit[4] + # Add the coder to the coder_dict if necessary + if coder not in coder_dict: coder_dict[coder] = {} + + # Optionally apply filters based on args + # 1. UNK type edits are only useful for detection, not correction. + if not args.dt and not args.ds and cat == "UNK": continue + # 2. Only evaluate single token edits; i.e. 0:1, 1:0 or 1:1 + if args.single and (end-start >= 2 or len(cor.split()) >= 2): continue + # 3. Only evaluate multi token edits; i.e. 2+:n or n:2+ + if args.multi and end-start < 2 and len(cor.split()) < 2: continue + # 4. If there is a filter, ignore the specified error types + if args.filt and cat in args.filt: continue + + # Token Based Detection + if args.dt: + # Preserve noop edits. + if start == -1: + if (start, start) in coder_dict[coder].keys(): + coder_dict[coder][(start, start)].append(cat) + else: + coder_dict[coder][(start, start)] = [cat] + # Insertions defined as affecting the token on the right + elif start == end and start >= 0: + if (start, start+1) in coder_dict[coder].keys(): + coder_dict[coder][(start, start+1)].append(cat) + else: + coder_dict[coder][(start, start+1)] = [cat] + # Edit spans are split for each token in the range. + else: + for tok_id in range(start, end): + if (tok_id, tok_id+1) in coder_dict[coder].keys(): + coder_dict[coder][(tok_id, tok_id+1)].append(cat) + else: + coder_dict[coder][(tok_id, tok_id+1)] = [cat] + + # Span Based Detection + elif args.ds: + if (start, end) in coder_dict[coder].keys(): + coder_dict[coder][(start, end)].append(cat) + else: + coder_dict[coder][(start, end)] = [cat] + + # Span Based Correction + else: + # With error type classification + if args.cse: + if (start, end, cat, cor) in coder_dict[coder].keys(): + coder_dict[coder][(start, end, cat, cor)].append(cat) + else: + coder_dict[coder][(start, end, cat, cor)] = [cat] + # Without error type classification + else: + if (start, end, cor) in coder_dict[coder].keys(): + coder_dict[coder][(start, end, cor)].append(cat) + else: + coder_dict[coder][(start, end, cor)] = [cat] + return coder_dict + +# Input 1: A hyp dict; key is coder_id, value is dict of processed hyp edits. +# Input 2: A ref dict; key is coder_id, value is dict of processed ref edits. +# Input 3: A dictionary of the best corpus level TP, FP and FN counts so far. +# Input 4: Sentence ID (for verbose output only) +# Input 5: Command line args +# Output 1: A dict of the best corpus level TP, FP and FN for the input sentence. +# Output 2: The corresponding error type dict for the above dict. +def evaluate_edits(hyp_dict, ref_dict, best, sent_id, args): + # Store the best sentence level scores and hyp+ref combination IDs + # best_f is initialised as -1 cause 0 is a valid result. + best_tp, best_fp, best_fn, best_f, best_hyp, best_ref = 0, 0, 0, -1, 0, 0 + best_cat = {} + # Compare each hyp and ref combination + for hyp_id in hyp_dict.keys(): + for ref_id in ref_dict.keys(): + # Get the local counts for the current combination. + tp, fp, fn, cat_dict = compareEdits(hyp_dict[hyp_id], ref_dict[ref_id]) + # Compute the local sentence scores (for verbose output only) + loc_p, loc_r, loc_f = computeFScore(tp, fp, fn, args.beta) + # Compute the global sentence scores + p, r, f = computeFScore( + tp+best["tp"], fp+best["fp"], fn+best["fn"], args.beta) + # Save the scores if they are better in terms of: + # 1. Higher F-score + # 2. Same F-score, higher TP + # 3. Same F-score and TP, lower FP + # 4. Same F-score, TP and FP, lower FN + if (f > best_f) or \ + (f == best_f and tp > best_tp) or \ + (f == best_f and tp == best_tp and fp < best_fp) or \ + (f == best_f and tp == best_tp and fp == best_fp and fn < best_fn): + best_tp, best_fp, best_fn = tp, fp, fn + best_f, best_hyp, best_ref = f, hyp_id, ref_id + best_cat = cat_dict + # Verbose output + if args.verbose: + # Prepare verbose output edits. + hyp_verb = list(sorted(hyp_dict[hyp_id].keys())) + ref_verb = list(sorted(ref_dict[ref_id].keys())) + # Ignore noop edits + if not hyp_verb or hyp_verb[0][0] == -1: hyp_verb = [] + if not ref_verb or ref_verb[0][0] == -1: ref_verb = [] + # Print verbose info + print('{:-^40}'.format("")) + print("SENTENCE "+str(sent_id)+" - HYP "+str(hyp_id)+" - REF "+str(ref_id)) + print("HYPOTHESIS EDITS :", hyp_verb) + print("REFERENCE EDITS :", ref_verb) + print("Local TP/FP/FN :", str(tp), str(fp), str(fn)) + print("Local P/R/F"+str(args.beta)+" :", str(loc_p), str(loc_r), str(loc_f)) + print("Global TP/FP/FN :", str(tp+best["tp"]), str(fp+best["fp"]), str(fn+best["fn"])) + print("Global P/R/F"+str(args.beta)+" :", str(p), str(r), str(f)) + # Verbose output: display the best hyp+ref combination + if args.verbose: + print('{:-^40}'.format("")) + print("^^ HYP "+str(best_hyp)+", REF "+str(best_ref)+" chosen for sentence "+str(sent_id)) + # Save the best TP, FP and FNs as a dict, and return this and the best_cat dict + best_dict = {"tp":best_tp, "fp":best_fp, "fn":best_fn} + return best_dict, best_cat + +# Input 1: A dictionary of hypothesis edits for a single system. +# Input 2: A dictionary of reference edits for a single annotator. +# Output 1-3: The TP, FP and FN for the hyp vs the given ref annotator. +# Output 4: A dictionary of the error type counts. +def compareEdits(hyp_edits, ref_edits): + tp = 0 # True Positives + fp = 0 # False Positives + fn = 0 # False Negatives + cat_dict = {} # {cat: [tp, fp, fn], ...} + + for h_edit, h_cats in hyp_edits.items(): + # noop hyp edits cannot be TP or FP + if h_cats[0] == "noop": continue + # TRUE POSITIVES + if h_edit in ref_edits.keys(): + # On occasion, multiple tokens at same span. + for h_cat in ref_edits[h_edit]: # Use ref dict for TP + tp += 1 + # Each dict value [TP, FP, FN] + if h_cat in cat_dict.keys(): + cat_dict[h_cat][0] += 1 + else: + cat_dict[h_cat] = [1, 0, 0] + # FALSE POSITIVES + else: + # On occasion, multiple tokens at same span. + for h_cat in h_cats: + fp += 1 + # Each dict value [TP, FP, FN] + if h_cat in cat_dict.keys(): + cat_dict[h_cat][1] += 1 + else: + cat_dict[h_cat] = [0, 1, 0] + for r_edit, r_cats in ref_edits.items(): + # noop ref edits cannot be FN + if r_cats[0] == "noop": continue + # FALSE NEGATIVES + if r_edit not in hyp_edits.keys(): + # On occasion, multiple tokens at same span. + for r_cat in r_cats: + fn += 1 + # Each dict value [TP, FP, FN] + if r_cat in cat_dict.keys(): + cat_dict[r_cat][2] += 1 + else: + cat_dict[r_cat] = [0, 0, 1] + return tp, fp, fn, cat_dict + +# Input 1-3: True positives, false positives, false negatives +# Input 4: Value of beta in F-score. +# Output 1-3: Precision, Recall and F-score rounded to 4dp. +def computeFScore(tp, fp, fn, beta): + p = float(tp)/(tp+fp) if fp else 1.0 + r = float(tp)/(tp+fn) if fn else 1.0 + f = float((1+(beta**2))*p*r)/(((beta**2)*p)+r) if p+r else 0.0 + return round(p, 4), round(r, 4), round(f, 4) + +# Input 1-2: Two error category dicts. Key is cat, value is list of TP, FP, FN. +# Output: The dictionaries combined with cumulative TP, FP, FN. +def merge_dict(dict1, dict2): + for cat, stats in dict2.items(): + if cat in dict1.keys(): + dict1[cat] = [x+y for x, y in zip(dict1[cat], stats)] + else: + dict1[cat] = stats + return dict1 + +# Input 1: A dict; key is error cat, value is counts for [tp, fp, fn] +# Input 2: Integer value denoting level of error category granularity. +# 1: Operation tier; e.g. M, R, U. 2: Main tier; e.g. NOUN, VERB 3: Everything. +# Output: A dictionary of category TP, FP and FN based on Input 2. +def processCategories(cat_dict, setting): + # Otherwise, do some processing. + proc_cat_dict = {} + for cat, cnt in cat_dict.items(): + if cat == "UNK": + proc_cat_dict[cat] = cnt + continue + # M, U, R or UNK combined only. + if setting == 1: + if cat[0] in proc_cat_dict.keys(): + proc_cat_dict[cat[0]] = [x+y for x, y in zip(proc_cat_dict[cat[0]], cnt)] + else: + proc_cat_dict[cat[0]] = cnt + # Everything without M, U or R. + elif setting == 2: + if cat[2:] in proc_cat_dict.keys(): + proc_cat_dict[cat[2:]] = [x+y for x, y in zip(proc_cat_dict[cat[2:]], cnt)] + else: + proc_cat_dict[cat[2:]] = cnt + # All error category combinations + else: + return cat_dict + return proc_cat_dict + +# Input 1: A dict of global best TP, FP and FNs +# Input 2: A dict of error types and counts for those TP, FP and FNs +# Input 3: Command line args +def print_results(best, best_cats, args): + # Prepare output title. + if args.dt: title = " Token-Based Detection " + elif args.ds: title = " Span-Based Detection " + elif args.cse: title = " Span-Based Correction + Classification " + else: title = " Span-Based Correction " + + # Category Scores + if args.cat: + best_cats = processCategories(best_cats, args.cat) + print("") + print('{:=^66}'.format(title)) + print("Category".ljust(14), "TP".ljust(8), "FP".ljust(8), "FN".ljust(8), + "P".ljust(8), "R".ljust(8), "F"+str(args.beta)) + for cat, cnts in sorted(best_cats.items()): + cat_p, cat_r, cat_f = computeFScore(cnts[0], cnts[1], cnts[2], args.beta) + print(cat.ljust(14), str(cnts[0]).ljust(8), str(cnts[1]).ljust(8), + str(cnts[2]).ljust(8), str(cat_p).ljust(8), str(cat_r).ljust(8), cat_f) + + # Print the overall results. + print("") + print('{:=^46}'.format(title)) + print("\t".join(["TP", "FP", "FN", "Prec", "Rec", "F"+str(args.beta)])) + print("\t".join(map(str, [best["tp"], best["fp"], + best["fn"]]+list(computeFScore(best["tp"], best["fp"], best["fn"], args.beta))))) + print('{:=^46}'.format("")) + print("") + +if __name__ == "__main__": + # Run the program + main() \ No newline at end of file diff --git a/errant/commands/m2_to_m2.py b/errant/commands/m2_to_m2.py new file mode 100644 index 0000000..a89d239 --- /dev/null +++ b/errant/commands/m2_to_m2.py @@ -0,0 +1,178 @@ +import argparse +import errant + +def main(): + # Parse command line args + args = parse_args() + print("Loading resources...") + # Load Errant + annotator = errant.load("en") + # Open output M2 file + out_m2 = open(args.out, "w") + + print("Processing M2 file...") + # Open the m2 file and split it into text+edit blocks + m2 = open(args.m2_file).read().strip().split("\n\n") + # Loop through the blocks + for m2_block in m2: + m2_block = m2_block.strip().split("\n") + # Write the original text to the output M2 file + out_m2.write(m2_block[0]+"\n") + # Parse orig with spacy + orig = annotator.parse(m2_block[0][2:]) + # Simplify the edits and sort by coder id + edit_dict = simplify_edits(m2_block[1:]) + # Loop through coder ids + for id, raw_edits in sorted(edit_dict.items()): + # If the first edit is a noop + if raw_edits[0][2] == "noop": + # Write the noop and continue + out_m2.write(noop_edit(id)+"\n") + continue + # Apply the edits to generate the corrected text + # Also redefine the edits as orig and cor token offsets + cor, gold_edits = get_cor_and_edits(m2_block[0][2:], raw_edits) + # Parse cor with spacy + cor = annotator.parse(cor) + # Save detection edits here for auto + det_edits = [] + # Loop through the gold edits + for gold_edit in gold_edits: + # Do not minimise detection edits + if gold_edit[-2] in {"Um", "UNK"}: + edit = annotator.import_edit(orig, cor, gold_edit[:-1], + min=False, old_cat=args.old_cats) + # Overwrite the pseudo correction and set it in the edit + edit.c_toks = annotator.parse(gold_edit[-1]) + # Save the edit for auto + det_edits.append(edit) + # Write the edit for gold + if args.gold: + # Write the edit + out_m2.write(edit.to_m2(id)+"\n") + # Gold annotation + elif args.gold: + edit = annotator.import_edit(orig, cor, gold_edit[:-1], + not args.no_min, args.old_cats) + # Write the edit + out_m2.write(edit.to_m2(id)+"\n") + # Auto annotations + if args.auto: + # Auto edits + edits = annotator.annotate(orig, cor, args.lev, args.merge) + # Combine detection and auto edits and sort by orig offsets + edits = sorted(det_edits+edits, key=lambda e:(e.o_start, e.o_end)) + # Write the edits to the output M2 file + for edit in edits: + out_m2.write(edit.to_m2(id)+"\n") + # Write a newline when there are no more edits + out_m2.write("\n") + +# Parse command line args +def parse_args(): + parser = argparse.ArgumentParser( + description = "Automatically extract and/or classify edits in an m2 file.", + formatter_class = argparse.RawTextHelpFormatter, + usage = "%(prog)s [-h] (-auto | -gold) [options] m2_file -out OUT") + parser.add_argument( + "m2_file", + help = "The path to an m2 file.") + type_group = parser.add_mutually_exclusive_group(required = True) + type_group.add_argument( + "-auto", + help = "Extract edits automatically.", + action = "store_true") + type_group.add_argument( + "-gold", + help = "Use existing edit alignments.", + action = "store_true") + parser.add_argument( + "-out", + help = "The output filepath.", + required = True) + parser.add_argument( + "-no_min", + help = "Do not minimise edit spans (gold only).", + action = "store_true") + parser.add_argument( + "-old_cats", + help = "Preserve old error types (gold only); i.e. turn off the classifier.", + action = "store_true") + parser.add_argument( + "-lev", + help = "Align using standard Levenshtein.", + action = "store_true") + parser.add_argument( + "-merge", + help = "Choose a merging strategy for automatic alignment.\n" + "rules: Use a rule-based merging strategy (default)\n" + "all-split: Merge nothing: MSSDI -> M, S, S, D, I\n" + "all-merge: Merge adjacent non-matches: MSSDI -> M, SSDI\n" + "all-equal: Merge adjacent same-type non-matches: MSSDI -> M, SS, D, I", + choices = ["rules", "all-split", "all-merge", "all-equal"], + default = "rules") + args = parser.parse_args() + return args + +# Input: A list of edit lines from an m2 file +# Output: An edit dictionary; key is coder id, value is a list of edits +def simplify_edits(edits): + edit_dict = {} + for edit in edits: + edit = edit.split("|||") + span = edit[0][2:].split() # [2:] ignore the leading "A " + start = int(span[0]) + end = int(span[1]) + cat = edit[1] + cor = edit[2] + id = edit[-1] + # Save the useful info as a list + proc_edit = [start, end, cat, cor] + # Save the proc_edit inside the edit_dict using coder id + if id in edit_dict.keys(): + edit_dict[id].append(proc_edit) + else: + edit_dict[id] = [proc_edit] + return edit_dict + +# Input 1: A tokenised original text string +# Input 2: A list of edits; [o_start, o_end, cat, cor] +# Output 1: A tokenised corrected text string +# Output 2: A list of edits; [o_start, o_end, c_start, c_end, cat, cor] +def get_cor_and_edits(orig, edits): + # Copy orig; we will apply edits to it to make cor + cor = orig.split() + new_edits = [] + offset = 0 + # Sort the edits by offsets before processing them + edits = sorted(edits, key=lambda e:(e[0], e[1])) + # Loop through edits: [o_start, o_end, cat, cor_str] + for edit in edits: + o_start = edit[0] + o_end = edit[1] + cat = edit[2] + cor_toks = edit[3].split() + # Detection edits + if cat in {"Um", "UNK"}: + # Save the pseudo correction + det_toks = cor_toks[:] + # But temporarily overwrite it to be the same as orig + cor_toks = orig.split()[o_start:o_end] + # Apply the edits + cor[o_start+offset:o_end+offset] = cor_toks + # Get the cor token start and end offsets in cor + c_start = o_start+offset + c_end = c_start+len(cor_toks) + # Keep track of how this affects orig edit offsets + offset = offset-(o_end-o_start)+len(cor_toks) + # Detection edits: Restore the pseudo correction + if cat in {"Um", "UNK"}: cor_toks = det_toks + # Update the edit with cor span and save + new_edit = [o_start, o_end, c_start, c_end, cat, " ".join(cor_toks)] + new_edits.append(new_edit) + return " ".join(cor), new_edits + +# Input: A coder id +# Output: A noop edit; i.e. text contains no edits +def noop_edit(id=0): + return "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(id) \ No newline at end of file diff --git a/errant/commands/parallel_to_m2.py b/errant/commands/parallel_to_m2.py new file mode 100644 index 0000000..c7cc091 --- /dev/null +++ b/errant/commands/parallel_to_m2.py @@ -0,0 +1,92 @@ +import argparse +from contextlib import ExitStack +import errant + +def main(): + # Parse command line args + args = parse_args() + print("Loading resources...") + # Load Errant + annotator = errant.load("en") + # Open output m2 file + out_m2 = open(args.out, "w") + + print("Processing parallel files...") + # Process an arbitrary number of files line by line simultaneously. Python 3.3+ + # See https://tinyurl.com/y4cj4gth + with ExitStack() as stack: + in_files = [stack.enter_context(open(i)) for i in [args.orig]+args.cor] + # Process each line of all input files + for line in zip(*in_files): + # Get the original and all the corrected texts + orig = line[0].strip() + cors = line[1:] + # Skip the line if orig is empty + if not orig: continue + # Parse orig with spacy + orig = annotator.parse(orig, args.tok) + # Write orig to the output m2 file + out_m2.write(" ".join(["S"]+[token.text for token in orig])+"\n") + # Loop through the corrected texts + for cor_id, cor in enumerate(cors): + cor = cor.strip() + # If the texts are the same, write a noop edit + if orig.text.strip() == cor: + out_m2.write(noop_edit(cor_id)+"\n") + # Otherwise, do extra processing + else: + # Parse cor with spacy + cor = annotator.parse(cor, args.tok) + # Align the texts and extract and classify the edits + edits = annotator.annotate(orig, cor, args.lev, args.merge) + # Loop through the edits + for edit in edits: + # Write the edit to the output m2 file + out_m2.write(edit.to_m2(cor_id)+"\n") + # Write a newline when we have processed all corrections for each line + out_m2.write("\n") + +# Parse command line args +def parse_args(): + parser=argparse.ArgumentParser( + description="Align parallel text files and extract and classify the edits.\n", + formatter_class=argparse.RawTextHelpFormatter, + usage="%(prog)s [-h] [options] -orig ORIG -cor COR [COR ...] -out OUT") + parser.add_argument( + "-orig", + help="The path to the original text file.", + required=True) + parser.add_argument( + "-cor", + help="The paths to >= 1 corrected text files.", + nargs="+", + default=[], + required=True) + parser.add_argument( + "-out", + help="The output filepath.", + required=True) + parser.add_argument( + "-tok", + help="Word tokenise the text using spacy (default: False).", + action="store_true") + parser.add_argument( + "-lev", + help="Align using standard Levenshtein (default: False).", + action="store_true") + parser.add_argument( + "-merge", + help="Choose a merging strategy for automatic alignment.\n" + "rules: Use a rule-based merging strategy (default)\n" + "all-split: Merge nothing: MSSDI -> M, S, S, D, I\n" + "all-merge: Merge adjacent non-matches: MSSDI -> M, SSDI\n" + "all-equal: Merge adjacent same-type non-matches: MSSDI -> M, SS, D, I", + choices=["rules", "all-split", "all-merge", "all-equal"], + default="rules") + args=parser.parse_args() + return args + +# Input: A coder id +# Output: A noop edit; i.e. text contains no edits +def noop_edit(id=0): + return "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(id) \ No newline at end of file diff --git a/errant/edit.py b/errant/edit.py new file mode 100644 index 0000000..8c29096 --- /dev/null +++ b/errant/edit.py @@ -0,0 +1,56 @@ +# ERRANT edit class +class Edit: + + # Input 1: An original text string parsed by spacy + # Input 2: A corrected text string parsed by spacy + # Input 3: A token span edit list: [o_start, o_end, c_start, c_end] + # Input 4: An error type string, if known + def __init__(self, orig, cor, edit, type="NA"): + # Orig offsets, spacy tokens and string + self.o_start = edit[0] + self.o_end = edit[1] + self.o_toks = orig[self.o_start:self.o_end] + self.o_str = self.o_toks.text if self.o_toks else "" + # Cor offsets, spacy tokens and string + self.c_start = edit[2] + self.c_end = edit[3] + self.c_toks = cor[self.c_start:self.c_end] + self.c_str = self.c_toks.text if self.c_toks else "" + # Error type + self.type = type + + # Minimise the edit; e.g. [a b -> a c] = [b -> c] + def minimise(self): + # While the first token is the same on both sides + while self.o_toks and self.c_toks and \ + self.o_toks[0].text == self.c_toks[0].text: + # Remove that token from the span, and adjust the start offsets + self.o_toks = self.o_toks[1:] + self.c_toks = self.c_toks[1:] + self.o_start += 1 + self.c_start += 1 + # Do the same for the last token + while self.o_toks and self.c_toks and \ + self.o_toks[-1].text == self.c_toks[-1].text: + self.o_toks = self.o_toks[:-1] + self.c_toks = self.c_toks[:-1] + self.o_end -= 1 + self.c_end -= 1 + # Update the strings + self.o_str = self.o_toks.text if self.o_toks else "" + self.c_str = self.c_toks.text if self.c_toks else "" + return self + + # Input: An id for the annotator + # Output: An edit string formatted for an M2 file + def to_m2(self, id=0): + span = " ".join(["A", str(self.o_start), str(self.o_end)]) + cor_toks_str = " ".join([tok.text for tok in self.c_toks]) + return "|||".join([span, self.type, cor_toks_str, "REQUIRED", "-NONE-", str(id)]) + + # Edit object string representation + def __str__(self): + orig = "Orig: "+str([self.o_start, self.o_end, self.o_str]) + cor = "Cor: "+str([self.c_start, self.c_end, self.c_str]) + type = "Type: "+repr(self.type) + return ", ".join([orig, cor, type]) \ No newline at end of file diff --git a/errant/en/__init__.py b/errant/en/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/errant/en/classifier.py b/errant/en/classifier.py new file mode 100644 index 0000000..fa972f9 --- /dev/null +++ b/errant/en/classifier.py @@ -0,0 +1,426 @@ +from difflib import SequenceMatcher +from nltk.stem import LancasterStemmer +from pathlib import Path +import spacy.parts_of_speech as POS + +# Load Hunspell word list +def load_word_list(path): + with open(path) as word_list: + return set([word.strip() for word in word_list]) + +# Load Universal Dependency POS Tags map file. +# https://universaldependencies.org/tagset-conversion/en-penn-uposf.html +def load_pos_map(path): + map_dict = {} + with open(path) as map_file: + for line in map_file: + line = line.strip().split("\t") + # Change ADP to PREP for readability + if line[1].strip() == "ADP": map_dict[line[0]] = "PREP" + # Also change PROPN to NOUN; we don't need a prop noun tag + elif line[1].strip() == "PROPN": map_dict[line[0]] = "NOUN" + # Otherwise + else: map_dict[line[0]] = line[1].strip() + # Add some spacy PTB tags not in the original mapping. + map_dict['""'] = "PUNCT" + map_dict["SP"] = "SPACE" + map_dict["ADD"] = "X" + map_dict["GW"] = "X" + map_dict["NFP"] = "X" + map_dict["XX"] = "X" + return map_dict + +# Classifier resources +base_dir = Path(__file__).resolve().parent +# Spacy +nlp = None +# Lancaster Stemmer +stemmer = LancasterStemmer() +# GB English word list (inc -ise and -ize) +spell = load_word_list(base_dir/"resources"/"en_GB-large.txt") +# Part of speech map file +pos_map = load_pos_map(base_dir/"resources"/"en-ptb_map") +# Open class coarse Spacy POS tags +open_pos1 = {POS.ADJ, POS.ADV, POS.NOUN, POS.VERB} +# Open class coarse Spacy POS tags (strings) +open_pos2 = {"ADJ", "ADV", "NOUN", "VERB"} +# Rare POS tags that make uninformative error categories +rare_pos = {"INTJ", "NUM", "SYM", "X"} +# Contractions +conts = {"'d", "'ll", "'m", "n't", "'re", "'s", "'ve"} +# Special auxiliaries in contractions. +aux_conts = {"ca": "can", "sha": "shall", "wo": "will"} +# Some dep labels that map to pos tags. +dep_map = { + "acomp": "ADJ", + "amod": "ADJ", + "advmod": "ADV", + "det": "DET", + "prep": "PREP", + "prt": "PART", + "punct": "PUNCT"} + +# Input: An Edit object +# Output: The same Edit object with an updated error type +def classify(edit): + # Nothing to nothing is a detected but not corrected edit + if not edit.o_toks and not edit.c_toks: + edit.type = "UNK" + # Missing + elif not edit.o_toks and edit.c_toks: + op = "M:" + cat = get_one_sided_type(edit.c_toks) + edit.type = op+cat + # Unnecessary + elif edit.o_toks and not edit.c_toks: + op = "U:" + cat = get_one_sided_type(edit.o_toks) + edit.type = op+cat + # Replacement and special cases + else: + # Same to same is a detected but not corrected edit + if edit.o_str == edit.c_str: + edit.type = "UNK" + # Special: Ignore case change at the end of multi token edits + # E.g. [Doctor -> The doctor], [, since -> . Since] + # Classify the edit as if the last token wasn't there + elif edit.o_toks[-1].lower == edit.c_toks[-1].lower and \ + (len(edit.o_toks) > 1 or len(edit.c_toks) > 1): + # Store a copy of the full orig and cor toks + all_o_toks = edit.o_toks[:] + all_c_toks = edit.c_toks[:] + # Truncate the instance toks for classification + edit.o_toks = edit.o_toks[:-1] + edit.c_toks = edit.c_toks[:-1] + # Classify the truncated edit + edit = classify(edit) + # Restore the full orig and cor toks + edit.o_toks = all_o_toks + edit.c_toks = all_c_toks + # Replacement + else: + op = "R:" + cat = get_two_sided_type(edit.o_toks, edit.c_toks) + edit.type = op+cat + return edit + +# Input: Spacy tokens +# Output: A list of token, pos and dep tag strings +def get_edit_info(toks): + str = [] + pos = [] + dep = [] + for tok in toks: + str.append(tok.text) + pos.append(pos_map[tok.tag_]) + dep.append(tok.dep_) + return str, pos, dep + +# Input: Spacy tokens +# Output: An error type string based on input tokens from orig or cor +# When one side of the edit is null, we can only use the other side +def get_one_sided_type(toks): + # Extract strings, pos tags and parse info from the toks + str_list, pos_list, dep_list = get_edit_info(toks) + + # Special cases + if len(toks) == 1: + # Possessive noun suffixes; e.g. ' -> 's + if toks[0].tag_ == "POS": + return "NOUN:POSS" + # Contractions. Rule must come after possessive + if toks[0].lower_ in conts: + return "CONTR" + # Infinitival "to" is treated as part of a verb form + if toks[0].lower_ == "to" and toks[0].pos == POS.PART and \ + toks[0].dep_ != "prep": + return "VERB:FORM" + # Auxiliary verbs + if set(dep_list).issubset({"aux", "auxpass"}): + return "VERB:TENSE" + # POS-based tags. Ignores rare, uninformative categories + if len(set(pos_list)) == 1 and pos_list[0] not in rare_pos: + return pos_list[0] + # More POS-based tags using special dependency labels + if len(set(dep_list)) == 1 and dep_list[0] in dep_map.keys(): + return dep_map[dep_list[0]] + # To-infinitives and phrasal verbs + if set(pos_list) == {"PART", "VERB"}: + return "VERB" + # Tricky cases + else: + return "OTHER" + +# Input 1: Spacy orig tokens +# Input 2: Spacy cor tokens +# Output: An error type string based on orig AND cor +def get_two_sided_type(o_toks, c_toks): + # Extract strings, pos tags and parse info from the toks as lists + orig_str, orig_pos, orig_dep = get_edit_info(o_toks) + cor_str, cor_pos, cor_dep = get_edit_info(c_toks) + + # Orthography; i.e. whitespace and/or case errors. + if only_orth_change(orig_str, cor_str): + return "ORTH" + # Word Order; only matches exact reordering. + if exact_reordering(orig_str, cor_str): + return "WO" + + # 1:1 replacements (very common) + if len(orig_str) == len(cor_str) == 1: + # 1. SPECIAL CASES + # Possessive noun suffixes; e.g. ' -> 's + if o_toks[0].tag_ == "POS" or c_toks[0].tag_ == "POS": + return "NOUN:POSS" + # Contraction. Rule must come after possessive. + if (orig_str[0].lower() in conts or \ + cor_str[0].lower() in conts) and \ + orig_pos == cor_pos: + return "CONTR" + # Special auxiliaries in contractions (1); e.g. ca -> can, wo -> will + # Rule was broken in V1. Turned off this fix for compatibility. +# if (orig_str[0].lower() in res_dict["aux_conts"] and \ +# cor_str[0].lower() == res_dict["aux_conts"][orig_str[0].lower()]) or \ +# (cor_str[0].lower() in res_dict["aux_conts"] and \ +# orig_str[0].lower() == res_dict["aux_conts"][cor_str[0].lower()]): +# return "CONTR" + # Special auxiliaries in contractions (2); e.g. ca -> could, wo -> should + if orig_str[0].lower() in aux_conts or \ + cor_str[0].lower() in aux_conts: + return "VERB:TENSE" + # Special: "was" and "were" are the only past tense SVA + if {orig_str[0].lower(), cor_str[0].lower()} == {"was", "were"}: + return "VERB:SVA" + + # 2. SPELLING AND INFLECTION + # Only check alphabetical strings on the original side + # Spelling errors take precedence over POS errors; this rule is ordered + if orig_str[0].isalpha(): + # Check a GB English dict for both orig and lower case. + # E.g. "cat" is in the dict, but "Cat" is not. + if orig_str[0] not in spell and \ + orig_str[0].lower() not in spell: + # Check if both sides have a common lemma + if same_lemma(o_toks[0], c_toks[0]): + # Inflection; often count vs mass nouns or e.g. got vs getted + if orig_pos == cor_pos and orig_pos[0] in {"NOUN", "VERB"}: + return orig_pos[0]+":INFL" + # Unknown morphology; i.e. we cannot be more specific. + else: + return "MORPH" + # Use string similarity to detect true spelling errors. + else: + char_ratio = SequenceMatcher(None, orig_str[0], cor_str[0]).ratio() + # Ratio > 0.5 means both side share at least half the same chars. + # WARNING: THIS IS AN APPROXIMATION. + if char_ratio > 0.5: + return "SPELL" + # If ratio is <= 0.5, the error is more complex e.g. tolk -> say + else: + # If POS is the same, this takes precedence over spelling. + if orig_pos == cor_pos and \ + orig_pos[0] not in rare_pos: + return orig_pos[0] + # Tricky cases. + else: + return "OTHER" + + # 3. MORPHOLOGY + # Only ADJ, ADV, NOUN and VERB can have inflectional changes. + if same_lemma(o_toks[0], c_toks[0]) and \ + orig_pos[0] in open_pos2 and \ + cor_pos[0] in open_pos2: + # Same POS on both sides + if orig_pos == cor_pos: + # Adjective form; e.g. comparatives + if orig_pos[0] == "ADJ": + return "ADJ:FORM" + # Noun number + if orig_pos[0] == "NOUN": + return "NOUN:NUM" + # Verbs - various types + if orig_pos[0] == "VERB": + # NOTE: These rules are carefully ordered. + # Use the dep parse to find some form errors. + # Main verbs preceded by aux cannot be tense or SVA. + if preceded_by_aux(o_toks, c_toks): + return "VERB:FORM" + # Use fine PTB tags to find various errors. + # FORM errors normally involve VBG or VBN. + if o_toks[0].tag_ in {"VBG", "VBN"} or \ + c_toks[0].tag_ in {"VBG", "VBN"}: + return "VERB:FORM" + # Of what's left, TENSE errors normally involved VBD. + if o_toks[0].tag_ == "VBD" or c_toks[0].tag_ == "VBD": + return "VERB:TENSE" + # Of what's left, SVA errors normally involve VBZ. + if o_toks[0].tag_ == "VBZ" or c_toks[0].tag_ == "VBZ": + return "VERB:SVA" + # Any remaining aux verbs are called TENSE. + if orig_dep[0].startswith("aux") and \ + cor_dep[0].startswith("aux"): + return "VERB:TENSE" + # Use dep labels to find some more ADJ:FORM + if set(orig_dep+cor_dep).issubset({"acomp", "amod"}): + return "ADJ:FORM" + # Adj to plural noun is usually noun number; e.g. musical -> musicals. + if orig_pos[0] == "ADJ" and c_toks[0].tag_ == "NNS": + return "NOUN:NUM" + # For remaining verb errors (rare), rely on cor_pos + if c_toks[0].tag_ in {"VBG", "VBN"}: + return "VERB:FORM" + if c_toks[0].tag_ == "VBD": + return "VERB:TENSE" + if c_toks[0].tag_ == "VBZ": + return "VERB:SVA" + # Tricky cases that all have the same lemma. + else: + return "MORPH" + # Derivational morphology. + if stemmer.stem(orig_str[0]) == stemmer.stem(cor_str[0]) and \ + orig_pos[0] in open_pos2 and \ + cor_pos[0] in open_pos2: + return "MORPH" + + # 4. GENERAL + # Auxiliaries with different lemmas + if orig_dep[0].startswith("aux") and cor_dep[0].startswith("aux"): + return "VERB:TENSE" + # POS-based tags. Some of these are context sensitive mispellings. + if orig_pos == cor_pos and orig_pos[0] not in rare_pos: + return orig_pos[0] + # Some dep labels map to POS-based tags. + if orig_dep == cor_dep and orig_dep[0] in dep_map.keys(): + return dep_map[orig_dep[0]] + # Phrasal verb particles. + if set(orig_pos+cor_pos) == {"PART", "PREP"} or \ + set(orig_dep+cor_dep) == {"prt", "prep"}: + return "PART" + # Can use dep labels to resolve DET + PRON combinations. + if set(orig_pos+cor_pos) == {"DET", "PRON"}: + # DET cannot be a subject or object. + if cor_dep[0] in {"nsubj", "nsubjpass", "dobj", "pobj"}: + return "PRON" + # "poss" indicates possessive determiner + if cor_dep[0] == "poss": + return "DET" + # Tricky cases. + else: + return "OTHER" + + # Multi-token replacements (uncommon) + # All auxiliaries + if set(orig_dep+cor_dep).issubset({"aux", "auxpass"}): + return "VERB:TENSE" + # All same POS + if len(set(orig_pos+cor_pos)) == 1: + # Final verbs with the same lemma are tense; e.g. eat -> has eaten + if orig_pos[0] == "VERB" and \ + same_lemma(o_toks[-1], c_toks[-1]): + return "VERB:TENSE" + # POS-based tags. + elif orig_pos[0] not in rare_pos: + return orig_pos[0] + # All same special dep labels. + if len(set(orig_dep+cor_dep)) == 1 and \ + orig_dep[0] in dep_map.keys(): + return dep_map[orig_dep[0]] + # Infinitives, gerunds, phrasal verbs. + if set(orig_pos+cor_pos) == {"PART", "VERB"}: + # Final verbs with the same lemma are form; e.g. to eat -> eating + if same_lemma(o_toks[-1], c_toks[-1]): + return "VERB:FORM" + # Remaining edits are often verb; e.g. to eat -> consuming, look at -> see + else: + return "VERB" + # Possessive nouns; e.g. friends -> friend 's + if (orig_pos == ["NOUN", "PART"] or cor_pos == ["NOUN", "PART"]) and \ + same_lemma(o_toks[0], c_toks[0]): + return "NOUN:POSS" + # Adjective forms with "most" and "more"; e.g. more free -> freer + if (orig_str[0].lower() in {"most", "more"} or \ + cor_str[0].lower() in {"most", "more"}) and \ + same_lemma(o_toks[-1], c_toks[-1]) and \ + len(orig_str) <= 2 and len(cor_str) <= 2: + return "ADJ:FORM" + + # Tricky cases. + else: + return "OTHER" + +# Input 1: A list of original token strings +# Input 2: A list of corrected token strings +# Output: Boolean; the difference between orig and cor is only whitespace or case +def only_orth_change(o_str, c_str): + o_join = "".join(o_str).lower() + c_join = "".join(c_str).lower() + if o_join == c_join: + return True + return False + +# Input 1: A list of original token strings +# Input 2: A list of corrected token strings +# Output: Boolean; the tokens are exactly the same but in a different order +def exact_reordering(o_str, c_str): + # Sorting lets us keep duplicates. + o_set = sorted([tok.lower() for tok in o_str]) + c_set = sorted([tok.lower() for tok in c_str]) + if o_set == c_set: + return True + return False + +# Input 1: A spacy orig token +# Input 2: A spacy cor token +# Output: Boolean; the tokens have the same lemma +# Spacy only finds lemma for its predicted POS tag. Sometimes these are wrong, +# so we also consider alternative POS tags to improve chance of a match. +def same_lemma(o_tok, c_tok): + o_lemmas = [] + c_lemmas = [] + for pos in open_pos1: + # Lemmatise the lower cased form of the word. + o_lemmas.append(nlp.vocab.morphology.lemmatize( + pos, o_tok.lower, nlp.vocab.morphology.tag_map)) + c_lemmas.append(nlp.vocab.morphology.lemmatize( + pos, c_tok.lower, nlp.vocab.morphology.tag_map)) + if set(o_lemmas).intersection(set(c_lemmas)): + return True + return False + +# Input 1: An original text spacy token. +# Input 2: A corrected text spacy token. +# Output: Boolean; both tokens have a dependant auxiliary verb. +def preceded_by_aux(o_tok, c_tok): + # If the toks are aux, we need to check if they are the first aux. + if o_tok[0].dep_.startswith("aux") and c_tok[0].dep_.startswith("aux"): + # Find the parent verb + o_head = o_tok[0].head + c_head = c_tok[0].head + # Find the children of the parent + o_children = o_head.children + c_children = c_head.children + # Check the orig children. + for o_child in o_children: + # Look at the first aux... + if o_child.dep_.startswith("aux"): + # Check if the string matches o_tok + if o_child.text != o_tok[0].text: + # If it doesn't, o_tok is not first so check cor + for c_child in c_children: + # Find the first aux in cor... + if c_child.dep_.startswith("aux"): + # If that doesn't match either, neither are first aux + if c_child.text != c_tok[0].text: + return True + # Break after the first cor aux + break + # Break after the first orig aux. + break + # Otherwise, the toks are main verbs so we need to look for any aux. + else: + o_deps = [o_dep.dep_ for o_dep in o_tok[0].children] + c_deps = [c_dep.dep_ for c_dep in c_tok[0].children] + if "aux" in o_deps or "auxpass" in o_deps: + if "aux" in c_deps or "auxpass" in c_deps: + return True + return False \ No newline at end of file diff --git a/errant/en/merger.py b/errant/en/merger.py new file mode 100644 index 0000000..202239d --- /dev/null +++ b/errant/en/merger.py @@ -0,0 +1,122 @@ +from difflib import SequenceMatcher +from itertools import combinations, groupby +from re import sub +from string import punctuation +import spacy.parts_of_speech as POS +from errant.edit import Edit + +# Merger resources +open_pos = {POS.ADJ, POS.ADV, POS.NOUN, POS.VERB} + +# Input: An Alignment object +# Output: A list of Edit objects +def get_rule_edits(alignment): + edits = [] + # Split alignment into groups of M, T and rest. (T has a number after it) + for op, group in groupby(alignment.align_seq, + lambda x: x[0][0] if x[0][0] in {"M", "T"} else False): + group = list(group) + # Ignore M + if op == "M": continue + # T is always split + elif op == "T": + for seq in group: + edits.append(Edit(alignment.orig, alignment.cor, seq[1:])) + # Process D, I and S subsequence + else: + processed = process_seq(group, alignment) + # Turn the processed sequence into edits + for seq in processed: + edits.append(Edit(alignment.orig, alignment.cor, seq[1:])) + return edits + +# Input 1: A sequence of adjacent D, I and/or S alignments +# Input 2: An Alignment object +# Output: A sequence of merged/split alignments +def process_seq(seq, alignment): + # Return single alignments + if len(seq) <= 1: return seq + # Get the ops for the whole sequence + ops = [op[0] for op in seq] + # Merge all D xor I ops. (95% of human multi-token edits contain S). + if set(ops) == {"D"} or set(ops) == {"I"}: return merge_edits(seq) + + content = False # True if edit includes a content word + # Get indices of all start-end combinations in the seq: 012 = 01, 02, 12 + combos = list(combinations(range(0, len(seq)), 2)) + # Sort them starting with largest spans first + combos.sort(key = lambda x: x[1]-x[0], reverse=True) + # Loop through combos + for start, end in combos: + # Ignore ranges that do NOT contain a substitution. + if "S" not in ops[start:end+1]: continue + # Get the tokens in orig and cor. They will now never be empty. + o = alignment.orig[seq[start][1]:seq[end][2]] + c = alignment.cor[seq[start][3]:seq[end][4]] + # Merge possessive suffixes: [friends -> friend 's] + if o[-1].tag_ == "POS" or c[-1].tag_ == "POS": + return process_seq(seq[:end-1], alignment) + \ + merge_edits(seq[end-1:end+1]) + \ + process_seq(seq[end+1:], alignment) + # Case changes + if o[-1].lower == c[-1].lower: + # Merge first token I or D: [Cat -> The big cat] + if start == 0 and (len(o) == 1 and c[0].text[0].isupper()) or \ + (len(c) == 1 and o[0].text[0].isupper()): + return merge_edits(seq[start:end+1]) + \ + process_seq(seq[end+1:], alignment) + # Merge with previous punctuation: [, we -> . We], [we -> . We] + if (len(o) > 1 and is_punct(o[-2])) or \ + (len(c) > 1 and is_punct(c[-2])): + return process_seq(seq[:end-1], alignment) + \ + merge_edits(seq[end-1:end+1]) + \ + process_seq(seq[end+1:], alignment) + # Merge whitespace/hyphens: [acat -> a cat], [sub - way -> subway] + s_str = sub("['-]", "", "".join([tok.lower_ for tok in o])) + t_str = sub("['-]", "", "".join([tok.lower_ for tok in c])) + if s_str == t_str: + return process_seq(seq[:start], alignment) + \ + merge_edits(seq[start:end+1]) + \ + process_seq(seq[end+1:], alignment) + # Merge same POS or infinitive/phrasal verbs: + # [to eat -> eating], [watch -> look at] + pos_set = set([tok.pos for tok in o]+[tok.pos for tok in c]) + if (len(pos_set) == 1 and len(o) != len(c)) or \ + pos_set == {POS.PART, POS.VERB}: + return process_seq(seq[:start], alignment) + \ + merge_edits(seq[start:end+1]) + \ + process_seq(seq[end+1:], alignment) + # Split rules take effect when we get to smallest chunks + if end-start < 2: + # Split adjacent substitutions + if len(o) == len(c) == 2: + return process_seq(seq[:start+1], alignment) + \ + process_seq(seq[start+1:], alignment) + # Split similar substitutions at sequence boundaries + if (ops[start] == "S" and char_cost(o[0], c[0]) < 0.25) or \ + (ops[end] == "S" and char_cost(o[-1], c[-1]) < 0.25): + return process_seq(seq[:start+1], alignment) + \ + process_seq(seq[start+1:], alignment) + # Split final determiners + if end == len(seq)-1 and ((ops[-1] in {"D", "S"} and \ + o[-1].pos == POS.DET) or (ops[-1] in {"I", "S"} and \ + c[-1].pos == POS.DET)): + return process_seq(seq[:-1], alignment) + [seq[-1]] + # Set content word flag + if not pos_set.isdisjoint(open_pos): content = True + # Merge sequences that contain content words + if content: return merge_edits(seq) + else: return seq + +# Check whether token is punctuation +def is_punct(token): + return token.pos == POS.PUNCT or token.text in punctuation + +# Calculate the cost of character alignment; i.e. char similarity +def char_cost(a, b): + return 1-SequenceMatcher(None, a.text, b.text).ratio() + +# Merge the input alignment sequence to a single edit span +def merge_edits(seq): + if seq: return [("X", seq[0][1], seq[-1][2], seq[0][3], seq[-1][4])] + else: return seq \ No newline at end of file diff --git a/errant/en/resources/README.md b/errant/en/resources/README.md new file mode 100644 index 0000000..cb9958f --- /dev/null +++ b/errant/en/resources/README.md @@ -0,0 +1,20 @@ +# Resources + +## en-ptb_map + +en-ptb_map is a mapping file that converts spacy Penn Treebank (PTB) style part-of-speech (POS) tags to Universal Dependency (UD) tags. + +This file is necessary because spacy is inconsistent in how it maps fine-grained tags to coarse-grained tags and does not always follow UD guidelines. For example, spacy maps the fine-grained WDT tag (denoting a Wh-determiner such as "_which_ book") to PRON (pronoun) even though it is a determiner by definition. + +The original UD mapping file was obtained [here](http://universaldependencies.org/tagset-conversion/en-penn-uposf.html). I note that some of the mappings have changed since I originally downloaded the file (namely EX, LS and RP), and so I may update this in the future. + +Spacy also includes some custom POS tags that are not part of the original PTB tagset, so I used the recommended mapping (available [here](https://github.com/explosion/spaCy/blob/master/spacy/lang/en/tag_map.py)) in these cases. It is also worth mentioning that spacy occasionally updates the mapping, but this only applies that later version of spacy and not spacy 1.9 on which ERRANT currently depends. + +## en_GB-large.txt + +en_GB-large.txt is a list of valid British English words according to the latest Hunspell dictionary. + +It was obtained [here](https://sourceforge.net/projects/wordlist/files/speller/2017.08.24/). + +The specific file bundled with this release is: wordlist-en_GB-large-2017.08.24.zip. + diff --git a/resources/en-ptb_map b/errant/en/resources/en-ptb_map similarity index 100% rename from resources/en-ptb_map rename to errant/en/resources/en-ptb_map diff --git a/resources/en_GB-large.txt b/errant/en/resources/en_GB-large.txt similarity index 100% rename from resources/en_GB-large.txt rename to errant/en/resources/en_GB-large.txt diff --git a/m2_to_m2.py b/m2_to_m2.py deleted file mode 100644 index 18c32b9..0000000 --- a/m2_to_m2.py +++ /dev/null @@ -1,102 +0,0 @@ -import argparse -import os -import spacy -from nltk.stem.lancaster import LancasterStemmer -import scripts.align_text as align_text -import scripts.cat_rules as cat_rules -import scripts.toolbox as toolbox - -def main(args): - # Get base working directory. - basename = os.path.dirname(os.path.realpath(__file__)) - print("Loading resources...") - # Load Tokenizer and other resources - nlp = spacy.load("en") - # Lancaster Stemmer - stemmer = LancasterStemmer() - # GB English word list (inc -ise and -ize) - gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt") - # Part of speech map file - tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map") - # Setup output m2 file - out_m2 = open(args.out, "w") - - print("Processing files...") - # Open the m2 file and split into sentence+edit chunks. - m2_file = open(args.m2).read().strip().split("\n\n") - for info in m2_file: - # Get the original and corrected sentence + edits for each annotator. - orig_sent, coder_dict = toolbox.processM2(info) - # Write the orig_sent to the output m2 file. - out_m2.write("S "+" ".join(orig_sent)+"\n") - # Only process sentences with edits. - if coder_dict: - # Save marked up original sentence here, if required. - proc_orig = "" - # Loop through the annotators - for coder, coder_info in sorted(coder_dict.items()): - cor_sent = coder_info[0] - gold_edits = coder_info[1] - # If there is only 1 edit and it is noop, just write it. - if gold_edits[0][2] == "noop": - out_m2.write(toolbox.formatEdit(gold_edits[0], coder)+"\n") - continue - # Markup the orig and cor sentence with spacy (assume tokenized) - # Orig is marked up only once for the first coder that needs it. - proc_orig = toolbox.applySpacy(orig_sent, nlp) if not proc_orig else proc_orig - proc_cor = toolbox.applySpacy(cor_sent, nlp) - # Loop through gold edits. - for gold_edit in gold_edits: - # Um and UNK edits (uncorrected errors) are always preserved. - if gold_edit[2] in {"Um", "UNK"}: - # Um should get changed to UNK unless using old categories. - if gold_edit[2] == "Um" and not args.old_cats: gold_edit[2] = "UNK" - out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n") - # Gold edits - elif args.gold: - # Minimise the edit; e.g. [has eaten -> was eaten] = [has -> was] - if not args.max_edits: - gold_edit = toolbox.minimiseEdit(gold_edit, proc_orig, proc_cor) - # If minimised to nothing, the edit disappears. - if not gold_edit: continue - # Give the edit an automatic error type. - if not args.old_cats: - cat = cat_rules.autoTypeEdit(gold_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) - gold_edit[2] = cat - # Write the edit to the output m2 file. - out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n") - # Auto edits - if args.auto: - # Auto align the parallel sentences and extract the edits. - auto_edits = align_text.getAutoAlignedEdits(proc_orig, proc_cor, args) - # Loop through the edits. - for auto_edit in auto_edits: - # Give each edit an automatic error type. - cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) - auto_edit[2] = cat - # Write the edit to the output m2 file. - out_m2.write(toolbox.formatEdit(auto_edit, coder)+"\n") - # Write a newline when there are no more coders. - out_m2.write("\n") - -if __name__ == "__main__": - # Define and parse program input - parser = argparse.ArgumentParser(description="Automatically extract and/or type edits in an m2 file.", - formatter_class=argparse.RawTextHelpFormatter, - usage="%(prog)s [-h] (-auto | -gold) [options] m2 -out OUT") - parser.add_argument("m2", help="A path to an m2 file.") - type_group = parser.add_mutually_exclusive_group(required=True) - type_group.add_argument("-auto", help="Extract edits automatically.", action="store_true") - type_group.add_argument("-gold", help="Use existing edit alignments.", action="store_true") - parser.add_argument("-out", help="The output filepath.", required=True) - parser.add_argument("-max_edits", help="Do not minimise edit spans. (gold only)", action="store_true") - parser.add_argument("-old_cats", help="Do not reclassify the edits. (gold only)", action="store_true") - parser.add_argument("-lev", help="Use standard Levenshtein to align sentences.", action="store_true") - parser.add_argument("-merge", choices=["rules", "all-split", "all-merge", "all-equal"], default="rules", - help="Choose a merging strategy for automatic alignment.\n" - "rules: Use a rule-based merging strategy (default)\n" - "all-split: Merge nothing; e.g. MSSDI -> M, S, S, D, I\n" - "all-merge: Merge adjacent non-matches; e.g. MSSDI -> M, SSDI\n" - "all-equal: Merge adjacent same-type non-matches; e.g. MSSDI -> M, SS, D, I") - args = parser.parse_args() - main(args) \ No newline at end of file diff --git a/parallel_to_m2.py b/parallel_to_m2.py deleted file mode 100644 index 6ee4d13..0000000 --- a/parallel_to_m2.py +++ /dev/null @@ -1,79 +0,0 @@ -import argparse -import os -import spacy -from contextlib import ExitStack -from nltk.stem.lancaster import LancasterStemmer -import scripts.align_text as align_text -import scripts.cat_rules as cat_rules -import scripts.toolbox as toolbox - -def main(args): - # Get base working directory. - basename = os.path.dirname(os.path.realpath(__file__)) - print("Loading resources...") - # Load Tokenizer and other resources - nlp = spacy.load("en") - # Lancaster Stemmer - stemmer = LancasterStemmer() - # GB English word list (inc -ise and -ize) - gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt") - # Part of speech map file - tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map") - # Setup output m2 file - out_m2 = open(args.out, "w") - - # ExitStack lets us process an arbitrary number of files line by line simultaneously. - # See https://stackoverflow.com/questions/24108769/how-to-read-and-process-multiple-files-simultaneously-in-python - print("Processing files...") - with ExitStack() as stack: - in_files = [stack.enter_context(open(i)) for i in [args.orig]+args.cor] - # Process each line of all input files. - for line_id, line in enumerate(zip(*in_files)): - orig_sent = line[0].strip() - cor_sents = line[1:] - # If orig sent is empty, skip the line - if not orig_sent: continue - # Write the original sentence to the output m2 file. - out_m2.write("S "+orig_sent+"\n") - # Markup the original sentence with spacy (assume tokenized) - proc_orig = toolbox.applySpacy(orig_sent.split(), nlp) - # Loop through the corrected sentences - for cor_id, cor_sent in enumerate(cor_sents): - cor_sent = cor_sent.strip() - # Identical sentences have no edits, so just write noop. - if orig_sent == cor_sent: - out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(cor_id)+"\n") - # Otherwise, do extra processing. - else: - # Markup the corrected sentence with spacy (assume tokenized) - proc_cor = toolbox.applySpacy(cor_sent.strip().split(), nlp) - # Auto align the parallel sentences and extract the edits. - auto_edits = align_text.getAutoAlignedEdits(proc_orig, proc_cor, args) - # Loop through the edits. - for auto_edit in auto_edits: - # Give each edit an automatic error type. - cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) - auto_edit[2] = cat - # Write the edit to the output m2 file. - out_m2.write(toolbox.formatEdit(auto_edit, cor_id)+"\n") - # Write a newline when we have processed all corrections for a given sentence. - out_m2.write("\n") - -if __name__ == "__main__": - # Define and parse program input - parser = argparse.ArgumentParser(description="Convert parallel original and corrected text files (1 sentence per line) into M2 format.\nThe default uses Damerau-Levenshtein and merging rules and assumes tokenized text.", - formatter_class=argparse.RawTextHelpFormatter, - usage="%(prog)s [-h] [options] -orig ORIG -cor COR [COR ...] -out OUT") - parser.add_argument("-orig", help="The path to the original text file.", required=True) - parser.add_argument("-cor", help="The paths to >= 1 corrected text files.", nargs="+", default=[], required=True) - parser.add_argument("-out", help="The output filepath.", required=True) - parser.add_argument("-lev", help="Use standard Levenshtein to align sentences.", action="store_true") - parser.add_argument("-merge", choices=["rules", "all-split", "all-merge", "all-equal"], default="rules", - help="Choose a merging strategy for automatic alignment.\n" - "rules: Use a rule-based merging strategy (default)\n" - "all-split: Merge nothing; e.g. MSSDI -> M, S, S, D, I\n" - "all-merge: Merge adjacent non-matches; e.g. MSSDI -> M, SSDI\n" - "all-equal: Merge adjacent same-type non-matches; e.g. MSSDI -> M, SS, D, I") - args = parser.parse_args() - # Run the program. - main(args) \ No newline at end of file diff --git a/readme.md b/readme.md deleted file mode 100644 index 26240bd..0000000 --- a/readme.md +++ /dev/null @@ -1,345 +0,0 @@ -# ERRANT - -This repository contains the grammatical ERRor ANnotation Toolkit (ERRANT) described in: - -> Christopher Bryant, Mariano Felice, and Ted Briscoe. 2017. [**Automatic annotation and evaluation of Error Types for Grammatical Error Correction**](http://aclweb.org/anthology/P/P17/P17-1074.pdf). In Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). Vancouver, Canada. - -> Mariano Felice, Christopher Bryant, and Ted Briscoe. 2016. [**Automatic extraction of learner errors in esl sentences using linguistically enhanced alignments**](http://aclweb.org/anthology/C/C16/C16-1079.pdf). In Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers. Osaka, Japan. - -If you make use of this code, please cite the above papers. - -# Overview - -The main aim of ERRANT is to automatically annotate parallel English sentences with error type information. Specifically, given an original and corrected sentence pair, ERRANT will extract the edits that transform the former to the latter and then classify them according to a rule-based error type framework. This can be used to standardise parallel datasets or facilitate detailed error type evaluation. The annotated output file is in M2 format and an evaluation script is provided. - -### Example: -**Original**: This are gramamtical sentence . -**Corrected**: This is a grammatical sentence . -**Output M2**: -S This are gramamtical sentence . -A 1 2|||R:VERB:SVA|||is|||REQUIRED|||-NONE-|||0 -A 2 2|||M:DET|||a|||REQUIRED|||-NONE-|||0 -A 2 3|||R:SPELL|||grammatical|||REQUIRED|||-NONE-|||0 -A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||1 - -In M2 format, a line preceded by S denotes an original sentence while a line preceded by A indicates an edit annotation. Each edit line consists of the start and end token offset of the edit, the error type, and the tokenized correction string. The next two fields are included for historical reasons (see the CoNLL-2014 shared task) while the last field is the annotator id. - -A "noop" edit is a special kind of edit that explicitly indicates an annotator/system made no changes to the original sentence. If there is only one annotator, noop edits are optional, otherwise a noop edit should be included whenever at least 1 out of n annotators considered the original sentence to be correct. This is something to be aware of when combining individual m2 files, as missing noops can affect results. - -# Pre-requisites - -We only support Python 3. It is safest to install everything in a clean [virtualenv](https://docs.python-guide.org/dev/virtualenvs/#lower-level-virtualenv). - -## spaCy - -spaCy is a natural language processing (NLP) toolkit available here: https://spacy.io/. - -It can be installed for Python 3 as follows: -``` -pip3 install -U spacy==1.9.0 -python3 -m spacy download en -``` -This installs both spaCy itself and the default English language model. We do not recommend spaCy 2.0 at this time because it is slower and less compatible with ERRANT. More information on how to install spaCy can be found on its website. We used spaCy 1.7.3 in our original paper. - -## NLTK - -NLTK is another well-known NLP library: http://www.nltk.org/. We use it only for the Lancaster Stemmer. - -It can be installed for Python 3 as follows: -``` -pip3 install -U nltk -``` - -# Usage - -Three main scripts are provided with ERRANT: `parallel_to_m2.py`, `m2_to_m2.py` and `compare_m2.py`. - -1. `parallel_to_m2.py` - - Extract and classify edits from parallel sentences automatically. This is the simplest annotation script, which requires an original text file, at least one corrected text file, and an output filename. The original and corrected text files must have one sentence per line and be word tokenized. - Example: - ``` - python3 parallel_to_m2.py -orig -cor [ ...] -out - ``` - -2. `m2_to_m2.py` - - This is a more sophisticated version of `parallel_to_m2.py` that operates on an m2 file instead of parallel text files. This makes it easier to process multiple sets of corrections simultaneously. In addition to an input m2 file, you must also specify whether you want to use gold or auto edits: `-gold` will only classify the existing edits, while `-auto` will extract and classify edits automatically. In both settings, uncorrected edits and noops are preserved in the original input file. - Example: - ``` - python3 m2_to_m2.py {-auto|-gold} m2_file -out - ``` - -3. `compare_m2.py` - - This is the script to evaluate a hypothesis m2 file against a reference m2 file. The default behaviour evaluates the hypothesis overall in terms of correction. The `-cat {1,2,3}` flag is used to evaluate error types at increasing levels of granularity while the `-ds` or `-dt` flag is used to evaluate in terms of span-based or token-based detection (i.e. ignoring the correction). All scores are presented in terms of Precision, Recall and F-score (default: F0.5), and counts for True Positives (TP), False Positives (FP) and False Negatives (FN) are also shown. - Examples: - ``` - python3 compare_m2.py -hyp -ref - python3 compare_m2.py -hyp -ref -cat {1,2,3} - python3 compare_m2.py -hyp -ref -ds - python3 compare_m2.py -hyp -ref -ds -cat {1,2,3} - ``` - -All these scripts also have additional advanced command line options which can be displayed using the `-h` flag. - -#### Runtime - -In terms of speed, ERRANT processes ~70 sents/sec in the fully automatic edit extraction and classification setting, but ~350 sents/sec in the classification setting alone. These figures were calculated on an Intel Xeon E5-2630 v4 @ 2.20GHz machine, but results will vary depending on how different the original and corrected sentences are. - -# Edit Extraction - -For more information about the edit extraction phase of annotation, we refer the reader to the following paper: - -> Mariano Felice, Christopher Bryant, and Ted Briscoe. 2016. [**Automatic extraction of learner errors in esl sentences using linguistically enhanced alignments**](http://aclweb.org/anthology/C/C16/C16-1079.pdf). In Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers. Osaka, Japan. - -Note that ERRANT has been updated since the release of this paper and that the alignment cost and merging rules have also changed slightly. See `scripts/align_text.py` for more information. - -# Error Type Classification - -A brief description of some of the rules used to classify error types is described in Section 3.1 of the ERRANT paper. In this section, we describe all the rules in more detail. Although we present the rules for each error type individually, the reader should be aware that some rules interract and there are several constraints on rule order. Rule order is discussed at the end of this section. - -## Operation Tier - -All edits are minimally classified in terms of edit operation, i.e. Missing, Replacement or Unnecessary, depending on whether tokens are inserted, substituted or deleted respectively. - -| Type | Form -|-------------|-------- -| Missing | Ø -> B -| Replacement | A -> B -| Unnecessary | A -> Ø - -A special case concerns edits such as [Man -> The man] or [The man -> Man]. While these look like substitution edits, the main intention of the edit is actually to insert or delete a word. We hence treat them as such and ignore the case change. They are detected by the following rule: - -* The number of tokens on each side of the edit is not equal, the lower cased form of the last token is the same, and removing the last token on both sides results in an empty string on one side. - -Finally, any gold edit that changes A -> A or Ø -> Ø is labelled Unknown (UNK), since it ultimately has no effect on the text. These are normally gold edits that humans detected, but were unable or unsure how to correct. UNK edits are analogous to *Um* (Unclear Meaning) edits in the NUCLE framework. - -## Token Tier - -### Part-Of-Speech - -POS-based error types are assigned based primarily on the POS tags of the edited tokens according to the [Stanford Universal Dependency](http://universaldependencies.org/tagset-conversion/en-penn-uposf.html) framework. These tags are sometimes too detailed for error annotation however, so we do not use: interjections (INTJ), numerals (NUM), symbols (SYM) or other (X). We also renamed adpositions (ADP) to prepositions (PREP) and treat proper nouns (PROPN) as regular nouns (NOUN). - -In the majority of cases, an edit may be assigned a POS error category if it meets the following condition: - -* All tokens on both sides of the edit have the same POS tag and do not meet the criteria for a more specific type. - -This is not always sufficient however, and so we also make use of other information to determine certain POS-based edits. For example, there are several dependency parse labels that map to specific parts-of-speech. - -| Dep Label | POS | -|-----------|-------| -| acomp | ADJ | -| amod | ADJ | -| advmod | ADV | -| det | DET | -| prep | PREP | -| prt | PART | -| punct | PUNCT | - -* The tokens on both sides of the edit may have different POS tags but they all have the same dep label which appears in the above table. - -Finally, there are also several special cases of POS error types: - -#### VERB -* All tokens on both sides of the edit are either PART or VERB and the last token on each side has a different lemma; e.g. [*to eat* -> Ø], [*consuming* -> *to eat*], [*look at* -> *see*] - -#### PART -* There is exactly one token on both sides of the edit and the combined set of POS tags is PREP and PART or the combined set of dep labels is *prep* and *prt*; e.g. [(look) *at* -> (look) *for*]. - -#### DET/PRON -* There is exactly one token on both sides of the edit and the combined set of POS tags is DET and PRON. If the corrected token dep label is *poss*, this is a possessive determiner which means the edit is DET. If the corrected dep label is *nsubj*, *nsubjpass*, *dobj* or *pobj* however, the edit is PRON because determiners cannot be subjects or objects. - -#### PUNCT -* The lower cased form of the last token on both sides of the edit is the same and all remaining tokens are punctuation; e.g. [*. Because* -> *, because*] - -### Contractions: CONTR - -* At least one side of the edit is a contraction (*'d*, *'ll*, *'m*, *n't*, *'re*, *'s*, or *'ve*), there is not more than one token on both sides of the edit and all tokens have the same POS. - -During auto alignment, contractions may get separated from the word they depend on; e.g. [*did n't* -> *did not*] becomes [*n't* -> *not*]. *ca n't*, *wo n't* and *sha n't* are special cases where *can*, *will* and *shall* are shortened to *ca*, *wo* and *sha*. To prevent them being flagged as spelling errors, they are handled by a separate rule: - -* There is exactly one token on both sides of the edit and they are *ca* and *can*, *wo* and *will*, or *sha* and *shall*. - -### Morphology: MORPH - -* There is exactly one token on both sides of the edit and the tokens have the same lemma or stem, but nothing else in common. - -The morphology category captures derivational morphology errors, e.g. [*quick* (ADJ) -> *quickly* (ADV)], and cases where the POS tagger makes a mistake; e.g. [*catch* (NOUN) -> *catching* (VERB)] - -### Other: OTHER - -Edits that are not captured by any rules are classified as OTHER. They are typically edits such as [*at* (PREP) -> *the* (DET)], which have perhaps been improperly aligned, or else multi-token edits such as [*at his best* -> *well*] which are much more semantic in nature. - -### Orthography: ORTH - -* The lower cased form of both sides of the edit with all whitespace removed results in the same string; e.g. [*firstly* -> *Firstly*], [*bestfriend* -> *best friend*]. - -Although the definition of orthography can be quite broad, we use it here to refer only to edits that involve case and/or whitespace changes. - -### Spelling: SPELL - -We use the latest [British English Hunspell dictionary word list](https://sourceforge.net/projects/wordlist/files/speller/2017.01.22/) to identify spelling errors. Alternative English dictionaries can also be used. It is assumed humans did not misspell their corrections. - -Spelling edits must meet the following conditions: - -1. There is exactly one token on both sides of the edit. -2. The original token is entirely alphabetical (no numbers or punctuation). -3. The original token is not in the dictionary. -4. The lower cased original token is not in the dictionary. -5. The original and corrected tokens do not have the same lemma. -6. The original and corrected tokens share at least 50% of the same characters in the same relative order. - -We check the dictionary twice because casing produces false positives. For example *Cat* is not in the dictionary but *cat* is; we do not want to call *Cat* a spelling error however if the correction is [*Cat* -> *Cats*]. It's also worth noting some words require upper case to be valid; e.g. *iPhone*. - -The character comparison condition is an approximation. In general, spelling errors involve tokens that have very similar original and corrected forms. This is not always the case however, and there are also edits such as [*greatful* -> *pleased*]. While *greatful* is a misspelling of *grateful*, the correction ultimately replaces it entirely with a synonym. It hence seems more appropriate to call this a replacement adjective error rather than a spelling error: - -* The original token meets criteria 1-5, but not 6. If both sides of the edit have the same POS tag, use that as the error type, otherwise OTHER. - -### Word Order: WO - -* Alphabetically sorted lists of lower cased tokens on both sides of the edit are identical; e.g. [*house white* -> *white house*] - -Sorted lists are used instead of sets as sets do not allow duplicates. We also investigated relaxing the exact-match constraint to allow majority-matches, e.g. [*I saw the man* -> *The man saw me*], but ultimately preferred exact matches. - -## Morphology Tier - -### Adjective Form: ADJ:FORM - -* There is exactly one token on both sides of the edit and both tokens have the same lemma. The tokens themselves are either both ADJ or parsed as *acomp* or *amod*; e.g. [*big* -> *biggest*]. - -A second rule captures multi-token adjective form errors: - -* There are no more than two tokens on both sides of the edit, the first token on either side is *more* or *most* and the last token on both sides has the same lemma; e.g. [*more big* -> *bigger*]. - -### Noun Inflection: NOUN:INFL - -Noun inflection errors are usually count-mass noun errors, e.g. [*advices* -> *advice*], but also include cases such as [*countrys* -> *countries*] and [*childs* -> *children*]. They are a special kind of non-word error that must meet the following criteria: - -1. There is exactly one token on both sides of the edit. -2. The original token is entirely alphabetical (no numbers or punctuation). -3. The original token is not in the dictionary. -4. The lower cased original token is not in the dictionary. -5. The original and corrected tokens have the same lemma. -6. The original and corrected tokens are both NOUN. - -### Noun Number: NOUN:NUM - -* There is exactly one token on both sides of the edit, both tokens have the same lemma and both tokens are NOUN; e.g. [*cat* -> *cats*]. - -A fairly common POS tagger error concerns nouns that look like adjectives; e.g. [*musical* -> *musicals*]. These are handled by a separate rule that also makes use of fine PTB-style POS tags. - -* There is exactly one token on both sides of the edit, both tokens have the same lemma, the original token is ADJ and the corrected token is a plural noun (NNS). - -This second rule was only found to be effective in the singular to plural direction and not the other way around. - -### Noun Possessive: NOUN:POSS - -* There is not more than one token on both sides of the edit and at least one side is given the fine PTB tag POS. In this instance, POS indicates possessive rather than part-of-speech. - -Since possessive suffixes are separated from their dependent nouns, edits such as [*teacher* -> *teacher 's*] are minimised to [Ø -> *'s*]. Multi-token possessive edits are handled by a separate rule. - -* Either the original tokens or the corrected tokens consist of the POS sequence NOUN PART and the first token on both sides has the same lemma; e.g. [*friends* -> *friend 's*]. - -### Verb Form: VERB:FORM - -Verb form errors typically involve bare infinitive, *to*-infinitive, gerund and participle forms. To give an example, any edit between members of the following set would likely be considered a verb form error: {*eat*, *to eat*, *eating*, *eaten*}. To make things more complicated, *eat* is also a non-3rd-person present tense form (e.g. *I eat food*), which is usually not a verb form error. In light of this ambiguity, we use fine PTB-style POS tags, rather than coarse Universal Dependency tags, to classify verb form errors. A verb form error must meet one of the following criteria. - -  A. The edit is a missing or unnecessary *to*, it is tagged PART and is not parsed *prep*. -  B. There is exactly one token on both sides of the edit, they both have the same lemma, are both VERB and are both preceded by a dependent auxiliary verb. -  C. There is exactly one token on both sides of the edit, they both have the same lemma, are both VERB and at least one is a gerund (VBG) or past participle (VBN). -  D. There is exactly one token on both sides of the edit, they both have the same lemma, do not have the same POS tag, but the corrected token is a gerund (VBG) or past participle (VBN). -  E. All the tokens on both sides of the edit are PART or VERB and the last tokens have the same lemma. - -#### Explanation - -  A. We treat infinitival *to* as part of a verb form; e.g. [Ø -> *to*]. -  B. In a verb phrase, tense and agreement fall on the first auxiliary, if any. Consequently, if both edited verbs are preceded by auxiliaries, they can only be form errors; e.g. [(has) *eating* -> (has) *eaten*], [(have) *be* (sleeping) -> (have) *been* (sleeping)]. -  C. In general, we found that edits with a VBG or VBN on one side were form errors. -  D. If the POS tags are different, we rely only on the corrected POS tag. -  E. Multi-token form errors typically involve infinitival *to*; e.g. [*to eat* -> *eating*]. - -### Verb Inflection: VERB:INFL - -Verb inflection errors are classified in a similar manner to noun inflection errors. Examples include [*getted* -> *got*], [*danceing* -> *dancing*] and [*fliped* -> *flipped*]. - -1. There is exactly one token on both sides of the edit. -2. The original token is entirely alphabetical (no numbers or punctuation). -3. The original token is not in the dictionary. -4. The lower cased original token is not in the dictionary. -5. The original and corrected tokens have the same lemma. -6. The original and corrected tokens are both VERB. - -### Subject-Verb Agreement: VERB:SVA - -SVA errors must meet one of the following criteria: - -  A. There is exactly one token on both sides of the edit and they are *was* and *were*. -  B. There is exactly one token on both sides of the edit, they both have the same lemma, are both VERB and at least one side is a 3rd-person verb form (VBZ). -  C. There is exactly one token on both sides of the edit, they both have the same lemma, do not have the same POS tag, but the corrected token is a 3rd-person verb form (VBZ). - -#### Explanation - -  A. *was* and *were* are the only past tense forms that have agreement morphology. -  B. In general, we found that edits with VBZ on one side were form errors. -  C. If the POS tags are different, we rely only on the corrected POS tag. - -### Tense: VERB:TENSE - -Tense errors are complicated. The simplest tense errors are inflectional, e.g. [*eat* -> *ate*], but tense can also be expressed periphrastically by means of auxiliary verbs; e.g. [*eat* -> *has eaten*]. This does not mean we can label all auxiliary verbs tense errors however, as auxiliary verbs can also be form or agreement errors; e.g. [(is) *be* (eaten) -> (is) *being* (eaten)] and [(it) *are* (eaten) -> (it) *is* (eaten)]. Consequently, errors involving auxilliary verbs are only considered tense errors if they are not already classified as form or agreement errors. They must also meet one of the following criteria: - -  A. All tokens are parsed as missing or unnecessary auxiliary verbs (*aux*/*auxpass*); e.g. [Ø (eaten) -> *has* (eaten)]. -  B. There is exactly one token on both sides of the edit. If one side is *ca*, the other side is not *can*; if one side is *wo*, the other side is not *will*; or if one side is *sha*, the other side is not *shall*. E.g. [*ca* (n't) -> *could* (n't)]. -  C. There is exactly one token on both sides of the edit, they both have the same lemma, are both VERB and at least one side is a past tense verb form (VBD). -  D. There is exactly one token on both sides of the edit, they both have the same lemma, are both VERB, and are both parsed *aux* or *auxpass*. -  E. There is exactly one token on both sides of the edit, they both have the same lemma, do not have the same POS tag, but the corrected token is a past tense verb form (VBD). -  F. There is exactly one token on both sides of the edit, they do not have the same lemma, but are both parsed *aux* or *auxpass*; e.g. [*was* (eaten) -> *has* (eaten)]. -  G. All tokens on both side of the edit are parsed *aux* or *auxpass*; e.g. [*has been* (eaten) -> *is* (eaten)]. -  H. All tokens on both sides of the edit are VERB and the last token on both sides has the same lemma; e.g. [*has eaten* -> *was eating*]. - -#### Explanation - -  A. A missing or unnecessary auxilliary verb cannot be a form or agreement error so must be tense. -  B. As mentioned previously, certain contractions require a special rule. -  C. In general, we found that edits with VBD on one side were tense errors. -  D. In some situations, auxiliaries might be tagged as infinitives (VB) or non-3rd-person forms (VBP). Nevertheless, if they are auxiliaries, they are likely to be tense errors. -  E. If the POS tags are different, we rely only on the corrected POS tag. -  F. Auxiliary verbs with different lemmas are all likely to be tense errors. -  G. Sequences of auxiliaries on both sides of the edit are likely to be tense errors. -  H. Multi-token edits with the same VERB lemma are likely to be inflectional-to-periphrastic tense errors or vice versa. - -It is worth mentioning that although auxiliaries can be further subdivided in terms of tense, aspect, mood or voice, this distinction seems too narrow for the purposes of error type classification. - -## Rule Order - -As mentioned at the start of this section, the above rules have been described in isolation when in fact they sometimes interact and may be carefully ordered. The most complex example of this is verb morphology errors: while errors involving gerunds (VBG) or participles (VBN) are usually considered FORM, and errors involving past tense forms (VBD) are usually considered TENSE, edits such as VBG -> VBD, or vice versa, are more ambiguous (FORM or TENSE?). Similarly, SVA errors normally involve a 3rd-person form (VBZ), but there are also cases of VBZ -> VBN (SVA or FORM?). Although such cases are normally the result of a POS tagging error, we ultimately resolved this issue by manually inspecting the data to determine an order of precedence. Specifically, ambiguous errors were first considered FORM if one side was VBG or VBN, second considered SVA if one side was VBP or VBZ, and third considered TENSE if one side was VBD. In our experiments, this order seemed to produce the most reliable results, but it must still be recognised as an approximation. - -Ultimately, since the interactions between our rules are very difficult to describe in words, we refer the reader to the code for more information concerning rule order. Specifically, look at `getOneSidedType` and `getTwoSidedType` in `scripts/cat_rules.py`. In general, the rules presented in this section mirror the order they occur in these functions. - -# MIT License - -Copyright (c) 2017 Christopher Bryant, Mariano Felice - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -# Contact - -If you have any questions, suggestions or bug reports, you can contact the authors at: -christopher d0t bryant at cl.cam.ac.uk -mariano d0t felice at cl.cam.ac.uk \ No newline at end of file diff --git a/resources/readme.md b/resources/readme.md deleted file mode 100644 index 27e450b..0000000 --- a/resources/readme.md +++ /dev/null @@ -1,27 +0,0 @@ -# Resources - -## en-ptb_map - -en-ptb_map is a mapping file that converts spacy Penn Treebank (PTB) style part of speech tags to stanford universal dependency tags. - -The mapping file was obtained [here](http://universaldependencies.org/tagset-conversion/en-penn-uposf.html). - -Spacy includes some custom POS tags that were not part of the original PTB tagset. The authors of spacy suggested the following mapping for these tags: - -| PTB-Style | Universal -|-----------|-------- -| "" | PUNCT -| ADD | X -| GW | X -| NFP | X -| SP | SPACE -| XX | X - -## en_GB-large.txt - -en_GB-large.txt is a list of valid British English words according to the latest Hunspell dictionary. - -It was obtained [here](https://sourceforge.net/projects/wordlist/files/speller/2017.08.24/). - -The specific file bundled with this release is: wordlist-en_GB-large-2017.08.24.zip. - diff --git a/scripts/align_text.py b/scripts/align_text.py deleted file mode 100644 index e9b68da..0000000 --- a/scripts/align_text.py +++ /dev/null @@ -1,227 +0,0 @@ -from difflib import SequenceMatcher -from itertools import combinations, groupby -from string import punctuation -import re -import spacy.parts_of_speech as POS -import scripts.rdlextra as DL - -# Some global variables -CONTENT_POS = {POS.ADJ, POS.ADV, POS.NOUN, POS.VERB} - -### FUNCTIONS ### - -def get_opcodes(alignment): - s_start = 0 - s_end = 0 - t_start = 0 - t_end = 0 - opcodes = [] - for op in alignment: - if op[0] == "D": # Deletion - s_end += 1 - elif op[0] == "I": # Insertion - t_end += 1 - elif op[0].startswith("T"): # Transposition - # Extract number of elements involved (default is 2) - k = int(op[1:] or 2) - s_end += k - t_end += k - else: # Match or substitution - s_end += 1 - t_end += 1 - # Save - opcodes.append((op, s_start, s_end, t_start, t_end)) - # Start from here - s_start = s_end - t_start = t_end - return opcodes - -def merge_edits(edits): - if edits: - return [("X", edits[0][1], edits[-1][2], edits[0][3], edits[-1][4])] - else: - return edits - -# Input 1: Spacy source sentence -# Input 2: Spacy target sentence -# Input 3: The alignment between the 2; [e.g. M, M, S ,S M] -# Output: A list of processed edits that have been merged or split. -def get_edits(source, target, edits): - out_edits = [] - # Start: Split alignment intro groups of M, T and rest. T has a number after it. - for op, group in groupby(edits, lambda x: x[0][0] if x[0][0] in {"M", "T"} else False): - # Convert the generator to a list - group = list(group) - # Ignore M - if op == "M": continue - # Do not merge T - elif op == "T": out_edits.extend(group) - # Further processing required - else: out_edits.extend(process_edits(source, target, group)) - return out_edits - -# Input 1: Spacy source sentence -# Input 2: Spacy target sentence -# Input 3: A list of non-matching alignments: D, I and/or S -# Output: A list of processed edits that have been merged or split. -def process_edits(source, target, edits): - # Return single alignments - if len(edits) <= 1: return edits - # Get the ops for the whole edit sequence - ops = [op[0] for op in edits] - # Merge ops that are all D xor I. (95% of human multi-token edits contain S). - if set(ops) == {"D"} or set(ops) == {"I"}: return merge_edits(edits) - - content = False # True if edit includes a content word - # Get indices of all combinations of start and end ranges in the edits: 012 -> 01, 02, 12 - combos = list(combinations(range(0, len(edits)), 2)) - # Sort them starting with largest spans first - combos.sort(key = lambda x: x[1]-x[0], reverse=True) - # Loop through combos - for start, end in combos: - # Ignore ranges that do NOT contain a substitution. - if "S" not in ops[start:end+1]: continue - # Get the tokens in orig and cor. They will never be empty due to above rule. - s = source[edits[start][1]:edits[end][2]] - t = target[edits[start][3]:edits[end][4]] - # Possessive suffixes merged with previous token: [friends -> friend 's] - if s[-1].tag_ == "POS" or t[-1].tag_ == "POS": - return process_edits(source, target, edits[:end-1]) + merge_edits(edits[end-1:end+1]) + process_edits(source, target, edits[end+1:]) - # Case changes - if s[-1].lower_ == t[-1].lower_: - # Merge first token I or D of arbitrary length: [Cat -> The big cat] - if start == 0 and ((len(s) == 1 and t[0].text[0].isupper()) or (len(t) == 1 and s[0].text[0].isupper())): - return merge_edits(edits[start:end+1]) + process_edits(source, target, edits[end+1:]) - # Merge with previous punctuation: [, we -> . We], [we -> . We] - if (len(s) > 1 and is_punct(s[-2])) or (len(t) > 1 and is_punct(t[-2])): - return process_edits(source, target, edits[:end-1]) + merge_edits(edits[end-1:end+1]) + process_edits(source, target, edits[end+1:]) - # Whitespace/hyphens: [bestfriend -> best friend], [sub - way -> subway] - s_str = re.sub("['-]", "", "".join([tok.lower_ for tok in s])) - t_str = re.sub("['-]", "", "".join([tok.lower_ for tok in t])) - if s_str == t_str: - return process_edits(source, target, edits[:start]) + merge_edits(edits[start:end+1]) + process_edits(source, target, edits[end+1:]) - # POS-based merging: Same POS or infinitive/phrasal verbs: [to eat -> eating], [watch -> look at] - pos_set = set([tok.pos for tok in s]+[tok.pos for tok in t]) - if (len(pos_set) == 1 and len(s) != len(t)) or pos_set == {POS.PART, POS.VERB}: - return process_edits(source, target, edits[:start]) + merge_edits(edits[start:end+1]) + process_edits(source, target, edits[end+1:]) - # Split rules take effect when we get to smallest chunks - if end-start < 2: - # Split adjacent substitutions - if len(s) == len(t) == 2: - return process_edits(source, target, edits[:start+1]) + process_edits(source, target, edits[start+1:]) - # Similar substitutions at start or end - if (ops[start] == "S" and char_cost(s[0].text, t[0].text) < 0.25) or \ - (ops[end] == "S" and char_cost(s[-1].text, t[-1].text) < 0.25): - return process_edits(source, target, edits[:start+1]) + process_edits(source, target, edits[start+1:]) - # Split final determiners - if end == len(edits)-1 and ((ops[-1] in {"D", "S"} and s[-1].pos == POS.DET) or \ - (ops[-1] in {"I", "S"} and t[-1].pos == POS.DET)): - return process_edits(source, target, edits[:-1]) + [edits[-1]] - # Set content word flag - if not pos_set.isdisjoint(CONTENT_POS): content = True - # If all else fails, merge edits that contain content words - if content: return merge_edits(edits) - else: return edits - -# Is the token a content word? -def is_content(A): - return A.pos in CONTENT_POS - -# Check whether token is punctuation -def is_punct(token): - return token.pos == POS.PUNCT or token.text in punctuation - -# all-split: No edits are ever merged. Everything is 1:1, 1:0 or 0:1 only. -def get_edits_split(edits): - new_edits = [] - for edit in edits: - op = edit[0] - if op != "M": - new_edits.append(edit) - return new_edits - -# all-merge: Merge all adjacent edits of any operation type, except M. -def get_edits_group_all(edits): - new_edits = [] - for op, group in groupby(edits, lambda x: True if x[0] == "M" else False): - if not op: - new_edits.extend(merge_edits(list(group))) - return new_edits - -# all-equal: Merge all edits of the same operation type. -def get_edits_group_type(edits): - new_edits = [] - for op, group in groupby(edits, lambda x: x[0]): - if op != "M": - new_edits.extend(merge_edits(list(group))) - return new_edits - -# Cost is 0 if lemmas are the same, otherwise 0.499. Maximum S cost is 1.999. -# This prevents unintuitive transpositions. -def lemma_cost(A, B): - if A.lemma == B.lemma: - return 0 - else: - return 0.499 - -# Cost is 0 if POS are the same, else 0.25 if both are content, else 0.5. -# Content words more likely to align to other content words. -def pos_cost(A, B): - if A.pos == B.pos: - return 0 - elif is_content(A) and is_content(B): - return 0.25 - else: - return 0.5 - -# Calculate the cost of character alignment; i.e. char similarity -def char_cost(A, B): - return 1-SequenceMatcher(None, A, B).ratio() - -# If there is a substitution, calculate the more informative cost. -def token_substitution(A, B, A_extra, B_extra): - # If lower case strings are the same, don't bother checking pos etc. - # This helps catch case marking substitution errors. - if A.lower() == B.lower(): - return 0 - cost = lemma_cost(A_extra, B_extra) + pos_cost(A_extra, B_extra) + char_cost(A, B) - return cost - -# Change cost of Transpositions to be the same as Levenshtein. -def levTransposition(a,b,c,d): - return float("inf") - -# Change cost of Substitution to be the same as Levenshtein. -def levSubstitution(a,b,c,d): - return 1 - -# Input 1: A Spacy annotated original sentence. -# Input 2: A Spacy annotated corrected sentence. -# Input 3: Command line args. -# Output: A list of lists. Each sublist is an edit of the form: -# edit = [orig_start, orig_end, cat, cor, cor_start, cor_end] -def getAutoAlignedEdits(orig, cor, args): - # Get a list of strings from the spacy objects. - orig_toks = [tok.text for tok in orig] - cor_toks = [tok.text for tok in cor] - # Align using Levenshtein. - if args.lev: alignments = DL.WagnerFischer(orig_toks, cor_toks, orig, cor, substitution=levSubstitution, transposition=levTransposition) - # Otherwise, use linguistically enhanced Damerau-Levenshtein - else: alignments = DL.WagnerFischer(orig_toks, cor_toks, orig, cor, substitution=token_substitution) - # Get the alignment with the highest score. There is usually only 1 best in DL due to custom costs. - alignment = next(alignments.alignments(True)) # True uses Depth-first search. - # Convert the alignment into edits; choose merge strategy - if args.merge == "rules": edits = get_edits(orig, cor, get_opcodes(alignment)) - elif args.merge == "all-split": edits = get_edits_split(get_opcodes(alignment)) - elif args.merge == "all-merge": edits = get_edits_group_all(get_opcodes(alignment)) - elif args.merge == "all-equal": edits = get_edits_group_type(get_opcodes(alignment)) - proc_edits = [] - for edit in edits: - orig_start = edit[1] - orig_end = edit[2] - cat = "NA" # Auto edits do not have human types. - cor_start = edit[3] - cor_end = edit[4] - cor_str = " ".join(cor_toks[cor_start:cor_end]) - proc_edits.append([orig_start, orig_end, cat, cor_str, cor_start, cor_end]) - return proc_edits diff --git a/scripts/cat_rules.py b/scripts/cat_rules.py deleted file mode 100644 index 196f76d..0000000 --- a/scripts/cat_rules.py +++ /dev/null @@ -1,376 +0,0 @@ -from difflib import SequenceMatcher -from string import punctuation -import spacy.parts_of_speech as spos - -# Contractions -conts = {"'d", "'ll", "'m", "n't", "'re", "'s", "'ve"} -# Rare POS tags that make uninformative error categories -rare_tags = {"INTJ", "NUM", "SYM", "X"} -# Special auxiliaries in contractions. -special_aux1 = ({"ca", "can"}, {"sha", "shall"}, {"wo", "will"}) -special_aux2 = {"ca", "sha", "wo"} -# Open class spacy POS tag objects -open_pos = (spos.ADJ, spos.ADV, spos.NOUN, spos.VERB) -# Open class POS tags -open_tags = {"ADJ", "ADV", "NOUN", "VERB"} -# Some dep labels that map to pos tags. -dep_map = { "acomp": "ADJ", - "amod": "ADJ", - "advmod": "ADV", - "det": "DET", - "prep": "PREP", - "prt": "PART", - "punct": "PUNCT" } - -# Input 1: An edit list. [orig_start, orig_end, cat, cor, cor_start, cor_end] -# Input 2: An original SpaCy sentence. -# Input 3: A corrected SpaCy sentence. -# Input 4: A set of valid GB English words. -# Input 5: A dictionary to map PTB tags to Stanford Universal Dependency tags. -# Input 6: A preloaded spacy processing object. -# Input 7: The Lancaster stemmer in NLTK. -# Output: The input edit with new error tag, in M2 edit format. -def autoTypeEdit(edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer): - # Get the tokens in the edit. - orig_toks = orig_sent[edit[0]:edit[1]] - cor_toks = cor_sent[edit[4]:edit[5]] - # Nothing to nothing is a detected, but not corrected edit. - if not orig_toks and not cor_toks: - return "UNK" - # Missing - elif not orig_toks and cor_toks: - op = "M:" - cat = getOneSidedType(cor_toks, tag_map) - # Unnecessary - elif orig_toks and not cor_toks: - op = "U:" - cat = getOneSidedType(orig_toks, tag_map) - # Replacement and special cases - else: - # Same to same is a detected, but not corrected edit. - if orig_toks.text == cor_toks.text: - return "UNK" - # Special: Orthographic errors at the end of multi-token edits are ignored. - # E.g. [Doctor -> The doctor], [The doctor -> Dcotor], [, since -> . Since] - # Classify the edit as if the last token weren't there. - elif orig_toks[-1].lower_ == cor_toks[-1].lower_ and \ - (len(orig_toks) > 1 or len(cor_toks) > 1): - min_edit = edit[:] - min_edit[1] -= 1 - min_edit[5] -= 1 - return autoTypeEdit(min_edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer) - # Replacement - else: - op = "R:" - cat = getTwoSidedType(orig_toks, cor_toks, gb_spell, tag_map, nlp, stemmer) - return op+cat - -# Input 1: Spacy tokens -# Input 2: A map dict from PTB to universal dependency pos tags. -# Output: A list of token, pos and dep tag strings. -def getEditInfo(toks, tag_map): - str = [] - pos = [] - dep = [] - for tok in toks: - str.append(tok.text) - pos.append(tag_map[tok.tag_]) - dep.append(tok.dep_) - return str, pos, dep - -# Input 1: Spacy tokens. -# Input 2: A map dict from PTB to universal dependency pos tags. -# Output: An error type string. -# When one side of the edit is null, we can only use the other side. -def getOneSidedType(toks, tag_map): - # Extract strings, pos tags and parse info from the toks. - str_list, pos_list, dep_list = getEditInfo(toks, tag_map) - - # Special cases. - if len(toks) == 1: - # Possessive noun suffixes; e.g. ' -> 's - if toks[0].tag_ == "POS": - return "NOUN:POSS" - # Contraction. Rule must come after possessive. - if toks[0].lower_ in conts: - return "CONTR" - # Infinitival "to" is treated as part of a verb form. - if toks[0].lower_ == "to" and toks[0].pos_ == "PART" and toks[0].dep_ != "prep": - return "VERB:FORM" - # Auxiliary verbs. - if set(dep_list).issubset({"aux", "auxpass"}): - return "VERB:TENSE" - # POS-based tags. Ignores rare, uninformative categories. - if len(set(pos_list)) == 1 and pos_list[0] not in rare_tags: - return pos_list[0] - # More POS-based tags using special dependency labels. - if len(set(dep_list)) == 1 and dep_list[0] in dep_map.keys(): - return dep_map[dep_list[0]] - # To-infinitives and phrasal verbs. - if set(pos_list) == {"PART", "VERB"}: - return "VERB" - # Tricky cases - else: - return "OTHER" - -# Input 1: Original text spacy tokens. -# Input 2: Corrected text spacy tokens. -# Input 3: A set of valid GB English words. -# Input 4: A map from PTB to universal dependency pos tags. -# Input 5: A preloaded spacy processing object. -# Input 6: The Lancaster stemmer in NLTK. -# Output: An error type string. -def getTwoSidedType(orig_toks, cor_toks, gb_spell, tag_map, nlp, stemmer): - # Extract strings, pos tags and parse info from the toks. - orig_str, orig_pos, orig_dep = getEditInfo(orig_toks, tag_map) - cor_str, cor_pos, cor_dep = getEditInfo(cor_toks, tag_map) - - # Orthography; i.e. whitespace and/or case errors. - if onlyOrthChange(orig_str, cor_str): - return "ORTH" - # Word Order; only matches exact reordering. - if exactReordering(orig_str, cor_str): - return "WO" - - # 1:1 replacements (very common) - if len(orig_str) == len(cor_str) == 1: - # 1. SPECIAL CASES - # Possessive noun suffixes; e.g. ' -> 's - if orig_toks[0].tag_ == "POS" or cor_toks[0].tag_ == "POS": - return "NOUN:POSS" - # Contraction. Rule must come after possessive. - if (orig_str[0].lower() in conts or cor_str[0].lower() in conts) and orig_pos == cor_pos: - return "CONTR" - # Special auxiliaries in contractions (1); e.g. ca -> can - if set(orig_str[0].lower()+cor_str[0].lower()) in special_aux1: - return "CONTR" - # Special auxiliaries in contractions (2); e.g. ca -> could - if orig_str[0].lower() in special_aux2 or cor_str[0].lower() in special_aux2: - return "VERB:TENSE" - # Special: "was" and "were" are the only past tense SVA. - if {orig_str[0].lower(), cor_str[0].lower()} == {"was", "were"}: - return "VERB:SVA" - - # 2. SPELLING AND INFLECTION - # Only check alphabetical strings on the original side. - # Spelling errors take precendece over POS errors so this rule is ordered. - if orig_str[0].isalpha(): - # Check a GB English dict for both orig and lower case. - # "cat" is in the dict, but "Cat" is not. - if orig_str[0] not in gb_spell and orig_str[0].lower() not in gb_spell: - # Check if both sides have a common lemma - if sameLemma(orig_toks[0], cor_toks[0], nlp): - # Inflection; Usually count vs mass nouns or e.g. got vs getted - if orig_pos == cor_pos and orig_pos[0] in {"NOUN", "VERB"}: - return orig_pos[0]+":INFL" - # Unknown morphology; i.e. we cannot be more specific. - else: - return "MORPH" - # Use string similarity to detect true spelling errors. - else: - char_ratio = SequenceMatcher(None, orig_str[0], cor_str[0]).ratio() - # Ratio > 0.5 means both side share at least half the same chars. - # WARNING: THIS IS AN APPROXIMATION. - if char_ratio > 0.5: - return "SPELL" - # If ratio is <= 0.5, this may be a spelling+other error; e.g. tolk -> say - else: - # If POS is the same, this takes precedence over spelling. - if orig_pos == cor_pos and orig_pos[0] not in rare_tags: - return orig_pos[0] - # Tricky cases. - else: - return "OTHER" - - # 3. MORPHOLOGY - # Only ADJ, ADV, NOUN and VERB with same lemma can have inflectional changes. - if sameLemma(orig_toks[0], cor_toks[0], nlp) and \ - orig_pos[0] in open_tags and cor_pos[0] in open_tags: - # Same POS on both sides - if orig_pos == cor_pos: - # Adjective form; e.g. comparatives - if orig_pos[0] == "ADJ": - return "ADJ:FORM" - # Noun number - if orig_pos[0] == "NOUN": - return "NOUN:NUM" - # Verbs - various types - if orig_pos[0] == "VERB": - # NOTE: These rules are carefully ordered. - # Use the dep parse to find some form errors. - # Main verbs preceded by aux cannot be tense or SVA. - if precededByAux(orig_toks, cor_toks): - return "VERB:FORM" - # Use fine PTB tags to find various errors. - # FORM errors normally involve VBG or VBN. - if orig_toks[0].tag_ in {"VBG", "VBN"} or cor_toks[0].tag_ in {"VBG", "VBN"}: - return "VERB:FORM" - # Of what's left, TENSE errors normally involved VBD. - if orig_toks[0].tag_ == "VBD" or cor_toks[0].tag_ == "VBD": - return "VERB:TENSE" - # Of what's left, SVA errors normally involve VBZ. - if orig_toks[0].tag_ == "VBZ" or cor_toks[0].tag_ == "VBZ": - return "VERB:SVA" - # Any remaining aux verbs are called TENSE. - if orig_dep[0].startswith("aux") and cor_dep[0].startswith("aux"): - return "VERB:TENSE" - # Use dep labels to find some more ADJ:FORM - if set(orig_dep+cor_dep).issubset({"acomp", "amod"}): - return "ADJ:FORM" - # Adj to plural noun is usually a noun number error; e.g. musical -> musicals. - if orig_pos[0] == "ADJ" and cor_toks[0].tag_ == "NNS": - return "NOUN:NUM" - # For remaining verb errors (rare), rely on cor_pos - if cor_toks[0].tag_ in {"VBG", "VBN"}: - return "VERB:FORM" - # Cor VBD = TENSE - if cor_toks[0].tag_ == "VBD": - return "VERB:TENSE" - # Cor VBZ = SVA - if cor_toks[0].tag_ == "VBZ": - return "VERB:SVA" - # Tricky cases that all have the same lemma. - else: - return "MORPH" - # Derivational morphology. - if stemmer.stem(orig_str[0]) == stemmer.stem(cor_str[0]) and \ - orig_pos[0] in open_tags and cor_pos[0] in open_tags: - return "MORPH" - - # 4. GENERAL - # Auxiliaries with different lemmas - if orig_dep[0].startswith("aux") and cor_dep[0].startswith("aux"): - return "VERB:TENSE" - # POS-based tags. Some of these are context sensitive mispellings. - if orig_pos == cor_pos and orig_pos[0] not in rare_tags: - return orig_pos[0] - # Some dep labels map to POS-based tags. - if orig_dep == cor_dep and orig_dep[0] in dep_map.keys(): - return dep_map[orig_dep[0]] - # Phrasal verb particles. - if set(orig_pos+cor_pos) == {"PART", "PREP"} or set(orig_dep+cor_dep) == {"prt", "prep"}: - return "PART" - # Can use dep labels to resolve DET + PRON combinations. - if set(orig_pos+cor_pos) == {"DET", "PRON"}: - # DET cannot be a subject or object. - if cor_dep[0] in {"nsubj", "nsubjpass", "dobj", "pobj"}: - return "PRON" - # "poss" indicates possessive determiner - if cor_dep[0] == "poss": - return "DET" - # Tricky cases. - else: - return "OTHER" - - # Multi-token replacements (uncommon) - # All auxiliaries - if set(orig_dep+cor_dep).issubset({"aux", "auxpass"}): - return "VERB:TENSE" - # All same POS - if len(set(orig_pos+cor_pos)) == 1: - # Final verbs with the same lemma are tense; e.g. eat -> has eaten - if orig_pos[0] == "VERB" and sameLemma(orig_toks[-1], cor_toks[-1], nlp): - return "VERB:TENSE" - # POS-based tags. - elif orig_pos[0] not in rare_tags: - return orig_pos[0] - # All same special dep labels. - if len(set(orig_dep+cor_dep)) == 1 and orig_dep[0] in dep_map.keys(): - return dep_map[orig_dep[0]] - # Infinitives, gerunds, phrasal verbs. - if set(orig_pos+cor_pos) == {"PART", "VERB"}: - # Final verbs with the same lemma are form; e.g. to eat -> eating - if sameLemma(orig_toks[-1], cor_toks[-1], nlp): - return "VERB:FORM" - # Remaining edits are often verb; e.g. to eat -> consuming, look at -> see - else: - return "VERB" - # Possessive nouns; e.g. friends -> friend 's - if (orig_pos == ["NOUN", "PART"] or cor_pos == ["NOUN", "PART"]) and \ - sameLemma(orig_toks[0], cor_toks[0], nlp): - return "NOUN:POSS" - # Adjective forms with "most" and "more"; e.g. more free -> freer - if (orig_str[0].lower() in {"most", "more"} or cor_str[0].lower() in {"most", "more"}) and \ - sameLemma(orig_toks[-1], cor_toks[-1], nlp) and len(orig_str) <= 2 and len(cor_str) <= 2: - return "ADJ:FORM" - - # Tricky cases. - else: - return "OTHER" - -# Input 1: A list of original token strings -# Input 2: A list of corrected token strings -# Output: Boolean; the difference between the inputs is only whitespace or case. -def onlyOrthChange(orig_str, cor_str): - orig_join = "".join(orig_str).lower() - cor_join = "".join(cor_str).lower() - if orig_join == cor_join: - return True - return False - -# Input 1: A list of original token strings -# Input 2: A list of corrected token strings -# Output: Boolean; the tokens are exactly the same but in a different order. -def exactReordering(orig_str, cor_str): - # Sorting lets us keep duplicates. - orig_set = sorted([tok.lower() for tok in orig_str]) - cor_set = sorted([tok.lower() for tok in cor_str]) - if orig_set == cor_set: - return True - return False - -# Input 1: An original text spacy token. -# Input 2: A corrected text spacy token. -# Input 3: A spaCy processing object. -# Output: Boolean; the tokens have the same lemma. -# Spacy only finds lemma for its predicted POS tag. Sometimes these are wrong, -# so we also consider alternative POS tags to improve chance of a match. -def sameLemma(orig_tok, cor_tok, nlp): - orig_lemmas = [] - cor_lemmas = [] - for pos in open_pos: - # Pass the lower cased form of the word for lemmatization; improves accuracy. - orig_lemmas.append(nlp.vocab.morphology.lemmatize(pos, orig_tok.lower, nlp.vocab.morphology.tag_map)) - cor_lemmas.append(nlp.vocab.morphology.lemmatize(pos, cor_tok.lower, nlp.vocab.morphology.tag_map)) - if set(orig_lemmas).intersection(set(cor_lemmas)): - return True - return False - -# Input 1: An original text spacy token. -# Input 2: A corrected text spacy token. -# Output: Boolean; both tokens have a dependant auxiliary verb. -def precededByAux(orig_tok, cor_tok): - # If the toks are aux, we need to check if they are the first aux. - if orig_tok[0].dep_.startswith("aux") and cor_tok[0].dep_.startswith("aux"): - # Find the parent verb - orig_head = orig_tok[0].head - cor_head = cor_tok[0].head - # Find the children of the parent - orig_children = orig_head.children - cor_children = cor_head.children - # Check the orig children. - for orig_child in orig_children: - # Look at the first aux... - if orig_child.dep_.startswith("aux"): - # Check if the string matches orig_tok - if orig_child.text != orig_tok[0].text: - # If it doesn't, orig_tok is not the first aux so check the cor children - for cor_child in cor_children: - # Find the first aux in cor... - if cor_child.dep_.startswith("aux"): - # If that doesn't match cor_tok, there cor_tok also isnt first aux. - if cor_child.text != cor_tok[0].text: - # Therefore, both orig and cor are not first aux. - return True - # Break after the first cor aux - break - # Break after the first orig aux. - break - # Otherwise, the toks are main verbs so we need to look for any aux. - else: - orig_deps = [orig_dep.dep_ for orig_dep in orig_tok[0].children] - cor_deps = [cor_dep.dep_ for cor_dep in cor_tok[0].children] - if "aux" in orig_deps or "auxpass" in orig_deps: - if "aux" in cor_deps or "auxpass" in cor_deps: - return True - return False \ No newline at end of file diff --git a/scripts/rdlextra.py b/scripts/rdlextra.py deleted file mode 100644 index 48a12bc..0000000 --- a/scripts/rdlextra.py +++ /dev/null @@ -1,341 +0,0 @@ -# Copyright (c) 2016 Mariano Felice and Christopher Bryant -# -# This file contains an implementation of the Damerau-Levenshtein -# algorithm (restricted edit distance version) to align two sentences, -# as described in the following paper: -# -# Mariano Felice, Christopher Bryant and Ted Briscoe. 2016. -# Automatic extraction of learner errors in ESL sentences using -# linguistically enhanced alignments. In Proceedings of the 26th -# International Conference on Computational Linguistics (COLING 2016), -# pp. 825-835, Osaka, Japan. Japanese Association for Natural Language -# Processing. -# -# Please, cite this paper when using this script in your work. -# -# This code is based on an original implementation of the Wagner-Fischer -# algorithm by Kyle Gorman, available at: https://gist.github.com/kylebgorman/8034009 -# The original license and description are included below. -# -# This implementation adds support for token transpositions of arbitrary -# length, e.g. A B C --> B C A. -# -# ORIGINAL LICENSE: -# -# Copyright (c) 2013-2016 Kyle Gorman -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the -# "Software"), to deal in the Software without restriction, including -# without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of the Software, and to -# permit persons to whom the Software is furnished to do so, subject to -# the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# wagnerfischer.py: efficient computation of Levenshtein distance and -# all optimal alignments with arbitrary edit costs. The algorithm for -# computing the dynamic programming table used has been discovered many -# times, but is described most clearly in: -# -# R.A. Wagner & M.J. Fischer. 1974. The string-to-string correction -# problem. Journal of the ACM, 21(1): 168-173. -# -# Wagner & Fischer also describe an algorithm ("Algorithm Y") to find the -# alignment path (i.e., list of edit operations involved in the optimal -# alignment), but it it is specified such that in fact it only generates -# one such path, whereas many such paths may exist, particularly when -# multiple edit operations have the same cost. For example, when all edit -# operations have the same cost, there are two equal-cost alignments of -# "TGAC" and "GCAC": -# -# TGAC TGxAC -# ss== d=i== -# GCAC xGCAC -# -# However, all such paths can be generated efficiently, as follows. First, -# the dynamic programming table "cells" are defined as tuples of (partial -# cost, set of all operations reaching this cell with minimal cost). As a -# result, the completed table can be thought of as an unweighted, directed -# graph (or FSA). The bottom right cell (the one containing the Levenshtein -# distance) is the start state and the origin as end state. The set of arcs -# are the set of operations in each cell as arcs. (Many of the cells of the -# table, those which are not visited by any optimal alignment, are under -# the graph interpretation unconnected vertices, and can be ignored. Every -# path between the bottom right cell and the origin cell is an optimal -# alignment. These paths can be efficiently enumerated using breadth-first -# traversal. The trick here is that elements in deque must not only contain -# indices but also partial paths. Averaging over all such paths, we can -# come up with an estimate of the number of insertions, deletions, and -# substitutions involved as well; in the example above, we say S = 1 and -# D, I = 0.5. -# -# Thanks to Christoph Weidemann (ctw@cogsci.info), who added support for -# arbitrary cost functions. - - -import collections -import doctest -import pprint - - -# Default cost functions. - -def INSERTION(A, A_extra=None, cost=1): - return cost - -def DELETION(A, A_extra=None, cost=1): - return cost - -def SUBSTITUTION(A, B, A_extra=None, B_extra=None, cost=1): - return cost - -def TRANSPOSITION(A, B, A_extra=None, B_extra=None): - # Change to cost=float('inf') to have standard edit distance by default - # A and B should be the same length - cost = len(A) - 1 # or len(B) -1 - return cost - -Trace = collections.namedtuple("Trace", ["cost", "ops"]) - -class WagnerFischer(object): - - """ - An object representing a (set of) Levenshtein alignments between two - iterable objects (they need not be strings). The cost of the optimal - alignment is scored in `self.cost`, and all Levenshtein alignments can - be generated using self.alignments()`. - - Basic tests: - - >>> WagnerFischer("god", "gawd").cost - 2 - >>> WagnerFischer("sitting", "kitten").cost - 3 - >>> WagnerFischer("bana", "banananana").cost - 6 - >>> WagnerFischer("bana", "bana").cost - 0 - >>> WagnerFischer("banana", "angioplastical").cost - 11 - >>> WagnerFischer("angioplastical", "banana").cost - 11 - >>> WagnerFischer("Saturday", "Sunday").cost - 3 - - IDS tests: - - >>> WagnerFischer("doytauvab", "doyvautab").IDS() == {"S": 2.0} - True - >>> WagnerFischer("kitten", "sitting").IDS() == {"I": 1.0, "S": 2.0} - True - - Detect insertion vs. deletion: - - >>> thesmalldog = "the small dog".split() - >>> thebigdog = "the big dog".split() - >>> bigdog = "big dog".split() - >>> sub_inf = lambda A, B: float("inf") - - # Deletion. - >>> wf = WagnerFischer(thebigdog, bigdog, substitution=sub_inf) - >>> wf.IDS() == {"D": 1.0} - True - - # Insertion. - >>> wf = WagnerFischer(bigdog, thebigdog, substitution=sub_inf) - >>> wf.IDS() == {"I": 1.0} - True - - # Neither. - >>> wf = WagnerFischer(thebigdog, thesmalldog, substitution=sub_inf) - >>> wf.IDS() == {"I": 1.0, "D": 1.0} - True - """ - - # Initializes pretty printer (shared across all class instances). - pprinter = pprint.PrettyPrinter(width=75) - - def __init__(self, A, B, A_extra=None, B_extra=None, insertion=INSERTION, deletion=DELETION, - substitution=SUBSTITUTION, transposition=TRANSPOSITION): - # Stores cost functions in a dictionary for programmatic access. - self.costs = {"I": insertion, "D": deletion, "S": substitution, "T":transposition} - # Keep lowercased versions for transpositions - Al = [x.lower() for x in A] - Bl = [x.lower() for x in B] - # Initializes table. - self.asz = len(A) - self.bsz = len(B) - self._table = [[None for _ in range(self.bsz + 1)] for - _ in range(self.asz + 1)] - # From now on, all indexing done using self.__getitem__. - ## Fills in edges. - self[0][0] = Trace(0, {"O"}) # Start cell. - for i in range(1, self.asz + 1): - self[i][0] = Trace(self[i - 1][0].cost + self.costs["D"](A[i - 1], A_extra[i - 1] if A_extra else None), - {"D"}) - for j in range(1, self.bsz + 1): - self[0][j] = Trace(self[0][j - 1].cost + self.costs["I"](B[j - 1], B_extra[j - 1] if B_extra else None), - {"I"}) - - ## Fills in rest. - for i in range(len(A)): - for j in range(len(B)): - # Cleans it up in case there are more than one check for match - # first, as it is always the cheapest option. - if A[i] == B[j]: - self[i + 1][j + 1] = Trace(self[i][j].cost, {"M"}) - # Checks for other types. - else: - costD = self[i][j + 1].cost + self.costs["D"](A[i], A_extra[i] if A_extra else None) - costI = self[i + 1][j].cost + self.costs["I"](B[j], B_extra[j] if B_extra else None) - costS = self[i][j].cost + self.costs["S"](A[i], B[j], A_extra[i] if A_extra else None, B_extra[j] if B_extra else None) - costT = float("inf") # We don't know it yet - min_val = min(costI, costD, costS) - - # Multiword transpositions: - # Find a sequence of equal elements in different order - # We only need to check diagonally because we require the same number of elements - k = 1 - #while i > 0 and j > 0 and (i - k) >= 0 and (j - k) >= 0 and any(x in ["D", "I", "S"] for x in self[i-k+1][j-k+1].ops): - while i > 0 and j > 0 and (i - k) >= 0 and (j - k) >= 0 and self[i-k+1][j-k+1].cost - self[i-k][j-k].cost > 0: # An operation that has a cost (i.e. I, D or S > 0) - if collections.Counter(Al[i-k:i+1]) == collections.Counter(Bl[j-k:j+1]): - costT = self[i-k][j-k].cost + self.costs["T"](A[i-k:i+1], B[j-k:j+1], A_extra[i-k:i+1] if A_extra else None, B_extra[j-k:j+1] if B_extra else None) - min_val = min(min_val, costT) - break - k += 1 - - trace = Trace(min_val, []) # Use a list to preserve the order - # Adds _all_ operations matching minimum value. - if costD == min_val: - trace.ops.append("D") - if costI == min_val: - trace.ops.append("I") - if costS == min_val: - trace.ops.append("S") - if costT == min_val: - trace.ops.append("T" + str(k+1)) - self[i + 1][j + 1] = trace - - # Stores optimum cost as a property. - self.cost = self[-1][-1].cost - - def __repr__(self): - return self.pprinter.pformat(self._table) - - def __iter__(self): - for row in self._table: - yield row - - def __getitem__(self, i): - """ - Returns the i-th row of the table, which is a list and so - can be indexed. Therefore, e.g., self[2][3] == self._table[2][3] - """ - return self._table[i] - - # Stuff for generating alignments. - - def _stepback(self, i, j, trace, path_back): - """ - Given a cell location (i, j) and a Trace object trace, generate - all traces they point back to in the table - """ - for op in trace.ops: - if op == "M": - yield i - 1, j - 1, self[i - 1][j - 1], path_back + ["M"] - elif op == "I": - yield i, j - 1, self[i][j - 1], path_back + ["I"] - elif op == "D": - yield i - 1, j, self[i - 1][j], path_back + ["D"] - elif op == "S": - yield i - 1, j - 1, self[i - 1][j - 1], path_back + ["S"] - elif op.startswith("T"): - # Extract stepback (default is a transposition of 2 elements) - k = int(op[1:] or 2) - yield i - k, j - k, self[i - k][j - k], path_back + [op] - elif op == "O": - return # Origin cell, so we're done. - else: - raise ValueError("Unknown op {!r}".format(op)) - - def alignments(self, dfirst=False): - """ - Generate all alignments with optimal cost by traversing an - implicit graph on the dynamic programming table. Use - breadth-first traversal by default. - """ - # Each cell of the queue is a tuple of (i, j, trace, path_back) - # where i, j is the current index, trace is the trace object at - # this cell - if dfirst: - return self._dfirst_alignments() - else: - return self._bfirst_alignments() - - def _dfirst_alignments(self): - """ - Generate alignments via depth-first traversal. - """ - stack = list(self._stepback(self.asz, self.bsz, self[-1][-1], [])) - while stack: - (i, j, trace, path_back) = stack.pop() - if trace.ops == {"O"}: - yield path_back[::-1] - continue - stack.extend(self._stepback(i, j, trace, path_back)) - - def _bfirst_alignments(self): - """ - Generate alignments via breadth-first traversal. - """ - # Each cell of the queue is a tuple of (i, j, trace, path_back) - # where i, j is the current index, trace is the trace object at - # this cell, and path_back is a reversed list of edit operations - # which is initialized as an empty list. - queue = collections.deque(self._stepback(self.asz, self.bsz, - self[-1][-1], [])) - while queue: - (i, j, trace, path_back) = queue.popleft() - if trace.ops == {"O"}: - # We have reached the origin, the end of a reverse path, so - # yield the list of edit operations in reverse. - yield path_back[::-1] - continue - queue.extend(self._stepback(i, j, trace, path_back)) - - def IDS(self): - """ - Estimates insertions, deletions, and substitution _count_ (not - costs). Non-integer values arise when there are multiple possible - alignments with the same cost. - """ - npaths = 0 - opcounts = collections.Counter() - for alignment in self.alignments(): - # Counts edit types for this path, ignoring "M" (which is free). - opcounts += collections.Counter(op for op in alignment if op != "M") - npaths += 1 - # Averages over all paths. - return collections.Counter({o: c / npaths for (o, c) in - opcounts.items()}) - - -if __name__ == "__main__": - #doctest.testmod() - a = raw_input("A: ").split() - b = raw_input("B: ").split() - al = WagnerFischer(a, b).alignments() - for a in al: - print(a) - diff --git a/scripts/toolbox.py b/scripts/toolbox.py deleted file mode 100644 index 02b6c08..0000000 --- a/scripts/toolbox.py +++ /dev/null @@ -1,136 +0,0 @@ -from operator import itemgetter - -# Load latest Hunspell dictionaries: -def loadDictionary(path): - return set(open(path).read().split()) - -# Load Stanford Universal Tags map file. -def loadTagMap(path): - map_dict = {} - open_file = open(path).readlines() - for line in open_file: - line = line.strip().split("\t") - # Change ADP to PREP; makes it clearer - if line[1].strip() == "ADP": - map_dict[line[0]] = "PREP" - # Also change PROPN to NOUN; we don't need a prop noun tag - elif line[1].strip() == "PROPN": - map_dict[line[0]] = "NOUN" - else: - map_dict[line[0]] = line[1].strip() - # Add some spacy PTB tags not in the original mapping. - map_dict['""'] = "PUNCT" - map_dict["SP"] = "SPACE" - map_dict["ADD"] = "X" - map_dict["GW"] = "X" - map_dict["NFP"] = "X" - map_dict["XX"] = "X" - return map_dict - -# Input: A sentence + edit block in an m2 file. -# Output 1: The original sentence (a list of tokens) -# Output 2: A dictionary; key is coder id, value is a tuple. -# tuple[0] is the corrected sentence (a list of tokens), tuple[1] is the edits. -# Process M2 to extract sentences and edits. -def processM2(info): - info = info.split("\n") - orig_sent = info[0][2:].split() # [2:] ignore the leading "S " - all_edits = info[1:] - # Simplify the edits and group by coder id. - edit_dict = processEdits(all_edits) - out_dict = {} - # Loop through each coder and their edits. - for coder, edits in edit_dict.items(): - # Copy orig_sent. We will apply the edits to it to make cor_sent - cor_sent = orig_sent[:] - gold_edits = [] - offset = 0 - # Sort edits by start and end offset only. If they are the same, do not reorder. - edits = sorted(edits, key=itemgetter(0)) # Sort by start offset - edits = sorted(edits, key=itemgetter(1)) # Sort by end offset - for edit in edits: - # Do not apply noop or Um edits, but save them - if edit[2] in {"noop", "Um"}: - gold_edits.append(edit+[-1,-1]) - continue - orig_start = edit[0] - orig_end = edit[1] - cor_toks = edit[3].split() - # Apply the edit. - cor_sent[orig_start+offset:orig_end+offset] = cor_toks - # Get the cor token start and end positions in cor_sent - cor_start = orig_start+offset - cor_end = cor_start+len(cor_toks) - # Keep track of how this affects orig edit offsets. - offset = offset-(orig_end-orig_start)+len(cor_toks) - # Save the edit with cor_start and cor_end - gold_edits.append(edit+[cor_start]+[cor_end]) - # Save the cor_sent and gold_edits for each annotator in the out_dict. - out_dict[coder] = (cor_sent, gold_edits) - return orig_sent, out_dict - -# Input: A list of edit lines for a sentence in an m2 file. -# Output: An edit dictionary; key is coder id, value is a list of edits. -def processEdits(edits): - edit_dict = {} - for edit in edits: - edit = edit.split("|||") - span = edit[0][2:].split() # [2:] ignore the leading "A " - start = int(span[0]) - end = int(span[1]) - cat = edit[1] - cor = edit[2] - id = edit[-1] - # Save the useful info as a list - proc_edit = [start, end, cat, cor] - # Save the proc edit inside the edit_dict using coder id. - if id in edit_dict.keys(): - edit_dict[id].append(proc_edit) - else: - edit_dict[id] = [proc_edit] - return edit_dict - -# Input 1: A list of token strings in a sentence. -# Input 2: A preloaded Spacy processing object. -# Annotate tokens with POS, lemma and parse info. -def applySpacy(sent, nlp): - # Convert tokens to spacy tokens and POS tag and parse. - sent = nlp.tokenizer.tokens_from_list(sent) - nlp.tagger(sent) - nlp.parser(sent) - return sent - -# Input 1: An edit list. [orig_start, orig_end, cat, cor, cor_start, cor_end] -# Input 2: An original SpaCy sentence. -# Input 3: A corrected SpaCy sentence. -# Output: A minimised edit with duplicate words on both sides removed. -# E.g. [was eaten -> has eaten] becomes [was -> has] -def minimiseEdit(edit, orig, cor): - # edit = [orig_start, orig_end, cat, cor, cor_start, cor_end] - orig_toks = orig[edit[0]:edit[1]] - cor_toks = cor[edit[4]:edit[5]] - # While the first token is the same string in both (and both are not null) - while orig_toks and cor_toks and orig_toks[0].text == cor_toks[0].text: - # Remove that token from the span, and adjust the start offset. - orig_toks = orig_toks[1:] - cor_toks = cor_toks[1:] - edit[0] += 1 - edit[4] += 1 - # Then do the same from the last token. - while orig_toks and cor_toks and orig_toks[-1].text == cor_toks[-1].text: - # Remove that token from the span, and adjust the start offset. - orig_toks = orig_toks[:-1] - cor_toks = cor_toks[:-1] - edit[1] -= 1 - edit[5] -= 1 - # If both sides are not null, save the new correction string. - if orig_toks or cor_toks: - edit[3] = " ".join([tok.text for tok in cor_toks]) - return edit - -# Input 1: An edit list = [orig_start, orig_end, cat, cor, cor_start, cor_end] -# Input 2: A coder id for the specific annotator. -# Output: An edit in m2 file format. -def formatEdit(edit, coder_id=0): - span = " ".join(["A", str(edit[0]), str(edit[1])]) - return "|||".join([span, edit[2], edit[3], "REQUIRED", "-NONE-", str(coder_id)]) \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..dd7e253 --- /dev/null +++ b/setup.py @@ -0,0 +1,47 @@ +from pathlib import Path +from setuptools import setup, find_packages + +# Get base working directory. +base_dir = Path(__file__).resolve().parent + +# Readme text for long description +with open(base_dir/"README.md") as f: + readme = f.read() + +setup( + name = "errant", + version = "2.0.0", + license = "MIT", + description = "The ERRor ANnotation Toolkit (ERRANT). Automatically extract and classify edits in parallel sentences.", + long_description = readme, + long_description_content_type = "text/markdown", + author = "Christopher Bryant, Mariano Felice", + author_email = "christopher.bryant@cl.cam.ac.uk", + url = "https://github.com/chrisjbryant/errant", + keywords = ["automatic annotation", "grammatical errors", "natural language processing"], + python_requires = ">= 3.3", + install_requires = ["spacy == 1.9.0", "nltk >= 3.0"], + packages = find_packages(), + include_package_data=True, + entry_points = { + "console_scripts": [ + "errant_compare = errant.commands.compare_m2:main", + "errant_m2 = errant.commands.m2_to_m2:main", + "errant_parallel = errant.commands.parallel_to_m2:main"]}, + classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Operating System :: OS Independent", + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + "Topic :: Education", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Text Processing :: Linguistic"] +)