Skip to content

Commit

Permalink
Multilingual extension (German, Spanish, Romanian, Russian)
Browse files Browse the repository at this point in the history
  • Loading branch information
fstahlberg committed Sep 24, 2024
1 parent 6f576c6 commit 1853f53
Show file tree
Hide file tree
Showing 12 changed files with 720 additions and 6 deletions.
40 changes: 39 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,33 @@ tax on sales of stores for non residents are set at 21% for 2014 and 20% in 2015

Again, repeat for the remaining nine shards.

## Multilingual C4\_200M

In our [BEA 2024 paper](https://aclanthology.org/2024.bea-1.2/) we introduced variants of our original English dataset in German, Spanish, Romanian, and Russian. The multilingual datasets are generated with the same recipe, but you need to provide the language ID to `c4200m_get_target_sentences.py`:

```
python c4200m_get_target_sentences.py multilingual/ro.tsv ro.target_sentences.tsv ro &> ro.get_target_sentences.log
```

The entry point to the multilingual annotation toolkit is `annotate.py`:

```
$ echo -e "I goed to the storr.\tI went to the store." | python3 -m merrant.annotate
S I goed to the storr.
A 2 6|||R:VERB:INFL|||went|||REQUIRED|||-NONE-|||0
A 14 19|||R:SPELL|||store|||REQUIRED|||-NONE-|||0
```

## License
The corruption edits in this dataset are licensed under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/).


## BibTeX
If you found this dataset useful, please cite our [paper](https://www.aclweb.org/anthology/2021.bea-1.4/).
If you found this dataset useful, please cite our papers:

Original English dataset ([BEA 2021 paper](https://www.aclweb.org/anthology/2021.bea-1.4/)):

```
@inproceedings{stahlberg-kumar-2021-synthetic,
Expand All @@ -139,5 +160,22 @@ If you found this dataset useful, please cite our [paper](https://www.aclweb.org
}
```

Multilingual dataset in German, Spanish, Romanian, and Russian ([BEA 2024 paper](https://aclanthology.org/2024.bea-1.2/)):

```
@inproceedings{stahlberg-kumar-2024-synthetic,
title = "Synthetic Data Generation for Low-resource Grammatical Error Correction with Tagged Corruption Models",
author = "Stahlberg, Felix and
Kumar, Shankar",
booktitle = "Proceedings of the 19th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.bea-1.2",
pages = "11--16",
}
```

**This is not an officially supported Google product.**

13 changes: 8 additions & 5 deletions c4200m_get_target_sentences.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,24 @@

LOGGING_STEPS = 100000


def main(argv):
if len(argv) != 3:
if len(argv) != 3 and len(argv) != 4:
raise app.UsageError(
"python3 c4200m_get_target_sentences.py <edits-tsv> <output-tsv>")
"python3 c4200m_get_target_sentences.py <edits-tsv> <output-tsv> [<lang>]")
edits_tsv_path = argv[1]
output_tsv_path = argv[2]

tfds_name = "c4/en:2.2.1"
if len(argv) == 4 and argv[3] != "en":
tfds_name = "c4/multilingual/" + argv[3]

print("Loading C4_200M target sentence hashes from %r..." % edits_tsv_path)
remaining_hashes = set()
with open(edits_tsv_path) as edits_tsv_reader:
for tsv_line in edits_tsv_reader:
remaining_hashes.add(tsv_line.split("\t", 1)[0])
print("Searching for %d target sentences in the C4 dataset..." %
len(remaining_hashes))
print("Searching for %d target sentences in the dataset %r..." %
(len(remaining_hashes), tfds_name))
target_sentences = []
for num_done_examples, example in enumerate(
tfds.load("c4/en:2.2.1", split="train")):
Expand Down
Binary file added multilingual/de.tsv.bz2
Binary file not shown.
Binary file added multilingual/es.tsv.bz2
Binary file not shown.
Empty file.
49 changes: 49 additions & 0 deletions multilingual/merrant/annotate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
r"""Command-line tool for running mERRANT.
This script reads TSV data from stdin and writes formatted annotations to
stdout.
Example:
echo -e "I goed to the storr.\tI went to the store." | \
python3 -m merrant.annotate
Output (M2_CHAR format):
S I goed to the storr.
A 2 6|||R:VERB:INFL|||went|||REQUIRED|||-NONE-|||0
A 14 19|||R:SPELL|||store|||REQUIRED|||-NONE-|||0
"""

import sys
from typing import Sequence

from absl import app
from absl import flags

from merrant import api
from merrant import io

_SPACY_MODEL = flags.DEFINE_string("spacy_model", "en_core_web_sm",
"Tagging model.")

_OUTPUT_FORMAT = flags.DEFINE_enum(
"output_format", "M2_CHAR", ["M2_CHAR", "M2_TOK", "TSV_TAGGED_CORRUPTION"],
"Tagging model.")


def main(argv: Sequence[str]) -> None:
if len(argv) > 1:
raise app.UsageError("Too many command-line arguments.")

annotator = api.Annotator(_SPACY_MODEL.value,
aspell_lang=_SPACY_MODEL.value[:2])
annotator.initialize()
formatter = io.make_formatter(_OUTPUT_FORMAT.value)
for line in sys.stdin:
parts = line.strip("\n").split("\t")
annotation = annotator.annotate(parts[0], parts[1:])
print(formatter.format(annotation).decode("utf-8"))


if __name__ == "__main__":
app.run(main)

64 changes: 64 additions & 0 deletions multilingual/merrant/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""Main API for mERRANT."""

from typing import Optional, Sequence

from merrant import classification
from merrant import utils


class Annotator:
"""Main interface to mERRANT.
Example usage:
annotator = api.Annotator("en_core_web_sm-3.0.0a1", aspell_lang="en")
annotator.initialize()
annotation = annotator.annotate("I goed to the storr."
["I went to the store."])
The returned `utils.Annotation` contains tagged `utils.EditsSpans`. If
`aspell_lang` is not set, no spell checker will be used. Edits can still be
classified as `SPELL` based on character Levenshtein distance.
"""

def __init__(self, spacy_model: str, aspell_lang: Optional[str] = None):
self._spacy_model = spacy_model
self._aspell_lang = aspell_lang
self._initialized = False
self._nlp = None
self._classifier = None

def initialize(self):
"""Initialized the interface. Must be called before `annotate()`."""
self._nlp = utils.load_spacy_from_google3(self._spacy_model)
self._classifier = classification.GenericClassifier(
aspell_lang=self._aspell_lang)
self._classifier.initialize()
self._initialized = True

def annotate(self, source_sentence: str,
target_sentences: Sequence[str]) -> utils.Annotation:
"""Annotates the edits between a source- and a set of target sentences.
Args:
source_sentence: Untokenized source (original) sentence.
target_sentences: A list of untokenized target (corrected) sentences.
Returns:
An `utils.Annotation` with tagged edit spans.
"""
if not self._initialized:
raise ValueError("Annotator not initialized.")

if isinstance(target_sentences, str):
raise ValueError("target_sentences must be a list, not a string.")

source_doc = self._nlp(source_sentence)
annotation = utils.Annotation(source_doc=source_doc)
for target_sentence in target_sentences:
target_doc = self._nlp(target_sentence)
annotation.target_sentences.append(
utils.TargetSentence(
doc=target_doc,
edit_spans=self._classifier.classify(source_doc, target_doc)))
return annotation

Loading

0 comments on commit 1853f53

Please sign in to comment.