Multilingual extension (German, Spanish, Romanian, Russian)

google-research-datasets · Sep 24, 2024 · 1853f53 · 1853f53
1 parent 6f576c6
commit 1853f53
Show file tree

Hide file tree

Showing 12 changed files with 720 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -118,12 +118,33 @@ tax on sales of stores for non residents are set at 21% for 2014 and 20% in 2015
 
 Again, repeat for the remaining nine shards.
 
+## Multilingual C4\_200M
+
+In our [BEA 2024 paper](https://aclanthology.org/2024.bea-1.2/) we introduced variants of our original English dataset in German, Spanish, Romanian, and Russian. The multilingual datasets are generated with the same recipe, but you need to provide the language ID to `c4200m_get_target_sentences.py`:
+
+```
+python c4200m_get_target_sentences.py multilingual/ro.tsv ro.target_sentences.tsv ro &> ro.get_target_sentences.log
+
+```
+
+The entry point to the multilingual annotation toolkit is `annotate.py`:
+
+```
+$ echo -e "I goed to the storr.\tI went to the store." | python3 -m merrant.annotate
+
+S I goed to the storr.
+A 2 6|||R:VERB:INFL|||went|||REQUIRED|||-NONE-|||0
+A 14 19|||R:SPELL|||store|||REQUIRED|||-NONE-|||0
+```
+
 ## License
 The corruption edits in this dataset are licensed under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/).
 
 
 ## BibTeX
-If you found this dataset useful, please cite our [paper](https://www.aclweb.org/anthology/2021.bea-1.4/).
+If you found this dataset useful, please cite our papers:
+
+Original English dataset ([BEA 2021 paper](https://www.aclweb.org/anthology/2021.bea-1.4/)):
 
 ```
 @inproceedings{stahlberg-kumar-2021-synthetic,
@@ -139,5 +160,22 @@ If you found this dataset useful, please cite our [paper](https://www.aclweb.org
 }
 ```
 
+Multilingual dataset in German, Spanish, Romanian, and Russian ([BEA 2024 paper](https://aclanthology.org/2024.bea-1.2/)):
+
+```
+@inproceedings{stahlberg-kumar-2024-synthetic,
+    title = "Synthetic Data Generation for Low-resource Grammatical Error Correction with Tagged Corruption Models",
+    author = "Stahlberg, Felix  and
+      Kumar, Shankar",
+    booktitle = "Proceedings of the 19th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2024)",
+    month = jun,
+    year = "2024",
+    address = "Mexico City, Mexico",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.bea-1.2",
+    pages = "11--16",
+}
+```
+
 **This is not an officially supported Google product.**
 
diff --git a/c4200m_get_target_sentences.py b/c4200m_get_target_sentences.py
@@ -8,21 +8,24 @@
 
 LOGGING_STEPS = 100000
 
-
 def main(argv):
-  if len(argv) != 3:
+  if len(argv) != 3 and len(argv) != 4:
     raise app.UsageError(
-        "python3 c4200m_get_target_sentences.py <edits-tsv> <output-tsv>")
+        "python3 c4200m_get_target_sentences.py <edits-tsv> <output-tsv> [<lang>]")
   edits_tsv_path = argv[1]
   output_tsv_path = argv[2]
 
+  tfds_name = "c4/en:2.2.1"
+  if len(argv) == 4 and argv[3] != "en":
+    tfds_name = "c4/multilingual/" + argv[3]
+
   print("Loading C4_200M target sentence hashes from %r..." % edits_tsv_path)
   remaining_hashes = set()
   with open(edits_tsv_path) as edits_tsv_reader:
     for tsv_line in edits_tsv_reader:
       remaining_hashes.add(tsv_line.split("\t", 1)[0])
-  print("Searching for %d target sentences in the C4 dataset..." %
-        len(remaining_hashes))
+  print("Searching for %d target sentences in the dataset %r..." %
+        (len(remaining_hashes), tfds_name))
   target_sentences = []
   for num_done_examples, example in enumerate(
       tfds.load("c4/en:2.2.1", split="train")):

diff --git a/multilingual/de.tsv.bz2 b/multilingual/de.tsv.bz2
diff --git a/multilingual/es.tsv.bz2 b/multilingual/es.tsv.bz2
diff --git a/multilingual/merrant/__init__.py b/multilingual/merrant/__init__.py
diff --git a/multilingual/merrant/annotate.py b/multilingual/merrant/annotate.py
@@ -0,0 +1,49 @@
+r"""Command-line tool for running mERRANT.
+
+This script reads TSV data from stdin and writes formatted annotations to
+stdout.
+
+Example:
+  echo -e "I goed to the storr.\tI went to the store." | \
+    python3 -m merrant.annotate
+
+Output (M2_CHAR format):
+  S I goed to the storr.
+  A 2 6|||R:VERB:INFL|||went|||REQUIRED|||-NONE-|||0
+  A 14 19|||R:SPELL|||store|||REQUIRED|||-NONE-|||0
+"""
+
+import sys
+from typing import Sequence
+
+from absl import app
+from absl import flags
+
+from merrant import api
+from merrant import io
+
+_SPACY_MODEL = flags.DEFINE_string("spacy_model", "en_core_web_sm",
+                                   "Tagging model.")
+
+_OUTPUT_FORMAT = flags.DEFINE_enum(
+    "output_format", "M2_CHAR", ["M2_CHAR", "M2_TOK", "TSV_TAGGED_CORRUPTION"],
+    "Tagging model.")
+
+
+def main(argv: Sequence[str]) -> None:
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+
+  annotator = api.Annotator(_SPACY_MODEL.value,
+                            aspell_lang=_SPACY_MODEL.value[:2])
+  annotator.initialize()
+  formatter = io.make_formatter(_OUTPUT_FORMAT.value)
+  for line in sys.stdin:
+    parts = line.strip("\n").split("\t")
+    annotation = annotator.annotate(parts[0], parts[1:])
+    print(formatter.format(annotation).decode("utf-8"))
+
+
+if __name__ == "__main__":
+  app.run(main)
+
diff --git a/multilingual/merrant/api.py b/multilingual/merrant/api.py
@@ -0,0 +1,64 @@
+"""Main API for mERRANT."""
+
+from typing import Optional, Sequence
+
+from merrant import classification
+from merrant import utils
+
+
+class Annotator:
+  """Main interface to mERRANT.
+
+  Example usage:
+    annotator = api.Annotator("en_core_web_sm-3.0.0a1", aspell_lang="en")
+    annotator.initialize()
+    annotation = annotator.annotate("I goed to the storr."
+                                    ["I went to the store."])
+
+  The returned `utils.Annotation` contains tagged `utils.EditsSpans`. If
+  `aspell_lang` is not set, no spell checker will be used. Edits can still be
+  classified as `SPELL` based on character Levenshtein distance.
+  """
+
+  def __init__(self, spacy_model: str, aspell_lang: Optional[str] = None):
+    self._spacy_model = spacy_model
+    self._aspell_lang = aspell_lang
+    self._initialized = False
+    self._nlp = None
+    self._classifier = None
+
+  def initialize(self):
+    """Initialized the interface. Must be called before `annotate()`."""
+    self._nlp = utils.load_spacy_from_google3(self._spacy_model)
+    self._classifier = classification.GenericClassifier(
+        aspell_lang=self._aspell_lang)
+    self._classifier.initialize()
+    self._initialized = True
+
+  def annotate(self, source_sentence: str,
+               target_sentences: Sequence[str]) -> utils.Annotation:
+    """Annotates the edits between a source- and a set of target sentences.
+
+    Args:
+      source_sentence: Untokenized source (original) sentence.
+      target_sentences: A list of untokenized target (corrected) sentences.
+
+    Returns:
+      An `utils.Annotation` with tagged edit spans.
+    """
+    if not self._initialized:
+      raise ValueError("Annotator not initialized.")
+
+    if isinstance(target_sentences, str):
+      raise ValueError("target_sentences must be a list, not a string.")
+
+    source_doc = self._nlp(source_sentence)
+    annotation = utils.Annotation(source_doc=source_doc)
+    for target_sentence in target_sentences:
+      target_doc = self._nlp(target_sentence)
+      annotation.target_sentences.append(
+          utils.TargetSentence(
+              doc=target_doc,
+              edit_spans=self._classifier.classify(source_doc, target_doc)))
+    return annotation
+