common-voice · MichaelKohler · Jul 18, 2021 · Jul 17, 2021 · Jul 18, 2021 · Jul 18, 2021
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "common_voice_sentence_collector"
-version = "1.0.0"
-authors = ["Florian Merz <[email protected]>"]
+version = "1.1.0"
+authors = ["Florian Merz <[email protected]>", "Michael Kohler <[email protected]>"]
 edition = "2018"
 
 [lib]
@@ -23,3 +23,4 @@ punkt = "1.0.5"
 rand = "0.8.3"
 toml = "0.5.8"
 serde = { version = "1.0.126", features = ["derive"] }
+inline-python = "0.7.0"
diff --git a/README.md b/README.md
@@ -4,15 +4,16 @@
 
 Right now this tool supports extractions from the following sources:
 
-* Wikipedia - max 3 sentences per articles
-* Wikisource - max 3 sentences per articles
+* Wikipedia - max 3 sentences per article
+* Wikisource - max 3 sentences per article
 * Simple files with one sentence per line
 
 For a source to be added, the dataset needs to be vetted by Mozilla to check license compatibility. If you know about a good source, please start a topic on [Discourse](https://discourse.mozilla.org/c/voice/). Once it's been verified that a source can be used, check the "Adding another scrape target" further below.
 
 ## Setup
 
 - [Rust Nightly](https://rustup.rs/) (follow the instructions and customize the install to select the `nightly` channel)
+- Install [`pip3`](https://pip.pypa.io/en/stable/installing/) in case it's not installed on your system already
 
 Note: as long as we're using the current `punkt` dependency, we need to use the Nightly version of Rust.
 
@@ -59,6 +60,7 @@ python WikiExtractor.py --json ../enwiki-latest-pages-articles-multistream.xml
 
 ```bash
 cd ../cv-sentence-extractor
+pip3 install -r requirements.txt # can be skipped if your language doesn't use the Python segmenter
 cargo run --release -- extract -l en -d ../wikiextractor/text/ >> wiki.en.txt
 ```
 
@@ -91,6 +93,7 @@ python WikiExtractor.py --json ../enwikisource-latest-pages-articles.xml
 
 ```bash
 cd ../cv-sentence-extractor
+pip3 install -r requirements.txt # can be skipped if your language doesn't use the Python segmenter
 cargo run --release -- extract-wikisource -l en -d ../wikiextractor/text/ >> wiki.en.txt
 ```
 
@@ -100,7 +103,8 @@ cargo run --release -- extract-wikisource -l en -d ../wikiextractor/text/ >> wik
 
 If you have one or multiple files with one sentence per line, you can use this extractor to extract sentences from these files applying the defined language rules. This can be useful if you have a large list of sentences and you want to only have sentences which match the rules.
 
-```
+```bash
+pip3 install -r requirements.txt # can be skipped if your language doesn't use the Python segmenter
 cargo run --release -- extract-file -l en -d ../texts/ >> file.en.txt
 ```
 
@@ -128,6 +132,7 @@ The following rules can be configured per language. Add a `<language>.toml` file
 | other_patterns |  Rust regex to disallow anything else | Rust Regex Array | all other patterns allowed
 | quote_start_with_letter |  If a quote needs to start with a letter | boolean | true
 | replacements |  Replaces abbreviations or other words according to configuration. This happens before any other rules are checked. | Array of replacement configurations: each configuration is an Array of two values: `["search", "replacement"]`. See example below. | nothing gets replaced
+| segmenter |  Segmenter to use for this language. See below for more information. | "python" | using `rust-punkt` by default
 
 ### Example for `matching_symbols`
 
@@ -225,6 +230,52 @@ In order to get your language rules and blocklist incorporated in this repo, you
 
 Once we have your rules into the repo, we will run an automatic extraction and submit those sentences to Common Voice. This means that you can't manually adjust the sample output you've used for review as these changes would be lost.
 
+## Using a different segmenter to split sentences
+
+By default we are using the `rust-punkt` segmenter to split sentences. However this leads to several issues if `rust-punkt` does not support a given language. More info on that can be found in issue #11. Therefore we introduce a new way of adding your own Python-based segmenter if needed. Note that using Python-based segmenters will slow down the extract considerably.
+
+If `rust-punkt` is not working well for a language rule file you are implementing, you can use your own custom segmenter written in Python. While English doesn't use a Python-based segmenter, there is an English example available in `src/segmenters.rs` you can use as base to write your own segmenter in Python.
+
+This is currently experimental.
+
+### Changes needed to add your own segmenter in Python
+
+First you will need to add the `segmenter` rule to the rules file:
+
+```
+segmenter = "python"
+```
+
+This will direct our extraction script to use the special cases Python extraction.
+
+Then you will need to add a new function to `src/segmenter.rs` with the name `split_sentences_with_python_xx`, replacing `xx` with your language code you also use for the rules file. You can copy/paste `split_sentences_with_python_en` and adjust it to your needs. Using Spanish as an example, your new function might look like this:
+
+```
+pub fn split_sentences_with_python_es(text: &str) -> Vec<String> {
+    let ctx = Context::new();
+
+    ctx.run(python! {
+        import someLibraryWeNeed
+
+        split_sentences = doTheNecessaryWorkToSplitSentences('text)
+    });
+
+    ctx.get("split_sentences")
+}
+```
+
+Note that the function gets passed the full text as `text`, but you need to use `'text` to reference it within the Python block. This is a simple string with all sentences to be split. The split sentences need to be assigned to the `split_sentences` variable, as our script will read out this variable to continue the extraction.
+
+Additionally you need to make sure that this function is called for your language, otherwise you will get an error that there is no matching function. For this, add a new match case to the `split_sentences_with_python` function. To add Spanish for example, add the following:
+
+```
+  "es" => split_sentences_with_python_es(text),
+```
+
+**Make sure you add all the required Python packages to `requirements.txt` as these will need to be installed by everyone running the respository locally as well as by the extraction pipelines on GitHub.**
+
+As this is experimental, there are certain parts that could be improved, such as moving out each language into its own file, as well as automatically importing the needed file so there is no need to manually add a case to the match. PRs are certainly welcome!
+
 ## Adding another scrape target
 
 If you find a new open data source that provides a lot of sentences ([Example](https://discourse.mozilla.org/t/using-the-europarl-dataset-with-sentences-from-speeches-from-the-european-parliament/50184/36)), we suggest to not go through through the Sentence Collector but rather adding a scrape target here. Before you do so, let's discuss it on [Discourse](https://discourse.mozilla.org/c/voice/) first!

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+nltk
diff --git a/scripts/extraction.sh b/scripts/extraction.sh
@@ -18,6 +18,9 @@ mkdir -p $OUTPUT_PATH
 
 source $HERE/providers/common.sh
 
+echo "Installing Python dependencies"
+pip3 install -r requirements.txt
+
 if [ $TYPE == "sample" ]; then
   source $HERE/providers/wiki.sh
 

diff --git a/scripts/providers/wiki.sh b/scripts/providers/wiki.sh
@@ -18,6 +18,16 @@ function run {
   fi
   rm listing.html
 
+  if [ $TYPE == "sample" ]; then
+    # For a sample extract we only want to run it for the first file
+    ARCHIVE_FILE_NAME=${LANGUAGE_CODE}${ARCHIVE_FILE_NAME_MATCHES/%?/}
+    echo "Starting sample extraction for $ARCHIVE_FILE_NAME"
+    _downloadAndDecompressDump
+    extract
+    cleanup
+    exit $?
+  fi
+
   for archive in "${ARCHIVE_FILE_NAME_MATCHES[@]}"
   do
     ARCHIVE_FILE_NAME=${LANGUAGE_CODE}${archive/%?/}

diff --git a/src/extractor.rs b/src/extractor.rs
@@ -3,6 +3,7 @@ use crate::replacer;
 use crate::checker;
 use crate::loaders::Loader;
 use crate::rules::{load_rules, Rules};
+use crate::segmenter::split_sentences_with_python;
 use glob::glob;
 use punkt::params::Standard;
 use punkt::{SentenceTokenizer, TrainingData};
@@ -33,6 +34,7 @@ pub fn extract(loader: impl Loader, no_check: bool) -> Result<(), String> {
                 checker::check,
                 replacer::replace_strings,
                 no_check,
+                &config.language,
             );
 
             for sentence in sentences {
@@ -57,17 +59,33 @@ fn choose(
     predicate: impl FnMut(&Rules, &str) -> bool,
     mut replacer: impl FnMut(&Rules, &str) -> String,
     no_check: bool,
+    language: &str,
 ) -> Vec<String> {
-    let sentences_replaced_abbreviations: Vec<String> = SentenceTokenizer::<Standard>::new(text, training_data)
+    let sentences: Vec<String>;
+
+    if rules.segmenter != String::from("") {
+        if rules.segmenter == "python" {
+            sentences = split_sentences_with_python(language, text);
+        } else {
+            panic!("Segmenter {} is not yet supported!", rules.segmenter);
+        }
+    } else {
+        // we use rust-punkt as segmenter by default
+        sentences = SentenceTokenizer::<Standard>::new(text, training_data)
+            .map(|item| { String::from(item) })
+            .collect();
+    }
+
+    let sentences_pool = sentences.iter()
         .map(|item| { replacer(rules, item) })
         .collect();
 
     if no_check {
-        sentences_replaced_abbreviations
+        sentences_pool
     } else {
         pick_sentences(
             rules,
-            sentences_replaced_abbreviations,
+            sentences_pool,
             existing_sentences,
             config.max_sentences_per_text,
             predicate,

diff --git a/src/lib.rs b/src/lib.rs
@@ -14,3 +14,4 @@ mod replacer;
 mod rules;
 mod config;
 mod loaders;
+mod segmenter;