diff --git a/README.md b/README.md index df1e7b7f..a3b8fabb 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ The [Sentence Collector](https://commonvoice.mozilla.org/sentence-collector/) is - Is everything working as expected? If not, submit [a new issue](https://github.com/Common-Voice/sentence-collector/issues/new). - Review the pending issues in the [project](https://github.com/Common-Voice/sentence-collector/projects/2). - Create a [new PR](https://github.com/Common-Voice/sentence-collector/compare) to fix any of the existing issues in the project. -- To add or adjust validation and cleanup for a language see [VALIDATION.md](https://github.com/common-voice/sentence-collector/blob/main/server/lib/validation/VALIDATION.md) and [CLEANUP.md](https://github.com/common-voice/sentence-collector/blob/main/server/lib/cleanup/CLEANUP.md). Cleanup is optional and should only be used in rare cases. +- To add or adjust normalization, validation and cleanup for a language see [VALIDATION.md](https://github.com/common-voice/sentence-collector/blob/main/server/lib/validation/VALIDATION.md) and [CLEANUP.md](https://github.com/common-voice/sentence-collector/blob/main/server/lib/cleanup/CLEANUP.md). Cleanup is optional and should only be used in rare cases. ## Prerequisites diff --git a/server/lib/validation/VALIDATION.md b/server/lib/validation/VALIDATION.md index 4e0939ea..f3815b43 100644 --- a/server/lib/validation/VALIDATION.md +++ b/server/lib/validation/VALIDATION.md @@ -48,3 +48,11 @@ In this example we are defining one function and one regex: * Using a regex: the second validation uses a regex. If the sentence contains any numbers, the regex matches and we will mark the sentence as invalid. The user will see `Sentence contains numbers` in the frontend. If no numbers are found (the regex doesn't match), the sentence will be marked as valid. You can return the same error message for multiple invalidation rules if appropriate, however try to be as specific as possible. In the frontend the errors will be grouped by this error message. + +## Normalization + +For certain languages there are benefits of normalizing the sentence in NFC before running through validation. This can be enabled by adding the language code to the `USE_NFC_NORMALIZATION` array in `index.js`. Activating normalization means that any further steps will get the normalized sentence. This includes the validation rules, as well as saving it to the database and then later on exporting it to the Common Voice repository. + +**Example:** In Korean you can either type `"ᄏ", "ᅩ" and "ᆯ"` which results in `콜` of length 3 (when checked with `.length`), or `콜` which is one code point. + +If we apply NFC the validation process gets easier to define. This topic came up in [this PR](https://github.com/common-voice/sentence-collector/pull/630#issuecomment-1201099593). diff --git a/server/lib/validation/index.js b/server/lib/validation/index.js index d1660deb..f5c0a603 100644 --- a/server/lib/validation/index.js +++ b/server/lib/validation/index.js @@ -35,6 +35,13 @@ const VALIDATORS = { yue, }; +// For certain language we want to normalize before we validate. +// This then also means that the returned sentence is normalized +// and therefore will be saved to the database in normalized form. +const USE_NFC_NORMALIZATION = [ + 'ko', +]; + module.exports = { validateSentences, }; @@ -42,20 +49,24 @@ module.exports = { function validateSentences(language, sentences) { const validator = getValidatorFor(language); - return runValidation(validator, sentences); + return runValidation(validator, { + sentences, + normalize: USE_NFC_NORMALIZATION.includes(language), + }); } -function runValidation(validator, sentences = { unreviewed: [], validated: [] }) { +function runValidation(validator, { sentences = { unreviewed: [], validated: [] }, normalize }) { let filtered = []; const validate = (validSentences, sentence) => { - const validationResult = validateSentence(validator, sentence); + const sentenceToValidate = normalize ? sentence.normalize('NFC') : sentence; + const validationResult = validateSentence(validator, sentenceToValidate); if (validationResult.error) { filtered.push(validationResult); return validSentences; } - validSentences.push(sentence); + validSentences.push(sentenceToValidate); return validSentences; }; diff --git a/server/tests/lib/validation/index.test.js b/server/tests/lib/validation/index.test.js index d323048f..f1bb915c 100644 --- a/server/tests/lib/validation/index.test.js +++ b/server/tests/lib/validation/index.test.js @@ -2,17 +2,23 @@ import test from 'ava'; import validation from '../../../lib/validation'; function validate(t, language, sentences, expected) { + const validationResult = validation.validateSentences(language, sentences); + t.log(validationResult.valid); + t.deepEqual(validationResult.valid, expected); +} + +function validateFiltered(t, language, sentences, expected) { const validationResult = validation.validateSentences(language, sentences); t.log(validationResult.filtered); t.deepEqual(validationResult.filtered, expected); } -test('validates valid sentences', validate, 'en', { +test('validates valid sentences', validateFiltered, 'en', { unreviewed: ['This is valid'], validated: ['This is valid too'], }, []); -test('validates invalid sentences - too long', validate, 'en', { +test('validates invalid sentences - too long', validateFiltered, 'en', { unreviewed: ['This is very very very very very very very very very very very very very very very very very very very very long'], validated: ['This is very very very very very very very very very very very very very very very very very very very very long too'], }, [{ @@ -23,7 +29,7 @@ test('validates invalid sentences - too long', validate, 'en', { error: 'Number of words must be between 1 and 14 (inclusive)', }]); -test('validates invalid sentences - contains numbers', validate, 'en', { +test('validates invalid sentences - contains numbers', validateFiltered, 'en', { unreviewed: ['This is 2valid'], validated: ['This is 3valid'], }, [{ @@ -34,7 +40,7 @@ test('validates invalid sentences - contains numbers', validate, 'en', { error: 'Sentence should not contain numbers', }]); -test('validates invalid sentences - contains abbreviation', validate, 'en', { +test('validates invalid sentences - contains abbreviation', validateFiltered, 'en', { unreviewed: ['This is A.B.C.'], validated: ['This ABC too'], }, [{ @@ -45,7 +51,7 @@ test('validates invalid sentences - contains abbreviation', validate, 'en', { error: 'Sentence should not contain abbreviations', }]); -test('validates invalid sentences - contains symbols', validate, 'en', { +test('validates invalid sentences - contains symbols', validateFiltered, 'en', { unreviewed: ['This is # test'], validated: ['This is @ test', 'This is / test'], }, [{ @@ -59,7 +65,7 @@ test('validates invalid sentences - contains symbols', validate, 'en', { error: 'Sentence should not contain symbols', }]); -test('validates invalid sentences - multiple sentences', validate, 'it', { +test('validates invalid sentences - multiple sentences', validateFiltered, 'it', { unreviewed: ['This is test. And more.'], validated: ['This is one. This is two.'], }, [{ @@ -70,7 +76,7 @@ test('validates invalid sentences - multiple sentences', validate, 'it', { error: 'Sentence should not contain sentence punctuation inside a sentence', }]); -test('validates invalid sentences - english chars', validate, 'ru', { +test('validates invalid sentences - english chars', validateFiltered, 'ru', { unreviewed: ['This is test'], validated: ['This too'], }, [{ @@ -81,7 +87,7 @@ test('validates invalid sentences - english chars', validate, 'ru', { error: 'Sentence should not contain latin alphabet characters', }]); -test('validates invalid sentences - other rules', validate, 'bas', { +test('validates invalid sentences - other rules', validateFiltered, 'bas', { unreviewed: ['This is valid', 'This is wrong .', 'This as well!.', 'No;', 'Definitely not,'], validated: ['This too'], }, [{ @@ -97,3 +103,8 @@ test('validates invalid sentences - other rules', validate, 'bas', { sentence: 'Definitely not,', error: 'Sentence should not end with a comma', }]); + +test('normalizes', validate, 'ko', { + unreviewed: ['콜'], + validated: [], +}, ['콜']);