add korean sentence validation #630 (#630)

common-voice · Aug 8, 2022 · 70183de · 70183de
1 parent 5a86a81
commit 70183de
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 0 deletions.
diff --git a/server/lib/validation/index.js b/server/lib/validation/index.js
@@ -7,6 +7,7 @@ const eo = require('./languages/eo');
 const ig = require('./languages/ig');
 const it = require('./languages/it');
 const kab = require( './languages/kab');
+const ko = require( './languages/ko');
 const ne = require('./languages/ne');
 const or = require('./languages/or');
 const ru = require('./languages/ru');
@@ -25,6 +26,7 @@ const VALIDATORS = {
   ig,
   it,
   kab,
+  ko,
   ne,
   or,
   ru,

diff --git a/server/lib/validation/languages/ko.js b/server/lib/validation/languages/ko.js
@@ -0,0 +1,55 @@
+// Minimum of characters that qualify as a sentence.
+const MIN_CHARACTERS = 1;
+
+// Maximum of characters allowed per sentence to keep recordings in a manageable duration.
+const MAX_CHARACTERS = 50;
+
+const INVALIDATIONS = [{
+  fn: (sentence) => {
+    // To properly tokenize Korean, We need some heavy tokenizers (ex: mecab-ko, nori, ...),
+    // For counting letters those tokenizers are not necessary.
+    return sentence.length < MIN_CHARACTERS || sentence.length > MAX_CHARACTERS;
+  },
+  error: `문장의 글자 수는 ${MIN_CHARACTERS}글자 이상, ${MAX_CHARACTERS}글자 이하여야 합니다.`,
+}, {
+  // One Korean letter is composed with two or three letters,
+  // in order of (consonant(1st) - vowel(2nd) - consonant(3rd, optional)).
+  // It shouldn't be allowed to use them separately, since that could cause various pronunciation issues.
+  // 
+  // This regex is for Unicode "Hangul Syllables" (U+AC00–U+D7A3), Which are composed form (see below).
+  regex: /[ㄱ-ㅎㅏ-ㅣ]/,
+  error: '문장에는 자음이나 모음만 따로 있는 글자가 있어서는 안 됩니다.',
+},
+{
+  // Korean letters (Hangul) have two type of Unicode code points.
+  //
+  // - Composed form (Unicode "Hangul Syllables" : U+AC00–U+D7A3)
+  //   - One Unicode codepoint contains three or two letters in rectangular shape.
+  //   - This is normally used codepoints.
+  // - Other forms
+  //   - Other Unicode codepoints deal korean letters as separated vowels and consonants.
+  //   - This takes doubled space in bytes.
+  //   - This only appears when a contributor is using keyboard layout called "Sebeolsik", which is akin to Dvorak.
+  //   - After NFC normalization ( 5a86a81 ),
+  //     Composible combination of two or three characters (1st - 2nd - 3rd (optional)) will become
+  //     Composed form ("Hangul Syllables"). Characters that cannot be combined may remain.
+  //
+  // This regex is for codepoints other than "Hangul Syllables" (U+AC00–U+D7A3).
+  regex: /[\u1100-\u11FF\uA960-\uA97F\u3130-\u318F]/u,
+  error: '문장에는 첫가끝 형태의 분해된 글자가 있어서는 안 됩니다. 완성형 글자를 입력해주세요.',
+}, {
+  // Since there are so may kinds of "should not be allowd" letters,
+  // It would be convenient to allow only certain type of characters.
+  // examples: CJK chinese letters, Japanese letters, Korean specific chinese letters (aka hanja),
+  //           not-used symbols (semicolon, colon - native korean sentences do not contain them),
+  //           better to be excluded symbols (quote, tilda, ...),
+  //           characters that can be normalized into normal characters with destructive NFKC normalization (ⓐ, ㈜, ...),
+  //           historical korean letters (aka 옛한글 - ㆆ, ㅿ, ㆁ, ...)
+  //           ...
+  regex: /[^가-힣.,?! ]/u,
+  error: '문장에는 한글과 마침표, 쉼표, 느낌표, 물음표, 공백만 들어있어야 합니다.',
+}];
+
+module.exports = {
+  INVALIDATIONS,
+};