From 65bb29955846bffb3f8e631ef964ee7a0f3b2cda Mon Sep 17 00:00:00 2001 From: sftblw Date: Mon, 8 Aug 2022 23:51:10 +0900 Subject: [PATCH] add korean sentence validation #630 --- server/lib/validation/index.js | 2 + server/lib/validation/languages/ko.js | 55 +++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 server/lib/validation/languages/ko.js diff --git a/server/lib/validation/index.js b/server/lib/validation/index.js index f5c0a603..cbead814 100644 --- a/server/lib/validation/index.js +++ b/server/lib/validation/index.js @@ -7,6 +7,7 @@ const eo = require('./languages/eo'); const ig = require('./languages/ig'); const it = require('./languages/it'); const kab = require( './languages/kab'); +const ko = require( './languages/ko'); const ne = require('./languages/ne'); const or = require('./languages/or'); const ru = require('./languages/ru'); @@ -25,6 +26,7 @@ const VALIDATORS = { ig, it, kab, + ko, ne, or, ru, diff --git a/server/lib/validation/languages/ko.js b/server/lib/validation/languages/ko.js new file mode 100644 index 00000000..67ab88cf --- /dev/null +++ b/server/lib/validation/languages/ko.js @@ -0,0 +1,55 @@ +// Minimum of characters that qualify as a sentence. +const MIN_CHARACTERS = 1; + +// Maximum of characters allowed per sentence to keep recordings in a manageable duration. +const MAX_CHARACTERS = 50; + +const INVALIDATIONS = [{ + fn: (sentence) => { + // To properly tokenize Korean, We need some heavy tokenizers (ex: mecab-ko, nori, ...), + // For counting letters those tokenizers are not necessary. + return sentence.length < MIN_CHARACTERS || sentence.length > MAX_CHARACTERS; + }, + error: `문장의 글자 수는 ${MIN_CHARACTERS}글자 이상, ${MAX_CHARACTERS}글자 이하여야 합니다.`, +}, { + // One Korean letter is composed with two or three letters, + // in order of (consonant(1st) - vowel(2nd) - consonant(3rd, optional)). + // It shouldn't be allowed to use them separately, since that could cause various pronunciation issues. + // + // This regex is for Unicode "Hangul Syllables" (U+AC00–U+D7A3), Which are composed form (see below). + regex: /[ㄱ-ㅎㅏ-ㅣ]/, + error: '문장에는 자음이나 모음만 따로 있는 글자가 있어서는 안 됩니다.', +}, +{ + // Korean letters (Hangul) have two type of Unicode code points. + // + // - Composed form (Unicode "Hangul Syllables" : U+AC00–U+D7A3) + // - One Unicode codepoint contains three or two letters in rectangular shape. + // - This is normally used codepoints. + // - Other forms + // - Other Unicode codepoints deal korean letters as separated vowels and consonants. + // - This takes doubled space in bytes. + // - This only appears when a contributor is using keyboard layout called "Sebeolsik", which is akin to Dvorak. + // - After NFC normalization ( 5a86a81 ), + // Composible combination of two or three characters (1st - 2nd - 3rd (optional)) will become + // Composed form ("Hangul Syllables"). Characters that cannot be combined may remain. + // + // This regex is for codepoints other than "Hangul Syllables" (U+AC00–U+D7A3). + regex: /[\u1100-\u11FF\uA960-\uA97F\u3130-\u318F]/u, + error: '문장에는 첫가끝 형태의 분해된 글자가 있어서는 안 됩니다. 완성형 글자를 입력해주세요.', +}, { + // Since there are so may kinds of "should not be allowd" letters, + // It would be convenient to allow only certain type of characters. + // examples: CJK chinese letters, Japanese letters, Korean specific chinese letters (aka hanja), + // not-used symbols (semicolon, colon - native korean sentences do not contain them), + // better to be excluded symbols (quote, tilda, ...), + // characters that can be normalized into normal characters with destructive NFKC normalization (ⓐ, ㈜, ...), + // historical korean letters (aka 옛한글 - ㆆ, ㅿ, ㆁ, ...) + // ... + regex: /[^가-힣.,?! ]/u, + error: '문장에는 한글과 마침표, 쉼표, 느낌표, 물음표, 공백만 들어있어야 합니다.', +}]; + +module.exports = { + INVALIDATIONS, +};