Skip to content

Commit

Permalink
[ja] add preprocessor for width of alphabetic characters (#964)
Browse files Browse the repository at this point in the history
* add japanese text preprocessor for variants in width of alphabetic characters

* try combining with numeric to improve performance

* Update ext/js/language/ja/japanese.js

Co-authored-by: Kuuuube <[email protected]>
Signed-off-by: StefanVukovic99 <[email protected]>

* Update ext/js/language/ja/japanese.js

Co-authored-by: Kuuuube <[email protected]>
Signed-off-by: StefanVukovic99 <[email protected]>

* fix tests

---------

Signed-off-by: StefanVukovic99 <[email protected]>
Co-authored-by: Kuuuube <[email protected]>
  • Loading branch information
StefanVukovic99 and Kuuuube authored May 22, 2024
1 parent 125cde3 commit d19b898
Show file tree
Hide file tree
Showing 12 changed files with 585 additions and 35 deletions.
31 changes: 21 additions & 10 deletions ext/js/language/ja/japanese-text-preprocessors.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@ import {basicTextProcessorOptions} from '../text-processors.js';
import {convertAlphabeticToKana} from './japanese-wanakana.js';
import {
collapseEmphaticSequences as collapseEmphaticSequencesFunction,
convertAlphanumericToFullWidth,
convertFullWidthAlphanumericToNormal,
convertHalfWidthKanaToFullWidth,
convertHiraganaToKatakana as convertHiraganaToKatakanaFunction,
convertKatakanaToHiragana as convertKatakanaToHiraganaFunction,
convertNumericToFullWidth
convertKatakanaToHiragana as convertKatakanaToHiraganaFunction
} from './japanese.js';

/** @type {import('language').TextProcessor<boolean>} */
Expand All @@ -33,22 +34,32 @@ export const convertHalfWidthCharacters = {
process: (str, setting) => (setting ? convertHalfWidthKanaToFullWidth(str) : str)
};

/** @type {import('language').TextProcessor<boolean>} */
export const convertNumericCharacters = {
name: 'Convert numeric characters to full width',
description: '1234 → 1234',
options: basicTextProcessorOptions,
process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str)
};

/** @type {import('language').TextProcessor<boolean>} */
export const convertAlphabeticCharacters = {
export const alphabeticToHiragana = {
name: 'Convert alphabetic characters to hiragana',
description: 'yomichan → よみちゃん',
options: basicTextProcessorOptions,
process: (str, setting) => (setting ? convertAlphabeticToKana(str) : str)
};

/** @type {import('language').BidirectionalConversionPreprocessor} */
export const alphanumericWidthVariants = {
name: 'Convert between alphabetic width variants',
description: 'yomitan → yomitan and vice versa',
options: ['off', 'direct', 'inverse'],
process: (str, setting) => {
switch (setting) {
case 'off':
return str;
case 'direct':
return convertFullWidthAlphanumericToNormal(str);
case 'inverse':
return convertAlphanumericToFullWidth(str);
}
}
};

/** @type {import('language').BidirectionalConversionPreprocessor} */
export const convertHiraganaToKatakana = {
name: 'Convert hiragana to katakana',
Expand Down
32 changes: 28 additions & 4 deletions ext/js/language/ja/japanese.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/


const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063;
const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3;
const KATAKANA_SMALL_KA_CODE_POINT = 0x30f5;
Expand Down Expand Up @@ -523,16 +524,39 @@ export function convertHiraganaToKatakana(text) {
* @param {string} text
* @returns {string}
*/
export function convertNumericToFullWidth(text) {
export function convertAlphanumericToFullWidth(text) {
let result = '';
for (const char of text) {
let c = /** @type {number} */ (char.codePointAt(0));
if (c >= 0x30 && c <= 0x39) { // ['0', '9']
c += 0xff10 - 0x30; // 0xff10 = '0' full width
result += String.fromCodePoint(c);
} else {
result += char;
} else if (c >= 0x41 && c <= 0x5a) { // ['A', 'Z']
c += 0xff21 - 0x41; // 0xff21 = 'A' full width
} else if (c >= 0x61 && c <= 0x7a) { // ['a', 'z']
c += 0xff41 - 0x61; // 0xff41 = 'a' full width
}
result += String.fromCodePoint(c);
}
return result;
}

/**
* @param {string} text
* @returns {string}
*/
export function convertFullWidthAlphanumericToNormal(text) {
let result = '';
const length = text.length;
for (let i = 0; i < length; i++) {
let c = /** @type {number} */ (text[i].codePointAt(0));
if (c >= 0xff10 && c <= 0xff19) { // ['0', '9']
c -= 0xff10 - 0x30; // 0x30 = '0'
} else if (c >= 0xff21 && c <= 0xff3a) { // ['A', 'Z']
c -= 0xff21 - 0x41; // 0x41 = 'A'
} else if (c >= 0xff41 && c <= 0xff5a) { // ['a', 'z']
c -= 0xff41 - 0x61; // 0x61 = 'a'
}
result += String.fromCodePoint(c);
}
return result;
}
Expand Down
12 changes: 9 additions & 3 deletions ext/js/language/language-descriptors.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,13 @@ import {removeArabicScriptDiacritics} from './ar/arabic-text-preprocessors.js';
import {eszettPreprocessor} from './de/german-text-preprocessors.js';
import {germanTransforms} from './de/german-transforms.js';
import {englishTransforms} from './en/english-transforms.js';
import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js';
import {
alphabeticToHiragana,
alphanumericWidthVariants,
collapseEmphaticSequences,
convertHalfWidthCharacters,
convertHiraganaToKatakana
} from './ja/japanese-text-preprocessors.js';
import {japaneseTransforms} from './ja/japanese-transforms.js';
import {isStringPartiallyJapanese} from './ja/japanese.js';
import {disassembleHangul, reassembleHangul} from './ko/korean-text-processors.js';
Expand Down Expand Up @@ -143,8 +149,8 @@ const languageDescriptors = [
isTextLookupWorthy: isStringPartiallyJapanese,
textPreprocessors: {
convertHalfWidthCharacters,
convertNumericCharacters,
convertAlphabeticCharacters,
alphabeticToHiragana,
alphanumericWidthVariants,
convertHiraganaToKatakana,
collapseEmphaticSequences
},
Expand Down
94 changes: 93 additions & 1 deletion test/data/anki-note-builder-test-results.json
Original file line number Diff line number Diff line change
Expand Up @@ -3874,7 +3874,99 @@
]
},
{
"name": "Test text preprocessors - convertAlphabeticCharacters",
"name": "Test text preprocessors - alphabeticWidthVariants 1",
"results": [
{
"audio": "",
"clipboard-image": "",
"clipboard-text": "",
"cloze-body": "English",
"cloze-body-kana": "English",
"cloze-prefix": "cloze-prefix",
"cloze-suffix": "cloze-suffix",
"conjugation": "",
"dictionary": "Test Dictionary 2",
"document-title": "title",
"expression": "English",
"frequencies": "",
"frequency-harmonic-rank": "9999999",
"frequency-harmonic-occurrence": "0",
"frequency-average-rank": "9999999",
"frequency-average-occurrence": "0",
"furigana": "English",
"furigana-plain": "English",
"glossary": "<div style=\"text-align: left;\"><i>(n, Test Dictionary 2)</i> English definition</div>",
"glossary-brief": "<div style=\"text-align: left;\">English definition</div>",
"glossary-no-dictionary": "<div style=\"text-align: left;\"><i>(n)</i> English definition</div>",
"glossary-first": "<div style=\"text-align: left;\"><i>(n, Test Dictionary 2)</i> English definition</div>",
"glossary-first-brief": "<div style=\"text-align: left;\">English definition</div>",
"glossary-first-no-dictionary": "<div style=\"text-align: left;\"><i>(n)</i> English definition</div>",
"part-of-speech": "Noun",
"pitch-accents": "",
"pitch-accent-graphs": "",
"pitch-accent-graphs-jj": "",
"pitch-accent-positions": "",
"pitch-accent-categories": "",
"phonetic-transcriptions": "",
"reading": "English",
"screenshot": "",
"search-query": "fullQuery",
"selection-text": "",
"sentence": "cloze-prefixEnglishcloze-suffix",
"sentence-furigana": "cloze-prefixEnglishcloze-suffix",
"tags": "n",
"url": "<a href=\"url:\">url:</a>"
}
]
},
{
"name": "Test text preprocessors - alphabeticWidthVariants 2",
"results": [
{
"audio": "",
"clipboard-image": "",
"clipboard-text": "",
"cloze-body": "USB",
"cloze-body-kana": "USB",
"cloze-prefix": "cloze-prefix",
"cloze-suffix": "cloze-suffix",
"conjugation": "",
"dictionary": "Test Dictionary 2",
"document-title": "title",
"expression": "USB",
"frequencies": "",
"frequency-harmonic-rank": "9999999",
"frequency-harmonic-occurrence": "0",
"frequency-average-rank": "9999999",
"frequency-average-occurrence": "0",
"furigana": "<ruby>USB<rt>ユーエスビー</rt></ruby>",
"furigana-plain": "USB[ユーエスビー]",
"glossary": "<div style=\"text-align: left;\"><i>(n, Test Dictionary 2)</i> USB definition</div>",
"glossary-brief": "<div style=\"text-align: left;\">USB definition</div>",
"glossary-no-dictionary": "<div style=\"text-align: left;\"><i>(n)</i> USB definition</div>",
"glossary-first": "<div style=\"text-align: left;\"><i>(n, Test Dictionary 2)</i> USB definition</div>",
"glossary-first-brief": "<div style=\"text-align: left;\">USB definition</div>",
"glossary-first-no-dictionary": "<div style=\"text-align: left;\"><i>(n)</i> USB definition</div>",
"part-of-speech": "Noun",
"pitch-accents": "",
"pitch-accent-graphs": "",
"pitch-accent-graphs-jj": "",
"pitch-accent-positions": "",
"pitch-accent-categories": "",
"phonetic-transcriptions": "",
"reading": "ユーエスビー",
"screenshot": "",
"search-query": "fullQuery",
"selection-text": "",
"sentence": "cloze-prefixUSBcloze-suffix",
"sentence-furigana": "cloze-prefixUSBcloze-suffix",
"tags": "n",
"url": "<a href=\"url:\">url:</a>"
}
]
},
{
"name": "Test text preprocessors - alphabeticToHiragana",
"results": [
{
"audio": "",
Expand Down
6 changes: 3 additions & 3 deletions test/data/database-test-cases.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"ipa": 1
},
"terms": {
"total": 30
"total": 31
}
}
},
Expand All @@ -36,7 +36,7 @@
{
"kanji": 2,
"kanjiMeta": 6,
"terms": 30,
"terms": 31,
"termMeta": 39,
"tagMeta": 15,
"media": 6
Expand All @@ -45,7 +45,7 @@
"total": {
"kanji": 2,
"kanjiMeta": 6,
"terms": 30,
"terms": 31,
"termMeta": 39,
"tagMeta": 15,
"media": 6
Expand Down
3 changes: 2 additions & 1 deletion test/data/dictionaries/valid-dictionary1/term_bank_1.json
Original file line number Diff line number Diff line change
Expand Up @@ -344,5 +344,6 @@
["凄い", "すごい", "adj-i", "adj-i", 1, ["sugoi definition"], 18, ""],
["English", "", "n", "n", 1, ["English definition"], 19, ""],
["language", "", "n", "n", 1, ["language definition"], 20, ""],
["마시다", "", "v", "v", 1, ["masida definition"], 21, ""]
["USB", "ユーエスビー", "n", "n", 1, ["USB definition"], 21, ""],
["마시다", "", "v", "v", 1, ["masida definition"], 22, ""]
]
28 changes: 27 additions & 1 deletion test/data/translator-test-inputs.json
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,33 @@
]
},
{
"name": "Test text preprocessors - convertAlphabeticCharacters",
"name": "Test text preprocessors - alphabeticWidthVariants 1",
"func": "findTerms",
"mode": "split",
"text": "English",
"options": [
"default",
{
"type": "terms",
"removeNonJapaneseCharacters": false
}
]
},
{
"name": "Test text preprocessors - alphabeticWidthVariants 2",
"func": "findTerms",
"mode": "split",
"text": "USB",
"options": [
"default",
{
"type": "terms",
"removeNonJapaneseCharacters": false
}
]
},
{
"name": "Test text preprocessors - alphabeticToHiragana",
"func": "findTerms",
"mode": "split",
"text": "utsu",
Expand Down
Loading

0 comments on commit d19b898

Please sign in to comment.