Skip to content

Commit

Permalink
Merge branch 'master' into lingua-libre-audio-source
Browse files Browse the repository at this point in the history
  • Loading branch information
Casheeew authored Jun 26, 2024
2 parents b04fe38 + 4e3f23e commit 0d61e83
Show file tree
Hide file tree
Showing 6 changed files with 288 additions and 17 deletions.
51 changes: 34 additions & 17 deletions ext/js/display/display.js
Original file line number Diff line number Diff line change
Expand Up @@ -1229,28 +1229,45 @@ export class Display extends EventDispatcher {
* @returns {Promise<import('dictionary').DictionaryEntry[]>}
*/
async _findDictionaryEntries(isKanji, source, wildcardsEnabled, optionsContext) {
/** @type {import('dictionary').DictionaryEntry[]} */
let dictionaryEntries = [];
const {findDetails, source: source2} = this._getFindDetails(source, wildcardsEnabled);
if (isKanji) {
return await this._application.api.kanjiFind(source, optionsContext);
dictionaryEntries = await this._application.api.kanjiFind(source, optionsContext);
if (dictionaryEntries.length > 0) { return dictionaryEntries; }

dictionaryEntries = (await this._application.api.termsFind(source2, findDetails, optionsContext)).dictionaryEntries;
} else {
/** @type {import('api').FindTermsDetails} */
const findDetails = {};
if (wildcardsEnabled) {
const match = /^([*\uff0a]*)([\w\W]*?)([*\uff0a]*)$/.exec(source);
if (match !== null) {
if (match[1]) {
findDetails.matchType = 'suffix';
findDetails.deinflect = false;
} else if (match[3]) {
findDetails.matchType = 'prefix';
findDetails.deinflect = false;
}
source = match[2];
dictionaryEntries = (await this._application.api.termsFind(source2, findDetails, optionsContext)).dictionaryEntries;
if (dictionaryEntries.length > 0) { return dictionaryEntries; }

dictionaryEntries = await this._application.api.kanjiFind(source, optionsContext);
}
return dictionaryEntries;
}

/**
* @param {string} source
* @param {boolean} wildcardsEnabled
* @returns {{findDetails: import('api').FindTermsDetails, source: string}}
*/
_getFindDetails(source, wildcardsEnabled) {
/** @type {import('api').FindTermsDetails} */
const findDetails = {};
if (wildcardsEnabled) {
const match = /^([*\uff0a]*)([\w\W]*?)([*\uff0a]*)$/.exec(source);
if (match !== null) {
if (match[1]) {
findDetails.matchType = 'suffix';
findDetails.deinflect = false;
} else if (match[3]) {
findDetails.matchType = 'prefix';
findDetails.deinflect = false;
}
source = match[2];
}

const {dictionaryEntries} = await this._application.api.termsFind(source, findDetails, optionsContext);
return dictionaryEntries;
}
return {findDetails, source};
}

/**
Expand Down
9 changes: 9 additions & 0 deletions ext/js/language/ja/japanese-text-preprocessors.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import {
convertHalfWidthKanaToFullWidth,
convertHiraganaToKatakana as convertHiraganaToKatakanaFunction,
convertKatakanaToHiragana as convertKatakanaToHiraganaFunction,
normalizeCombiningCharacters as normalizeCombiningCharactersFunction,
} from './japanese.js';

/** @type {import('language').TextProcessor<boolean>} */
Expand Down Expand Up @@ -90,3 +91,11 @@ export const collapseEmphaticSequences = {
return str;
},
};

/** @type {import('language').TextProcessor<boolean>} */
export const normalizeCombiningCharacters = {
name: 'Normalize combining characters',
description: 'ド → ド (U+30C8 U+3099 → U+30C9)',
options: basicTextProcessorOptions,
process: (str, setting) => (setting ? normalizeCombiningCharactersFunction(str) : str),
};
60 changes: 60 additions & 0 deletions ext/js/language/ja/japanese.js
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ const JAPANESE_RANGES = [
const SMALL_KANA_SET = new Set('ぁぃぅぇぉゃゅょゎァィゥェォャュョヮ');

const HALFWIDTH_KATAKANA_MAPPING = new Map([
['・', '・--'],
['ヲ', 'ヲヺ-'],
['ァ', 'ァ--'],
['ィ', 'ィ--'],
Expand Down Expand Up @@ -560,6 +561,65 @@ export function getKanaDiacriticInfo(character) {
return typeof info !== 'undefined' ? {character: info.character, type: info.type} : null;
}

/**
* @param {number} codePoint
* @returns {boolean}
*/
function dakutenAllowed(codePoint) {
// To reduce processing time some characters which shouldn't have dakuten but are highly unlikely to have a combining character attached are included
// かがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとはばぱひびぴふぶぷへべぺほ
// カガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトハバパヒビピフブプヘベペホ
return ((codePoint >= 0x304B && codePoint <= 0x3068) ||
(codePoint >= 0x306F && codePoint <= 0x307B) ||
(codePoint >= 0x30AB && codePoint <= 0x30C8) ||
(codePoint >= 0x30CF && codePoint <= 0x30DB));
}

/**
* @param {number} codePoint
* @returns {boolean}
*/
function handakutenAllowed(codePoint) {
// To reduce processing time some characters which shouldn't have handakuten but are highly unlikely to have a combining character attached are included
// はばぱひびぴふぶぷへべぺほ
// ハバパヒビピフブプヘベペホ
return ((codePoint >= 0x306F && codePoint <= 0x307B) ||
(codePoint >= 0x30CF && codePoint <= 0x30DB));
}

/**
* @param {string} text
* @returns {string}
*/
export function normalizeCombiningCharacters(text) {
let result = '';
let i = text.length - 1;
// Ignoring the first character is intentional, it cannot combine with anything
while (i > 0) {
if (text[i] === '\u3099') {
const dakutenCombinee = text[i - 1].codePointAt(0);
if (dakutenCombinee && dakutenAllowed(dakutenCombinee)) {
result = String.fromCodePoint(dakutenCombinee + 1) + result;
i -= 2;
continue;
}
} else if (text[i] === '\u309A') {
const handakutenCombinee = text[i - 1].codePointAt(0);
if (handakutenCombinee && handakutenAllowed(handakutenCombinee)) {
result = String.fromCodePoint(handakutenCombinee + 2) + result;
i -= 2;
continue;
}
}
result = text[i] + result;
i--;
}
// i === -1 when first two characters are combined
if (i === 0) {
result = text[0] + result;
}
return result;
}

// Furigana distribution

Expand Down
2 changes: 2 additions & 0 deletions ext/js/language/language-descriptors.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import {
collapseEmphaticSequences,
convertHalfWidthCharacters,
convertHiraganaToKatakana,
normalizeCombiningCharacters,
} from './ja/japanese-text-preprocessors.js';
import {japaneseTransforms} from './ja/japanese-transforms.js';
import {isStringPartiallyJapanese} from './ja/japanese.js';
Expand Down Expand Up @@ -170,6 +171,7 @@ const languageDescriptors = [
textPreprocessors: {
convertHalfWidthCharacters,
alphabeticToHiragana,
normalizeCombiningCharacters,
alphanumericWidthVariants,
convertHiraganaToKatakana,
collapseEmphaticSequences,
Expand Down
182 changes: 182 additions & 0 deletions test/japanese-util.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -863,3 +863,185 @@ describe('Japanese utility functions', () => {
});
});
});

describe('combining dakuten/handakuten normalization', () => {
const testCasesDakuten = [
['か\u3099', 'が'],
['き\u3099', 'ぎ'],
['く\u3099', 'ぐ'],
['け\u3099', 'げ'],
['こ\u3099', 'ご'],
['さ\u3099', 'ざ'],
['し\u3099', 'じ'],
['す\u3099', 'ず'],
['せ\u3099', 'ぜ'],
['そ\u3099', 'ぞ'],
['た\u3099', 'だ'],
['ち\u3099', 'ぢ'],
['つ\u3099', 'づ'],
['て\u3099', 'で'],
['と\u3099', 'ど'],
['は\u3099', 'ば'],
['ひ\u3099', 'び'],
['ふ\u3099', 'ぶ'],
['へ\u3099', 'べ'],
['ほ\u3099', 'ぼ'],
['カ\u3099', 'ガ'],
['キ\u3099', 'ギ'],
['ク\u3099', 'グ'],
['ケ\u3099', 'ゲ'],
['コ\u3099', 'ゴ'],
['サ\u3099', 'ザ'],
['シ\u3099', 'ジ'],
['ス\u3099', 'ズ'],
['セ\u3099', 'ゼ'],
['ソ\u3099', 'ゾ'],
['タ\u3099', 'ダ'],
['チ\u3099', 'ヂ'],
['ツ\u3099', 'ヅ'],
['テ\u3099', 'デ'],
['ト\u3099', 'ド'],
['ハ\u3099', 'バ'],
['ヒ\u3099', 'ビ'],
['フ\u3099', 'ブ'],
['ヘ\u3099', 'ベ'],
['ホ\u3099', 'ボ'],
];

const testCasesHandakuten = [
['は\u309A', 'ぱ'],
['ひ\u309A', 'ぴ'],
['ふ\u309A', 'ぷ'],
['へ\u309A', 'ぺ'],
['ほ\u309A', 'ぽ'],
['ハ\u309A', 'パ'],
['ヒ\u309A', 'ピ'],
['フ\u309A', 'プ'],
['ヘ\u309A', 'ペ'],
['ホ\u309A', 'ポ'],
];

const testCasesIgnored = [
['な\u3099', 'な\u3099'],
['な\u309A', 'な\u309A'],
['に\u3099', 'に\u3099'],
['に\u309A', 'に\u309A'],
['ぬ\u3099', 'ぬ\u3099'],
['ぬ\u309A', 'ぬ\u309A'],
['ね\u3099', 'ね\u3099'],
['ね\u309A', 'ね\u309A'],
['の\u3099', 'の\u3099'],
['の\u309A', 'の\u309A'],
['ま\u3099', 'ま\u3099'],
['ま\u309A', 'ま\u309A'],
['み\u3099', 'み\u3099'],
['み\u309A', 'み\u309A'],
['む\u3099', 'む\u3099'],
['む\u309A', 'む\u309A'],
['め\u3099', 'め\u3099'],
['め\u309A', 'め\u309A'],
['も\u3099', 'も\u3099'],
['も\u309A', 'も\u309A'],
['ゃ\u3099', 'ゃ\u3099'],
['ゃ\u309A', 'ゃ\u309A'],
['や\u3099', 'や\u3099'],
['や\u309A', 'や\u309A'],
['ゅ\u3099', 'ゅ\u3099'],
['ゅ\u309A', 'ゅ\u309A'],
['ゆ\u3099', 'ゆ\u3099'],
['ゆ\u309A', 'ゆ\u309A'],
['ょ\u3099', 'ょ\u3099'],
['ょ\u309A', 'ょ\u309A'],
['よ\u3099', 'よ\u3099'],
['よ\u309A', 'よ\u309A'],
['ら\u3099', 'ら\u3099'],
['ら\u309A', 'ら\u309A'],
['り\u3099', 'り\u3099'],
['り\u309A', 'り\u309A'],
['る\u3099', 'る\u3099'],
['る\u309A', 'る\u309A'],
['れ\u3099', 'れ\u3099'],
['れ\u309A', 'れ\u309A'],
['ろ\u3099', 'ろ\u3099'],
['ろ\u309A', 'ろ\u309A'],
['ゎ\u3099', 'ゎ\u3099'],
['ゎ\u309A', 'ゎ\u309A'],
['わ\u3099', 'わ\u3099'],
['わ\u309A', 'わ\u309A'],
['ゐ\u3099', 'ゐ\u3099'],
['ゐ\u309A', 'ゐ\u309A'],
['ゑ\u3099', 'ゑ\u3099'],
['ゑ\u309A', 'ゑ\u309A'],
['を\u3099', 'を\u3099'],
['を\u309A', 'を\u309A'],
['ん\u3099', 'ん\u3099'],
['ん\u309A', 'ん\u309A'],
['ナ\u3099', 'ナ\u3099'],
['ナ\u309A', 'ナ\u309A'],
['ニ\u3099', 'ニ\u3099'],
['ニ\u309A', 'ニ\u309A'],
['ヌ\u3099', 'ヌ\u3099'],
['ヌ\u309A', 'ヌ\u309A'],
['ネ\u3099', 'ネ\u3099'],
['ネ\u309A', 'ネ\u309A'],
['ノ\u3099', 'ノ\u3099'],
['ノ\u309A', 'ノ\u309A'],
['マ\u3099', 'マ\u3099'],
['マ\u309A', 'マ\u309A'],
['ミ\u3099', 'ミ\u3099'],
['ミ\u309A', 'ミ\u309A'],
['ム\u3099', 'ム\u3099'],
['ム\u309A', 'ム\u309A'],
['メ\u3099', 'メ\u3099'],
['メ\u309A', 'メ\u309A'],
['モ\u3099', 'モ\u3099'],
['モ\u309A', 'モ\u309A'],
['ャ\u3099', 'ャ\u3099'],
['ャ\u309A', 'ャ\u309A'],
['ヤ\u3099', 'ヤ\u3099'],
['ヤ\u309A', 'ヤ\u309A'],
['ュ\u3099', 'ュ\u3099'],
['ュ\u309A', 'ュ\u309A'],
['ユ\u3099', 'ユ\u3099'],
['ユ\u309A', 'ユ\u309A'],
['ョ\u3099', 'ョ\u3099'],
['ョ\u309A', 'ョ\u309A'],
['ヨ\u3099', 'ヨ\u3099'],
['ヨ\u309A', 'ヨ\u309A'],
['ラ\u3099', 'ラ\u3099'],
['ラ\u309A', 'ラ\u309A'],
['リ\u3099', 'リ\u3099'],
['リ\u309A', 'リ\u309A'],
['ル\u3099', 'ル\u3099'],
['ル\u309A', 'ル\u309A'],
['レ\u3099', 'レ\u3099'],
['レ\u309A', 'レ\u309A'],
['ロ\u3099', 'ロ\u3099'],
['ロ\u309A', 'ロ\u309A'],
['ヮ\u3099', 'ヮ\u3099'],
['ヮ\u309A', 'ヮ\u309A'],
['ワ\u3099', 'ワ\u3099'],
['ワ\u309A', 'ワ\u309A'],
['ヰ\u3099', 'ヰ\u3099'],
['ヰ\u309A', 'ヰ\u309A'],
['ヱ\u3099', 'ヱ\u3099'],
['ヱ\u309A', 'ヱ\u309A'],
['ヲ\u3099', 'ヲ\u3099'],
['ヲ\u309A', 'ヲ\u309A'],
['ン\u3099', 'ン\u3099'],
['ン\u309A', 'ン\u309A'],
];

const textCasesMisc = [
['', ''],
['\u3099ハ', '\u3099ハ'],
['\u309Aハ', '\u309Aハ'],
['さくらし\u3099また\u3099いこん', 'さくらじまだいこん'],
['いっほ\u309Aん', 'いっぽん'],
];

const testCases = [...testCasesDakuten, ...testCasesHandakuten, ...testCasesIgnored, ...textCasesMisc];
test.each(testCases)('%s normalizes to %s', (input, expected) => {
expect(jp.normalizeCombiningCharacters(input)).toStrictEqual(expected);
});
});
1 change: 1 addition & 0 deletions types/ext/language-descriptors.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ type AllTextProcessors = {
pre: {
convertHalfWidthCharacters: TextProcessor<boolean>;
alphabeticToHiragana: TextProcessor<boolean>;
normalizeCombiningCharacters: TextProcessor<boolean>;
alphanumericWidthVariants: BidirectionalConversionPreprocessor;
convertHiraganaToKatakana: BidirectionalConversionPreprocessor;
collapseEmphaticSequences: TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>;
Expand Down

0 comments on commit 0d61e83

Please sign in to comment.