From 3ed9235eccc4e576239c5bbe8d54b8eaab9766da Mon Sep 17 00:00:00 2001 From: Jason Dent Date: Tue, 16 Jul 2024 08:59:43 +0200 Subject: [PATCH] fix: support emojis in triev3 files (#5923) --- .../cspell-trie-lib/Samples/sampleV3.trie | 3 + .../cspell-trie-lib/Samples/sampleV4.trie | 42 +++-- .../__snapshots__/importExportV3.test.ts.snap | 30 ++++ .../__snapshots__/importExportV4.test.ts.snap | 168 +++++++++++------- .../src/lib/io/importExportV3.test.ts | 85 +++------ .../src/lib/io/importExportV4.test.ts | 85 ++------- .../src/lib/io/importV3.test.ts | 65 +------ .../cspell-trie-lib/src/lib/io/importV3.ts | 4 +- .../src/lib/io/importV3FastBlob.test.ts | 78 ++------ .../src/lib/io/test/sampleData.ts | 114 ++++++++++++ 10 files changed, 336 insertions(+), 338 deletions(-) create mode 100644 packages/cspell-trie-lib/src/lib/io/test/sampleData.ts diff --git a/packages/cspell-trie-lib/Samples/sampleV3.trie b/packages/cspell-trie-lib/Samples/sampleV3.trie index 1a6e286f39d..bd946070c0c 100644 --- a/packages/cspell-trie-lib/Samples/sampleV3.trie +++ b/packages/cspell-trie-lib/Samples/sampleV3.trie @@ -18,3 +18,6 @@ lift#58;<3ong w#87;<6 ref \#$5 t#63;< wa#64;<2 +ᐊᓂᔑᓈᐯᒧᐎᓐ$8 +ᓀᐦᐃᔭᐍᐏᐣ$7 +😀😃😄😁😆🥹😅😂🤣🥲☺️😊😇🙂🙃😉$9<8 diff --git a/packages/cspell-trie-lib/Samples/sampleV4.trie b/packages/cspell-trie-lib/Samples/sampleV4.trie index a7eb7e02b6f..8a52061f46c 100644 --- a/packages/cspell-trie-lib/Samples/sampleV4.trie +++ b/packages/cspell-trie-lib/Samples/sampleV4.trie @@ -5,7 +5,7 @@ base=10 # Data: __DATA__ [ -0,27,57,59,104,55,62,63,86,87,102 +0,27,58,60,104,45,56,63,64,86,87 ] B /* B */ @@ -23,11 +23,13 @@ w York$7 u /* Nu */ mbers \0\1\2\3\4\5\6\7\8\9$9<9 +\\\ +/* \\\ */ +\\\ +/* \\\\\\ */ +\\\$3 a /* a */ -\/ -/* a\/ */ -b$c$2 p#6 { + beforeAll(async () => { + if (updateSampleFile) { + await createSampleFile(); + } + }); + test('tests serialize / deserialize small sample', () => { const trie = Trie.buildTrie(smallSample).root; const expected = toTree(trie); @@ -48,7 +55,6 @@ describe('Import/Export', () => { const root = importTrie(data.split('\n').map((a) => (a ? a + '\n' : a))); const words = [...Trie.iteratorTrieWords(root)]; expect(words).toEqual([...sampleWords].sort()); - // await writeFile(sampleFile, data); }); test('tests deserialize from file', async () => { @@ -125,65 +131,14 @@ function toTree(root: TrieNode): string { return ['\n', ...walk(root, '')].join(''); } -const specialCharacters = [ - 'arrow <', - 'escape \\', - '\\\\\\', - 'eol \n', - 'eow $', - 'ref #', - 'Numbers 0123456789', - 'Braces: {}[]()', -]; - -const smallSample = genSequence(['lift', 'talk', 'walk', 'turn', 'burn', 'chalk', 'churn']) - .concatMap(applyEndings) - .toArray(); - -const sampleWords = [ - 'journal', - 'journalism', - 'journalist', - 'journalistic', - 'journals', - 'journey', - 'journeyer', - 'journeyman', - 'journeymen', - 'joust', - 'jouster', - 'jousting', - 'jovial', - 'joviality', - 'jowl', - 'jowly', - 'joy', - 'joyful', - 'joyfuller', - 'joyfullest', - 'joyfulness', - 'joyless', - 'joylessness', - 'joyous', - 'joyousness', - 'joyridden', - 'joyride', - 'joyrider', - 'joyriding', - 'joyrode', - 'joystick', - 'Big Apple', - 'New York', - 'apple', - 'big apple', - 'fun journey', - 'long walk', - 'fun walk', - ...specialCharacters, - ...smallSample, -]; - -function applyEndings(s: string): string[] { - const endings = ['', 'ed', 'er', 'ing', 's']; - return endings.map((e) => s + e); +async function createSampleFile() { + const trie = Trie.buildTrie(sampleWords).root; + const data = [ + ...serializeTrie(consolidate(trie), { + base: 10, + comment: 'Sample Words', + addLineBreaksToImproveDiffs: false, + }), + ].join(''); + await writeFile(sampleFile, data); } diff --git a/packages/cspell-trie-lib/src/lib/io/importExportV4.test.ts b/packages/cspell-trie-lib/src/lib/io/importExportV4.test.ts index d20830786d3..fa388fab1cb 100644 --- a/packages/cspell-trie-lib/src/lib/io/importExportV4.test.ts +++ b/packages/cspell-trie-lib/src/lib/io/importExportV4.test.ts @@ -1,7 +1,6 @@ -import { readFile } from 'node:fs/promises'; +import { readFile, writeFile } from 'node:fs/promises'; -import { genSequence } from 'gensequence'; -import { describe, expect, test } from 'vitest'; +import { beforeAll, describe, expect, test } from 'vitest'; import { resolveSample as resolveSamplePath } from '../../test/samples.js'; import { consolidate } from '../consolidate.js'; @@ -9,10 +8,18 @@ import * as Trie from '../index.js'; import type { TrieNode } from '../TrieNode/TrieNode.js'; import * as v3 from './importExportV3.js'; import { __testing__, importTrie, serializeTrie } from './importExportV4.js'; +import { sampleWords, smallSample, specialCharacters } from './test/sampleData.js'; const sampleFile = resolveSamplePath('sampleV4.trie'); +const updateSampleFile = false; describe('Import/Export', () => { + beforeAll(async () => { + if (updateSampleFile) { + await createSampleFile(); + } + }); + test('tests serialize / deserialize small sample', () => { const trie = Trie.buildTrie(smallSample).root; const expected = toTree(trie); @@ -154,67 +161,13 @@ function toTree(root: TrieNode): string { return ['\n', ...walk(root, '')].join(''); } -const specialCharacters = [ - 'arrow <', - 'escape \\', - 'eol \n', - 'eow $', - 'ref #', - 'Numbers 0123456789', - 'Braces: {}[]()', - 'slash /', - 'a/b', - 'a/c', -]; - -const smallSample = genSequence(['lift', 'talk', 'walk', 'turn', 'burn', 'chalk', 'churn']) - .concatMap(applyEndings) - .toArray(); - -const sampleWords = [ - 'journal', - 'journalism', - 'journalist', - 'journalistic', - 'journals', - 'journey', - 'journeyer', - 'journeyman', - 'journeymen', - 'joust', - 'jouster', - 'jousting', - 'jovial', - 'joviality', - 'jowl', - 'jowly', - 'joy', - 'joyful', - 'joyfuller', - 'joyfullest', - 'joyfulness', - 'joyless', - 'joylessness', - 'joyous', - 'joyousness', - 'joyridden', - 'joyride', - 'joyrider', - 'joyriding', - 'joyrode', - 'joystick', - 'Big Apple', - 'New York', - 'apple', - 'big apple', - 'fun journey', - 'long walk', - 'fun walk', - ...specialCharacters, - ...smallSample, -]; - -function applyEndings(s: string): string[] { - const endings = ['', 'ed', 'er', 'ing', 's']; - return endings.map((e) => s + e); +async function createSampleFile() { + const trie = Trie.buildTrie(sampleWords).root; + const data = [ + ...serializeTrie(consolidate(trie), { + base: 10, + comment: 'Sample Words', + }), + ].join(''); + await writeFile(sampleFile, data); } diff --git a/packages/cspell-trie-lib/src/lib/io/importV3.test.ts b/packages/cspell-trie-lib/src/lib/io/importV3.test.ts index 2e4e0f9ff8c..55caa1c0844 100644 --- a/packages/cspell-trie-lib/src/lib/io/importV3.test.ts +++ b/packages/cspell-trie-lib/src/lib/io/importV3.test.ts @@ -1,6 +1,5 @@ import { readFile } from 'node:fs/promises'; -import { genSequence } from 'gensequence'; import { describe, expect, test } from 'vitest'; import { resolveSample as resolveSamplePath } from '../../test/samples.js'; @@ -11,6 +10,7 @@ import { trieRootToITrieRoot } from '../TrieNode/trie.js'; import { TrieNodeBuilder } from '../TrieNode/TrieNodeBuilder.js'; import { serializeTrie } from './importExportV3.js'; import { importTrieV3AsTrieRoot, importTrieV3WithBuilder } from './importV3.js'; +import { sampleWords, smallSample, specialCharacters } from './test/sampleData.js'; const sampleFile = resolveSamplePath('sampleV3.trie'); @@ -152,66 +152,3 @@ function toTree(root: ITrieNode): string { return ['\n', ...walk(root, '')].join(''); } - -const specialCharacters = [ - 'arrow <', - 'escape \\', - '\\\\\\', - 'eol \n', - 'eow $', - 'ref #', - 'Numbers 0123456789', - 'Braces: {}[]()', -]; - -const smallSample = genSequence(['lift', 'talk', 'walk', 'turn', 'burn', 'chalk', 'churn']) - .concatMap(applyEndings) - .toArray(); - -const sampleWords = [ - 'journal', - 'journalism', - 'journalist', - 'journalistic', - 'journals', - 'journey', - 'journeyer', - 'journeyman', - 'journeymen', - 'joust', - 'jouster', - 'jousting', - 'jovial', - 'joviality', - 'jowl', - 'jowly', - 'joy', - 'joyful', - 'joyfuller', - 'joyfullest', - 'joyfulness', - 'joyless', - 'joylessness', - 'joyous', - 'joyousness', - 'joyridden', - 'joyride', - 'joyrider', - 'joyriding', - 'joyrode', - 'joystick', - 'Big Apple', - 'New York', - 'apple', - 'big apple', - 'fun journey', - 'long walk', - 'fun walk', - ...specialCharacters, - ...smallSample, -]; - -function applyEndings(s: string): string[] { - const endings = ['', 'ed', 'er', 'ing', 's']; - return endings.map((e) => s + e); -} diff --git a/packages/cspell-trie-lib/src/lib/io/importV3.ts b/packages/cspell-trie-lib/src/lib/io/importV3.ts index d2c9f1b0ca2..a186a69797b 100644 --- a/packages/cspell-trie-lib/src/lib/io/importV3.ts +++ b/packages/cspell-trie-lib/src/lib/io/importV3.ts @@ -91,8 +91,8 @@ export function importTrieV3WithBuilder( for (let i = startOfData + 1; i < dataLines.length; ++i) { const line = dataLines[i]; - for (let j = 0; j < line.length; ++j) { - node = parser(node, line[j]); + for (const c of line) { + node = parser(node, c); } } timerParse(); diff --git a/packages/cspell-trie-lib/src/lib/io/importV3FastBlob.test.ts b/packages/cspell-trie-lib/src/lib/io/importV3FastBlob.test.ts index 589db6021d3..54e364b89fd 100644 --- a/packages/cspell-trie-lib/src/lib/io/importV3FastBlob.test.ts +++ b/packages/cspell-trie-lib/src/lib/io/importV3FastBlob.test.ts @@ -1,6 +1,5 @@ import { readFile } from 'node:fs/promises'; -import { genSequence } from 'gensequence'; import { describe, expect, test } from 'vitest'; import { resolveSample as resolveSamplePath } from '../../test/samples.js'; @@ -11,6 +10,14 @@ import { FastTrieBlob } from '../TrieBlob/FastTrieBlob.js'; import { trieRootToITrieRoot } from '../TrieNode/trie.js'; import { serializeTrie } from './importExportV3.js'; import { importTrieV3AsFastTrieBlob } from './importV3FastBlob.js'; +import { + filterUnique, + mixedLanguageWords, + sampleWords, + sampleWordsExt, + smallSample, + specialCharacters, +} from './test/sampleData.js'; const sampleFile = resolveSamplePath('sampleV3.trie'); @@ -32,14 +39,16 @@ describe('Import/Export', () => { }); test('tests serialize / deserialize specialCharacters', () => { - const trie = Trie.buildTrie(specialCharacters).root; + const sampleWords = [...specialCharacters, ...mixedLanguageWords].filter(filterUnique()); + const trie = Trie.buildTrie(sampleWords).root; const data = [...serializeTrie(consolidate(trie), 10)]; const ft = importTrieV3AsFastTrieBlob(data); const words = [...ft.words()]; - expect(words.sort()).toEqual([...specialCharacters].sort()); + expect(words.sort()).toEqual([...sampleWords].sort()); }); test('tests serialize / deserialize', async () => { + const sampleWords = sampleWordsExt; const trie = Trie.buildTrie(sampleWords).root; const data = [ ...serializeTrie(consolidate(trie), { @@ -130,66 +139,3 @@ function toTree(root: ITrieNode): string { return ['\n', ...walk(root, '')].join(''); } - -const specialCharacters = [ - 'arrow <', - 'escape \\', - '\\\\\\', - 'eol \n', - 'eow $', - 'ref #', - 'Numbers 0123456789', - 'Braces: {}[]()', -]; - -const smallSample = genSequence(['lift', 'talk', 'walk', 'turn', 'burn', 'chalk', 'churn']) - .concatMap(applyEndings) - .toArray(); - -const sampleWords = [ - 'journal', - 'journalism', - 'journalist', - 'journalistic', - 'journals', - 'journey', - 'journeyer', - 'journeyman', - 'journeymen', - 'joust', - 'jouster', - 'jousting', - 'jovial', - 'joviality', - 'jowl', - 'jowly', - 'joy', - 'joyful', - 'joyfuller', - 'joyfullest', - 'joyfulness', - 'joyless', - 'joylessness', - 'joyous', - 'joyousness', - 'joyridden', - 'joyride', - 'joyrider', - 'joyriding', - 'joyrode', - 'joystick', - 'Big Apple', - 'New York', - 'apple', - 'big apple', - 'fun journey', - 'long walk', - 'fun walk', - ...specialCharacters, - ...smallSample, -]; - -function applyEndings(s: string): string[] { - const endings = ['', 'ed', 'er', 'ing', 's']; - return endings.map((e) => s + e); -} diff --git a/packages/cspell-trie-lib/src/lib/io/test/sampleData.ts b/packages/cspell-trie-lib/src/lib/io/test/sampleData.ts new file mode 100644 index 00000000000..de3705601d2 --- /dev/null +++ b/packages/cspell-trie-lib/src/lib/io/test/sampleData.ts @@ -0,0 +1,114 @@ +export const specialCharacters = [ + 'arrow <', + 'escape \\', + '\\\\\\', + 'eol \n', + 'eow $', + 'ref #', + 'Numbers 0123456789', + 'Braces: {}[]()', +]; + +export const smallSample = ['lift', 'talk', 'walk', 'turn', 'burn', 'chalk', 'churn'].flatMap(applyEndings); + +// cspell:disable +export const mixedLanguageWords = [ + 'Here are a few words to use as a dictionary. They just need to be split. ', + 'walk walked walking walker ', + 'talk talked talking talker ', + 'play played playing player ', + 'red green blue yellow orange ', + 'on the first day of ', + 'on a dark and ', + 'ted red bed reed bees', + 'fëé', + 'café', + 'cat béat', + 'féé', + 'téé', + 'ትኛ', + 'አኛ', + 'ትግርኛ', + 'አማርኛ', + 'ພາສາລາວ', + 'ꦧꦱꦗꦮ', + 'ᐃᓄᒃᑎᑐᑦ', + 'ᐊᓂᔑᓈᐯᒧᐎᓐ', + 'ᓀᐦᐃᔭᐍᐏᐣ', + '😀😃😄😁😆🥹😅😂🤣🥲☺️😊😇🙂🙃😉', + '😌😍🥰😘😗😙😚😋😛😝😜🤪🤨🧐🤓😎', + '🥸🤩🥳😏😒😞😔😟😕🙁☹️😣😖😫😩🥺', + '😢😭😤😠😡🤬🤯😳🥵🥶😶‍🌫️😱😨😰😥😓', + '🤗🤔🫣🤭🫢🫡🤫🫠🤥😶🫥😐🫤😑🫨😬', + '🙄😯😦😧😮😲🥱😴🤤😪😮‍💨😵😵‍💫🤐🥴🤢', + '🤮🤧😷🤒🤕🤑🤠😈 ', +] // cspell:enable + .flatMap((a) => a.split(' ')) + .map((a) => a.normalize('NFC')) + .filter((a) => !!a); +// cspell:enable + +export const sampleWords = [ + 'journal', + 'journalism', + 'journalist', + 'journalistic', + 'journals', + 'journey', + 'journeyer', + 'journeyman', + 'journeymen', + 'joust', + 'jouster', + 'jousting', + 'jovial', + 'joviality', + 'jowl', + 'jowly', + 'joy', + 'joyful', + 'joyfuller', + 'joyfullest', + 'joyfulness', + 'joyless', + 'joylessness', + 'joyous', + 'joyousness', + 'joyridden', + 'joyride', + 'joyrider', + 'joyriding', + 'joyrode', + 'joystick', + 'Big Apple', + 'New York', + 'apple', + 'big apple', + 'fun journey', + 'long walk', + 'fun walk', + ...specialCharacters, + ...smallSample, + // cspell:disable + 'ᐊᓂᔑᓈᐯᒧᐎᓐ', + 'ᓀᐦᐃᔭᐍᐏᐣ', + '😀😃😄😁😆🥹😅😂🤣🥲☺️😊😇🙂🙃😉', + // cspell:enable +]; + +export const sampleWordsExt = [...sampleWords, ...mixedLanguageWords].filter(filterUnique()); + +function applyEndings(s: string): string[] { + const endings = ['', 'ed', 'er', 'ing', 's']; + return endings.map((e) => s + e); +} + +export function filterUnique(): (v: T) => boolean { + const seen = new Set(); + + return (v) => { + const s = seen.size; + seen.add(v); + return seen.size !== s; + }; +}