From ac0730f8306d3c547988e4a2cb56ff1824c68113 Mon Sep 17 00:00:00 2001 From: Jason Dent Date: Sun, 27 Oct 2024 09:31:50 +0100 Subject: [PATCH] fix: tools - remove duplicates and support compounding (#6423) --- .../cspell-tools.config.schema.json | 31 +++++- .../src/__snapshots__/build.test.ts.snap | 15 +-- .../src/compiler/CompileOptions.ts | 7 ++ .../cspell-tools/src/compiler/Reader.test.ts | 6 +- packages/cspell-tools/src/compiler/Reader.ts | 4 +- .../src/compiler/SourceReader.test.ts | 1 + .../cspell-tools/src/compiler/SourceReader.ts | 6 +- .../src/compiler/WordsCollection.ts | 2 +- .../__snapshots__/compile.test.ts.snap | 72 +++++-------- packages/cspell-tools/src/compiler/compile.ts | 16 ++- .../compiler/createWordsCollection.test.ts | 4 +- .../src/compiler/createWordsCollection.ts | 75 ++++++++++--- .../src/compiler/legacyLineToWords.ts | 2 +- .../src/compiler/readers/ReaderOptions.ts | 7 +- .../src/compiler/readers/trieFileReader.ts | 5 +- .../compiler/splitCamelCaseIfAllowed.test.ts | 21 +++- .../src/compiler/splitCamelCaseIfAllowed.ts | 48 +++++++-- .../streamSourceWordsFromFile.test.ts | 2 + .../cspell-tools/src/compiler/text.test.ts | 26 ++++- packages/cspell-tools/src/compiler/text.ts | 23 +++- .../src/compiler/wordListCompiler.test.ts | 32 +++++- .../src/compiler/wordListCompiler.ts | 100 +++++++++++++++++- .../src/compiler/wordListParser.test.ts | 14 +++ .../src/compiler/wordListParser.ts | 25 ++++- packages/cspell-tools/src/config/config.ts | 23 ++++ packages/cspell-trie-lib/api/api.d.ts | 6 +- .../src/lib/SimpleDictionaryParser.test.ts | 10 ++ .../src/lib/SimpleDictionaryParser.ts | 16 ++- 28 files changed, 484 insertions(+), 115 deletions(-) diff --git a/packages/cspell-tools/cspell-tools.config.schema.json b/packages/cspell-tools/cspell-tools.config.schema.json index 2cbc10115b7..3e34f628d69 100644 --- a/packages/cspell-tools/cspell-tools.config.schema.json +++ b/packages/cspell-tools/cspell-tools.config.schema.json @@ -38,7 +38,8 @@ }, "type": "array" } - ] + ], + "description": "Words in the `allowedSplitWords` are considered correct and can be used as a basis for splitting compound words.\n\nIf entries can be split so that all the words in the entry are allowed, then only the individual words are added, otherwise the entire entry is added. This is to prevent misspellings in CamelCase words from being introduced into the dictionary." }, "keepRawCase": { "default": false, @@ -64,6 +65,11 @@ ], "default": false, "description": "Split lines into words." + }, + "storeSplitWordsAsCompounds": { + "default": false, + "description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.", + "type": "boolean" } }, "required": [ @@ -89,7 +95,8 @@ }, "type": "array" } - ] + ], + "description": "Words in the `allowedSplitWords` are considered correct and can be used as a basis for splitting compound words.\n\nIf entries can be split so that all the words in the entry are allowed, then only the individual words are added, otherwise the entire entry is added. This is to prevent misspellings in CamelCase words from being introduced into the dictionary." }, "filename": { "$ref": "#/definitions/FilePath" @@ -115,6 +122,11 @@ ], "default": false, "description": "Split lines into words." + }, + "storeSplitWordsAsCompounds": { + "default": false, + "description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.", + "type": "boolean" } }, "required": [ @@ -171,6 +183,11 @@ "description": "Name of target, used as the basis of target file name.", "type": "string" }, + "removeDuplicates": { + "default": false, + "description": "Remove duplicate words, favor lower case words over mixed case words. Combine compound prefixes where possible.", + "type": "boolean" + }, "sort": { "default": true, "description": "Sort the words in the resulting dictionary. Does not apply to `trie` based formats.", @@ -249,6 +266,11 @@ "description": "Maximum number of nested Hunspell Rules to apply. This is needed for recursive dictionaries like Hebrew.", "type": "number" }, + "removeDuplicates": { + "default": false, + "description": "Remove duplicate words, favor lower case words over mixed case words. Combine compound prefixes where possible.", + "type": "boolean" + }, "rootDir": { "description": "Specify the directory where all relative paths will resolved against. By default, all relative paths are relative to the location of the config file.", "type": "string" @@ -271,6 +293,11 @@ "default": false, "description": "Split lines into words." }, + "storeSplitWordsAsCompounds": { + "default": false, + "description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.", + "type": "boolean" + }, "targets": { "description": "Optional Target Dictionaries to create.", "items": { diff --git a/packages/cspell-tools/src/__snapshots__/build.test.ts.snap b/packages/cspell-tools/src/__snapshots__/build.test.ts.snap index babe264751f..0731ba91ba5 100644 --- a/packages/cspell-tools/src/__snapshots__/build.test.ts.snap +++ b/packages/cspell-tools/src/__snapshots__/build.test.ts.snap @@ -178,17 +178,12 @@ exports[`build action > build multi 0 2`] = ` !Errorerror !codecode !err -+code -+code+ -+error -+error+ -+msg +*msg ++code* ++error* Café -Code -Code+ -Error -Error+ -msg +Code* +Error* " `; diff --git a/packages/cspell-tools/src/compiler/CompileOptions.ts b/packages/cspell-tools/src/compiler/CompileOptions.ts index dffa49af33c..91f082b13bc 100644 --- a/packages/cspell-tools/src/compiler/CompileOptions.ts +++ b/packages/cspell-tools/src/compiler/CompileOptions.ts @@ -28,4 +28,11 @@ export interface CompileOptions { * */ dictionaryDirectives?: string[] | undefined; + + /** + * Remove duplicate words, favor lower case words over mixed case words. + * Combine compound prefixes where possible. + * @default false + */ + removeDuplicates?: boolean; } diff --git a/packages/cspell-tools/src/compiler/Reader.test.ts b/packages/cspell-tools/src/compiler/Reader.test.ts index 94506c33604..c7f2143e369 100644 --- a/packages/cspell-tools/src/compiler/Reader.test.ts +++ b/packages/cspell-tools/src/compiler/Reader.test.ts @@ -17,14 +17,14 @@ const sc = (m: string) => expect.stringContaining(m); describe('Validate the iterateWordsFromFile', () => { test('streamWordsFromFile: hunspell', async () => { const reader = await createReader(path.join(samples, 'hunspell', 'example.aff'), readerOptions); - const results = [...reader]; + const results = [...reader.lines]; // this might break if the processing order of hunspell changes. expect(results).toEqual(s('hello rework reworked tried try work worked', ' ')); }); test('stream words from trie', async () => { const reader = await createReader(path.join(samples, 'cities.trie.gz'), readerOptions); - const results = [...reader]; + const results = [...reader.lines]; expect(results.join('|')).toBe( 'amsterdam|angeles|city|delhi|francisco|london|los|los angeles' + '|mexico|mexico city|new|new amsterdam|new delhi|new york|paris|san|san francisco|york', @@ -39,7 +39,7 @@ describe('Validate the iterateWordsFromFile', () => { ${'hunspell/example.aff'} | ${{}} | ${'hello|rework|reworked|tried|try|work|worked'} `('stream words from text $file $options', async ({ file, options, expected }) => { const reader = await createReader(path.resolve(samples, file), options); - const results = [...reader]; + const results = [...reader.lines]; expect(results.join('|')).toBe(expected); }); diff --git a/packages/cspell-tools/src/compiler/Reader.ts b/packages/cspell-tools/src/compiler/Reader.ts index edce90ce27d..b790a03e286 100644 --- a/packages/cspell-tools/src/compiler/Reader.ts +++ b/packages/cspell-tools/src/compiler/Reader.ts @@ -1,4 +1,4 @@ -import type { BaseReader, Reader, ReaderOptions } from './readers/ReaderOptions.js'; +import type { BaseReader, DictionaryReader, Reader, ReaderOptions } from './readers/ReaderOptions.js'; import { readHunspellFiles } from './readers/readHunspellFiles.js'; import { regHunspellFile } from './readers/regHunspellFile.js'; import { textFileReader } from './readers/textFileReader.js'; @@ -17,7 +17,7 @@ const readers: ReaderSelector[] = [ { test: regHunspellFile, method: readHunspellFiles }, ]; -function findMatchingReader(filename: string, options: ReaderOptions): Promise { +function findMatchingReader(filename: string, options: ReaderOptions): Promise { for (const reader of readers) { if (reader.test.test(filename)) { return reader.method(filename, options); diff --git a/packages/cspell-tools/src/compiler/SourceReader.test.ts b/packages/cspell-tools/src/compiler/SourceReader.test.ts index ec0ffda37c8..cd8b5565c63 100644 --- a/packages/cspell-tools/src/compiler/SourceReader.test.ts +++ b/packages/cspell-tools/src/compiler/SourceReader.test.ts @@ -14,6 +14,7 @@ const samples = helper.resolveSample('dicts'); const readerOptions: SourceReaderOptions = { splitWords: false, allowedSplitWords: defaultAllowedSplitWords, + storeSplitWordsAsCompounds: undefined, }; describe('Validate the iterateWordsFromFile', () => { diff --git a/packages/cspell-tools/src/compiler/SourceReader.ts b/packages/cspell-tools/src/compiler/SourceReader.ts index 6c6682e67b8..882cb15f457 100644 --- a/packages/cspell-tools/src/compiler/SourceReader.ts +++ b/packages/cspell-tools/src/compiler/SourceReader.ts @@ -22,6 +22,8 @@ export interface SourceReaderOptions { keepCase?: boolean; allowedSplitWords: AllowedSplitWordsCollection; + + storeSplitWordsAsCompounds: boolean | undefined; } export type AnnotatedWord = string; @@ -62,8 +64,8 @@ function splitLines(lines: Iterable, options: SourceReaderOptions): Iter } async function textFileReader(reader: Reader, options: SourceReaderOptions): Promise { - const { legacy, splitWords: split, allowedSplitWords } = options; - const words = [...parseFileLines(reader, { legacy, split, allowedSplitWords })]; + const { legacy, splitWords: split, allowedSplitWords, storeSplitWordsAsCompounds } = options; + const words = [...parseFileLines(reader.lines, { legacy, split, allowedSplitWords, storeSplitWordsAsCompounds })]; return { size: words.length, diff --git a/packages/cspell-tools/src/compiler/WordsCollection.ts b/packages/cspell-tools/src/compiler/WordsCollection.ts index 1668d5a8c2b..4dda1300446 100644 --- a/packages/cspell-tools/src/compiler/WordsCollection.ts +++ b/packages/cspell-tools/src/compiler/WordsCollection.ts @@ -1,6 +1,6 @@ export interface WordsCollection { size: number; - has(words: string): boolean; + has(words: string, caseSensitive: boolean): boolean; type?: string; } diff --git a/packages/cspell-tools/src/compiler/__snapshots__/compile.test.ts.snap b/packages/cspell-tools/src/compiler/__snapshots__/compile.test.ts.snap index 24a292c9936..189cedb1349 100644 --- a/packages/cspell-tools/src/compiler/__snapshots__/compile.test.ts.snap +++ b/packages/cspell-tools/src/compiler/__snapshots__/compile.test.ts.snap @@ -109,25 +109,18 @@ exports[`compile > compile 'sampleCodeDic.txt' fmt: 'plaintext' gz: false alt: t !Errorerror !codecode !err -+code -+code+ -+error -+error+ -+msg +*msg ++code* ++error* Café -Code -Code+ -Error -Error+ -msg +Code* +Error* ~!codemsg ~!errorerror ~cafe ~café -~code -~code+ -~error -~error+ +~code* +~error* " `; @@ -139,17 +132,12 @@ exports[`compile > compile 'sampleCodeDic.txt' fmt: 'plaintext' gz: false alt: u !Errorerror !codecode !err -+code -+code+ -+error -+error+ -+msg +*msg ++code* ++error* Café -Code -Code+ -Error -Error+ -msg +Code* +Error* " `; @@ -262,25 +250,18 @@ exports[`compile > compile conditional 'sampleCodeDic.txt' fmt: 'plaintext' gz: !Errorerror !codecode !err -+code -+code+ -+error -+error+ -+msg +*msg ++code* ++error* Café -Code -Code+ -Error -Error+ -msg +Code* +Error* ~!codemsg ~!errorerror ~cafe ~café -~code -~code+ -~error -~error+ +~code* +~error* " `; @@ -292,17 +273,12 @@ exports[`compile > compile conditional 'sampleCodeDic.txt' fmt: 'plaintext' gz: !Errorerror !codecode !err -+code -+code+ -+error -+error+ -+msg +*msg ++code* ++error* Café -Code -Code+ -Error -Error+ -msg +Code* +Error* " `; diff --git a/packages/cspell-tools/src/compiler/compile.ts b/packages/cspell-tools/src/compiler/compile.ts index 417d626696d..2cf5a621135 100644 --- a/packages/cspell-tools/src/compiler/compile.ts +++ b/packages/cspell-tools/src/compiler/compile.ts @@ -50,6 +50,7 @@ export async function compile(request: CompileRequest, options?: CompileOptions) const targetOptions: CompileTargetConfig = { sort: request.sort, generateNonStrict: request.generateNonStrict, + removeDuplicates: request.removeDuplicates, }; const conditional = options?.conditionalBuild || false; const checksumFile = resolveChecksumFile(request.checksumFile || conditional, rootDir); @@ -108,6 +109,7 @@ export async function compileTarget( const { format, sources, trieBase, sort = true, generateNonStrict = false, excludeWordsFrom } = target; const targetDirectory = path.resolve(rootDir, target.targetDirectory ?? cwd ?? process.cwd()); const dictionaryDirectives = target.dictionaryDirectives ?? compileOptions.dictionaryDirectives; + const removeDuplicates = target.removeDuplicates ?? false; const excludeFilter = await createExcludeFilter(excludeWordsFrom); @@ -129,6 +131,7 @@ export async function compileTarget( generateNonStrict, filter: excludeFilter, dictionaryDirectives, + // removeDuplicates, // Add this in if we use it. }); const checksumRoot = (checksumFile && path.dirname(checksumFile)) || rootDir; @@ -151,10 +154,16 @@ export async function compileTarget( trie4: format === 'trie4', generateNonStrict: generateNonStrictTrie, dictionaryDirectives: undefined, + // removeDuplicates, // Add this in if we use it. }); } : async (words: Iterable, dst: string) => { - return compileWordList(pipe(words, normalizer), dst, { sort, generateNonStrict, dictionaryDirectives }); + return compileWordList(pipe(words, normalizer), dst, { + sort, + generateNonStrict, + dictionaryDirectives, + removeDuplicates, + }); }; await processFiles(action, filesToProcess, filename); @@ -265,6 +274,7 @@ async function readFileSource(fileSource: FileSource, sourceOptions: CompileSour keepRawCase = sourceOptions.keepRawCase || false, split = sourceOptions.split || false, maxDepth, + storeSplitWordsAsCompounds, } = fileSource; const legacy = split === 'legacy'; @@ -282,6 +292,7 @@ async function readFileSource(fileSource: FileSource, sourceOptions: CompileSour splitWords, keepCase: keepRawCase, allowedSplitWords, + storeSplitWordsAsCompounds, }; logWithTimestamp(`Reading ${path.basename(filename)}`); @@ -317,5 +328,6 @@ function logProgress(freq = 100_000): (iter: Iterable) => Iterable { async function createExcludeFilter(excludeWordsFrom: FilePath[] | undefined): Promise<(word: string) => boolean> { if (!excludeWordsFrom || !excludeWordsFrom.length) return () => true; const excludeWords = await createWordsCollectionFromFiles(excludeWordsFrom); - return (word: string) => !excludeWords.has(word); + + return (word: string) => !excludeWords.has(word, word.toUpperCase() !== word); } diff --git a/packages/cspell-tools/src/compiler/createWordsCollection.test.ts b/packages/cspell-tools/src/compiler/createWordsCollection.test.ts index 0edad578aa9..e44765bc31f 100644 --- a/packages/cspell-tools/src/compiler/createWordsCollection.test.ts +++ b/packages/cspell-tools/src/compiler/createWordsCollection.test.ts @@ -16,7 +16,7 @@ describe('createAllowedSplitWords', () => { const fixFiles: string[] | undefined = Array.isArray(files) ? files : !files ? undefined : [files]; const allowedFiles = fixFiles?.map((file) => resolvePathToFixture(file)); const allowed = await createAllowedSplitWordsFromFiles(allowedFiles); - expect(allowed.size).toBe(expectedSize); - expect(allowed.has(has)).toBe(expected); + expect(allowed.size).toBeGreaterThanOrEqual(expectedSize); + expect(allowed.has(has, true)).toBe(expected); }); }); diff --git a/packages/cspell-tools/src/compiler/createWordsCollection.ts b/packages/cspell-tools/src/compiler/createWordsCollection.ts index a0c7ba39e12..37a2c2caa64 100644 --- a/packages/cspell-tools/src/compiler/createWordsCollection.ts +++ b/packages/cspell-tools/src/compiler/createWordsCollection.ts @@ -1,19 +1,22 @@ +import { parseDictionary } from 'cspell-trie-lib'; + import type { FilePath } from '../config/config.js'; import { createReader } from './Reader.js'; +import { DictionaryReader, Reader } from './readers/ReaderOptions.js'; import type { AllowedSplitWordsCollection, ExcludeWordsCollection, WordsCollection } from './WordsCollection.js'; import { defaultAllowedSplitWords, defaultExcludeWordsCollection } from './WordsCollection.js'; class AllowedSplitWordsImpl implements AllowedSplitWordsCollection { - private words: WordsCollection; + private collection: WordsCollection; readonly size: number; constructor(collection: WordsCollection) { - this.words = collection; + this.collection = collection; this.size = collection.size; } - public has(word: string) { - return !this.size || this.words.has(word); + public has(word: string, caseSensitive: boolean) { + return !this.size || this.collection.has(word, caseSensitive); } } @@ -32,9 +35,33 @@ export function createAllowedSplitWords(words: Iterable | undefined): Al return new AllowedSplitWordsImpl(createWordsCollection(words)); } -async function readFile(filename: string) { - const reader = await createReader(filename, {}); - return [...reader]; +function buildHasFn(dict: { hasWord: (word: string, caseSensitive: boolean) => boolean }) { + function has(word: string, caseSensitive: boolean) { + const r = dict.hasWord(word, true); + if (r || caseSensitive) return r; + const lc = word.toLowerCase(); + if (lc == word) return false; + return dict.hasWord(lc, true); + } + + return has; +} + +async function readFile(filename: string): Promise { + return await createReader(filename, {}); +} + +function readersToCollection(readers: Reader[]): WordsCollection { + const dictReaders = readers.filter(isDictionaryReader).map(dictReaderToCollection); + const nonDictCollection = lineReadersToCollection(readers.filter((a) => !isDictionaryReader(a))); + const collections = [...dictReaders, nonDictCollection]; + + const collection = { + size: collections.reduce((s, a) => s + a.size, 0), + has: (word: string, caseSensitive: boolean) => collections.some((a) => a.has(word, caseSensitive)), + }; + + return collection; } const cache = new WeakMap(); @@ -47,7 +74,7 @@ export async function createWordsCollectionFromFiles(files: FilePath | FilePath[ const sources = await Promise.all(files.map((file) => readFile(file))); - const collection = createWordsCollection(sources.flat()); + const collection = readersToCollection(sources); cache.set(files, collection); return collection; @@ -60,20 +87,22 @@ export function createWordsCollection(words: Iterable): WordsCollection .map((a) => a.trim()) .filter((a) => !!a) .filter((a) => !a.startsWith('#')); - return new Set(arrWords); + const setOfWords = new Set(arrWords); + const has = buildHasFn({ hasWord: (word: string) => setOfWords.has(word) }); + return { size: setOfWords.size, has }; } class ExcludeWordsCollectionImpl implements ExcludeWordsCollection { - private words: WordsCollection; + private collection: WordsCollection; readonly size: number; constructor(collection: WordsCollection) { - this.words = collection; + this.collection = collection; this.size = collection.size; } - public has(word: string) { - return this.words.has(word); + public has(word: string, caseSensitive: boolean) { + return this.collection.has(word, caseSensitive); } } @@ -89,3 +118,23 @@ export async function createExcludeWordsCollectionFromFiles( export function createExcludeWordsCollection(words: Iterable | undefined): ExcludeWordsCollection { return new ExcludeWordsCollectionImpl(words ? createWordsCollection(words) : new Set()); } + +function isDictionaryReader(reader: Reader | DictionaryReader): reader is DictionaryReader { + return 'hasWord' in reader && !!reader.hasWord; +} + +function dictReaderToCollection(reader: DictionaryReader): WordsCollection { + return { size: reader.size, has: buildHasFn(reader) }; +} + +function lineReadersToCollection(readers: Reader[]): WordsCollection { + function* words() { + for (const reader of readers) { + yield* reader.lines; + } + } + + const dict = parseDictionary(words(), { stripCaseAndAccents: false }); + + return { size: dict.size, has: buildHasFn(dict) }; +} diff --git a/packages/cspell-tools/src/compiler/legacyLineToWords.ts b/packages/cspell-tools/src/compiler/legacyLineToWords.ts index 348633a8cdb..94fd646dc9a 100644 --- a/packages/cspell-tools/src/compiler/legacyLineToWords.ts +++ b/packages/cspell-tools/src/compiler/legacyLineToWords.ts @@ -18,7 +18,7 @@ export function legacyLineToWords( const words = pipe( wordGroups, opConcatMap((a) => a.split(regExpSpaceOrDash)), - opConcatMap((a) => splitCamelCaseIfAllowed(a, allowedSplitWords, keepCase)), + opConcatMap((a) => splitCamelCaseIfAllowed(a, allowedSplitWords, keepCase, '')), opMap((a) => a.trim()), opFilter((a) => !!a), opFilter((s) => !regExpRepeatChars.test(s)), diff --git a/packages/cspell-tools/src/compiler/readers/ReaderOptions.ts b/packages/cspell-tools/src/compiler/readers/ReaderOptions.ts index 2584b363493..283b5006e1a 100644 --- a/packages/cspell-tools/src/compiler/readers/ReaderOptions.ts +++ b/packages/cspell-tools/src/compiler/readers/ReaderOptions.ts @@ -11,6 +11,11 @@ export interface BaseReader { size: number; type: 'Hunspell' | 'TextFile' | 'Trie'; lines: Iterable; + readonly hasWord?: (word: string, caseSensitive: boolean) => boolean; } -export interface Reader extends BaseReader, Iterable {} +export interface Reader extends BaseReader {} + +export interface DictionaryReader extends BaseReader { + readonly hasWord: (word: string, caseSensitive: boolean) => boolean; +} diff --git a/packages/cspell-tools/src/compiler/readers/trieFileReader.ts b/packages/cspell-tools/src/compiler/readers/trieFileReader.ts index 58644b42f08..0af15c8b6dd 100644 --- a/packages/cspell-tools/src/compiler/readers/trieFileReader.ts +++ b/packages/cspell-tools/src/compiler/readers/trieFileReader.ts @@ -1,9 +1,9 @@ import { importTrie, Trie } from 'cspell-trie-lib'; -import type { BaseReader } from './ReaderOptions.js'; +import type { DictionaryReader } from './ReaderOptions.js'; import { readTextFileLines } from './readTextFile.js'; -export async function trieFileReader(filename: string): Promise { +export async function trieFileReader(filename: string): Promise { const trieRoot = importTrie(await readTextFileLines(filename)); const trie = new Trie(trieRoot); const words = trie.words(); @@ -13,5 +13,6 @@ export async function trieFileReader(filename: string): Promise { return trie.size(); }, lines: words, + hasWord: (word: string, caseSensitive: boolean) => trie.hasWord(word, caseSensitive), }; } diff --git a/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.test.ts b/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.test.ts index 190b7a701eb..76bb0aed6a0 100644 --- a/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.test.ts +++ b/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.test.ts @@ -20,6 +20,25 @@ describe('splitCamelCaseIfAllowed', () => { ${'ADP_ConnectionStateMsg_Closed'} | ${true} | ${undefined} | ${['ADP', 'connection', 'state', 'msg', 'closed']} `('splitCamelCaseIfAllowed $text $keepCase $allowed', ({ text, keepCase, allowed, expected }) => { allowed = createAllowedSplitWords(allowed); - expect(splitCamelCaseIfAllowed(text, allowed, keepCase)).toEqual(expected); + expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '')).toEqual(expected); + }); + + test.each` + text | keepCase | allowed | expected + ${''} | ${false} | ${undefined} | ${[]} + ${'hello'} | ${false} | ${undefined} | ${['hello']} + ${'helloThere'} | ${false} | ${['hello', 'there']} | ${['hello+', '+there']} + ${'helloThere'} | ${false} | ${['hello', 'There']} | ${['hello+', '+There']} + ${'helloThere'} | ${true} | ${['hello', 'There']} | ${['hello+', '+There']} + ${'ERRORCode'} | ${false} | ${['error', 'code']} | ${['error+', '+code']} + ${'ERRORCode'} | ${true} | ${['error', 'code']} | ${['error+', '+code']} + ${'ERRORCode'} | ${true} | ${['code']} | ${['ERRORCode']} + ${'ERRORCode'} | ${false} | ${['code']} | ${['ERRORCode']} + ${'ErrorCode'} | ${true} | ${['error', 'code']} | ${['error+', '+code']} + ${'xmlUCSIsCatZ'} | ${true} | ${['xml', 'UCS', 'is', 'cat', 'z']} | ${['xml+', '+UCS+', 'is', '+cat+', 'z']} + ${'ADP_ConnectionStateMsg_Closed'} | ${true} | ${undefined} | ${['ADP', 'connection+', '+state+', '+msg', 'closed']} + `('splitCamelCaseIfAllowed $text $keepCase $allowed', ({ text, keepCase, allowed, expected }) => { + allowed = createAllowedSplitWords(allowed); + expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '+')).toEqual(expected); }); }); diff --git a/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.ts b/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.ts index 96a0d1a405c..a51e2fc7b58 100644 --- a/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.ts +++ b/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.ts @@ -1,4 +1,4 @@ -import * as Text from './text.js'; +import { isSingleLetter, splitCamelCaseWord } from './text.js'; import type { AllowedSplitWordsCollection } from './WordsCollection.js'; export const regExpSpaceOrDash = /[- ]+/g; @@ -8,12 +8,25 @@ export function splitCamelCaseIfAllowed( word: string, allowedWords: AllowedSplitWordsCollection, keepCase: boolean, + compoundPrefix: string, ): string[] { const split = [...splitCamelCase(word)]; if (split.length == 1) return adjustCases(split, allowedWords, keepCase); - const missing = split.find((w) => isUnknown(w, allowedWords)); - if (missing !== undefined) return [word]; - return adjustCases(split, allowedWords, keepCase); + const missing = split.some((w) => isUnknown(w, allowedWords)); + if (missing) return [word]; + const wordIndexes = calcWordIndex(word, split); + const adjusted = adjustCases(split, allowedWords, keepCase); + return !compoundPrefix + ? adjusted + : adjusted.map((w, i) => { + const { px, sx } = wordIndexes[i]; + const canCompound = w.length > 2; + const lc = w.toLowerCase(); + const p = canCompound && isSingleLetter(px) ? compoundPrefix : ''; + const s = canCompound && isSingleLetter(sx) ? compoundPrefix : ''; + if (lc.length < 4 || allowedWords.has(w, true)) return p + w + s; + return p + lc + s; + }); } function adjustCases(words: string[], allowedWords: AllowedSplitWordsCollection, keepCase: boolean): string[] { @@ -22,7 +35,7 @@ function adjustCases(words: string[], allowedWords: AllowedSplitWordsCollection, function adjustCase(word: string, allowedWords: AllowedSplitWordsCollection, keepCase: boolean): string { const lc = word.toLowerCase(); - if (!allowedWords.has(lc)) return word; + if (!allowedWords.has(lc, true)) return word; if (lc === word) return word; if (word.slice(1).toLowerCase() === word.slice(1)) return lc; if (!keepCase && word.toUpperCase() === word) return word.toLowerCase(); @@ -30,14 +43,35 @@ function adjustCase(word: string, allowedWords: AllowedSplitWordsCollection, kee } function isUnknown(word: string, allowedWords: AllowedSplitWordsCollection): boolean { - return !allowedWords.has(word) && !allowedWords.has(word.toLowerCase()); + if (word === 'ERROR') { + return !allowedWords.has(word, false); + } + return !allowedWords.has(word, false); } function splitCamelCase(word: string): Iterable { - const splitWords = Text.splitCamelCaseWord(word, false).filter((word) => !regExpIsNumber.test(word)); + const splitWords = splitCamelCaseWord(word).filter((word) => !regExpIsNumber.test(word)); // We only want to preserve this: "New York" and not "Namespace DNSLookup" if (splitWords.length > 1 && regExpSpaceOrDash.test(word)) { return splitWords.flatMap((w) => w.split(regExpSpaceOrDash)); } return splitWords; } + +interface WordIndex { + word: string; + i: number; + px: string; + sx: string; +} + +function calcWordIndex(word: string, words: string[]): WordIndex[] { + let i = 0; + return words.map((w) => { + const j = word.indexOf(w, i); + const k = j + w.length; + const wIndex = { word: w, i: j, px: word[j - 1] || '', sx: word[k] || '' }; + i = k; + return wIndex; + }); +} diff --git a/packages/cspell-tools/src/compiler/streamSourceWordsFromFile.test.ts b/packages/cspell-tools/src/compiler/streamSourceWordsFromFile.test.ts index dbf77d1cb0b..785d2a83623 100644 --- a/packages/cspell-tools/src/compiler/streamSourceWordsFromFile.test.ts +++ b/packages/cspell-tools/src/compiler/streamSourceWordsFromFile.test.ts @@ -17,6 +17,7 @@ describe('Validate the iterateWordsFromFile', () => { const reader = await streamSourceWordsFromFile(path.join(samples, 'hunspell/example.aff'), { splitWords: false, allowedSplitWords, + storeSplitWordsAsCompounds: undefined, }); const results = [...reader]; // this might break if the processing order of hunspell changes. @@ -27,6 +28,7 @@ describe('Validate the iterateWordsFromFile', () => { const reader = await streamSourceWordsFromFile(path.join(samples, 'cities.trie.gz'), { splitWords: false, allowedSplitWords, + storeSplitWordsAsCompounds: undefined, }); const results = [...reader]; expect(results.join('|')).toBe( diff --git a/packages/cspell-tools/src/compiler/text.test.ts b/packages/cspell-tools/src/compiler/text.test.ts index 4d62b850020..085e068a3a3 100644 --- a/packages/cspell-tools/src/compiler/text.test.ts +++ b/packages/cspell-tools/src/compiler/text.test.ts @@ -1,6 +1,6 @@ import { describe, expect, test } from 'vitest'; -import { splitCamelCaseWord } from './text.js'; +import { isSingleLetter, splitCamelCaseWord, splitCamelCaseWordAutoStem } from './text.js'; describe('split', () => { test.each` @@ -10,7 +10,29 @@ describe('split', () => { ${'free2move'} | ${['free', 'move']} ${'2move'} | ${['move']} ${'PrimeNumber5'} | ${['Prime', 'Number']} - `('splitCamelCaseWord', ({ word, expected }) => { + `('splitCamelCaseWord $word', ({ word, expected }) => { expect(splitCamelCaseWord(word)).toEqual(expected); }); + + test.each` + word | expected + ${'camelCases'} | ${['camel', 'Cases']} + ${'ERRORs'} | ${['Errors']} + ${'USER_ERRORs'} | ${['USER', 'Errors']} + ${'USERs_ERRORs'} | ${['Users', 'Errors']} + ${'WORKs_ERRORs'} | ${['Works', 'Errors']} + ${'WORKas'} | ${['WOR', 'Kas']} + `('splitCamelCaseWordAutoStem $word', ({ word, expected }) => { + expect(splitCamelCaseWordAutoStem(word)).toEqual(expected); + }); + + test.each` + letter | expected + ${'a'} | ${true} + ${'é'} | ${true} + ${'é'.normalize('NFD')} | ${true} + ${'1'} | ${false} + `('isSingleLetter $letter', ({ letter, expected }) => { + expect(isSingleLetter(letter)).toBe(expected); + }); }); diff --git a/packages/cspell-tools/src/compiler/text.ts b/packages/cspell-tools/src/compiler/text.ts index e950317e717..75162966ace 100644 --- a/packages/cspell-tools/src/compiler/text.ts +++ b/packages/cspell-tools/src/compiler/text.ts @@ -4,13 +4,30 @@ const regExUpperSOrIng = /(\p{Lu}+'?(?:s|ing|ies|es|ings|ed|ning))(?!\p{Ll})/gu; const regExSplitWords = /([\p{Ll}])([\p{Lu}])/gu; const regExSplitWords2 = /(\p{Lu})(\p{Lu}\p{Ll})/gu; +const regExpIsLetter = /^\p{L}\p{M}{0,2}$/u; + /** * Split camelCase words into an array of strings. */ -export function splitCamelCaseWord(word: string, autoStem = true): string[] { - const wPrime = autoStem ? word.replaceAll(regExUpperSOrIng, (s) => s[0] + s.slice(1).toLowerCase()) : word; - const pass1 = wPrime.replaceAll(regExSplitWords, '$1|$2'); +export function splitCamelCaseWord(word: string): string[] { + const pass1 = word.replaceAll(regExSplitWords, '$1|$2'); const pass2 = pass1.replaceAll(regExSplitWords2, '$1|$2'); const pass3 = pass2.replaceAll(/[\d_]+/g, '|'); return pass3.split('|').filter((a) => !!a); } + +/** + * Split camelCase words into an array of strings, try to fix English words. + */ +export function splitCamelCaseWordAutoStem(word: string): string[] { + return splitCamelCaseWord(word.replaceAll(regExUpperSOrIng, tailToLowerCase)); +} + +function tailToLowerCase(word: string): string { + const letters = [...word]; + return letters[0] + letters.slice(1).join('').toLowerCase(); +} + +export function isSingleLetter(c: string): boolean { + return regExpIsLetter.test(c); +} diff --git a/packages/cspell-tools/src/compiler/wordListCompiler.test.ts b/packages/cspell-tools/src/compiler/wordListCompiler.test.ts index b4e5792c7f4..da181e7d00e 100644 --- a/packages/cspell-tools/src/compiler/wordListCompiler.test.ts +++ b/packages/cspell-tools/src/compiler/wordListCompiler.test.ts @@ -28,7 +28,7 @@ const samples = path.join(testHelper.packageRoot, '../Samples/dicts'); const sampleDictEnUS = path.join(samples, 'hunspell', 'en_US.dic'); const sampleDictEn = path.join(samples, 'en_US.txt'); -const wordListHeader = __testing__.wordListHeader; +const { wordListHeader, removeDuplicates } = __testing__; const consoleSpy = spyOnConsole(); const consoleOutput = consoleSpy.consoleOutput; @@ -38,6 +38,7 @@ const allowedSplitWords = defaultAllowedSplitWords; const readOptions: SourceReaderOptions = { splitWords: false, allowedSplitWords, + storeSplitWordsAsCompounds: undefined, }; describe('Validate the wordListCompiler', () => { @@ -204,6 +205,26 @@ describe('Validate Larger Dictionary', () => { }, 60_000); }); +describe('', () => { + test.each` + words | expected + ${'hello'} | ${['hello']} + ${'hello|HELLO'} | ${['hello']} + ${'hello|*hello*|*HELLO*'} | ${['*hello*']} + ${'HELLO|*hello*|*HELLO*'} | ${['*hello*']} + ${'HELLO|*HELLO*'} | ${['*HELLO*']} + ${'Hello|*Hello*'} | ${['*Hello*']} + ${'hello|+hello+'} | ${['*hello*']} + ${'hello|hello+'} | ${['hello*']} + ${'hello|+hello'} | ${['*hello']} + ${'hello|hello+|+hello'} | ${['*hello*']} + `('removeDuplicate $words', ({ words, expected }) => { + words = typeof words === 'string' ? words.split('|') : words; + const result = [...removeDuplicates(words)]; + expect(result).toEqual(expected); + }); +}); + async function compileTrie(words: Iterable, destFilename: string, options: CompileTrieOptions): Promise { const normalizer = normalizeTargetWords(options); return _compileTrie(normalizer(words), destFilename, options); @@ -229,8 +250,13 @@ function legacyNormalizeWords(lines: Iterable): Iterable { ); } -function compileOpt(sort: boolean, generateNonStrict = true, dictionaryDirectives?: string[]): CompileOptions { - return { sort, generateNonStrict, dictionaryDirectives }; +function compileOpt( + sort: boolean, + generateNonStrict = true, + dictionaryDirectives: string[] | undefined = undefined, + removeDuplicates = false, +): CompileOptions { + return { sort, generateNonStrict, dictionaryDirectives, removeDuplicates }; } // const cities = `\ diff --git a/packages/cspell-tools/src/compiler/wordListCompiler.ts b/packages/cspell-tools/src/compiler/wordListCompiler.ts index b4d6f871e80..f8645a60dd3 100644 --- a/packages/cspell-tools/src/compiler/wordListCompiler.ts +++ b/packages/cspell-tools/src/compiler/wordListCompiler.ts @@ -36,13 +36,110 @@ export async function compileWordList( function normalize(lines: Iterable, options: CompileOptions): Iterable { const filter = normalizeTargetWords(options); - const iter = pipe(lines, filter); + const cleanLines = options.removeDuplicates ? removeDuplicates(lines) : lines; + + const iter = pipe(cleanLines, filter); if (!options.sort) return iter; const result = new Set(iter); return [...result].sort(); } +function stripCompoundAFix(word: string): string { + return word.replaceAll('*', '').replaceAll('+', ''); +} + +function* removeDuplicates(words: Iterable): Iterable { + const wordSet = new Set(words); + const wordForms = new Map(); + for (const word of wordSet) { + const lc = stripCompoundAFix(word.toLowerCase()); + const forms = wordForms.get(lc) ?? []; + forms.push(word); + wordForms.set(lc, forms); + } + + for (const forms of wordForms.values()) { + if (forms.length <= 1) { + yield* forms; + continue; + } + const mForms = removeDuplicateForms(forms); + if (mForms.size <= 1) { + yield* mForms.values(); + continue; + } + // Handle upper / lower mix. + const words = [...mForms.keys()]; + const lc = words[0].toLowerCase(); + const lcForm = mForms.get(lc); + if (!lcForm) { + yield* mForms.values(); + continue; + } + mForms.delete(lc); + yield lcForm; + for (const form of mForms.values()) { + if (form.toLowerCase() === lcForm) continue; + yield form; + } + } +} + +/** + * solo + * optional_prefix* + * optional_suffix* + * required_prefix+ + * required_suffix+ + */ + +enum Flags { + base = 0, + noPfx = 1 << 0, + noSfx = 1 << 1, + pfx = 1 << 2, + sfx = 1 << 3, + noFix = noPfx | noSfx, + midFix = pfx | sfx, +} + +function applyFlags(word: string, flags: number): string { + if (flags === Flags.noFix) return word; + if (flags === (Flags.noFix | Flags.midFix)) return '*' + word + '*'; + const p = flags & Flags.pfx ? (flags & Flags.noPfx ? '*' : '+') : ''; + const s = flags & Flags.sfx ? (flags & Flags.noSfx ? '*' : '+') : ''; + return s + word + p; +} + +function removeDuplicateForms(forms: Iterable): Map { + function flags(word: string, flag: number = 0) { + let f = Flags.base; + const isOptPrefix = word.endsWith('*'); + const isPrefix = !isOptPrefix && word.endsWith('+'); + const isAnyPrefix = isPrefix || isOptPrefix; + const isOptSuffix = word.startsWith('*'); + const isSuffix = !isOptSuffix && word.startsWith('+'); + const isAnySuffix = isSuffix || isOptSuffix; + f |= isAnyPrefix ? Flags.pfx : 0; + f |= !isPrefix ? Flags.noPfx : 0; + f |= isAnySuffix ? Flags.sfx : 0; + f |= !isSuffix ? Flags.noSfx : 0; + return flag | f; + } + + const m = new Map(); + for (const form of forms) { + const k = stripCompoundAFix(form); + m.set(k, flags(form, m.get(k))); + } + return new Map( + [...m.entries()].map(([form, flag]) => { + return [form, applyFlags(form, flag)]; + }), + ); +} + function createWordListTarget(destFilename: string): (seq: Iterable) => Promise { const target = createTarget(destFilename); return (seq: Iterable) => @@ -107,4 +204,5 @@ function createTrieTarget(destFilename: string, options: TrieOptions): (words: I export const __testing__ = { wordListHeader, + removeDuplicates, }; diff --git a/packages/cspell-tools/src/compiler/wordListParser.test.ts b/packages/cspell-tools/src/compiler/wordListParser.test.ts index d9c15f04046..c7540d704a5 100644 --- a/packages/cspell-tools/src/compiler/wordListParser.test.ts +++ b/packages/cspell-tools/src/compiler/wordListParser.test.ts @@ -7,6 +7,8 @@ import type { ParseFileOptions } from './wordListParser.js'; import { normalizeTargetWords, parseFileLines } from './wordListParser.js'; import { defaultAllowedSplitWords } from './WordsCollection.js'; +const alwaysAllowSplit = { size: 10, has: () => true }; + describe('Validate the wordListCompiler', () => { beforeEach(() => { vi.resetAllMocks(); @@ -57,11 +59,23 @@ describe('Validate the wordListCompiler', () => { const r = [...parseFileLines(content, options)]; expect(r).toEqual(expectedResult); }); + + test.each` + content | options | expectedResult + ${'AppleSauce'} | ${pf({ split: true })} | ${s('AppleSauce')} + ${'AppleSauce'} | ${pf({ split: true, allowedSplitWords: alwaysAllowSplit })} | ${s('apple|sauce')} + ${'AppleSauce'} | ${pf({ split: true, allowedSplitWords: alwaysAllowSplit, legacy: true })} | ${s('apple|sauce')} + ${'AppleSauce'} | ${pf({ split: true, allowedSplitWords: alwaysAllowSplit, storeSplitWordsAsCompounds: true })} | ${s('apple+|+sauce')} + `('parseFileLines split $content $options', ({ content, options, expectedResult }) => { + const r = [...parseFileLines(content, options)]; + expect(r).toEqual(expectedResult); + }); }); function pf(...opts: Partial[]): ParseFileOptions { const opt: ParseFileOptions = { allowedSplitWords: defaultAllowedSplitWords, + storeSplitWordsAsCompounds: undefined, }; for (const op of opts) { Object.assign(opt, op); diff --git a/packages/cspell-tools/src/compiler/wordListParser.ts b/packages/cspell-tools/src/compiler/wordListParser.ts index a431cca1623..9900dd99362 100644 --- a/packages/cspell-tools/src/compiler/wordListParser.ts +++ b/packages/cspell-tools/src/compiler/wordListParser.ts @@ -11,6 +11,7 @@ export function normalizeTargetWords(options: CompileOptions): Operator const lineParser = createDictionaryLineParser({ stripCaseAndAccents: options.generateNonStrict, stripCaseAndAccentsOnForbidden: true, + keepOptionalCompoundCharacter: true, }); const operations: Operator[] = [ opFilter((a) => !!a), @@ -77,6 +78,13 @@ export interface ParseFileOptions { legacy?: boolean; allowedSplitWords: AllowedSplitWordsCollection; + + /** + * Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. + * These words are prefixed / suffixed with `*`. + * @default undefined + */ + storeSplitWordsAsCompounds: boolean | undefined; } type ParseFileOptionsRequired = Required; @@ -90,6 +98,7 @@ const _defaultOptions: ParseFileOptionsRequired = { splitKeepBoth: false, // splitSeparator: regExpSplit, allowedSplitWords: { has: () => true, size: 0 }, + storeSplitWordsAsCompounds: undefined, }; export const defaultParseDictionaryOptions: ParseFileOptionsRequired = Object.freeze(_defaultOptions); @@ -106,12 +115,16 @@ export const setOfCSpellDirectiveFlags = ['no-split', 'split', 'keep-case', 'no- */ export function createParseFileLineMapper(options?: Partial): Operator { const _options = options || _defaultOptions; - const { splitKeepBoth = _defaultOptions.splitKeepBoth, allowedSplitWords = _defaultOptions.allowedSplitWords } = - _options; + const { + splitKeepBoth = _defaultOptions.splitKeepBoth, + allowedSplitWords = _defaultOptions.allowedSplitWords, + storeSplitWordsAsCompounds, + } = _options; let { legacy = _defaultOptions.legacy } = _options; let { split = _defaultOptions.split, keepCase = legacy ? false : _defaultOptions.keepCase } = _options; + const compoundFix = storeSplitWordsAsCompounds ? '+' : ''; function isString(line: unknown | string): line is string { return typeof line === 'string'; @@ -193,6 +206,10 @@ export function createParseFileLineMapper(options?: Partial): return lines; } + function splitWordIntoWords(word: string): string[] { + return splitCamelCaseIfAllowed(word, allowedSplitWords, keepCase, compoundFix); + } + function* splitWords(lines: Iterable): Iterable { for (const line of lines) { if (legacy) { @@ -201,9 +218,7 @@ export function createParseFileLineMapper(options?: Partial): } if (split) { const words = splitLine(line); - yield* !allowedSplitWords.size - ? words - : words.flatMap((word) => splitCamelCaseIfAllowed(word, allowedSplitWords, keepCase)); + yield* !allowedSplitWords.size ? words : words.flatMap((word) => splitWordIntoWords(word)); if (!splitKeepBoth) continue; } yield line.replaceAll(/["]/g, ''); diff --git a/packages/cspell-tools/src/config/config.ts b/packages/cspell-tools/src/config/config.ts index af2e339409f..22363381ea6 100644 --- a/packages/cspell-tools/src/config/config.ts +++ b/packages/cspell-tools/src/config/config.ts @@ -85,6 +85,13 @@ export interface CompileTargetOptions { * ``` */ dictionaryDirectives?: string[] | undefined; + + /** + * Remove duplicate words, favor lower case words over mixed case words. + * Combine compound prefixes where possible. + * @default false + */ + removeDuplicates?: boolean | undefined; } export interface Target extends CompileTargetOptions { @@ -167,7 +174,23 @@ export interface CompileSourceOptions { */ keepRawCase?: boolean | undefined; + /** + * Words in the `allowedSplitWords` are considered correct and can be used + * as a basis for splitting compound words. + * + * If entries can be split so that all the words in the entry are allowed, + * then only the individual words are added, otherwise the entire entry is added. + * This is to prevent misspellings in CamelCase words from being introduced into the + * dictionary. + */ allowedSplitWords?: FilePath | FilePath[] | undefined; + + /** + * Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. + * These words are prefixed / suffixed with `*`. + * @default false + */ + storeSplitWordsAsCompounds?: boolean | undefined; } export const configFileSchemaURL = diff --git a/packages/cspell-trie-lib/api/api.d.ts b/packages/cspell-trie-lib/api/api.d.ts index 94e18917617..501b1c0fcaa 100644 --- a/packages/cspell-trie-lib/api/api.d.ts +++ b/packages/cspell-trie-lib/api/api.d.ts @@ -656,6 +656,10 @@ interface ParseDictionaryOptions { * Specify the separator for splitting words. */ splitSeparator: RegExp | string; + /** + * Do not normalize the compound character. + */ + keepOptionalCompoundCharacter: boolean; } /** * Normalizes a dictionary words based upon prefix / suffixes. @@ -673,7 +677,7 @@ declare function createDictionaryLineParserMapper(options?: Partial | string, options?: Partial): Iterable; declare function parseDictionaryLegacy(text: string | string[], options?: Partial): Trie; -declare function parseDictionary(text: string | string[], options?: Partial): ITrie; +declare function parseDictionary(text: string | Iterable, options?: Partial): ITrie; /** * Builds an optimized Trie from a Iterable. It attempts to reduce the size of the trie diff --git a/packages/cspell-trie-lib/src/lib/SimpleDictionaryParser.test.ts b/packages/cspell-trie-lib/src/lib/SimpleDictionaryParser.test.ts index b2f8cdf0e55..8e7e04e1b31 100644 --- a/packages/cspell-trie-lib/src/lib/SimpleDictionaryParser.test.ts +++ b/packages/cspell-trie-lib/src/lib/SimpleDictionaryParser.test.ts @@ -73,6 +73,16 @@ describe('Validate SimpleDictionaryParser', () => { expect(result).toEqual(expected); }); + test.each` + content | options | expected + ${'*hello*'} | ${{}} | ${['hello', 'hello+', '+hello', '+hello+']} + ${'*hello*'} | ${{ keepOptionalCompoundCharacter: true }} | ${['*hello*']} + `('compounds $content', ({ content, options, expected }) => { + const trie = parseDictionaryLegacy(content, options); + const result = [...trie.words()]; + expect(result.sort()).toEqual(expected.sort()); + }); + test('preserve cases', () => { const words = ['!forbid', '+End', '+Middle+', 'Begin', 'Begin+', 'Café', 'End']; const trie = parseDictionaryLegacy(words.join('\n'), { stripCaseAndAccents: false }); diff --git a/packages/cspell-trie-lib/src/lib/SimpleDictionaryParser.ts b/packages/cspell-trie-lib/src/lib/SimpleDictionaryParser.ts index 9e62a71874c..71f0d65a2b7 100644 --- a/packages/cspell-trie-lib/src/lib/SimpleDictionaryParser.ts +++ b/packages/cspell-trie-lib/src/lib/SimpleDictionaryParser.ts @@ -65,6 +65,11 @@ export interface ParseDictionaryOptions { * Specify the separator for splitting words. */ splitSeparator: RegExp | string; + + /** + * Do not normalize the compound character. + */ + keepOptionalCompoundCharacter: boolean; } const RegExpSplit = /[\s,;]/g; @@ -82,6 +87,7 @@ const _defaultOptions: ParseDictionaryOptions = { split: false, splitKeepBoth: false, splitSeparator: RegExpSplit, + keepOptionalCompoundCharacter: false, }; export const defaultParseDictionaryOptions: ParseDictionaryOptions = Object.freeze(_defaultOptions); @@ -109,6 +115,7 @@ export function createDictionaryLineParserMapper(options?: Partial[] = keepOptionalCompoundCharacter + ? [] + : [opConcatMap(mapOptionalPrefix), opConcatMap(mapOptionalSuffix)]; + const processLines = opPipe( opFilter(isString), splitLines, @@ -243,8 +254,7 @@ export function createDictionaryLineParserMapper(options?: Partial, options?: Partia }); } -export function parseDictionary(text: string | string[], options?: Partial): ITrie { +export function parseDictionary(text: string | Iterable, options?: Partial): ITrie { return parseLinesToDictionary(typeof text === 'string' ? text.split('\n') : text, options); }