fix: tools - remove duplicates and support compounding (#6423)

streetsidesoftware · Oct 27, 2024 · ac0730f · ac0730f
1 parent 1b4d774
commit ac0730f
Show file tree

Hide file tree

Showing 28 changed files with 484 additions and 115 deletions.
diff --git a/packages/cspell-tools/cspell-tools.config.schema.json b/packages/cspell-tools/cspell-tools.config.schema.json
@@ -38,7 +38,8 @@
               },
               "type": "array"
             }
-          ]
+          ],
+          "description": "Words in the `allowedSplitWords` are considered correct and can be used as a basis for splitting compound words.\n\nIf entries can be split so that all the words in the entry are allowed, then only the individual words are added, otherwise the entire entry is added. This is to prevent misspellings in CamelCase words from being introduced into the dictionary."
         },
         "keepRawCase": {
           "default": false,
@@ -64,6 +65,11 @@
           ],
           "default": false,
           "description": "Split lines into words."
+        },
+        "storeSplitWordsAsCompounds": {
+          "default": false,
+          "description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
+          "type": "boolean"
         }
       },
       "required": [
@@ -89,7 +95,8 @@
               },
               "type": "array"
             }
-          ]
+          ],
+          "description": "Words in the `allowedSplitWords` are considered correct and can be used as a basis for splitting compound words.\n\nIf entries can be split so that all the words in the entry are allowed, then only the individual words are added, otherwise the entire entry is added. This is to prevent misspellings in CamelCase words from being introduced into the dictionary."
         },
         "filename": {
           "$ref": "#/definitions/FilePath"
@@ -115,6 +122,11 @@
           ],
           "default": false,
           "description": "Split lines into words."
+        },
+        "storeSplitWordsAsCompounds": {
+          "default": false,
+          "description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
+          "type": "boolean"
         }
       },
       "required": [
@@ -171,6 +183,11 @@
           "description": "Name of target, used as the basis of target file name.",
           "type": "string"
         },
+        "removeDuplicates": {
+          "default": false,
+          "description": "Remove duplicate words, favor lower case words over mixed case words. Combine compound prefixes where possible.",
+          "type": "boolean"
+        },
         "sort": {
           "default": true,
           "description": "Sort the words in the resulting dictionary. Does not apply to `trie` based formats.",
@@ -249,6 +266,11 @@
       "description": "Maximum number of nested Hunspell Rules to apply. This is needed for recursive dictionaries like Hebrew.",
       "type": "number"
     },
+    "removeDuplicates": {
+      "default": false,
+      "description": "Remove duplicate words, favor lower case words over mixed case words. Combine compound prefixes where possible.",
+      "type": "boolean"
+    },
     "rootDir": {
       "description": "Specify the directory where all relative paths will resolved against. By default, all relative paths are relative to the location of the config file.",
       "type": "string"
@@ -271,6 +293,11 @@
       "default": false,
       "description": "Split lines into words."
     },
+    "storeSplitWordsAsCompounds": {
+      "default": false,
+      "description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
+      "type": "boolean"
+    },
     "targets": {
       "description": "Optional Target Dictionaries to create.",
       "items": {

diff --git a/packages/cspell-tools/src/__snapshots__/build.test.ts.snap b/packages/cspell-tools/src/__snapshots__/build.test.ts.snap
@@ -178,17 +178,12 @@ exports[`build action > build multi 0 2`] = `
 !Errorerror
 !codecode
 !err
-+code
-+code+
-+error
-+error+
-+msg
+*msg
++code*
++error*
 Café
-Code
-Code+
-Error
-Error+
-msg
+Code*
+Error*
 "
 `;
 

diff --git a/packages/cspell-tools/src/compiler/CompileOptions.ts b/packages/cspell-tools/src/compiler/CompileOptions.ts
@@ -28,4 +28,11 @@ export interface CompileOptions {
      *
      */
     dictionaryDirectives?: string[] | undefined;
+
+    /**
+     * Remove duplicate words, favor lower case words over mixed case words.
+     * Combine compound prefixes where possible.
+     * @default false
+     */
+    removeDuplicates?: boolean;
 }
diff --git a/packages/cspell-tools/src/compiler/Reader.test.ts b/packages/cspell-tools/src/compiler/Reader.test.ts
@@ -17,14 +17,14 @@ const sc = (m: string) => expect.stringContaining(m);
 describe('Validate the iterateWordsFromFile', () => {
     test('streamWordsFromFile: hunspell', async () => {
         const reader = await createReader(path.join(samples, 'hunspell', 'example.aff'), readerOptions);
-        const results = [...reader];
+        const results = [...reader.lines];
         // this might break if the processing order of hunspell changes.
         expect(results).toEqual(s('hello rework reworked tried try work worked', ' '));
     });
 
     test('stream words from trie', async () => {
         const reader = await createReader(path.join(samples, 'cities.trie.gz'), readerOptions);
-        const results = [...reader];
+        const results = [...reader.lines];
         expect(results.join('|')).toBe(
             'amsterdam|angeles|city|delhi|francisco|london|los|los angeles' +
                 '|mexico|mexico city|new|new amsterdam|new delhi|new york|paris|san|san francisco|york',
@@ -39,7 +39,7 @@ describe('Validate the iterateWordsFromFile', () => {
         ${'hunspell/example.aff'} | ${{}}                    | ${'hello|rework|reworked|tried|try|work|worked'}
     `('stream words from text $file $options', async ({ file, options, expected }) => {
         const reader = await createReader(path.resolve(samples, file), options);
-        const results = [...reader];
+        const results = [...reader.lines];
         expect(results.join('|')).toBe(expected);
     });
 

diff --git a/packages/cspell-tools/src/compiler/Reader.ts b/packages/cspell-tools/src/compiler/Reader.ts
@@ -1,4 +1,4 @@
-import type { BaseReader, Reader, ReaderOptions } from './readers/ReaderOptions.js';
+import type { BaseReader, DictionaryReader, Reader, ReaderOptions } from './readers/ReaderOptions.js';
 import { readHunspellFiles } from './readers/readHunspellFiles.js';
 import { regHunspellFile } from './readers/regHunspellFile.js';
 import { textFileReader } from './readers/textFileReader.js';
@@ -17,7 +17,7 @@ const readers: ReaderSelector[] = [
     { test: regHunspellFile, method: readHunspellFiles },
 ];
 
-function findMatchingReader(filename: string, options: ReaderOptions): Promise<BaseReader> {
+function findMatchingReader(filename: string, options: ReaderOptions): Promise<BaseReader | DictionaryReader> {
     for (const reader of readers) {
         if (reader.test.test(filename)) {
             return reader.method(filename, options);

diff --git a/packages/cspell-tools/src/compiler/SourceReader.test.ts b/packages/cspell-tools/src/compiler/SourceReader.test.ts
@@ -14,6 +14,7 @@ const samples = helper.resolveSample('dicts');
 const readerOptions: SourceReaderOptions = {
     splitWords: false,
     allowedSplitWords: defaultAllowedSplitWords,
+    storeSplitWordsAsCompounds: undefined,
 };
 
 describe('Validate the iterateWordsFromFile', () => {

diff --git a/packages/cspell-tools/src/compiler/SourceReader.ts b/packages/cspell-tools/src/compiler/SourceReader.ts
@@ -22,6 +22,8 @@ export interface SourceReaderOptions {
     keepCase?: boolean;
 
     allowedSplitWords: AllowedSplitWordsCollection;
+
+    storeSplitWordsAsCompounds: boolean | undefined;
 }
 
 export type AnnotatedWord = string;
@@ -62,8 +64,8 @@ function splitLines(lines: Iterable<string>, options: SourceReaderOptions): Iter
 }
 
 async function textFileReader(reader: Reader, options: SourceReaderOptions): Promise<SourceReader> {
-    const { legacy, splitWords: split, allowedSplitWords } = options;
-    const words = [...parseFileLines(reader, { legacy, split, allowedSplitWords })];
+    const { legacy, splitWords: split, allowedSplitWords, storeSplitWordsAsCompounds } = options;
+    const words = [...parseFileLines(reader.lines, { legacy, split, allowedSplitWords, storeSplitWordsAsCompounds })];
 
     return {
         size: words.length,

diff --git a/packages/cspell-tools/src/compiler/WordsCollection.ts b/packages/cspell-tools/src/compiler/WordsCollection.ts
@@ -1,6 +1,6 @@
 export interface WordsCollection {
     size: number;
-    has(words: string): boolean;
+    has(words: string, caseSensitive: boolean): boolean;
     type?: string;
 }
 

diff --git a/packages/cspell-tools/src/compiler/__snapshots__/compile.test.ts.snap b/packages/cspell-tools/src/compiler/__snapshots__/compile.test.ts.snap
@@ -109,25 +109,18 @@ exports[`compile > compile 'sampleCodeDic.txt' fmt: 'plaintext' gz: false alt: t
 !Errorerror
 !codecode
 !err
-+code
-+code+
-+error
-+error+
-+msg
+*msg
++code*
++error*
 Café
-Code
-Code+
-Error
-Error+
-msg
+Code*
+Error*
 ~!codemsg
 ~!errorerror
 ~cafe
 ~café
-~code
-~code+
-~error
-~error+
+~code*
+~error*
 "
 `;
 
@@ -139,17 +132,12 @@ exports[`compile > compile 'sampleCodeDic.txt' fmt: 'plaintext' gz: false alt: u
 !Errorerror
 !codecode
 !err
-+code
-+code+
-+error
-+error+
-+msg
+*msg
++code*
++error*
 Café
-Code
-Code+
-Error
-Error+
-msg
+Code*
+Error*
 "
 `;
 
@@ -262,25 +250,18 @@ exports[`compile > compile conditional 'sampleCodeDic.txt' fmt: 'plaintext' gz:
 !Errorerror
 !codecode
 !err
-+code
-+code+
-+error
-+error+
-+msg
+*msg
++code*
++error*
 Café
-Code
-Code+
-Error
-Error+
-msg
+Code*
+Error*
 ~!codemsg
 ~!errorerror
 ~cafe
 ~café
-~code
-~code+
-~error
-~error+
+~code*
+~error*
 "
 `;
 
@@ -292,17 +273,12 @@ exports[`compile > compile conditional 'sampleCodeDic.txt' fmt: 'plaintext' gz:
 !Errorerror
 !codecode
 !err
-+code
-+code+
-+error
-+error+
-+msg
+*msg
++code*
++error*
 Café
-Code
-Code+
-Error
-Error+
-msg
+Code*
+Error*
 "
 `;
 

diff --git a/packages/cspell-tools/src/compiler/compile.ts b/packages/cspell-tools/src/compiler/compile.ts
@@ -50,6 +50,7 @@ export async function compile(request: CompileRequest, options?: CompileOptions)
     const targetOptions: CompileTargetConfig = {
         sort: request.sort,
         generateNonStrict: request.generateNonStrict,
+        removeDuplicates: request.removeDuplicates,
     };
     const conditional = options?.conditionalBuild || false;
     const checksumFile = resolveChecksumFile(request.checksumFile || conditional, rootDir);
@@ -108,6 +109,7 @@ export async function compileTarget(
     const { format, sources, trieBase, sort = true, generateNonStrict = false, excludeWordsFrom } = target;
     const targetDirectory = path.resolve(rootDir, target.targetDirectory ?? cwd ?? process.cwd());
     const dictionaryDirectives = target.dictionaryDirectives ?? compileOptions.dictionaryDirectives;
+    const removeDuplicates = target.removeDuplicates ?? false;
 
     const excludeFilter = await createExcludeFilter(excludeWordsFrom);
 
@@ -129,6 +131,7 @@ export async function compileTarget(
         generateNonStrict,
         filter: excludeFilter,
         dictionaryDirectives,
+        // removeDuplicates, // Add this in if we use it.
     });
     const checksumRoot = (checksumFile && path.dirname(checksumFile)) || rootDir;
 
@@ -151,10 +154,16 @@ export async function compileTarget(
                   trie4: format === 'trie4',
                   generateNonStrict: generateNonStrictTrie,
                   dictionaryDirectives: undefined,
+                  //   removeDuplicates, // Add this in if we use it.
               });
           }
         : async (words: Iterable<string>, dst: string) => {
-              return compileWordList(pipe(words, normalizer), dst, { sort, generateNonStrict, dictionaryDirectives });
+              return compileWordList(pipe(words, normalizer), dst, {
+                  sort,
+                  generateNonStrict,
+                  dictionaryDirectives,
+                  removeDuplicates,
+              });
           };
 
     await processFiles(action, filesToProcess, filename);
@@ -265,6 +274,7 @@ async function readFileSource(fileSource: FileSource, sourceOptions: CompileSour
         keepRawCase = sourceOptions.keepRawCase || false,
         split = sourceOptions.split || false,
         maxDepth,
+        storeSplitWordsAsCompounds,
     } = fileSource;
 
     const legacy = split === 'legacy';
@@ -282,6 +292,7 @@ async function readFileSource(fileSource: FileSource, sourceOptions: CompileSour
         splitWords,
         keepCase: keepRawCase,
         allowedSplitWords,
+        storeSplitWordsAsCompounds,
     };
 
     logWithTimestamp(`Reading ${path.basename(filename)}`);
@@ -317,5 +328,6 @@ function logProgress<T>(freq = 100_000): (iter: Iterable<T>) => Iterable<T> {
 async function createExcludeFilter(excludeWordsFrom: FilePath[] | undefined): Promise<(word: string) => boolean> {
     if (!excludeWordsFrom || !excludeWordsFrom.length) return () => true;
     const excludeWords = await createWordsCollectionFromFiles(excludeWordsFrom);
-    return (word: string) => !excludeWords.has(word);
+
+    return (word: string) => !excludeWords.has(word, word.toUpperCase() !== word);
 }
diff --git a/packages/cspell-tools/src/compiler/createWordsCollection.test.ts b/packages/cspell-tools/src/compiler/createWordsCollection.test.ts
@@ -16,7 +16,7 @@ describe('createAllowedSplitWords', () => {
         const fixFiles: string[] | undefined = Array.isArray(files) ? files : !files ? undefined : [files];
         const allowedFiles = fixFiles?.map((file) => resolvePathToFixture(file));
         const allowed = await createAllowedSplitWordsFromFiles(allowedFiles);
-        expect(allowed.size).toBe(expectedSize);
-        expect(allowed.has(has)).toBe(expected);
+        expect(allowed.size).toBeGreaterThanOrEqual(expectedSize);
+        expect(allowed.has(has, true)).toBe(expected);
     });
 });
-Original file line number
+Diff line change
@@ Expand Up / @@ -178,17 +178,12 @@ exports[`build action > build multi 0 2`] = ` @@
     !Errorerror
     !codecode
     !err
-    +code
-    +code+
-    +error
-    +error+
-    +msg
+    *msg
+    +code*
+    +error*
     Café
-    Code
-    Code+
-    Error
-    Error+
-    msg
+    Code*
+    Error*
     "
     `;
@@ Expand Down @@