Skip to content

Commit

Permalink
fix: tools - remove duplicates and support compounding (#6423)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jason3S authored Oct 27, 2024
1 parent 1b4d774 commit ac0730f
Show file tree
Hide file tree
Showing 28 changed files with 484 additions and 115 deletions.
31 changes: 29 additions & 2 deletions packages/cspell-tools/cspell-tools.config.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@
},
"type": "array"
}
]
],
"description": "Words in the `allowedSplitWords` are considered correct and can be used as a basis for splitting compound words.\n\nIf entries can be split so that all the words in the entry are allowed, then only the individual words are added, otherwise the entire entry is added. This is to prevent misspellings in CamelCase words from being introduced into the dictionary."
},
"keepRawCase": {
"default": false,
Expand All @@ -64,6 +65,11 @@
],
"default": false,
"description": "Split lines into words."
},
"storeSplitWordsAsCompounds": {
"default": false,
"description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
"type": "boolean"
}
},
"required": [
Expand All @@ -89,7 +95,8 @@
},
"type": "array"
}
]
],
"description": "Words in the `allowedSplitWords` are considered correct and can be used as a basis for splitting compound words.\n\nIf entries can be split so that all the words in the entry are allowed, then only the individual words are added, otherwise the entire entry is added. This is to prevent misspellings in CamelCase words from being introduced into the dictionary."
},
"filename": {
"$ref": "#/definitions/FilePath"
Expand All @@ -115,6 +122,11 @@
],
"default": false,
"description": "Split lines into words."
},
"storeSplitWordsAsCompounds": {
"default": false,
"description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
"type": "boolean"
}
},
"required": [
Expand Down Expand Up @@ -171,6 +183,11 @@
"description": "Name of target, used as the basis of target file name.",
"type": "string"
},
"removeDuplicates": {
"default": false,
"description": "Remove duplicate words, favor lower case words over mixed case words. Combine compound prefixes where possible.",
"type": "boolean"
},
"sort": {
"default": true,
"description": "Sort the words in the resulting dictionary. Does not apply to `trie` based formats.",
Expand Down Expand Up @@ -249,6 +266,11 @@
"description": "Maximum number of nested Hunspell Rules to apply. This is needed for recursive dictionaries like Hebrew.",
"type": "number"
},
"removeDuplicates": {
"default": false,
"description": "Remove duplicate words, favor lower case words over mixed case words. Combine compound prefixes where possible.",
"type": "boolean"
},
"rootDir": {
"description": "Specify the directory where all relative paths will resolved against. By default, all relative paths are relative to the location of the config file.",
"type": "string"
Expand All @@ -271,6 +293,11 @@
"default": false,
"description": "Split lines into words."
},
"storeSplitWordsAsCompounds": {
"default": false,
"description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
"type": "boolean"
},
"targets": {
"description": "Optional Target Dictionaries to create.",
"items": {
Expand Down
15 changes: 5 additions & 10 deletions packages/cspell-tools/src/__snapshots__/build.test.ts.snap
Original file line number Diff line number Diff line change
Expand Up @@ -178,17 +178,12 @@ exports[`build action > build multi 0 2`] = `
!Errorerror
!codecode
!err
+code
+code+
+error
+error+
+msg
*msg
+code*
+error*
Café
Code
Code+
Error
Error+
msg
Code*
Error*
"
`;

Expand Down
7 changes: 7 additions & 0 deletions packages/cspell-tools/src/compiler/CompileOptions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,11 @@ export interface CompileOptions {
*
*/
dictionaryDirectives?: string[] | undefined;

/**
* Remove duplicate words, favor lower case words over mixed case words.
* Combine compound prefixes where possible.
* @default false
*/
removeDuplicates?: boolean;
}
6 changes: 3 additions & 3 deletions packages/cspell-tools/src/compiler/Reader.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ const sc = (m: string) => expect.stringContaining(m);
describe('Validate the iterateWordsFromFile', () => {
test('streamWordsFromFile: hunspell', async () => {
const reader = await createReader(path.join(samples, 'hunspell', 'example.aff'), readerOptions);
const results = [...reader];
const results = [...reader.lines];
// this might break if the processing order of hunspell changes.
expect(results).toEqual(s('hello rework reworked tried try work worked', ' '));
});

test('stream words from trie', async () => {
const reader = await createReader(path.join(samples, 'cities.trie.gz'), readerOptions);
const results = [...reader];
const results = [...reader.lines];
expect(results.join('|')).toBe(
'amsterdam|angeles|city|delhi|francisco|london|los|los angeles' +
'|mexico|mexico city|new|new amsterdam|new delhi|new york|paris|san|san francisco|york',
Expand All @@ -39,7 +39,7 @@ describe('Validate the iterateWordsFromFile', () => {
${'hunspell/example.aff'} | ${{}} | ${'hello|rework|reworked|tried|try|work|worked'}
`('stream words from text $file $options', async ({ file, options, expected }) => {
const reader = await createReader(path.resolve(samples, file), options);
const results = [...reader];
const results = [...reader.lines];
expect(results.join('|')).toBe(expected);
});

Expand Down
4 changes: 2 additions & 2 deletions packages/cspell-tools/src/compiler/Reader.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { BaseReader, Reader, ReaderOptions } from './readers/ReaderOptions.js';
import type { BaseReader, DictionaryReader, Reader, ReaderOptions } from './readers/ReaderOptions.js';
import { readHunspellFiles } from './readers/readHunspellFiles.js';
import { regHunspellFile } from './readers/regHunspellFile.js';
import { textFileReader } from './readers/textFileReader.js';
Expand All @@ -17,7 +17,7 @@ const readers: ReaderSelector[] = [
{ test: regHunspellFile, method: readHunspellFiles },
];

function findMatchingReader(filename: string, options: ReaderOptions): Promise<BaseReader> {
function findMatchingReader(filename: string, options: ReaderOptions): Promise<BaseReader | DictionaryReader> {
for (const reader of readers) {
if (reader.test.test(filename)) {
return reader.method(filename, options);
Expand Down
1 change: 1 addition & 0 deletions packages/cspell-tools/src/compiler/SourceReader.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ const samples = helper.resolveSample('dicts');
const readerOptions: SourceReaderOptions = {
splitWords: false,
allowedSplitWords: defaultAllowedSplitWords,
storeSplitWordsAsCompounds: undefined,
};

describe('Validate the iterateWordsFromFile', () => {
Expand Down
6 changes: 4 additions & 2 deletions packages/cspell-tools/src/compiler/SourceReader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ export interface SourceReaderOptions {
keepCase?: boolean;

allowedSplitWords: AllowedSplitWordsCollection;

storeSplitWordsAsCompounds: boolean | undefined;
}

export type AnnotatedWord = string;
Expand Down Expand Up @@ -62,8 +64,8 @@ function splitLines(lines: Iterable<string>, options: SourceReaderOptions): Iter
}

async function textFileReader(reader: Reader, options: SourceReaderOptions): Promise<SourceReader> {
const { legacy, splitWords: split, allowedSplitWords } = options;
const words = [...parseFileLines(reader, { legacy, split, allowedSplitWords })];
const { legacy, splitWords: split, allowedSplitWords, storeSplitWordsAsCompounds } = options;
const words = [...parseFileLines(reader.lines, { legacy, split, allowedSplitWords, storeSplitWordsAsCompounds })];

return {
size: words.length,
Expand Down
2 changes: 1 addition & 1 deletion packages/cspell-tools/src/compiler/WordsCollection.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
export interface WordsCollection {
size: number;
has(words: string): boolean;
has(words: string, caseSensitive: boolean): boolean;
type?: string;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,25 +109,18 @@ exports[`compile > compile 'sampleCodeDic.txt' fmt: 'plaintext' gz: false alt: t
!Errorerror
!codecode
!err
+code
+code+
+error
+error+
+msg
*msg
+code*
+error*
Café
Code
Code+
Error
Error+
msg
Code*
Error*
~!codemsg
~!errorerror
~cafe
~café
~code
~code+
~error
~error+
~code*
~error*
"
`;

Expand All @@ -139,17 +132,12 @@ exports[`compile > compile 'sampleCodeDic.txt' fmt: 'plaintext' gz: false alt: u
!Errorerror
!codecode
!err
+code
+code+
+error
+error+
+msg
*msg
+code*
+error*
Café
Code
Code+
Error
Error+
msg
Code*
Error*
"
`;

Expand Down Expand Up @@ -262,25 +250,18 @@ exports[`compile > compile conditional 'sampleCodeDic.txt' fmt: 'plaintext' gz:
!Errorerror
!codecode
!err
+code
+code+
+error
+error+
+msg
*msg
+code*
+error*
Café
Code
Code+
Error
Error+
msg
Code*
Error*
~!codemsg
~!errorerror
~cafe
~café
~code
~code+
~error
~error+
~code*
~error*
"
`;

Expand All @@ -292,17 +273,12 @@ exports[`compile > compile conditional 'sampleCodeDic.txt' fmt: 'plaintext' gz:
!Errorerror
!codecode
!err
+code
+code+
+error
+error+
+msg
*msg
+code*
+error*
Café
Code
Code+
Error
Error+
msg
Code*
Error*
"
`;

Expand Down
16 changes: 14 additions & 2 deletions packages/cspell-tools/src/compiler/compile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ export async function compile(request: CompileRequest, options?: CompileOptions)
const targetOptions: CompileTargetConfig = {
sort: request.sort,
generateNonStrict: request.generateNonStrict,
removeDuplicates: request.removeDuplicates,
};
const conditional = options?.conditionalBuild || false;
const checksumFile = resolveChecksumFile(request.checksumFile || conditional, rootDir);
Expand Down Expand Up @@ -108,6 +109,7 @@ export async function compileTarget(
const { format, sources, trieBase, sort = true, generateNonStrict = false, excludeWordsFrom } = target;
const targetDirectory = path.resolve(rootDir, target.targetDirectory ?? cwd ?? process.cwd());
const dictionaryDirectives = target.dictionaryDirectives ?? compileOptions.dictionaryDirectives;
const removeDuplicates = target.removeDuplicates ?? false;

const excludeFilter = await createExcludeFilter(excludeWordsFrom);

Expand All @@ -129,6 +131,7 @@ export async function compileTarget(
generateNonStrict,
filter: excludeFilter,
dictionaryDirectives,
// removeDuplicates, // Add this in if we use it.
});
const checksumRoot = (checksumFile && path.dirname(checksumFile)) || rootDir;

Expand All @@ -151,10 +154,16 @@ export async function compileTarget(
trie4: format === 'trie4',
generateNonStrict: generateNonStrictTrie,
dictionaryDirectives: undefined,
// removeDuplicates, // Add this in if we use it.
});
}
: async (words: Iterable<string>, dst: string) => {
return compileWordList(pipe(words, normalizer), dst, { sort, generateNonStrict, dictionaryDirectives });
return compileWordList(pipe(words, normalizer), dst, {
sort,
generateNonStrict,
dictionaryDirectives,
removeDuplicates,
});
};

await processFiles(action, filesToProcess, filename);
Expand Down Expand Up @@ -265,6 +274,7 @@ async function readFileSource(fileSource: FileSource, sourceOptions: CompileSour
keepRawCase = sourceOptions.keepRawCase || false,
split = sourceOptions.split || false,
maxDepth,
storeSplitWordsAsCompounds,
} = fileSource;

const legacy = split === 'legacy';
Expand All @@ -282,6 +292,7 @@ async function readFileSource(fileSource: FileSource, sourceOptions: CompileSour
splitWords,
keepCase: keepRawCase,
allowedSplitWords,
storeSplitWordsAsCompounds,
};

logWithTimestamp(`Reading ${path.basename(filename)}`);
Expand Down Expand Up @@ -317,5 +328,6 @@ function logProgress<T>(freq = 100_000): (iter: Iterable<T>) => Iterable<T> {
async function createExcludeFilter(excludeWordsFrom: FilePath[] | undefined): Promise<(word: string) => boolean> {
if (!excludeWordsFrom || !excludeWordsFrom.length) return () => true;
const excludeWords = await createWordsCollectionFromFiles(excludeWordsFrom);
return (word: string) => !excludeWords.has(word);

return (word: string) => !excludeWords.has(word, word.toUpperCase() !== word);
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ describe('createAllowedSplitWords', () => {
const fixFiles: string[] | undefined = Array.isArray(files) ? files : !files ? undefined : [files];
const allowedFiles = fixFiles?.map((file) => resolvePathToFixture(file));
const allowed = await createAllowedSplitWordsFromFiles(allowedFiles);
expect(allowed.size).toBe(expectedSize);
expect(allowed.has(has)).toBe(expected);
expect(allowed.size).toBeGreaterThanOrEqual(expectedSize);
expect(allowed.has(has, true)).toBe(expected);
});
});
Loading

0 comments on commit ac0730f

Please sign in to comment.