-
-
Notifications
You must be signed in to change notification settings - Fork 186
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
⚡️ Faster tokenizer of strings (#5387)
**Description** <!-- Please provide a short description and potentially linked issues justifying the need for this PR --> Our `string` arbitrary starts its initialization by tokenizing known vulnerable strings into set of units (chars). The idea behind this tokenization process is to later generate vulnerable strings while generating entries with this arbitrary. The process is the following: - for each string known to be vulnerable, try to tokenize it with respect to the provided constraints on length and the unit arbitrary - for each tokenizable string, add it to the bucket of potentially to be generated strings This original tokenizer process was able to abide by constraints on length. Computed tokens were depending on the set of provided contraints on lengths and the arbitrary being considered. But this flexibility had a runtime cost we don't want to pay anymore. The tokenizer will stop trying to optimize on the lengths and will just tokenize for the requested arbitrary. <!-- * Your PR is fixing a bug or regression? Check for existing issues related to this bug and link them --> <!-- * Your PR is adding a new feature? Make sure there is a related issue or discussion attached to it --> <!-- You can provide any additional context to help into understanding what's this PR is attempting to solve: reproduction of a bug, code snippets... --> **Checklist** — _Don't delete this checklist and make sure you do the following before opening the PR_ - [x] The name of my PR follows [gitmoji](https://gitmoji.dev/) specification - [x] My PR references one of several related issues (if any) - [x] New features or breaking changes must come with an associated Issue or Discussion - [x] My PR does not add any new dependency without an associated Issue or Discussion - [x] My PR includes bumps details, please run `yarn bump` and flag the impacts properly - [x] My PR adds relevant tests and they would have failed without my PR (when applicable) <!-- More about contributing at https://github.com/dubzzz/fast-check/blob/main/CONTRIBUTING.md --> **Advanced** <!-- How to fill the advanced section is detailed below! --> - [x] Category: ⚡️ Improve performance - [x] Impacts: Slight performance uplift, but way more to come thanks to this change <!-- [Category] Please use one of the categories below, it will help us into better understanding the urgency of the PR --> <!-- * ✨ Introduce new features --> <!-- * 📝 Add or update documentation --> <!-- * ✅ Add or update tests --> <!-- * 🐛 Fix a bug --> <!-- * 🏷️ Add or update types --> <!-- * ⚡️ Improve performance --> <!-- * _Other(s):_ ... --> <!-- [Impacts] Please provide a comma separated list of the potential impacts that might be introduced by this change --> <!-- * Generated values: Can your change impact any of the existing generators in terms of generated values, if so which ones? when? --> <!-- * Shrink values: Can your change impact any of the existing generators in terms of shrink values, if so which ones? when? --> <!-- * Performance: Can it require some typings changes on user side? Please give more details --> <!-- * Typings: Is there a potential performance impact? In which cases? -->
- Loading branch information
Showing
4 changed files
with
91 additions
and
71 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
--- | ||
"fast-check": minor | ||
--- | ||
|
||
⚡️ Faster tokenizer of strings |
57 changes: 57 additions & 0 deletions
57
packages/fast-check/src/arbitrary/_internals/helpers/TokenizeString.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import type { Arbitrary } from '../../../check/arbitrary/definition/Arbitrary'; | ||
import { safePop, safePush, safeSubstring } from '../../../utils/globals'; | ||
|
||
/** | ||
* Split a string into valid tokens of patternsArb | ||
* @internal | ||
*/ | ||
export function tokenizeString(patternsArb: Arbitrary<string>, value: string): string[] | undefined { | ||
// First match wins! Possibly not the best match. | ||
// Empty strings are not considered as valid chunks. | ||
if (value.length === 0) { | ||
return []; | ||
} | ||
|
||
// DFS analysis | ||
// Structure of an item within the stack: | ||
// - endIndexChunks: where we are in the analysis | ||
// - chunks: chunks computed and extracted up-to endIndexChunks | ||
// - nextStartIndex: where to start next time (mostly needed as we want to go deep first) | ||
const stack: StackItem[] = [{ endIndexChunks: 0, nextStartIndex: 1, chunks: [] }]; | ||
while (stack.length > 0) { | ||
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion | ||
const last = safePop(stack)!; | ||
|
||
// Going deeper in the tree | ||
// TODO - Use larger chunks first instead of small ones then large ones | ||
for (let index = last.nextStartIndex; index <= value.length; ++index) { | ||
const chunk = safeSubstring(value, last.endIndexChunks, index); | ||
if (patternsArb.canShrinkWithoutContext(chunk)) { | ||
const newChunks = [...last.chunks, chunk]; | ||
if (index === value.length) { | ||
// TODO - Rely on dynamic programming tricks not to retry from already investigated indices | ||
return newChunks; // we found a full match | ||
} | ||
// Pushed in case we need to try for next indices | ||
// Actually it corresponds to moving to the next index in the for-loop BUT as we want to go deep first, | ||
// we stop the iteration of the current for-loop via a break and delay the analysis for next index for later | ||
// with this push. | ||
safePush(stack, { endIndexChunks: last.endIndexChunks, nextStartIndex: index + 1, chunks: last.chunks }); | ||
// Pushed to go deeper in the tree | ||
safePush(stack, { endIndexChunks: index, nextStartIndex: index + 1, chunks: newChunks }); | ||
break; | ||
} | ||
} | ||
} | ||
return undefined; | ||
} | ||
|
||
/** @internal */ | ||
type StackItem = { | ||
/** Currently selected chunks */ | ||
chunks: string[]; | ||
/** Index corresponding to the last chunk (end + 1) */ | ||
endIndexChunks: number; | ||
/** Where to start the next chunk */ | ||
nextStartIndex: number; | ||
}; |
71 changes: 12 additions & 59 deletions
71
packages/fast-check/src/arbitrary/_internals/mappers/PatternsToString.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,82 +1,35 @@ | ||
import type { Arbitrary } from '../../../check/arbitrary/definition/Arbitrary'; | ||
import { MaxLengthUpperBound } from '../helpers/MaxLengthFromMinLength'; | ||
import type { StringSharedConstraints } from '../../_shared/StringSharedConstraints'; | ||
import { safeJoin, safePop, safePush, safeSubstring, Error } from '../../../utils/globals'; | ||
import { safeJoin, Error } from '../../../utils/globals'; | ||
import { tokenizeString } from '../helpers/TokenizeString'; | ||
|
||
/** @internal - tab is supposed to be composed of valid entries extracted from the source arbitrary */ | ||
export function patternsToStringMapper(tab: string[]): string { | ||
return safeJoin(tab, ''); | ||
} | ||
|
||
/** @internal */ | ||
export function patternsToStringUnmapperIsValidLength(tokens: string[], constraints: StringSharedConstraints): boolean { | ||
const minLength = constraints.minLength !== undefined ? constraints.minLength : 0; | ||
const maxLength = constraints.maxLength !== undefined ? constraints.maxLength : MaxLengthUpperBound; | ||
return minLength <= tokens.length && tokens.length <= maxLength; | ||
} | ||
|
||
/** @internal */ | ||
export function patternsToStringUnmapperFor( | ||
patternsArb: Arbitrary<string>, | ||
constraints: StringSharedConstraints, | ||
): (value: unknown) => string[] { | ||
return function patternsToStringUnmapper(value: unknown): string[] { | ||
// First match wins! Possibly not the best match. | ||
// Empty strings are not considered as valid chunks. | ||
// Example: | ||
// > Size limit (not known here) is [min: 0, max: 2], we want to revert "abc" and both ["a","b","c"] and ["ab", "c"] are possible. | ||
// > Unmap to ["a", "b", "c"] while not in [min: 0, max: 2]. | ||
|
||
if (typeof value !== 'string') { | ||
throw new Error('Unsupported value'); | ||
} | ||
|
||
const minLength = constraints.minLength !== undefined ? constraints.minLength : 0; | ||
const maxLength = constraints.maxLength !== undefined ? constraints.maxLength : MaxLengthUpperBound; | ||
if (value.length === 0) { | ||
if (minLength > 0) { | ||
throw new Error('Unable to unmap received string'); | ||
} | ||
return []; | ||
} | ||
|
||
// DFS analysis | ||
// Structure of an item within the stack: | ||
// - endIndexChunks: where we are in the analysis | ||
// - chunks: chunks computed and extracted up-to endIndexChunks | ||
// - nextStartIndex: where to start next time (mostly needed as we want to go deep first) | ||
const stack: StackItem[] = [{ endIndexChunks: 0, nextStartIndex: 1, chunks: [] }]; | ||
while (stack.length > 0) { | ||
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion | ||
const last = safePop(stack)!; | ||
|
||
// Going deeper in the tree | ||
// TODO - Use larger chunks first instead of small ones then large ones | ||
for (let index = last.nextStartIndex; index <= value.length; ++index) { | ||
const chunk = safeSubstring(value, last.endIndexChunks, index); | ||
if (patternsArb.canShrinkWithoutContext(chunk)) { | ||
const newChunks = [...last.chunks, chunk]; | ||
if (index === value.length) { | ||
if (newChunks.length < minLength || newChunks.length > maxLength) { | ||
break; // =continue as we already reach the last index of the for-loop | ||
} | ||
// TODO - Rely on dynamic programming tricks not to retry from already investigated indices | ||
return newChunks; // we found a full match | ||
} | ||
// Pushed in case we need to try for next indices | ||
// Actually it corresponds to moving to the next index in the for-loop BUT as we want to go deep first, | ||
// we stop the iteration of the current for-loop via a break and delay the analysis for next index for later | ||
// with this push. | ||
safePush(stack, { endIndexChunks: last.endIndexChunks, nextStartIndex: index + 1, chunks: last.chunks }); | ||
// Pushed to go deeper in the tree | ||
safePush(stack, { endIndexChunks: index, nextStartIndex: index + 1, chunks: newChunks }); | ||
break; | ||
} | ||
} | ||
const tokens = tokenizeString(patternsArb, value); | ||
if (tokens !== undefined && patternsToStringUnmapperIsValidLength(tokens, constraints)) { | ||
return tokens; | ||
} | ||
throw new Error('Unable to unmap received string'); | ||
}; | ||
} | ||
|
||
/** @internal */ | ||
type StackItem = { | ||
/** Currently selected chunks */ | ||
chunks: string[]; | ||
/** Index corresponding to the last chunk (end + 1) */ | ||
endIndexChunks: number; | ||
/** Where to start the next chunk */ | ||
nextStartIndex: number; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters