⚡️ Faster tokenizer of strings (#5387)

**Description**  Our `string` arbitrary starts its initialization by tokenizing known vulnerable strings into set of units (chars). The idea behind this tokenization process is to later generate vulnerable strings while generating entries with this arbitrary. The process is the following: - for each string known to be vulnerable, try to tokenize it with respect to the provided constraints on length and the unit arbitrary - for each tokenizable string, add it to the bucket of potentially to be generated strings This original tokenizer process was able to abide by constraints on length. Computed tokens were depending on the set of provided contraints on lengths and the arbitrary being considered. But this flexibility had a runtime cost we don't want to pay anymore. The tokenizer will stop trying to optimize on the lengths and will just tokenize for the requested arbitrary.    **Checklist** — _Don't delete this checklist and make sure you do the following before opening the PR_ - [x] The name of my PR follows [gitmoji](https://gitmoji.dev/) specification - [x] My PR references one of several related issues (if any) - [x] New features or breaking changes must come with an associated Issue or Discussion - [x] My PR does not add any new dependency without an associated Issue or Discussion - [x] My PR includes bumps details, please run `yarn bump` and flag the impacts properly - [x] My PR adds relevant tests and they would have failed without my PR (when applicable)  **Advanced**  - [x] Category: ⚡️ Improve performance - [x] Impacts: Slight performance uplift, but way more to come thanks to this change
dubzzz · Oct 31, 2024 · d336e2e · d336e2e
1 parent 640157f
commit d336e2e
Show file tree

Hide file tree

Showing 4 changed files with 91 additions and 71 deletions.
diff --git a/.changeset/polite-pumpkins-turn.md b/.changeset/polite-pumpkins-turn.md
@@ -0,0 +1,5 @@
+---
+"fast-check": minor
+---
+
+⚡️ Faster tokenizer of strings
diff --git a/packages/fast-check/src/arbitrary/_internals/helpers/TokenizeString.ts b/packages/fast-check/src/arbitrary/_internals/helpers/TokenizeString.ts
@@ -0,0 +1,57 @@
+import type { Arbitrary } from '../../../check/arbitrary/definition/Arbitrary';
+import { safePop, safePush, safeSubstring } from '../../../utils/globals';
+
+/**
+ * Split a string into valid tokens of patternsArb
+ * @internal
+ */
+export function tokenizeString(patternsArb: Arbitrary<string>, value: string): string[] | undefined {
+  // First match wins! Possibly not the best match.
+  // Empty strings are not considered as valid chunks.
+  if (value.length === 0) {
+    return [];
+  }
+
+  // DFS analysis
+  // Structure of an item within the stack:
+  // - endIndexChunks: where we are in the analysis
+  // - chunks: chunks computed and extracted up-to endIndexChunks
+  // - nextStartIndex: where to start next time (mostly needed as we want to go deep first)
+  const stack: StackItem[] = [{ endIndexChunks: 0, nextStartIndex: 1, chunks: [] }];
+  while (stack.length > 0) {
+    // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
+    const last = safePop(stack)!;
+
+    // Going deeper in the tree
+    // TODO - Use larger chunks first instead of small ones then large ones
+    for (let index = last.nextStartIndex; index <= value.length; ++index) {
+      const chunk = safeSubstring(value, last.endIndexChunks, index);
+      if (patternsArb.canShrinkWithoutContext(chunk)) {
+        const newChunks = [...last.chunks, chunk];
+        if (index === value.length) {
+          // TODO - Rely on dynamic programming tricks not to retry from already investigated indices
+          return newChunks; // we found a full match
+        }
+        // Pushed in case we need to try for next indices
+        // Actually it corresponds to moving to the next index in the for-loop BUT as we want to go deep first,
+        // we stop the iteration of the current for-loop via a break and delay the analysis for next index for later
+        // with this push.
+        safePush(stack, { endIndexChunks: last.endIndexChunks, nextStartIndex: index + 1, chunks: last.chunks });
+        // Pushed to go deeper in the tree
+        safePush(stack, { endIndexChunks: index, nextStartIndex: index + 1, chunks: newChunks });
+        break;
+      }
+    }
+  }
+  return undefined;
+}
+
+/** @internal */
+type StackItem = {
+  /** Currently selected chunks */
+  chunks: string[];
+  /** Index corresponding to the last chunk (end + 1) */
+  endIndexChunks: number;
+  /** Where to start the next chunk */
+  nextStartIndex: number;
+};
diff --git a/packages/fast-check/src/arbitrary/_internals/mappers/PatternsToString.ts b/packages/fast-check/src/arbitrary/_internals/mappers/PatternsToString.ts
@@ -1,82 +1,35 @@
 import type { Arbitrary } from '../../../check/arbitrary/definition/Arbitrary';
 import { MaxLengthUpperBound } from '../helpers/MaxLengthFromMinLength';
 import type { StringSharedConstraints } from '../../_shared/StringSharedConstraints';
-import { safeJoin, safePop, safePush, safeSubstring, Error } from '../../../utils/globals';
+import { safeJoin, Error } from '../../../utils/globals';
+import { tokenizeString } from '../helpers/TokenizeString';
 
 /** @internal - tab is supposed to be composed of valid entries extracted from the source arbitrary */
 export function patternsToStringMapper(tab: string[]): string {
   return safeJoin(tab, '');
 }
 
+/** @internal */
+export function patternsToStringUnmapperIsValidLength(tokens: string[], constraints: StringSharedConstraints): boolean {
+  const minLength = constraints.minLength !== undefined ? constraints.minLength : 0;
+  const maxLength = constraints.maxLength !== undefined ? constraints.maxLength : MaxLengthUpperBound;
+  return minLength <= tokens.length && tokens.length <= maxLength;
+}
+
 /** @internal */
 export function patternsToStringUnmapperFor(
   patternsArb: Arbitrary<string>,
   constraints: StringSharedConstraints,
 ): (value: unknown) => string[] {
   return function patternsToStringUnmapper(value: unknown): string[] {
-    // First match wins! Possibly not the best match.
-    // Empty strings are not considered as valid chunks.
-    // Example:
-    // >  Size limit (not known here) is [min: 0, max: 2], we want to revert "abc" and both ["a","b","c"] and ["ab", "c"] are possible.
-    // >  Unmap to ["a", "b", "c"] while not in [min: 0, max: 2].
-
     if (typeof value !== 'string') {
       throw new Error('Unsupported value');
     }
 
-    const minLength = constraints.minLength !== undefined ? constraints.minLength : 0;
-    const maxLength = constraints.maxLength !== undefined ? constraints.maxLength : MaxLengthUpperBound;
-    if (value.length === 0) {
-      if (minLength > 0) {
-        throw new Error('Unable to unmap received string');
-      }
-      return [];
-    }
-
-    // DFS analysis
-    // Structure of an item within the stack:
-    // - endIndexChunks: where we are in the analysis
-    // - chunks: chunks computed and extracted up-to endIndexChunks
-    // - nextStartIndex: where to start next time (mostly needed as we want to go deep first)
-    const stack: StackItem[] = [{ endIndexChunks: 0, nextStartIndex: 1, chunks: [] }];
-    while (stack.length > 0) {
-      // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
-      const last = safePop(stack)!;
-
-      // Going deeper in the tree
-      // TODO - Use larger chunks first instead of small ones then large ones
-      for (let index = last.nextStartIndex; index <= value.length; ++index) {
-        const chunk = safeSubstring(value, last.endIndexChunks, index);
-        if (patternsArb.canShrinkWithoutContext(chunk)) {
-          const newChunks = [...last.chunks, chunk];
-          if (index === value.length) {
-            if (newChunks.length < minLength || newChunks.length > maxLength) {
-              break; // =continue as we already reach the last index of the for-loop
-            }
-            // TODO - Rely on dynamic programming tricks not to retry from already investigated indices
-            return newChunks; // we found a full match
-          }
-          // Pushed in case we need to try for next indices
-          // Actually it corresponds to moving to the next index in the for-loop BUT as we want to go deep first,
-          // we stop the iteration of the current for-loop via a break and delay the analysis for next index for later
-          // with this push.
-          safePush(stack, { endIndexChunks: last.endIndexChunks, nextStartIndex: index + 1, chunks: last.chunks });
-          // Pushed to go deeper in the tree
-          safePush(stack, { endIndexChunks: index, nextStartIndex: index + 1, chunks: newChunks });
-          break;
-        }
-      }
+    const tokens = tokenizeString(patternsArb, value);
+    if (tokens !== undefined && patternsToStringUnmapperIsValidLength(tokens, constraints)) {
+      return tokens;
     }
     throw new Error('Unable to unmap received string');
   };
 }
-
-/** @internal */
-type StackItem = {
-  /** Currently selected chunks */
-  chunks: string[];
-  /** Index corresponding to the last chunk (end + 1) */
-  endIndexChunks: number;
-  /** Where to start the next chunk */
-  nextStartIndex: number;
-};
diff --git a/packages/fast-check/test/unit/arbitrary/_internals/mappers/PatternsToString.spec.ts b/packages/fast-check/test/unit/arbitrary/_internals/mappers/PatternsToString.spec.ts
@@ -14,17 +14,13 @@ describe('patternsToStringUnmapperFor', () => {
     ${['a']}                          | ${'aaa'}          | ${{}}                              | ${['a', 'a', 'a']}
     ${['a', 'b', 'c']}                | ${'abc'}          | ${{}}                              | ${['a', 'b', 'c']}
     ${['a', 'b', 'c', 'abc']}         | ${'abc'}          | ${{}}                              | ${['a', 'b', 'c'] /* starts by a: the shortest fit */}
-    ${['ab', 'aaa', 'aba', 'a']}      | ${'abaaa'}        | ${{ minLength: 2, maxLength: 3 }}  | ${['ab', 'aaa'] /* starts by ab: the shortest fit */}
     ${['ab', 'aaa', 'aba', 'a']}      | ${'abaaa'}        | ${{ minLength: 3 }}                | ${['ab', 'a', 'a', 'a']}
-    ${['a', 'aaaaa']}                 | ${'aaaaa'}        | ${{ maxLength: 1 }}                | ${['aaaaa']}
-    ${['a', 'aaaaa']}                 | ${'aaaaa'}        | ${{ maxLength: 4 }}                | ${['aaaaa']}
     ${['a', 'aaaaa']}                 | ${'aaaaa'}        | ${{ maxLength: 5 }}                | ${['a', 'a', 'a', 'a', 'a'] /* starts by a: the shortest fit */}
-    ${['a', 'aa']}                    | ${'aaaaaaaaaaa'}  | ${{ minLength: 0, maxLength: 10 }} | ${['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'aa']}
+    ${['a', 'ab']}                    | ${'aaaaaaaaaab'}  | ${{ minLength: 0, maxLength: 10 }} | ${['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'ab']}
     ${['a', 'aa']}                    | ${'aaaaaaaaaaa'}  | ${{ minLength: 0 }}                | ${['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a'] /* ignore maxGeneratedLength = maxLengthFromMinLength(minLength) = 2*minLength + 10 */}
-    ${['a', 'aa']}                    | ${'aaaaaaaaaaaa'} | ${{ minLength: 0, maxLength: 10 }} | ${['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'aa', 'aa']}
+    ${['a', 'ab']}                    | ${'aaaaaaaaabab'} | ${{ minLength: 0, maxLength: 10 }} | ${['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'ab', 'ab']}
     ${['a', 'aa']}                    | ${'aaaaaaaaaaaa'} | ${{ minLength: 0 }}                | ${['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a'] /* ignore maxGeneratedLength = maxLengthFromMinLength(minLength) = 2*minLength + 10 */}
     ${MorseCode}                      | ${'...___...'}    | ${{}}                              | ${['.', '.', '.', '_', '_', '_', '.', '.', '.']}
-    ${MorseCode}                      | ${'...___...'}    | ${{ maxLength: 3 }}                | ${['..', '.__', '_...']}
     ${['\uD83D', '\uDC34', 'a', 'b']} | ${'a\u{1f434}b'}  | ${{}}                              | ${['a', '\uD83D', '\uDC34', 'b']}
   `(
     'should properly split $source into chunks ($constraints)',
@@ -44,10 +40,16 @@ describe('patternsToStringUnmapperFor', () => {
   );
 
   it.each`
-    sourceChunks       | source     | constraints
-    ${['a', 'b', 'c']} | ${'abcd'}  | ${{}}
-    ${['ab', 'aaa']}   | ${'abaaa'} | ${{ minLength: 3 }}
-    ${['a']}           | ${'aaaaa'} | ${{ maxLength: 4 }}
+    sourceChunks                 | source            | constraints
+    ${['a', 'b', 'c']}           | ${'abcd'}         | ${{}}
+    ${['ab', 'aaa']}             | ${'abaaa'}        | ${{ minLength: 3 }}
+    ${['a']}                     | ${'aaaaa'}        | ${{ maxLength: 4 }}
+    ${['a', 'aa']}               | ${'aaaaaaaaaaa'}  | ${{ minLength: 0, maxLength: 10 /* Cannot reach ['a' x9, 'aa'] as the best match would be ['a' x11] so we discard the other */ }}
+    ${['a', 'aa']}               | ${'aaaaaaaaaaaa'} | ${{ minLength: 0, maxLength: 10 /* Cannot reach ['a' x8, 'aa' x2] as the best match would be ['a' x12] so we discard the other */ }}
+    ${MorseCode}                 | ${'...___...'}    | ${{ maxLength: 3 /* Cannot reach ['..', '.__', '_...'] as we have a better match (with shorter strings) discarding it */ }}
+    ${['ab', 'aaa', 'aba', 'a']} | ${'abaaa'}        | ${{ minLength: 2, maxLength: 3 /* Cannot reach ['ab', 'aaa'] as we have a better match (with shorter strings) discarding it */ }}
+    ${['a', 'aaaaa']}            | ${'aaaaa'}        | ${{ maxLength: 1 /* Cannot reach ['aaaaa'] as we have a better match (with shorter strings) discarding it */ }}
+    ${['a', 'aaaaa']}            | ${'aaaaa'}        | ${{ maxLength: 4 /* Cannot reach ['aaaaa'] as we have a better match (with shorter strings) discarding it */ }}
   `('should throw when string cannot be split into chunks ($constraints)', ({ sourceChunks, source, constraints }) => {
     // Arrange
     const sourceChunksSet = new Set(sourceChunks);
@@ -74,7 +76,7 @@ describe('patternsToStringUnmapperFor', () => {
           const source = sourceMods.map((mod) => sourceChunks[mod % sourceChunks.length]).join('');
 
           // Act
-          const unmapper = patternsToStringUnmapperFor(instance, {});
+          const unmapper = patternsToStringUnmapperFor(instance, {}); // no constraints on the length, as such we can accept chunks overlapping them
           const chunks = unmapper(source);
 
           // Assert
@@ -91,7 +93,10 @@ describe('patternsToStringUnmapperFor', () => {
   it('should be able to split strings built out of chunks into chunks while respecting constraints in size', () =>
     fc.assert(
       fc.property(
-        fc.array(fc.fullUnicodeString({ minLength: 1 }), { minLength: 1 }),
+        fc.uniqueArray(fc.fullUnicodeString({ minLength: 1 }), {
+          minLength: 1,
+          comparator: (sa, sb) => sa.includes(sb) || sb.includes(sa), // chunks independent from each others to avoid being discarded because of better match
+        }),
         fc.array(fc.nat()),
         fc.nat(),
         fc.nat(),