Support top-level flag W

slevithan · Nov 19, 2024 · ee1b868 · ee1b868
1 parent 890d8e1
commit ee1b868
Show file tree

Hide file tree

Showing 7 changed files with 77 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -265,7 +265,7 @@ Notice that nearly every feature below has at least subtle differences from Java
   </tr>
 
   <tr valign="top">
-    <th align="left" rowspan="3">Flags</th>
+    <th align="left" rowspan="4">Flags</th>
     <td><code>i</code></td>
     <td><code>i</code></td>
     <td align="middle">✅</td>
@@ -297,6 +297,17 @@ Notice that nearly every feature below has at least subtle differences from Java
       ✔ Whitespace and <code>#</code> not ignored in char classes<br>
     </td>
   </tr>
+  <tr valign="top">
+    <td><code>W</code></td>
+    <td><code>W</code></td>
+    <td align="middle">✅</td>
+    <td align="middle">✅</td>
+    <td>
+      ● Currently supported only as a top-level flag<br>
+      ✔ POSIX <code>[[:word:]]</code> and <code>\p{Word}</code> are ASCII based<br>
+      ✔ <code>\b</code> is ASCII based<br>
+    </td>
+  </tr>
 
   <tr valign="top">
     <th align="left" rowspan="2" valign="top">Flag modifiers</th>

diff --git a/demo/demo.js b/demo/demo.js
@@ -10,6 +10,7 @@ const state = {
     i: getValue('flag-i'),
     m: getValue('flag-m'),
     x: getValue('flag-x'),
+    W: getValue('flag-W'),
   },
   opts: {
     accuracy: getValue('option-accuracy'),
@@ -66,7 +67,15 @@ function showTranspiled() {
   ui.subclassInfo.classList.add('hidden');
   const options = {
     ...state.opts,
-    flags: `${state.flags.i ? 'i' : ''}${state.flags.m ? 'm' : ''}${state.flags.x ? 'x' : ''}`,
+    flags: `${
+      state.flags.i ? 'i' : ''
+    }${
+      state.flags.m ? 'm' : ''
+    }${
+      state.flags.x ? 'x' : ''
+    }${
+      state.flags.W ? 'W' : ''
+    }`,
     maxRecursionDepth: state.opts.maxRecursionDepth === '' ? null : +state.opts.maxRecursionDepth,
     target: state.opts.target === 'auto' ? autoTarget : state.opts.target,
   };

diff --git a/demo/index.html b/demo/index.html
@@ -35,6 +35,11 @@ <h2>Try it</h2>
         <kbd>x</kbd>
         <span class="tip tip-lg">Insignificant whitespace and comments</span>
       </label>
+      <label>
+        <input type="checkbox" id="flag-W" onchange="setFlag('W', this.checked)">
+        <kbd>W</kbd>
+        <span class="tip tip-sm">Word is ASCII</span>
+      </label>
     </p>
     <p>
       <label>

diff --git a/src/generate.js b/src/generate.js
@@ -216,8 +216,12 @@ function genAssertion(node, _, gen) {
   if (kind === AstAssertionKinds.string_start) {
     return '^';
   }
-  // Kinds `line_end`, `line_start`, `search_start`, `string_end_newline`, and `word_boundary` are
-  // never included in transformer output
+  // If a word boundary came through the transformer unaltered, that means `wordIsAscii` is enabled
+  if (kind === AstAssertionKinds.word_boundary) {
+    return negate ? r`\B` : r`\b`;
+  }
+  // Kinds `line_end`, `line_start`, `search_start`, and `string_end_newline` are never included in
+  // transformer output
   throw new Error(`Unexpected assertion kind "${kind}"`);
 }
 

diff --git a/src/parse.js b/src/parse.js
@@ -291,23 +291,15 @@ function parseCharacterSet({token, skipPropertyNameValidation}) {
       });
     }
   }
-  const node = {
-    type: AstTypes.CharacterSet,
-    kind: throwIfNot(AstCharacterSetKinds[kind], `Unexpected character set kind "${kind}"`),
-  };
-  if (
-    kind === TokenCharacterSetKinds.digit ||
-    kind === TokenCharacterSetKinds.hex ||
-    kind === TokenCharacterSetKinds.posix ||
-    kind === TokenCharacterSetKinds.space ||
-    kind === TokenCharacterSetKinds.word
-  ) {
-    node.negate = negate;
-    if (kind === TokenCharacterSetKinds.posix) {
-      node.value = value;
-    }
+  if (kind === TokenCharacterSetKinds.posix) {
+    return {
+      type: AstTypes.CharacterSet,
+      kind: AstCharacterSetKinds.posix,
+      negate,
+      value,
+    };
   }
-  return node;
+  return createCharacterSet(kind, {negate});
 }
 
 function parseGroupOpen(context, state) {
@@ -520,6 +512,22 @@ function createCharacterClassRange(min, max) {
   };
 }
 
+function createCharacterSet(kind, {negate}) {
+  const node = {
+    type: AstTypes.CharacterSet,
+    kind: throwIfNot(AstCharacterSetKinds[kind], `Unexpected character set kind "${kind}"`),
+  };
+  if (
+    kind === TokenCharacterSetKinds.digit ||
+    kind === TokenCharacterSetKinds.hex ||
+    kind === TokenCharacterSetKinds.space ||
+    kind === TokenCharacterSetKinds.word
+  ) {
+    node.negate = negate;
+  }
+  return node;
+}
+
 function createDirectiveFromToken({kind, flags}) {
   const node = {
     type: AstTypes.Directive,
@@ -534,12 +542,13 @@ function createDirectiveFromToken({kind, flags}) {
   return node;
 }
 
-function createFlags({ignoreCase, dotAll, extended}) {
+function createFlags({ignoreCase, dotAll, extended, wordIsAscii}) {
   return {
     type: AstTypes.Flags,
     ignoreCase,
     dotAll,
     extended,
+    wordIsAscii,
   };
 }
 
@@ -709,6 +718,7 @@ export {
   createCharacterClass,
   createCharacterClassIntersection,
   createCharacterClassRange,
+  createCharacterSet,
   createFlags,
   createGroup,
   createLookaround,

diff --git a/src/tokenize.js b/src/tokenize.js
@@ -136,8 +136,8 @@ function tokenize(pattern, flags = '') {
   if (typeof pattern !== 'string') {
     throw new Error('String expected as pattern');
   }
-  if (!/^[imx]*$/.test(flags)) {
-    throw new Error(`Flags "${flags}" unsupported in Oniguruma`);
+  if (!/^[imxW]*$/.test(flags)) {
+    throw new Error(`Flags "${flags}" unsupported`);
   }
   const xStack = [flags.includes('x')];
   const context = {
@@ -191,10 +191,13 @@ function tokenize(pattern, flags = '') {
     tokens,
     flags: {
       ignoreCase: flags.includes('i'),
-      // Onig flag m is equivalent to JS flag s
+      // Flag m is called `multiline` in Onig, but that has a different meaning in JS. Onig flag m
+      // is equivalent to JS flag s
       dotAll: flags.includes('m'),
       // Flag x is fully handled during tokenization
       extended: flags.includes('x'),
+      // Flag W is currently only supported as a top-level flag
+      wordIsAscii: flags.includes('W'),
     },
   };
 }

diff --git a/src/transform.js b/src/transform.js
@@ -1,5 +1,5 @@
 import {Accuracy, Target} from './options.js';
-import {AstAssertionKinds, AstCharacterSetKinds, AstDirectiveKinds, AstTypes, AstVariableLengthCharacterSetKinds, createAlternative, createBackreference, createCapturingGroup, createGroup, createLookaround, createUnicodeProperty, parse} from './parse.js';
+import {AstAssertionKinds, AstCharacterSetKinds, AstDirectiveKinds, AstTypes, AstVariableLengthCharacterSetKinds, createAlternative, createBackreference, createCapturingGroup, createCharacterSet, createGroup, createLookaround, createUnicodeProperty, parse} from './parse.js';
 import {applySubclassStrategies, isLoneGLookaround} from './subclass.js';
 import {tokenize} from './tokenize.js';
 import {traverse} from './traverse.js';
@@ -57,6 +57,7 @@ function transform(ast, options) {
     // Subroutines can appear before the groups they ref, so collect reffed nodes for a second pass 
     subroutineRefMap: new Map(),
     supportedGNodes: new Set(),
+    wordIsAscii: ast.flags.wordIsAscii,
   };
   traverse({node: ast}, firstPassState, FirstPassVisitor);
   // Global flags modified by the first pass
@@ -121,7 +122,7 @@ const FirstPassVisitor = {
     },
   },
 
-  Assertion({node, ast, remove, replaceWith}, {accuracy, supportedGNodes}) {
+  Assertion({node, ast, remove, replaceWith}, {accuracy, supportedGNodes, wordIsAscii}) {
     const {kind, negate} = node;
     if (kind === AstAssertionKinds.line_end) {
       // Onig's only line break char is line feed, unlike JS
@@ -137,8 +138,8 @@ const FirstPassVisitor = {
       remove();
     } else if (kind === AstAssertionKinds.string_end_newline) {
       replaceWith(parseFragment(r`(?=\n?\z)`));
-    } else if (kind === AstAssertionKinds.word_boundary) {
-      // Onig's word char definition for `\b` is different than for `\w`
+    } else if (kind === AstAssertionKinds.word_boundary && !wordIsAscii) {
+      // Onig's `\b` is Unicode-aware by default, though `\w` is ASCII-only
       const wordChar = r`[\p{L}\p{N}\p{Pc}]`;
       const b = `(?:(?<=${wordChar})(?!${wordChar})|(?<!${wordChar})(?=${wordChar}))`;
       const B = `(?:(?<=${wordChar})(?=${wordChar})|(?<!${wordChar})(?!${wordChar}))`;
@@ -156,7 +157,7 @@ const FirstPassVisitor = {
     subroutineRefMap.set(name ?? number, node);
   },
 
-  CharacterSet({node, replaceWith}, {accuracy, minTargetEs2024}) {
+  CharacterSet({node, replaceWith}, {accuracy, minTargetEs2024, wordIsAscii}) {
     const {kind, negate, value} = node;
     if (kind === AstCharacterSetKinds.any) {
       replaceWith(createUnicodeProperty('Any'));
@@ -179,6 +180,8 @@ const FirstPassVisitor = {
           ascii = `\0-${cp(ascii.codePointAt(0) - 1)}${cp(ascii.codePointAt(2) + 1)}-\u{10FFFF}`;
         }
         replaceWith(parseFragment(`[${ascii}]`));
+      } else if (value === 'word' && wordIsAscii) {
+        replaceWith(createCharacterSet(AstCharacterSetKinds.word, {negate}));
       } else {
         const negateableNode = parseFragment(PosixClassesMap.get(value));
         negateableNode.negate = negate;
@@ -221,8 +224,9 @@ const FirstPassVisitor = {
   },
 
   Flags({node, parent}) {
-    // Onig's flag x (`extended`) isn't available in JS
-    delete node.extended;
+    // Remove Onig flags that aren't available in JS
+    delete node.extended; // Flag x
+    delete node.wordIsAscii; // Flag W
     Object.assign(node, {
       // JS flag g; no Onig equiv
       global: false,