Skip to content

Commit

Permalink
Support top-level flag W
Browse files Browse the repository at this point in the history
  • Loading branch information
slevithan committed Nov 19, 2024
1 parent 890d8e1 commit ee1b868
Show file tree
Hide file tree
Showing 7 changed files with 77 additions and 31 deletions.
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ Notice that nearly every feature below has at least subtle differences from Java
</tr>

<tr valign="top">
<th align="left" rowspan="3">Flags</th>
<th align="left" rowspan="4">Flags</th>
<td><code>i</code></td>
<td><code>i</code></td>
<td align="middle">✅</td>
Expand Down Expand Up @@ -297,6 +297,17 @@ Notice that nearly every feature below has at least subtle differences from Java
✔ Whitespace and <code>#</code> not ignored in char classes<br>
</td>
</tr>
<tr valign="top">
<td><code>W</code></td>
<td><code>W</code></td>
<td align="middle">✅</td>
<td align="middle">✅</td>
<td>
● Currently supported only as a top-level flag<br>
✔ POSIX <code>[[:word:]]</code> and <code>\p{Word}</code> are ASCII based<br>
✔ <code>\b</code> is ASCII based<br>
</td>
</tr>

<tr valign="top">
<th align="left" rowspan="2" valign="top">Flag modifiers</th>
Expand Down
11 changes: 10 additions & 1 deletion demo/demo.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ const state = {
i: getValue('flag-i'),
m: getValue('flag-m'),
x: getValue('flag-x'),
W: getValue('flag-W'),
},
opts: {
accuracy: getValue('option-accuracy'),
Expand Down Expand Up @@ -66,7 +67,15 @@ function showTranspiled() {
ui.subclassInfo.classList.add('hidden');
const options = {
...state.opts,
flags: `${state.flags.i ? 'i' : ''}${state.flags.m ? 'm' : ''}${state.flags.x ? 'x' : ''}`,
flags: `${
state.flags.i ? 'i' : ''
}${
state.flags.m ? 'm' : ''
}${
state.flags.x ? 'x' : ''
}${
state.flags.W ? 'W' : ''
}`,
maxRecursionDepth: state.opts.maxRecursionDepth === '' ? null : +state.opts.maxRecursionDepth,
target: state.opts.target === 'auto' ? autoTarget : state.opts.target,
};
Expand Down
5 changes: 5 additions & 0 deletions demo/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ <h2>Try it</h2>
<kbd>x</kbd>
<span class="tip tip-lg">Insignificant whitespace and comments</span>
</label>
<label>
<input type="checkbox" id="flag-W" onchange="setFlag('W', this.checked)">
<kbd>W</kbd>
<span class="tip tip-sm">Word is ASCII</span>
</label>
</p>
<p>
<label>
Expand Down
8 changes: 6 additions & 2 deletions src/generate.js
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,12 @@ function genAssertion(node, _, gen) {
if (kind === AstAssertionKinds.string_start) {
return '^';
}
// Kinds `line_end`, `line_start`, `search_start`, `string_end_newline`, and `word_boundary` are
// never included in transformer output
// If a word boundary came through the transformer unaltered, that means `wordIsAscii` is enabled
if (kind === AstAssertionKinds.word_boundary) {
return negate ? r`\B` : r`\b`;
}
// Kinds `line_end`, `line_start`, `search_start`, and `string_end_newline` are never included in
// transformer output
throw new Error(`Unexpected assertion kind "${kind}"`);
}

Expand Down
44 changes: 27 additions & 17 deletions src/parse.js
Original file line number Diff line number Diff line change
Expand Up @@ -291,23 +291,15 @@ function parseCharacterSet({token, skipPropertyNameValidation}) {
});
}
}
const node = {
type: AstTypes.CharacterSet,
kind: throwIfNot(AstCharacterSetKinds[kind], `Unexpected character set kind "${kind}"`),
};
if (
kind === TokenCharacterSetKinds.digit ||
kind === TokenCharacterSetKinds.hex ||
kind === TokenCharacterSetKinds.posix ||
kind === TokenCharacterSetKinds.space ||
kind === TokenCharacterSetKinds.word
) {
node.negate = negate;
if (kind === TokenCharacterSetKinds.posix) {
node.value = value;
}
if (kind === TokenCharacterSetKinds.posix) {
return {
type: AstTypes.CharacterSet,
kind: AstCharacterSetKinds.posix,
negate,
value,
};
}
return node;
return createCharacterSet(kind, {negate});
}

function parseGroupOpen(context, state) {
Expand Down Expand Up @@ -520,6 +512,22 @@ function createCharacterClassRange(min, max) {
};
}

function createCharacterSet(kind, {negate}) {
const node = {
type: AstTypes.CharacterSet,
kind: throwIfNot(AstCharacterSetKinds[kind], `Unexpected character set kind "${kind}"`),
};
if (
kind === TokenCharacterSetKinds.digit ||
kind === TokenCharacterSetKinds.hex ||
kind === TokenCharacterSetKinds.space ||
kind === TokenCharacterSetKinds.word
) {
node.negate = negate;
}
return node;
}

function createDirectiveFromToken({kind, flags}) {
const node = {
type: AstTypes.Directive,
Expand All @@ -534,12 +542,13 @@ function createDirectiveFromToken({kind, flags}) {
return node;
}

function createFlags({ignoreCase, dotAll, extended}) {
function createFlags({ignoreCase, dotAll, extended, wordIsAscii}) {
return {
type: AstTypes.Flags,
ignoreCase,
dotAll,
extended,
wordIsAscii,
};
}

Expand Down Expand Up @@ -709,6 +718,7 @@ export {
createCharacterClass,
createCharacterClassIntersection,
createCharacterClassRange,
createCharacterSet,
createFlags,
createGroup,
createLookaround,
Expand Down
9 changes: 6 additions & 3 deletions src/tokenize.js
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ function tokenize(pattern, flags = '') {
if (typeof pattern !== 'string') {
throw new Error('String expected as pattern');
}
if (!/^[imx]*$/.test(flags)) {
throw new Error(`Flags "${flags}" unsupported in Oniguruma`);
if (!/^[imxW]*$/.test(flags)) {
throw new Error(`Flags "${flags}" unsupported`);
}
const xStack = [flags.includes('x')];
const context = {
Expand Down Expand Up @@ -191,10 +191,13 @@ function tokenize(pattern, flags = '') {
tokens,
flags: {
ignoreCase: flags.includes('i'),
// Onig flag m is equivalent to JS flag s
// Flag m is called `multiline` in Onig, but that has a different meaning in JS. Onig flag m
// is equivalent to JS flag s
dotAll: flags.includes('m'),
// Flag x is fully handled during tokenization
extended: flags.includes('x'),
// Flag W is currently only supported as a top-level flag
wordIsAscii: flags.includes('W'),
},
};
}
Expand Down
18 changes: 11 additions & 7 deletions src/transform.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import {Accuracy, Target} from './options.js';
import {AstAssertionKinds, AstCharacterSetKinds, AstDirectiveKinds, AstTypes, AstVariableLengthCharacterSetKinds, createAlternative, createBackreference, createCapturingGroup, createGroup, createLookaround, createUnicodeProperty, parse} from './parse.js';
import {AstAssertionKinds, AstCharacterSetKinds, AstDirectiveKinds, AstTypes, AstVariableLengthCharacterSetKinds, createAlternative, createBackreference, createCapturingGroup, createCharacterSet, createGroup, createLookaround, createUnicodeProperty, parse} from './parse.js';
import {applySubclassStrategies, isLoneGLookaround} from './subclass.js';
import {tokenize} from './tokenize.js';
import {traverse} from './traverse.js';
Expand Down Expand Up @@ -57,6 +57,7 @@ function transform(ast, options) {
// Subroutines can appear before the groups they ref, so collect reffed nodes for a second pass
subroutineRefMap: new Map(),
supportedGNodes: new Set(),
wordIsAscii: ast.flags.wordIsAscii,
};
traverse({node: ast}, firstPassState, FirstPassVisitor);
// Global flags modified by the first pass
Expand Down Expand Up @@ -121,7 +122,7 @@ const FirstPassVisitor = {
},
},

Assertion({node, ast, remove, replaceWith}, {accuracy, supportedGNodes}) {
Assertion({node, ast, remove, replaceWith}, {accuracy, supportedGNodes, wordIsAscii}) {
const {kind, negate} = node;
if (kind === AstAssertionKinds.line_end) {
// Onig's only line break char is line feed, unlike JS
Expand All @@ -137,8 +138,8 @@ const FirstPassVisitor = {
remove();
} else if (kind === AstAssertionKinds.string_end_newline) {
replaceWith(parseFragment(r`(?=\n?\z)`));
} else if (kind === AstAssertionKinds.word_boundary) {
// Onig's word char definition for `\b` is different than for `\w`
} else if (kind === AstAssertionKinds.word_boundary && !wordIsAscii) {
// Onig's `\b` is Unicode-aware by default, though `\w` is ASCII-only
const wordChar = r`[\p{L}\p{N}\p{Pc}]`;
const b = `(?:(?<=${wordChar})(?!${wordChar})|(?<!${wordChar})(?=${wordChar}))`;
const B = `(?:(?<=${wordChar})(?=${wordChar})|(?<!${wordChar})(?!${wordChar}))`;
Expand All @@ -156,7 +157,7 @@ const FirstPassVisitor = {
subroutineRefMap.set(name ?? number, node);
},

CharacterSet({node, replaceWith}, {accuracy, minTargetEs2024}) {
CharacterSet({node, replaceWith}, {accuracy, minTargetEs2024, wordIsAscii}) {
const {kind, negate, value} = node;
if (kind === AstCharacterSetKinds.any) {
replaceWith(createUnicodeProperty('Any'));
Expand All @@ -179,6 +180,8 @@ const FirstPassVisitor = {
ascii = `\0-${cp(ascii.codePointAt(0) - 1)}${cp(ascii.codePointAt(2) + 1)}-\u{10FFFF}`;
}
replaceWith(parseFragment(`[${ascii}]`));
} else if (value === 'word' && wordIsAscii) {
replaceWith(createCharacterSet(AstCharacterSetKinds.word, {negate}));
} else {
const negateableNode = parseFragment(PosixClassesMap.get(value));
negateableNode.negate = negate;
Expand Down Expand Up @@ -221,8 +224,9 @@ const FirstPassVisitor = {
},

Flags({node, parent}) {
// Onig's flag x (`extended`) isn't available in JS
delete node.extended;
// Remove Onig flags that aren't available in JS
delete node.extended; // Flag x
delete node.wordIsAscii; // Flag W
Object.assign(node, {
// JS flag g; no Onig equiv
global: false,
Expand Down

0 comments on commit ee1b868

Please sign in to comment.