diff --git a/README.md b/README.md
index 86a4e32..f36d84c 100644
--- a/README.md
+++ b/README.md
@@ -265,7 +265,7 @@ Notice that nearly every feature below has at least subtle differences from Java
Flag modifiers
diff --git a/demo/demo.js b/demo/demo.js
index 0740091..b50bdbd 100644
--- a/demo/demo.js
+++ b/demo/demo.js
@@ -10,6 +10,7 @@ const state = {
i: getValue('flag-i'),
m: getValue('flag-m'),
x: getValue('flag-x'),
+ W: getValue('flag-W'),
},
opts: {
accuracy: getValue('option-accuracy'),
@@ -66,7 +67,15 @@ function showTranspiled() {
ui.subclassInfo.classList.add('hidden');
const options = {
...state.opts,
- flags: `${state.flags.i ? 'i' : ''}${state.flags.m ? 'm' : ''}${state.flags.x ? 'x' : ''}`,
+ flags: `${
+ state.flags.i ? 'i' : ''
+ }${
+ state.flags.m ? 'm' : ''
+ }${
+ state.flags.x ? 'x' : ''
+ }${
+ state.flags.W ? 'W' : ''
+ }`,
maxRecursionDepth: state.opts.maxRecursionDepth === '' ? null : +state.opts.maxRecursionDepth,
target: state.opts.target === 'auto' ? autoTarget : state.opts.target,
};
diff --git a/demo/index.html b/demo/index.html
index 34d656d..32a05ae 100644
--- a/demo/index.html
+++ b/demo/index.html
@@ -35,6 +35,11 @@ Try it
x
Insignificant whitespace and comments
+
+
+ W
+ Word is ASCII
+
diff --git a/src/generate.js b/src/generate.js
index 26535fc..92a9902 100644
--- a/src/generate.js
+++ b/src/generate.js
@@ -216,8 +216,12 @@ function genAssertion(node, _, gen) {
if (kind === AstAssertionKinds.string_start) {
return '^';
}
- // Kinds `line_end`, `line_start`, `search_start`, `string_end_newline`, and `word_boundary` are
- // never included in transformer output
+ // If a word boundary came through the transformer unaltered, that means `wordIsAscii` is enabled
+ if (kind === AstAssertionKinds.word_boundary) {
+ return negate ? r`\B` : r`\b`;
+ }
+ // Kinds `line_end`, `line_start`, `search_start`, and `string_end_newline` are never included in
+ // transformer output
throw new Error(`Unexpected assertion kind "${kind}"`);
}
diff --git a/src/parse.js b/src/parse.js
index 6d8fa94..8d633be 100644
--- a/src/parse.js
+++ b/src/parse.js
@@ -291,23 +291,15 @@ function parseCharacterSet({token, skipPropertyNameValidation}) {
});
}
}
- const node = {
- type: AstTypes.CharacterSet,
- kind: throwIfNot(AstCharacterSetKinds[kind], `Unexpected character set kind "${kind}"`),
- };
- if (
- kind === TokenCharacterSetKinds.digit ||
- kind === TokenCharacterSetKinds.hex ||
- kind === TokenCharacterSetKinds.posix ||
- kind === TokenCharacterSetKinds.space ||
- kind === TokenCharacterSetKinds.word
- ) {
- node.negate = negate;
- if (kind === TokenCharacterSetKinds.posix) {
- node.value = value;
- }
+ if (kind === TokenCharacterSetKinds.posix) {
+ return {
+ type: AstTypes.CharacterSet,
+ kind: AstCharacterSetKinds.posix,
+ negate,
+ value,
+ };
}
- return node;
+ return createCharacterSet(kind, {negate});
}
function parseGroupOpen(context, state) {
@@ -520,6 +512,22 @@ function createCharacterClassRange(min, max) {
};
}
+function createCharacterSet(kind, {negate}) {
+ const node = {
+ type: AstTypes.CharacterSet,
+ kind: throwIfNot(AstCharacterSetKinds[kind], `Unexpected character set kind "${kind}"`),
+ };
+ if (
+ kind === TokenCharacterSetKinds.digit ||
+ kind === TokenCharacterSetKinds.hex ||
+ kind === TokenCharacterSetKinds.space ||
+ kind === TokenCharacterSetKinds.word
+ ) {
+ node.negate = negate;
+ }
+ return node;
+}
+
function createDirectiveFromToken({kind, flags}) {
const node = {
type: AstTypes.Directive,
@@ -534,12 +542,13 @@ function createDirectiveFromToken({kind, flags}) {
return node;
}
-function createFlags({ignoreCase, dotAll, extended}) {
+function createFlags({ignoreCase, dotAll, extended, wordIsAscii}) {
return {
type: AstTypes.Flags,
ignoreCase,
dotAll,
extended,
+ wordIsAscii,
};
}
@@ -709,6 +718,7 @@ export {
createCharacterClass,
createCharacterClassIntersection,
createCharacterClassRange,
+ createCharacterSet,
createFlags,
createGroup,
createLookaround,
diff --git a/src/tokenize.js b/src/tokenize.js
index 9f69ec6..ebac3c3 100644
--- a/src/tokenize.js
+++ b/src/tokenize.js
@@ -136,8 +136,8 @@ function tokenize(pattern, flags = '') {
if (typeof pattern !== 'string') {
throw new Error('String expected as pattern');
}
- if (!/^[imx]*$/.test(flags)) {
- throw new Error(`Flags "${flags}" unsupported in Oniguruma`);
+ if (!/^[imxW]*$/.test(flags)) {
+ throw new Error(`Flags "${flags}" unsupported`);
}
const xStack = [flags.includes('x')];
const context = {
@@ -191,10 +191,13 @@ function tokenize(pattern, flags = '') {
tokens,
flags: {
ignoreCase: flags.includes('i'),
- // Onig flag m is equivalent to JS flag s
+ // Flag m is called `multiline` in Onig, but that has a different meaning in JS. Onig flag m
+ // is equivalent to JS flag s
dotAll: flags.includes('m'),
// Flag x is fully handled during tokenization
extended: flags.includes('x'),
+ // Flag W is currently only supported as a top-level flag
+ wordIsAscii: flags.includes('W'),
},
};
}
diff --git a/src/transform.js b/src/transform.js
index 038812b..f5d1e0a 100644
--- a/src/transform.js
+++ b/src/transform.js
@@ -1,5 +1,5 @@
import {Accuracy, Target} from './options.js';
-import {AstAssertionKinds, AstCharacterSetKinds, AstDirectiveKinds, AstTypes, AstVariableLengthCharacterSetKinds, createAlternative, createBackreference, createCapturingGroup, createGroup, createLookaround, createUnicodeProperty, parse} from './parse.js';
+import {AstAssertionKinds, AstCharacterSetKinds, AstDirectiveKinds, AstTypes, AstVariableLengthCharacterSetKinds, createAlternative, createBackreference, createCapturingGroup, createCharacterSet, createGroup, createLookaround, createUnicodeProperty, parse} from './parse.js';
import {applySubclassStrategies, isLoneGLookaround} from './subclass.js';
import {tokenize} from './tokenize.js';
import {traverse} from './traverse.js';
@@ -57,6 +57,7 @@ function transform(ast, options) {
// Subroutines can appear before the groups they ref, so collect reffed nodes for a second pass
subroutineRefMap: new Map(),
supportedGNodes: new Set(),
+ wordIsAscii: ast.flags.wordIsAscii,
};
traverse({node: ast}, firstPassState, FirstPassVisitor);
// Global flags modified by the first pass
@@ -121,7 +122,7 @@ const FirstPassVisitor = {
},
},
- Assertion({node, ast, remove, replaceWith}, {accuracy, supportedGNodes}) {
+ Assertion({node, ast, remove, replaceWith}, {accuracy, supportedGNodes, wordIsAscii}) {
const {kind, negate} = node;
if (kind === AstAssertionKinds.line_end) {
// Onig's only line break char is line feed, unlike JS
@@ -137,8 +138,8 @@ const FirstPassVisitor = {
remove();
} else if (kind === AstAssertionKinds.string_end_newline) {
replaceWith(parseFragment(r`(?=\n?\z)`));
- } else if (kind === AstAssertionKinds.word_boundary) {
- // Onig's word char definition for `\b` is different than for `\w`
+ } else if (kind === AstAssertionKinds.word_boundary && !wordIsAscii) {
+ // Onig's `\b` is Unicode-aware by default, though `\w` is ASCII-only
const wordChar = r`[\p{L}\p{N}\p{Pc}]`;
const b = `(?:(?<=${wordChar})(?!${wordChar})|(?