Skip to content

Commit

Permalink
Rework character list matching in JIT (PCRE2Project#535)
Browse files Browse the repository at this point in the history
Co-authored-by: Zoltan Herczeg <[email protected]>
  • Loading branch information
zherczeg and Zoltan Herczeg authored Oct 21, 2024
1 parent ef11bee commit 96f0653
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 102 deletions.
201 changes: 99 additions & 102 deletions src/pcre2_jit_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -7929,6 +7929,7 @@ static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc,
{
DEFINE_COMPILER;
jump_list *found = NULL;
jump_list *check_result = NULL;
jump_list **list = (cc[0] & XCL_NOT) == 0 ? &found : backtracks;
sljit_uw c, charoffset;
sljit_u32 max = 256, min = READ_CHAR_MAX;
Expand Down Expand Up @@ -8507,139 +8508,135 @@ SLJIT_ASSERT(ranges.range_count > 0);
#endif /* SUPPORT_UNICODE */

SLJIT_ASSERT(compares == 1);
if (charoffset != 0)
invertcmp = (list != backtracks);

if (ranges.range_count == 2)
{
range_start = ranges.ranges[0];
range_end = ranges.ranges[1];

if (range_start < range_end)
{
SET_CHAR_OFFSET(range_start);
jump = CMP(SLJIT_LESS_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));
}
else
jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset));

add_jump(compiler, backtracks, jump);

SLJIT_ASSERT(ranges.stack == ranges.local_stack);
if (found != NULL)
set_jumps(found, LABEL());
return;
}

if (ranges.range_count > 6 && charoffset != 0)
{
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)charoffset);
charoffset = 0;
}

charoffset = 0;
depth = 0;
first_item = 0;
last_item = ranges.range_count;
compares = ranges.range_count;
last_item = ranges.range_count - 2;
has_cmov = sljit_has_cpu_feature(SLJIT_HAS_CMOV) != 0;

while (TRUE)
{
/* At least two items are present. */
SLJIT_ASSERT(first_item < last_item);

if (first_item + 6 >= last_item)
if (first_item + 6 <= last_item)
{
range_start = ranges.ranges[first_item];
range_end = ranges.ranges[first_item + 1];
first_item += 2;
compares -= 2;
invertcmp = (compares == 0 && list != backtracks);
charoffset = 0;
jump = NULL;
SLJIT_ASSERT(charoffset == 0);
mid_item = ((first_item + last_item) >> 1) & ~(sljit_u32)1;
SLJIT_ASSERT(last_item >= mid_item + 4);

range_end = ranges.ranges[mid_item + 1];
ranges.stack[depth].jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, (sljit_sw)range_end);
ranges.stack[depth].first_item = (sljit_u32)(mid_item + 2);
ranges.stack[depth].last_item = (sljit_u32)last_item;

depth++;
SLJIT_ASSERT(ranges.stack == ranges.local_stack ?
depth <= XCLASS_LOCAL_RANGES_LOG2_SIZE : (ranges.stack + depth) <= (xclass_stack_item*)ranges.ranges);

if (range_start < range_end)
last_item = mid_item;
continue;
}

range_start = ranges.ranges[first_item];
range_end = ranges.ranges[first_item + 1];

if (range_start < range_end)
{
SET_CHAR_OFFSET(range_start);
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
}
else
{
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)range_start);
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
}

if (first_item < last_item)
{
do
{
SET_CHAR_OFFSET(range_start);
first_item += 2;
range_start = ranges.ranges[first_item];
range_end = ranges.ranges[first_item + 1];

if (first_item < last_item)
if (range_start < range_end)
{
SET_CHAR_OFFSET(range_start);
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
}
else
jump = CMP(SLJIT_LESS_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));
}
else
{
if (first_item < last_item)
{
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)range_start);
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);

if (has_cmov)
SELECT(SLJIT_LESS_EQUAL, TMP2, STR_END, 0, TMP2);
else
OP_FLAGS(SLJIT_OR | ((first_item == last_item) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_LESS_EQUAL);
}
else
jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)range_start);
}

if (first_item < last_item)
{
do
{
range_start = ranges.ranges[first_item];
range_end = ranges.ranges[first_item + 1];
first_item += 2;
compares -= 2;
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset));

if (range_start < range_end)
{
SET_CHAR_OFFSET(range_start);
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));

if (has_cmov)
SELECT(SLJIT_LESS_EQUAL, TMP2, STR_END, 0, TMP2);
else
OP_FLAGS(SLJIT_OR | ((first_item == last_item) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_LESS_EQUAL);
}
if (has_cmov)
SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2);
else
{
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset));

if (has_cmov)
SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2);
else
OP_FLAGS(SLJIT_OR | ((first_item == last_item) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);
}
OP_FLAGS(SLJIT_OR | ((first_item == last_item) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);
}
while (first_item < last_item);

invertcmp = (compares == 0 && list != backtracks);

if (has_cmov)
jump = CMP(SLJIT_NOT_EQUAL ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);
else
jump = JUMP(SLJIT_NOT_EQUAL ^ invertcmp);
}

add_jump(compiler, compares > 0 ? list : backtracks, jump);

if (depth == 0) break;

add_jump(compiler, list == backtracks ? &found : backtracks, JUMP(SLJIT_JUMP));

/* The charoffset resets after the end of a branch is reached. */
depth--;
first_item = ranges.stack[depth].first_item;
last_item = ranges.stack[depth].last_item;
JUMPHERE(ranges.stack[depth].jump);
continue;
while (first_item < last_item);
}

mid_item = ((first_item + last_item) >> 1) & ~(sljit_u32)1;
SLJIT_ASSERT(last_item > mid_item + 2);

range_start = ranges.ranges[mid_item];
range_end = ranges.ranges[mid_item + 1];

ranges.stack[depth].first_item = (sljit_u8)(mid_item + 2);
ranges.stack[depth].last_item = (sljit_u8)last_item;
if (depth == 0) break;

compares -= 2;
invertcmp = (compares == 0 && list != backtracks);
add_jump(compiler, &check_result, JUMP(SLJIT_JUMP));

if (range_start < range_end)
{
ranges.stack[depth].jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, (sljit_sw)range_end);
jump = CMP(SLJIT_GREATER_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)range_start);
}
else
{
OP2U(SLJIT_SUB | SLJIT_SET_GREATER | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)range_start);
ranges.stack[depth].jump = JUMP(SLJIT_GREATER);
jump = JUMP(SLJIT_EQUAL ^ invertcmp);
}
/* The charoffset resets after the end of a branch is reached. */
charoffset = 0;
depth--;
first_item = ranges.stack[depth].first_item;
last_item = ranges.stack[depth].last_item;
JUMPHERE(ranges.stack[depth].jump);
}

depth++;
SLJIT_ASSERT(ranges.stack == ranges.local_stack ?
depth <= XCLASS_LOCAL_RANGES_LOG2_SIZE : (ranges.stack + depth) <= (xclass_stack_item*)ranges.ranges);
if (check_result != NULL)
set_jumps(check_result, LABEL());

add_jump(compiler, compares > 0 ? list : backtracks, jump);
last_item = mid_item;
if (has_cmov)
jump = CMP(SLJIT_NOT_EQUAL ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);
else
{
sljit_set_current_flags(compiler, SLJIT_SET_Z);
jump = JUMP(SLJIT_NOT_EQUAL ^ invertcmp);
}

SLJIT_ASSERT(compares == 0);
add_jump(compiler, backtracks, jump);

if (found != NULL)
set_jumps(found, LABEL());

Expand Down
24 changes: 24 additions & 0 deletions testdata/testinput5
Original file line number Diff line number Diff line change
Expand Up @@ -2863,4 +2863,28 @@
/([\x{6535}\x{6536}\x{6538}\x{6539}\x{653b}\x{653c}\x{653e}\x{653f}\x{6541}\x{6542}\x{8000}-\x{ffff}]#)+/B,utf
\x{6534}#\x{6537}#\x{653a}#\x{653d}#\x{6540}#\x{6543}#\x{7fff}#\x{6535}#\x{6536}#\x{6538}#\x{6539}#\x{653b}#\x{653c}#\x{653e}#\x{653f}#\x{6541}#\x{6542}#\x{8000}#\x{c246}#\x{ffff}

/[[:xdigit:]\x{400}-\x{600}]+/utf,ucp
!a0\x{400}\x{600}9\x{3ff}

/[^[:xdigit:]\x{400}-\x{600}]+/utf,ucp
\x{400}(\x{3ff}\x{601})\x{600}

/[[:xdigit:]\x{400}-\x{600}\x{700}]+/utf,ucp
!A0\x{700}9\x{601}

/[^[:xdigit:]\x{400}-\x{600}\x{700}]+/utf,ucp
\x{600}(\x{6ff}\x{701}\x{3ff}\x{601})\x{700}

/[[:xdigit:]\x{400}-\x{600}\x{700}-\x{800}\x{900}]+/utf,ucp
!f0\x{800}\x{600}9\x{601}

/[^[:xdigit:]\x{400}-\x{600}\x{700}-\x{800}\x{900}]+/utf,ucp
\x{700}[\x{3ff}\x{601}\x{6ff}\x{801}\x{8ff}\x{901}]\x{900}

/[[:xdigit:]\x{400}-\x{410}\x{500}\x{600}-\x{610}\x{700}\x{800}-\x{810}]+/utf,ucp
!F0\x{400}\x{410}\x{500}\x{600}\x{610}\x{700}\x{800}\x{810}9\x{7ff}

/[^[:xdigit:]\x{400}-\x{410}\x{500}\x{600}-\x{610}\x{700}\x{800}-\x{810}]+/utf,ucp
\x{800}<\x{3ff}\x{411}\x{4ff}\x{501}\x{5ff}\x{611}\x{6ff}\x{701}\x{7ff}\x{811}>\x{810}

# End of testinput5
Expand Down
32 changes: 32 additions & 0 deletions testdata/testoutput5
Original file line number Diff line number Diff line change
Expand Up @@ -6198,4 +6198,36 @@ Failed: error 115 at offset 52: reference to non-existent subpattern
0: \x{6535}#\x{6536}#\x{6538}#\x{6539}#\x{653b}#\x{653c}#\x{653e}#\x{653f}#\x{6541}#\x{6542}#\x{8000}#\x{c246}#
1: \x{c246}#

/[[:xdigit:]\x{400}-\x{600}]+/utf,ucp
!a0\x{400}\x{600}9\x{3ff}
0: a0\x{400}\x{600}9

/[^[:xdigit:]\x{400}-\x{600}]+/utf,ucp
\x{400}(\x{3ff}\x{601})\x{600}
0: (\x{3ff}\x{601})

/[[:xdigit:]\x{400}-\x{600}\x{700}]+/utf,ucp
!A0\x{700}9\x{601}
0: A0\x{700}9

/[^[:xdigit:]\x{400}-\x{600}\x{700}]+/utf,ucp
\x{600}(\x{6ff}\x{701}\x{3ff}\x{601})\x{700}
0: (\x{6ff}\x{701}\x{3ff}\x{601})

/[[:xdigit:]\x{400}-\x{600}\x{700}-\x{800}\x{900}]+/utf,ucp
!f0\x{800}\x{600}9\x{601}
0: f0\x{800}\x{600}9

/[^[:xdigit:]\x{400}-\x{600}\x{700}-\x{800}\x{900}]+/utf,ucp
\x{700}[\x{3ff}\x{601}\x{6ff}\x{801}\x{8ff}\x{901}]\x{900}
0: [\x{3ff}\x{601}\x{6ff}\x{801}\x{8ff}\x{901}]

/[[:xdigit:]\x{400}-\x{410}\x{500}\x{600}-\x{610}\x{700}\x{800}-\x{810}]+/utf,ucp
!F0\x{400}\x{410}\x{500}\x{600}\x{610}\x{700}\x{800}\x{810}9\x{7ff}
0: F0\x{400}\x{410}\x{500}\x{600}\x{610}\x{700}\x{800}\x{810}9

/[^[:xdigit:]\x{400}-\x{410}\x{500}\x{600}-\x{610}\x{700}\x{800}-\x{810}]+/utf,ucp
\x{800}<\x{3ff}\x{411}\x{4ff}\x{501}\x{5ff}\x{611}\x{6ff}\x{701}\x{7ff}\x{811}>\x{810}
0: <\x{3ff}\x{411}\x{4ff}\x{501}\x{5ff}\x{611}\x{6ff}\x{701}\x{7ff}\x{811}>

# End of testinput5
Expand Down

0 comments on commit 96f0653

Please sign in to comment.