Skip to content

Commit

Permalink
Strings: added support for UTF8 offsets in regexp
Browse files Browse the repository at this point in the history
  • Loading branch information
dg committed Jan 13, 2023
1 parent 41401a6 commit 49fc1b7
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 15 deletions.
70 changes: 63 additions & 7 deletions src/Utils/Strings.php
Original file line number Diff line number Diff line change
Expand Up @@ -495,12 +495,19 @@ public static function split(
bool|int $captureOffset = false,
bool $skipEmpty = false,
int $limit = -1,
bool $utf8 = false,
): array
{
$flags = is_int($captureOffset) // back compatibility
? $captureOffset
: ($captureOffset ? PREG_SPLIT_OFFSET_CAPTURE : 0) | ($skipEmpty ? PREG_SPLIT_NO_EMPTY : 0);
return self::pcre('preg_split', [$pattern, $subject, $limit, $flags | PREG_SPLIT_DELIM_CAPTURE]);

$pattern .= $utf8 ? 'u' : '';
$m = self::pcre('preg_split', [$pattern, $subject, $limit, $flags | PREG_SPLIT_DELIM_CAPTURE]);
return $utf8 && $captureOffset
? self::bytesToChars($subject, [$m])[0]
: $m;

}


Expand All @@ -514,19 +521,27 @@ public static function match(
bool|int $captureOffset = false,
int $offset = 0,
bool $unmatchedAsNull = false,
bool $utf8 = false,
): ?array
{
$flags = is_int($captureOffset) // back compatibility
? $captureOffset
: ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);

if ($utf8) {
$offset = strlen(self::substring($subject, 0, $offset));
$pattern .= 'u';
}

if ($offset > strlen($subject)) {
return null;
} elseif (!self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])) {
return null;
} elseif ($utf8 && $captureOffset) {
return self::bytesToChars($subject, [$m])[0];
} else {
return $m;
}

return self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])
? $m
: null;
}


Expand All @@ -542,12 +557,18 @@ public static function matchAll(
int $offset = 0,
bool $unmatchedAsNull = false,
bool $patternOrder = false,
bool $utf8 = false,
): array
{
$flags = is_int($captureOffset) // back compatibility
? $captureOffset
: ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0) | ($patternOrder ? PREG_PATTERN_ORDER : 0);


if ($utf8) {
$offset = strlen(self::substring($subject, 0, $offset));
$pattern .= 'u';
}

if ($offset > strlen($subject)) {
return [];
}
Expand All @@ -557,7 +578,10 @@ public static function matchAll(
($flags & PREG_PATTERN_ORDER) ? $flags : ($flags | PREG_SET_ORDER),
$offset,
]);
return $m;
return $utf8 && $captureOffset
? self::bytesToChars($subject, $m)
: $m;

}


Expand All @@ -572,6 +596,7 @@ public static function replace(
int $limit = -1,
bool $captureOffset = false,
bool $unmatchedAsNull = false,
bool $utf8 = false,
): string
{
if (is_object($replacement) || is_array($replacement)) {
Expand All @@ -580,17 +605,48 @@ public static function replace(
}

$flags = ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);
if ($utf8) {
$pattern .= 'u';
if ($captureOffset) {
$replacement = fn($m) => $replacement(self::bytesToChars($subject, [$m])[0]);
}
}

return self::pcre('preg_replace_callback', [$pattern, $replacement, $subject, $limit, 0, $flags]);

} elseif (is_array($pattern) && is_string(key($pattern))) {
$replacement = array_values($pattern);
$pattern = array_keys($pattern);
}

if ($utf8) {
$pattern = array_map(fn($item) => $item . 'u', (array) $pattern);
}

return self::pcre('preg_replace', [$pattern, $replacement, $subject, $limit]);
}


private static function bytesToChars(string $s, array $groups): array
{
$lastBytes = $lastChars = 0;
foreach ($groups as &$matches) {
foreach ($matches as &$match) {
if ($match[1] > $lastBytes) {
$lastChars += self::length(substr($s, $lastBytes, $match[1] - $lastBytes));
} elseif ($match[1] < $lastBytes) {
$lastChars -= self::length(substr($s, $match[1], $lastBytes - $match[1]));
}

$lastBytes = $match[1];
$match[1] = $lastChars;
}
}

return $groups;
}


/** @internal */
public static function pcre(string $func, array $args)
{
Expand Down
13 changes: 11 additions & 2 deletions tests/Utils/Strings.match().phpt
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,21 @@ Assert::same(['hell', 'l'], Strings::match('hello world!', '#([e-l])+#'));

Assert::same(['hell'], Strings::match('hello world!', '#[e-l]+#'));

Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', PREG_OFFSET_CAPTURE));
Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', captureOffset: true));
Assert::same([[' ', 12]], Strings::match('россия - враг', '#\s+#u', PREG_OFFSET_CAPTURE));
Assert::same([[' ', 12]], Strings::match('россия - враг', '#\s+#u', captureOffset: true));

Assert::same([[' ', 6]], Strings::match('россия - враг', '#\s+#u', captureOffset: true, utf8: true));
Assert::same(['e', null], Strings::match('hello world!', '#e(x)*#', unmatchedAsNull: true));

Assert::same(['ll'], Strings::match('hello world!', '#[e-l]+#', offset: 2));

Assert::same(['l'], Strings::match('žluťoučký kůň', '#[e-l]+#u', offset: 2));

Assert::same(['k'], Strings::match('žluťoučký kůň', '#[e-l]+#u', utf8: true, offset: 2));

Assert::same(['žluťoučký'], Strings::match('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier

Assert::same([['k', 7]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true, utf8: true, offset: 2));

Assert::null(Strings::match('hello world!', '', offset: 50));
Assert::null(Strings::match('', '', offset: 1));
17 changes: 17 additions & 0 deletions tests/Utils/Strings.matchAll().phpt
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,31 @@ Assert::same([
[['u', 3], ['u', 7], ['', 11], ['', 15]],
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', PREG_OFFSET_CAPTURE | PREG_PATTERN_ORDER));

Assert::same([
[['lu', 1], ['l', 1], ['u', 2]],
[['ou', 4], ['o', 4], ['u', 5]],
[['k', 7], ['k', 7], ['', 8]],
[['k', 10], ['k', 10], ['', 11]],
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, utf8: true));

Assert::same([
[['lu', 2], ['ou', 6], ['k', 10], ['k', 14]],
[['l', 2], ['o', 6], ['k', 10], ['k', 14]],
[['u', 3], ['u', 7], ['', 11], ['', 15]],
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, patternOrder: true));

Assert::same([
[['lu', 1], ['ou', 4], ['k', 7], ['k', 10]],
[['l', 1], ['o', 4], ['k', 7], ['k', 10]],
[['u', 2], ['u', 5], ['', 8], ['', 11]],
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, patternOrder: true, utf8: true));

Assert::same([['l'], ['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', offset: 2));

Assert::same([['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', offset: 2, utf8: true));

Assert::same([['žluťoučký'], ['kůň']], Strings::matchAll('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier

Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', PREG_PATTERN_ORDER, 2));
Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', offset: 2, patternOrder: true));

Expand Down
6 changes: 6 additions & 0 deletions tests/Utils/Strings.replace().phpt
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,10 @@ Assert::same(' !', Strings::replace('hello world!', ['#\w#']));

// flags & callback
Assert::same('hell0o worl9d!', Strings::replace('hello world!', '#[e-l]+#', fn($m) => implode('', $m[0]), captureOffset: true));
Assert::same('žl1uťoučk7ý k10ůň!', Strings::replace('žluťoučký kůň!', '#[e-l]+#u', fn($m) => implode('', $m[0]), captureOffset: true, utf8: true));
Strings::replace('hello world!', '#e(x)*#', fn($m) => Assert::null($m[1]), unmatchedAsNull: true);

// utf-8 without modifier
Assert::same('* *', Strings::replace('Россия агрессор', '#\w+#', fn() => '*', utf8: true));
Assert::same('* *', Strings::replace('Россия агрессор', '#\w+#', '*', utf8: true));
Assert::same('* *', Strings::replace('Россия агрессор', ['#\w+#'], '*', utf8: true));
30 changes: 24 additions & 6 deletions tests/Utils/Strings.split().phpt
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,29 @@ Assert::same([
], Strings::split('a, b, c', '#(,)\s*#', PREG_SPLIT_OFFSET_CAPTURE));

Assert::same([
['a', 0],
[',', 1],
['b', 3],
[',', 4],
['c', 6],
], Strings::split('a, b, c', '#(,)\s*#', captureOffset: true));
['ž', 0],
['lu', 2],
['ť', 4],
['ou', 6],
['č', 8],
['k', 10],
['ý ', 11],
['k', 14],
['ůň', 15],
], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true));

Assert::same([
['ž', 0],
['lu', 1],
['ť', 3],
['ou', 4],
['č', 6],
['k', 7],
['ý ', 8],
['k', 10],
['ůň', 11],
], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true, utf8: true));

Assert::same(['', ' ', ''], Strings::split('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier

Assert::same(['a', ',', 'b, c'], Strings::split('a, b, c', '#(,)\s*#', limit: 2));

0 comments on commit 49fc1b7

Please sign in to comment.