Skip to content

Commit

Permalink
perltest: add support for locale modifier
Browse files Browse the repository at this point in the history
Use a similar syntax to pcre2test to set a per pattern locale, and
teach pcre2test to recognize the modifier as perl compatible.

While at it, update tests and fix a recent regresion that wasn't
covered by them.
  • Loading branch information
carenas committed Oct 19, 2024
1 parent 998d2e0 commit eb5c19d
Show file tree
Hide file tree
Showing 10 changed files with 172 additions and 72 deletions.
99 changes: 68 additions & 31 deletions perltest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,19 @@
# a script to Perl through a pipe. See comments below about the data for the
# Perl script. If the next argument of this script is "-utf8", a suitable
# prefix for the Perl script is set up.

# If the next argument of this script is -locale, it must be followed by the
# name of a locale, which is then set when running the tests. Setting a locale
# implies -utf8. For example:
#
# ./perltest.sh -locale tr_TR.utf8 some-file
# A similar process is used to indicate the desire to set a specific locale
# tables per pattern in a similar way to pcre2test through a locale modifier,
# by using the -locale argument. This can be optionally combined with the
# previous arguments; for example, to process an UTF-8 test file in Turkish,
# add the locale=tr_TR.utf8 modifier to the pattern and -locale to perltest,
# or invoke something like (the specific names of the locale might vary):
#
# ./perltest.sh -utf8 -locale=tr_TR.utf8 some-file
#
# If the -locale argument has no setting, a suitable default locale is used
# when possible and reported at startup, it can be always overriden using the
# locale modifier for each pattern.
#
# The remaining arguments of this script, if any, are passed to Perl. They are
# an input file and an output file. If there is one argument, the output is
Expand All @@ -33,7 +40,7 @@

perl=perl
perlarg=""
prefix=''
prefix=""
spc=""

if [ $# -gt 0 -a "$1" = "-perl" ] ; then
Expand All @@ -53,27 +60,37 @@ if [ $# -gt 0 -a "$1" = "-w" ] ; then
fi

if [ $# -gt 0 -a "$1" = "-utf8" ] ; then
prefix="use utf8; require Encode;"
default_locale="C.utf8"
prefix="\
use utf8;\
require Encode;"
perlarg="$perlarg$spc-CSD"

shift
fi

if [ $# -gt 0 -a "$1" = "-locale" ] ; then
if [ $# -lt 2 ] ; then
echo "perltest.sh: Missing locale name - abandoned"
exit 1
if [ $# -gt 0 ] ; then
case "$1" in
-locale=*)
default_locale=${1#-locale=}
;;
-locale)
default_locale=${default_locale:-C}
;;
*)
skip=1
esac
if [ -z "$skip" ] ; then
prefix="\
use POSIX qw(locale_h);\
use locale qw(:ctype);\
\
\$default_locale = setlocale(LC_CTYPE, \"$default_locale\");\
if (\"\$default_locale\" eq \"\")\
{ die \"perltest: Failed to set locale \\\"$default_locale\\\"\\n\"; }\
print \"Locale: \$default_locale\\n\";\
$prefix"
shift
fi
prefix="use utf8;\
use POSIX qw(locale_h);\
use locale;\
\$loc=setlocale(LC_ALL, \"$2\");\
if (\"\$loc\" eq \"\")\
{ die \"perltest.sh: Failed to set locale \\\"$2\\\" - abandoned\\n\";}\
print \"Locale: \$loc\\n\";\
require Encode;"
shift
shift
fi


Expand All @@ -87,6 +104,7 @@ fi
# dupnames ignored (Perl always allows)
# hex preprocess pattern with embedded octets
# jitstack ignored
# locale use a specific locale tables
# mark show mark information
# no_auto_possess ignored
# no_start_optimize insert (??{""}) at pattern start (disables optimizing)
Expand Down Expand Up @@ -146,7 +164,7 @@ else
{
foreach $c (split(//, $_[0]))
{
if (ord $c >= 32 && ord $c < 127) { $t .= $c; }
if ($c =~ /^[[:print:]]$/) { $t .= $c; }
else { $t .= sprintf("\\x%02x", ord $c); }
}
}
Expand Down Expand Up @@ -190,6 +208,12 @@ $default_show_mark = 0;
NEXT_RE:
for (;;)
{
if (defined $locale && defined $default_locale)
{
setlocale(LC_CTYPE, $default_locale);
undef $locale;
}
printf " re> " if $interact;
last if ! ($_ = <$infile>);
printf $outfile "$_" if ! $interact;
Expand Down Expand Up @@ -263,10 +287,6 @@ for (;;)
$mod =~ s/allaftertext,?//;
# Detect utf
$utf8 = $mod =~ s/utf,?//;
# Remove "dupnames".
$mod =~ s/dupnames,?//;
Expand All @@ -275,6 +295,19 @@ for (;;)
$mod =~ s/jitstack=\d+,?//;
# The "locale" modifier indicates which locale to use
if ($mod =~ /locale=([^,]+),?/)
{
die "perltest: missing -locale cmdline flag" unless defined &setlocale;
$locale = setlocale(LC_CTYPE, $1);
if (!defined $locale)
{
print "** Failed to set locale '$1'\n";
next NEXT_RE;
}
}
$mod =~ s/locale=[^,]*,?//; # Remove it; "locale=" Ignored
# The "mark" modifier requests checking of MARK data */
$show_mark = $default_show_mark | ($mod =~ s/mark,?//);
Expand All @@ -283,11 +316,16 @@ for (;;)
$mod =~ s/ucp,?/u/;
# Detect utf
$utf8 = $mod =~ s/utf,?//;
# Remove "no_auto_possess".
$mod =~ s/no_auto_possess,?//;
# The "hex" modifier instructs us to preprocess the pattern
# The "hex" modifier instructs us to preprocess a pattern with embedded
# octets formatted as two digit hexadecimals
if ($mod =~ s/hex,?//)
{
Expand Down Expand Up @@ -321,12 +359,11 @@ for (;;)
$mod =~ s/-no_start_optimize,?//;
if ($mod =~ s/no_start_optimize,?//) { $pat =~ s/$del/$del(??{""})/; }
if ($mod =~ s/no_start_optimize,?//) { $pat = '(??{""})' . $pat; }
# Add back retained modifiers and check that the pattern is valid.
$mod =~ s/,//g;
$pattern = "$del$pat$del$mod";
eval "\$_ =~ ${pattern}";
Expand Down Expand Up @@ -419,7 +456,7 @@ for (;;)
if ($@)
{
printf $outfile "Error: $@\n";
printf $outfile "Error: $@";
next NEXT_RE;
}
elsif (scalar(@subs) == 0)
Expand Down
2 changes: 1 addition & 1 deletion src/pcre2test.c
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,7 @@ static modstruct modlist[] = {
{ "jitstack", MOD_PNDP, MOD_INT, 0, PO(jitstack) },
{ "jitverify", MOD_PAT, MOD_CTL, CTL_JITVERIFY, PO(control) },
{ "literal", MOD_PAT, MOD_OPT, PCRE2_LITERAL, PO(options) },
{ "locale", MOD_PAT, MOD_STR, LOCALESIZE, PO(locale) },
{ "locale", MOD_PATP, MOD_STR, LOCALESIZE, PO(locale) },
{ "mark", MOD_PNDP, MOD_CTL, CTL_MARK, PO(control) },
{ "match_invalid_utf", MOD_PAT, MOD_OPT, PCRE2_MATCH_INVALID_UTF, PO(options) },
{ "match_limit", MOD_CTM, MOD_INT, 0, MO(match_limit) },
Expand Down
9 changes: 9 additions & 0 deletions testdata/testinput1
Original file line number Diff line number Diff line change
Expand Up @@ -5087,6 +5087,15 @@ name)/mark
\= Expect no match
D

/(*COMMIT)ABC/no_start_optimize
ABC
\= Expect no match
DEFABC

/(*COMMIT)ABC/
ABC
DEFABC

# This should fail, as the skip causes a bump to offset 3 (the skip).

/A(*MARK:A)A+(*SKIP)(B|Z) | AC/x,mark
Expand Down
12 changes: 8 additions & 4 deletions testdata/testinput3
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@
/^[\w]+/locale=fr_FR
�cole

/^[\w]+/
\= Expect no match
�cole

/^[\W]+/
�cole

Expand Down Expand Up @@ -80,6 +76,14 @@
\= Expect no match
\x9c

/�/i
\xff
\= Expect no match
y

/(.)\1/i
\xfe\xde

/\W+/
>>>\xaa<<<
>>>\xba<<<
Expand Down
13 changes: 13 additions & 0 deletions testdata/testoutput1
Original file line number Diff line number Diff line change
Expand Up @@ -8174,6 +8174,19 @@ MK: B
D
No match, mark = B

/(*COMMIT)ABC/no_start_optimize
ABC
0: ABC
\= Expect no match
DEFABC
No match

/(*COMMIT)ABC/
ABC
0: ABC
DEFABC
0: ABC

# This should fail, as the skip causes a bump to offset 3 (the skip).

/A(*MARK:A)A+(*SKIP)(B|Z) | AC/x,mark
Expand Down
17 changes: 12 additions & 5 deletions testdata/testoutput3
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,6 @@ No match
�cole
0: �cole

/^[\w]+/
\= Expect no match
�cole
No match

/^[\W]+/
�cole
0: \xc9
Expand Down Expand Up @@ -115,6 +110,18 @@ No match
\x9c
No match

/�/i
\xff
0: �
\= Expect no match
y
No match

/(.)\1/i
\xfe\xde
0: ��
1: �

/\W+/
>>>\xaa<<<
0: >>>
Expand Down
17 changes: 12 additions & 5 deletions testdata/testoutput3A
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,6 @@ No match
�cole
0: �cole

/^[\w]+/
\= Expect no match
�cole
No match

/^[\W]+/
�cole
0: \xc9
Expand Down Expand Up @@ -115,6 +110,18 @@ No match
\x9c
No match

/�/i
\xff
0: �
\= Expect no match
y
No match

/(.)\1/i
\xfe\xde
0: ��
1: �

/\W+/
>>>\xaa<<<
0: >>>
Expand Down
17 changes: 12 additions & 5 deletions testdata/testoutput3B
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,6 @@ No match
�cole
0: �cole

/^[\w]+/
\= Expect no match
�cole
No match

/^[\W]+/
�cole
0: \xc9
Expand Down Expand Up @@ -115,6 +110,18 @@ No match
\x9c
No match

/�/i
\xff
0: �
\= Expect no match
y
No match

/(.)\1/i
\xfe\xde
0: ��
1: �

/\W+/
>>>\xaa<<<
0: >>>
Expand Down
14 changes: 10 additions & 4 deletions testdata/wintestinput3
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@
/^[\w]+/locale=french
�cole

/^[\w]+/
*** Failers
�cole

/^[\W]+/
�cole

Expand Down Expand Up @@ -75,6 +71,16 @@
*** Failers
�cole

/\xb5/i
*** Failers
\x9c

/�/i
\xff
*** Failers
y

/\W+/
>>>\xaa<<<
>>>\xba<<<
Expand Down
Loading

0 comments on commit eb5c19d

Please sign in to comment.