diff --git a/perltest.sh b/perltest.sh index 44a29b901..9b6a52dba 100755 --- a/perltest.sh +++ b/perltest.sh @@ -15,12 +15,19 @@ # a script to Perl through a pipe. See comments below about the data for the # Perl script. If the next argument of this script is "-utf8", a suitable # prefix for the Perl script is set up. - -# If the next argument of this script is -locale, it must be followed by the -# name of a locale, which is then set when running the tests. Setting a locale -# implies -utf8. For example: # -# ./perltest.sh -locale tr_TR.utf8 some-file +# A similar process is used to indicate the desire to set a specific locale +# tables per pattern in a similar way to pcre2test through a locale modifier, +# by using the -locale argument. This can be optionally combined with the +# previous arguments; for example, to process an UTF-8 test file in Turkish, +# add the locale=tr_TR.utf8 modifier to the pattern and -locale to perltest, +# or invoke something like (the specific names of the locale might vary): +# +# ./perltest.sh -utf8 -locale=tr_TR.utf8 some-file +# +# If the -locale argument has no setting, a suitable default locale is used +# when possible and reported at startup, it can be always overriden using the +# locale modifier for each pattern. # # The remaining arguments of this script, if any, are passed to Perl. They are # an input file and an output file. If there is one argument, the output is @@ -33,7 +40,7 @@ perl=perl perlarg="" -prefix='' +prefix="" spc="" if [ $# -gt 0 -a "$1" = "-perl" ] ; then @@ -53,27 +60,37 @@ if [ $# -gt 0 -a "$1" = "-w" ] ; then fi if [ $# -gt 0 -a "$1" = "-utf8" ] ; then - prefix="use utf8; require Encode;" + default_locale="C.utf8" + prefix="\ + use utf8;\ + require Encode;" perlarg="$perlarg$spc-CSD" - shift fi -if [ $# -gt 0 -a "$1" = "-locale" ] ; then - if [ $# -lt 2 ] ; then - echo "perltest.sh: Missing locale name - abandoned" - exit 1 +if [ $# -gt 0 ] ; then + case "$1" in + -locale=*) + default_locale=${1#-locale=} + ;; + -locale) + default_locale=${default_locale:-C} + ;; + *) + skip=1 + esac + if [ -z "$skip" ] ; then + prefix="\ + use POSIX qw(locale_h);\ + use locale qw(:ctype);\ + \ + \$default_locale = setlocale(LC_CTYPE, \"$default_locale\");\ + if (\"\$default_locale\" eq \"\")\ + { die \"perltest: Failed to set locale \\\"$default_locale\\\"\\n\"; }\ + print \"Locale: \$default_locale\\n\";\ + $prefix" + shift fi - prefix="use utf8;\ - use POSIX qw(locale_h);\ - use locale;\ - \$loc=setlocale(LC_ALL, \"$2\");\ - if (\"\$loc\" eq \"\")\ - { die \"perltest.sh: Failed to set locale \\\"$2\\\" - abandoned\\n\";}\ - print \"Locale: \$loc\\n\";\ - require Encode;" - shift - shift fi @@ -87,6 +104,7 @@ fi # dupnames ignored (Perl always allows) # hex preprocess pattern with embedded octets # jitstack ignored +# locale use a specific locale tables # mark show mark information # no_auto_possess ignored # no_start_optimize insert (??{""}) at pattern start (disables optimizing) @@ -146,7 +164,7 @@ else { foreach $c (split(//, $_[0])) { - if (ord $c >= 32 && ord $c < 127) { $t .= $c; } + if ($c =~ /^[[:print:]]$/) { $t .= $c; } else { $t .= sprintf("\\x%02x", ord $c); } } } @@ -190,6 +208,12 @@ $default_show_mark = 0; NEXT_RE: for (;;) { + if (defined $locale && defined $default_locale) + { + setlocale(LC_CTYPE, $default_locale); + undef $locale; + } + printf " re> " if $interact; last if ! ($_ = <$infile>); printf $outfile "$_" if ! $interact; @@ -263,10 +287,6 @@ for (;;) $mod =~ s/allaftertext,?//; - # Detect utf - - $utf8 = $mod =~ s/utf,?//; - # Remove "dupnames". $mod =~ s/dupnames,?//; @@ -275,6 +295,19 @@ for (;;) $mod =~ s/jitstack=\d+,?//; + # The "locale" modifier indicates which locale to use + if ($mod =~ /locale=([^,]+),?/) + { + die "perltest: missing -locale cmdline flag" unless defined &setlocale; + $locale = setlocale(LC_CTYPE, $1); + if (!defined $locale) + { + print "** Failed to set locale '$1'\n"; + next NEXT_RE; + } + } + $mod =~ s/locale=[^,]*,?//; # Remove it; "locale=" Ignored + # The "mark" modifier requests checking of MARK data */ $show_mark = $default_show_mark | ($mod =~ s/mark,?//); @@ -283,11 +316,16 @@ for (;;) $mod =~ s/ucp,?/u/; + # Detect utf + + $utf8 = $mod =~ s/utf,?//; + # Remove "no_auto_possess". $mod =~ s/no_auto_possess,?//; - # The "hex" modifier instructs us to preprocess the pattern + # The "hex" modifier instructs us to preprocess a pattern with embedded + # octets formatted as two digit hexadecimals if ($mod =~ s/hex,?//) { @@ -321,12 +359,11 @@ for (;;) $mod =~ s/-no_start_optimize,?//; - if ($mod =~ s/no_start_optimize,?//) { $pat =~ s/$del/$del(??{""})/; } + if ($mod =~ s/no_start_optimize,?//) { $pat = '(??{""})' . $pat; } # Add back retained modifiers and check that the pattern is valid. $mod =~ s/,//g; - $pattern = "$del$pat$del$mod"; eval "\$_ =~ ${pattern}"; @@ -419,7 +456,7 @@ for (;;) if ($@) { - printf $outfile "Error: $@\n"; + printf $outfile "Error: $@"; next NEXT_RE; } elsif (scalar(@subs) == 0) diff --git a/src/pcre2test.c b/src/pcre2test.c index 8dd8d67c3..c764f1731 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -720,7 +720,7 @@ static modstruct modlist[] = { { "jitstack", MOD_PNDP, MOD_INT, 0, PO(jitstack) }, { "jitverify", MOD_PAT, MOD_CTL, CTL_JITVERIFY, PO(control) }, { "literal", MOD_PAT, MOD_OPT, PCRE2_LITERAL, PO(options) }, - { "locale", MOD_PAT, MOD_STR, LOCALESIZE, PO(locale) }, + { "locale", MOD_PATP, MOD_STR, LOCALESIZE, PO(locale) }, { "mark", MOD_PNDP, MOD_CTL, CTL_MARK, PO(control) }, { "match_invalid_utf", MOD_PAT, MOD_OPT, PCRE2_MATCH_INVALID_UTF, PO(options) }, { "match_limit", MOD_CTM, MOD_INT, 0, MO(match_limit) }, diff --git a/testdata/testinput1 b/testdata/testinput1 index 908ab5ea6..0794502e7 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -5087,6 +5087,15 @@ name)/mark \= Expect no match D +/(*COMMIT)ABC/no_start_optimize + ABC +\= Expect no match + DEFABC + +/(*COMMIT)ABC/ + ABC + DEFABC + # This should fail, as the skip causes a bump to offset 3 (the skip). /A(*MARK:A)A+(*SKIP)(B|Z) | AC/x,mark diff --git a/testdata/testinput3 b/testdata/testinput3 index 20f8d4c23..be3373e69 100644 --- a/testdata/testinput3 +++ b/testdata/testinput3 @@ -14,10 +14,6 @@ /^[\w]+/locale=fr_FR École -/^[\w]+/ -\= Expect no match - École - /^[\W]+/ École @@ -80,6 +76,14 @@ \= Expect no match \x9c +/ÿ/i + \xff +\= Expect no match + y + +/(.)\1/i + \xfe\xde + /\W+/ >>>\xaa<<< >>>\xba<<< diff --git a/testdata/testoutput1 b/testdata/testoutput1 index fc2753bfe..8daf83628 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -8174,6 +8174,19 @@ MK: B D No match, mark = B +/(*COMMIT)ABC/no_start_optimize + ABC + 0: ABC +\= Expect no match + DEFABC +No match + +/(*COMMIT)ABC/ + ABC + 0: ABC + DEFABC + 0: ABC + # This should fail, as the skip causes a bump to offset 3 (the skip). /A(*MARK:A)A+(*SKIP)(B|Z) | AC/x,mark diff --git a/testdata/testoutput3 b/testdata/testoutput3 index ffe1f29d8..c6b120df8 100644 --- a/testdata/testoutput3 +++ b/testdata/testoutput3 @@ -16,11 +16,6 @@ No match École 0: École -/^[\w]+/ -\= Expect no match - École -No match - /^[\W]+/ École 0: \xc9 @@ -115,6 +110,18 @@ No match \x9c No match +/ÿ/i + \xff + 0: ÿ +\= Expect no match + y +No match + +/(.)\1/i + \xfe\xde + 0: þÞ + 1: þ + /\W+/ >>>\xaa<<< 0: >>> diff --git a/testdata/testoutput3A b/testdata/testoutput3A index 8b3e06b1d..e21ff2281 100644 --- a/testdata/testoutput3A +++ b/testdata/testoutput3A @@ -16,11 +16,6 @@ No match École 0: École -/^[\w]+/ -\= Expect no match - École -No match - /^[\W]+/ École 0: \xc9 @@ -115,6 +110,18 @@ No match \x9c No match +/ÿ/i + \xff + 0: ÿ +\= Expect no match + y +No match + +/(.)\1/i + \xfe\xde + 0: þÞ + 1: þ + /\W+/ >>>\xaa<<< 0: >>> diff --git a/testdata/testoutput3B b/testdata/testoutput3B index 73120b002..79aafcee3 100644 --- a/testdata/testoutput3B +++ b/testdata/testoutput3B @@ -16,11 +16,6 @@ No match École 0: École -/^[\w]+/ -\= Expect no match - École -No match - /^[\W]+/ École 0: \xc9 @@ -115,6 +110,18 @@ No match \x9c No match +/ÿ/i + \xff + 0: ÿ +\= Expect no match + y +No match + +/(.)\1/i + \xfe\xde + 0: þÞ + 1: þ + /\W+/ >>>\xaa<<< 0: >>> diff --git a/testdata/wintestinput3 b/testdata/wintestinput3 index 8d8017a65..6f124e376 100644 --- a/testdata/wintestinput3 +++ b/testdata/wintestinput3 @@ -14,10 +14,6 @@ /^[\w]+/locale=french École -/^[\w]+/ - *** Failers - École - /^[\W]+/ École @@ -75,6 +71,16 @@ *** Failers école +/\xb5/i + µ + *** Failers + \x9c + +/ÿ/i + \xff + *** Failers + y + /\W+/ >>>\xaa<<< >>>\xba<<< diff --git a/testdata/wintestoutput3 b/testdata/wintestoutput3 index b1894b66d..ad6a0e1aa 100644 --- a/testdata/wintestoutput3 +++ b/testdata/wintestoutput3 @@ -17,12 +17,6 @@ No match École 0: École -/^[\w]+/ - *** Failers -No match - École -No match - /^[\W]+/ École 0: \xc9 @@ -89,17 +83,17 @@ No match /\w/I Capture group count = 0 -Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P - Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z +Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P + Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z Subject length lower bound = 1 /\w/I,locale=french Capture group count = 0 -Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P - Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z - ƒ Š Œ Ž š œ ž Ÿ ª ² ³ µ ¹ º À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö - Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø ù ú û ü ý - þ ÿ +Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P + Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z + ƒ Š Œ Ž š œ ž Ÿ ª ² ³ µ ¹ º À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö + Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø ù ú û ü ý + þ ÿ Subject length lower bound = 1 # All remaining tests are in the french locale, so set the default. @@ -120,6 +114,22 @@ No match école No match +/\xb5/i + µ + 0: µ + *** Failers +No match + \x9c +No match + +/ÿ/i + \xff + 0: ÿ + *** Failers +No match + y +No match + /\W+/ >>>\xaa<<< 0: >>> @@ -166,10 +176,10 @@ No match End ------------------------------------------------------------------ Capture group count = 0 -Starting code units: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z - a b c d e f g h i j k l m n o p q r s t u v w x y z ƒ Š Œ Ž š œ ž Ÿ ª µ º - À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å - æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø ù ú û ü ý þ ÿ +Starting code units: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z + a b c d e f g h i j k l m n o p q r s t u v w x y z ƒ Š Œ Ž š œ ž Ÿ ª µ º + À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å + æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø ù ú û ü ý þ ÿ Subject length lower bound = 3 # End of testinput3