From 07c5639b9af523f9672204cc048577154d5a3fc0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= <carenas@gmail.com>
Date: Sun, 29 Sep 2024 13:01:46 -0700
Subject: [PATCH] pcre2test: tighten \x{...} parsing in subject

Eventhough it is documented that invalid escapes will be reported,
the code would still allow a NUL to be generated when a non terminated
\x{ escape was being used. Presume the fact that there was a test relying
on it means that Perl might had done so at one time, but had confirmed
that none does since at least 5.6.1, and indeed was silently failing
with perltest.sh as well. Additionally, the code for overlong escapes
was returning an incorrect value.

Refactor the code to report the error in those cases, and hopefully
adjust the test to match.

While at it update related documentation in Perl's compatibility.
---
 doc/pcre2compat.3        | 12 ++++++---
 doc/pcre2test.1          |  2 +-
 perltest.sh              |  8 +++++-
 src/pcre2test.c          | 54 +++++++++++++++++++++-------------------
 testdata/testinput10     |  3 ++-
 testdata/testinput12     |  2 +-
 testdata/testoutput10    |  3 ++-
 testdata/testoutput12-16 |  2 +-
 testdata/testoutput12-32 |  2 +-
 9 files changed, 52 insertions(+), 36 deletions(-)

diff --git a/doc/pcre2compat.3 b/doc/pcre2compat.3
index 74a9ce8a8..cc8b7709f 100644
--- a/doc/pcre2compat.3
+++ b/doc/pcre2compat.3
@@ -226,9 +226,15 @@ handled by PCRE2, either by the interpreter or the JIT. An example is
 /(?:|(?0)abcd)(?(R)|\ez)/, which matches a sequence of any number of repeated
 "abcd" substrings at the end of the subject.
 .P
-23. From release 10.45, PCRE2 gives an error if \ex is not followed by a 
-hexadecimal digit or a curly bracket. It used to interpret this as the NUL 
-character. Perl still generates NUL, but warns in its warning mode.
+23. Both PCRE2 and Perl error when \ex{ escapes are invalid, but Perl tries to
+recover and prints a warning if the problem was that an invalid hexadecimal
+digit was found, since PCRE2 doesn't have warnings it returns an error instead.
+Additionally, Perl accepts \ex{} and generates NUL unlike PCRE2.
+.P
+24. From release 10.45, PCRE2 gives an error if \ex is not followed by a
+hexadecimal digit or a curly bracket. It used to interpret this as the NUL
+character. Perl still generates NUL, but warns when in warning mode in most
+cases.
 .
 .
 .SH AUTHOR
diff --git a/doc/pcre2test.1 b/doc/pcre2test.1
index 1313706bf..3ea58f72b 100644
--- a/doc/pcre2test.1
+++ b/doc/pcre2test.1
@@ -516,7 +516,7 @@ this makes it possible to construct invalid UTF-8 sequences for testing
 purposes. On the other hand, \ex{hh} is interpreted as a UTF-8 character in
 UTF-8 mode, generating more than one byte if the value is greater than 127.
 When testing the 8-bit library not in UTF-8 mode, \ex{hh} generates one byte
-for values less than 256, and causes an error for greater values.
+for values that could fit on it, and causes an error for greater values.
 .P
 In UTF-16 mode, all 4-digit \ex{hhhh} values are accepted. This makes it
 possible to construct invalid UTF-16 sequences for testing purposes.
diff --git a/perltest.sh b/perltest.sh
index b05500665..219fcbb4c 100755
--- a/perltest.sh
+++ b/perltest.sh
@@ -314,7 +314,13 @@ for (;;)
       }
     else
       {
-      $x = eval "\"$_\"";   # To get escapes processed
+      s/(?<!\\)\\$//;     # Remove pcre2test specific trailing backslash
+      $x = eval "\"$_\""; # To get escapes processed
+      if ($interact && $@)
+        {
+        print STDERR "$@";
+        redo;
+        }
       }
 
     # Empty array for holding results, ensure $REGERROR and $REGMARK are
diff --git a/src/pcre2test.c b/src/pcre2test.c
index 852f1b848..1b433263e 100644
--- a/src/pcre2test.c
+++ b/src/pcre2test.c
@@ -7174,10 +7174,10 @@ while ((c = *p++) != 0)
     break;
 
     case 'x':
+    c = 0;
     if (*p == '{')
       {
       uint8_t *pt = p;
-      c = 0;
 
       /* We used to have "while (isxdigit(*(++pt)))" here, but it fails
       when isxdigit() is a macro that refers to its argument more than
@@ -7187,36 +7187,41 @@ while ((c = *p++) != 0)
       for (pt++; isxdigit(*pt); pt++)
         {
         if (++i == 9)
+          {
           fprintf(outfile, "** Too many hex digits in \\x{...} item; "
                            "using only the first eight.\n");
-        else c = c * 16 + (tolower(*pt) - ((isdigit(*pt))? '0' : 'a' - 10));
+          while (isxdigit(*pt)) pt++;
+          break;
+          }
+        else c = c * 16 + (tolower(*pt) - (isdigit(*pt)? '0' : 'a' - 10));
         }
-      if (*pt == '}')
+      if (i == 0 || *pt != '}')
         {
-        p = pt + 1;
-        break;
+        fprintf(outfile, "** Malformed \\x{ escape\n");
+        return PR_OK;
         }
-      /* Not correct form for \x{...}; fall through */
+      else p = pt + 1;
       }
-
-    /* \x without {} always defines just one byte in 8-bit mode. This
-    allows UTF-8 characters to be constructed byte by byte, and also allows
-    invalid UTF-8 sequences to be made. Just copy the byte in UTF-8 mode.
-    Otherwise, pass it down as data. */
-
-    c = 0;
-    while (i++ < 2 && isxdigit(*p))
+    else
       {
-      c = c * 16 + (tolower(*p) - ((isdigit(*p))? '0' : 'a' - 10));
-      p++;
-      }
+      /* \x without {} always defines just one byte in 8-bit mode. This
+      allows UTF-8 characters to be constructed byte by byte, and also allows
+      invalid UTF-8 sequences to be made. Just copy the byte in UTF-8 mode.
+      Otherwise, pass it down as data. */
+
+      while (i++ < 2 && isxdigit(*p))
+        {
+        c = c * 16 + (tolower(*p) - (isdigit(*p)? '0' : 'a' - 10));
+        p++;
+        }
 #if defined SUPPORT_PCRE2_8
-    if (utf && (test_mode == PCRE8_MODE))
-      {
-      *q8++ = c;
-      continue;
-      }
+      if (utf && (test_mode == PCRE8_MODE))
+        {
+        *q8++ = c;
+        continue;
+        }
 #endif
+      }
     break;
 
     case 0:     /* \ followed by EOF allows for an empty line */
@@ -7309,10 +7314,7 @@ while ((c = *p++) != 0)
     }
 #endif
 #ifdef SUPPORT_PCRE2_32
-  if (test_mode == PCRE32_MODE)
-    {
-    *q32++ = c;
-    }
+  if (test_mode == PCRE32_MODE) *q32++ = c;
 #endif
   }
 
diff --git a/testdata/testinput10 b/testdata/testinput10
index c17010e68..dacbbf223 100644
--- a/testdata/testinput10
+++ b/testdata/testinput10
@@ -187,8 +187,9 @@
     \x{c0}
     \x{f0}
 
+# This used to test: \x{100}\x{100}\x{100}\x{100\x{100}
 /Ā{3,4}/IB,utf
-  \x{100}\x{100}\x{100}\x{100\x{100}
+    \x{100}\x{100}\x{100}\x00100\x{100}
 
 /(\x{100}+|x)/IB,utf
 
diff --git a/testdata/testinput12 b/testdata/testinput12
index 00ca7e64b..29699c0ab 100644
--- a/testdata/testinput12
+++ b/testdata/testinput12
@@ -57,7 +57,7 @@
     \x{f0}
 
 /Ā{3,4}/IB,utf
-  \x{100}\x{100}\x{100}\x{100\x{100}
+  \x{100}\x{100}\x{100}\0x100\x{100}
 
 /(\x{100}+|x)/IB,utf
 
diff --git a/testdata/testoutput10 b/testdata/testoutput10
index 7f2019981..08b4390f9 100644
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@@ -492,6 +492,7 @@ No match
     \x{f0}
 No match
 
+# This used to test: \x{100}\x{100}\x{100}\x{100\x{100}
 /Ā{3,4}/IB,utf
 ------------------------------------------------------------------
         Bra
@@ -505,7 +506,7 @@ Options: utf
 First code unit = \xc4
 Last code unit = \x80
 Subject length lower bound = 3
-  \x{100}\x{100}\x{100}\x{100\x{100}
+    \x{100}\x{100}\x{100}\x00100\x{100}
  0: \x{100}\x{100}\x{100}
 
 /(\x{100}+|x)/IB,utf
diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16
index a384484c6..ee36b6af2 100644
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@@ -286,7 +286,7 @@ Options: utf
 First code unit = \x{100}
 Last code unit = \x{100}
 Subject length lower bound = 3
-  \x{100}\x{100}\x{100}\x{100\x{100}
+  \x{100}\x{100}\x{100}\0x100\x{100}
  0: \x{100}\x{100}\x{100}
 
 /(\x{100}+|x)/IB,utf
diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32
index 03a0f6b93..84fd93659 100644
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@@ -281,7 +281,7 @@ Options: utf
 First code unit = \x{100}
 Last code unit = \x{100}
 Subject length lower bound = 3
-  \x{100}\x{100}\x{100}\x{100\x{100}
+  \x{100}\x{100}\x{100}\0x100\x{100}
  0: \x{100}\x{100}\x{100}
 
 /(\x{100}+|x)/IB,utf