phpstan · ondrejmirtes · Jul 19, 2024 · Jul 15, 2024 · Jul 19, 2024 · Jul 19, 2024
diff --git a/resources/RegexGrammar.pp b/resources/RegexGrammar.pp
@@ -46,13 +46,23 @@
 %skip   nl                       \n
 
 // Character classes.
-%token  negative_class_          \[\^
-%token  class_                   \[
-%token _class                    \]
-%token  range                    \-
+%token  negative_class_          \[\^               -> class
+%token  class_                   \[                 -> class
+%token class:posix_class         \[:\^?[a-z]+:\]
+%token class:class_              \[
+%token class:_class_literal      (?<=[^\\]\[|[^\\]\[\^)\]
+%token class:_class              \]                 -> default
+%token class:range               \-
+%token class:escaped_end_class   \\\]
+// taken over from literals but class:character has \b support on top (backspace in character classes)
+%token class:character           \\([aefnrtb]|c[\x00-\x7f])
+%token class:dynamic_character   \\([0-7]{3}|x[0-9a-zA-Z]{2}|x{[0-9a-zA-Z]+})
+%token class:character_type      \\([CdDhHNRsSvVwWX]|[pP]{[^}]+})
+%token class:literal             \\.|.
 
 // Internal options.
-%token  internal_option          \(\?[\-+]?[imsx]\)
+// See https://www.regular-expressions.info/refmodifiers.html
+%token  internal_option          \(\?([imsxnJUX^]|xx)?-?([imsxnJUX^]|xx)\)
 
 // Lookahead and lookbehind assertions.
 %token  lookahead_               \(\?=
@@ -77,6 +87,7 @@
 %token  nc:_named_capturing      >                  -> default
 %token  nc:capturing_name        .+?(?=(?<!\\)>)
 %token  non_capturing_           \(\?:
+%token  non_capturing_internal_option \(\?([imsxnJUX^]|xx)?-?([imsxnJUX^]|xx):
 %token  non_capturing_reset_     \(\?\|
 %token  atomic_group_            \(\?>
 %token  capturing_               \(
@@ -168,7 +179,7 @@
         ::negative_class_:: #negativeclass
       | ::class_::
     )
-    ( <class_> | range() | literal() )+
+    ( <range> | <_class_literal> )? ( <posix_class> | <class_> | range() | literal() | <escaped_end_class> )* <range>?
     ::_class::
 
 #range:
@@ -183,15 +194,18 @@
   | (
         ::named_capturing_:: <capturing_name> ::_named_capturing:: #namedcapturing
       | ::non_capturing_:: #noncapturing
+      | non_capturing_internal_options() #noncapturing
       | ::non_capturing_reset_:: #noncapturingreset
       | ::atomic_group_:: #atomicgroup
       | ::capturing_::
     )
     alternation() ::_capturing::
 
+non_capturing_internal_options:
+    <non_capturing_internal_option>
+
 literal:
     <character>
-  | <range>
   | <dynamic_character>
   | <character_type>
   | <anchor>

diff --git a/src/Type/Php/RegexArrayShapeMatcher.php b/src/Type/Php/RegexArrayShapeMatcher.php
@@ -7,6 +7,7 @@
 use Hoa\Compiler\Llk\TreeNode;
 use Hoa\Exception\Exception;
 use Hoa\File\Read;
+use Nette\Utils\RegexpException;
 use Nette\Utils\Strings;
 use PhpParser\Node\Expr;
 use PhpParser\Node\Name;
@@ -31,7 +32,11 @@
 use function in_array;
 use function is_int;
 use function is_string;
+use function rtrim;
 use function sscanf;
+use function str_replace;
+use function strlen;
+use function substr;
 use const PREG_OFFSET_CAPTURE;
 use const PREG_UNMATCHED_AS_NULL;
 
@@ -375,6 +380,13 @@ private function parseGroups(string $regex): ?array
 			self::$parser = Llk::load(new Read(__DIR__ . '/../../../resources/RegexGrammar.pp'));
 		}
 
+		try {
+			Strings::match('', $regex);
+		} catch (RegexpException) {
+			// pattern is invalid, so let the RegularExpressionPatternRule report it
+			return null;
+		}
+
 		try {
 			$ast = self::$parser->parse($regex);
 		} catch (Exception) {
@@ -516,25 +528,37 @@ private function getQuantificationRange(TreeNode $node): array
 		$lastChild = $node->getChild($node->getChildrenNumber() - 1);
 		$value = $lastChild->getValue();
 
-		if ($value['token'] === 'n_to_m') {
-			if (sscanf($value['value'], '{%d,%d}', $n, $m) !== 2 || !is_int($n) || !is_int($m)) {
+		// normalize away possessive and lazy quantifier-modifiers
+		$token = str_replace(['_possessive', '_lazy'], '', $value['token']);
+		$value = rtrim($value['value'], '+?');
+
+		if ($token === 'n_to_m') {
+			if (sscanf($value, '{%d,%d}', $n, $m) !== 2 || !is_int($n) || !is_int($m)) {
 				throw new ShouldNotHappenException();
 			}
 
 			$min = $n;
 			$max = $m;
-		} elseif ($value['token'] === 'exactly_n') {
-			if (sscanf($value['value'], '{%d}', $n) !== 1 || !is_int($n)) {
+		} elseif ($token === 'n_or_more') {
+			if (sscanf($value, '{%d,}', $n) !== 1 || !is_int($n)) {
+				throw new ShouldNotHappenException();
+			}
+
+			$min = $n;
+		} elseif ($token === 'exactly_n') {
+			if (sscanf($value, '{%d}', $n) !== 1 || !is_int($n)) {
 				throw new ShouldNotHappenException();
 			}
 
 			$min = $n;
 			$max = $n;
-		} elseif ($value['token'] === 'zero_or_one') {
+		} elseif ($token === 'zero_or_one') {
 			$min = 0;
 			$max = 1;
-		} elseif ($value['token'] === 'zero_or_more') {
+		} elseif ($token === 'zero_or_more') {
 			$min = 0;
+		} elseif ($token === 'one_or_more') {
+			$min = 1;
 		}
 
 		return [$min, $max];
@@ -591,20 +615,8 @@ private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryL
 			if ($literalValue !== null) {
 				if (Strings::match($literalValue, '/^\d+$/') === null) {
 					$isNumeric = TrinaryLogic::createNo();
-				}
-
-				if (!$inOptionalQuantification) {
-					$isNonEmpty = TrinaryLogic::createYes();
-				}
-			}
-
-			if ($ast->getValueToken() === 'character_type') {
-				if ($ast->getValueValue() === '\d') {
-					if ($isNumeric->maybe()) {
-						$isNumeric = TrinaryLogic::createYes();
-					}
-				} else {
-					$isNumeric = TrinaryLogic::createNo();
+				} elseif ($isNumeric->maybe()) {
+					$isNumeric = TrinaryLogic::createYes();
 				}
 
 				if (!$inOptionalQuantification) {
@@ -613,32 +625,11 @@ private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryL
 			}
 		}
 
-		if ($ast->getId() === '#range' || $ast->getId() === '#class') {
-			if ($isNumeric->maybe()) {
-				$allNumeric = null;
-				foreach ($children as $child) {
-					$literalValue = $this->getLiteralValue($child);
-
-					if ($literalValue === null) {
-						break;
-					}
-
-					if (Strings::match($literalValue, '/^\d+$/') === null) {
-						$allNumeric = false;
-						break;
-					}
-
-					$allNumeric = true;
-				}
-
-				if ($allNumeric === true) {
-					$isNumeric = TrinaryLogic::createYes();
-				}
-			}
-
-			if (!$inOptionalQuantification) {
-				$isNonEmpty = TrinaryLogic::createYes();
-			}
+		// [^0-9] should not parse as numeric-string, and [^list-everything-but-numbers] is technically
+		// doable but really silly compared to just \d so we can safely assume the string is not numeric
+		// for negative classes
+		if ($ast->getId() === '#negativeclass') {
+			$isNumeric = TrinaryLogic::createNo();
 		}
 
 		foreach ($children as $child) {
@@ -653,13 +644,65 @@ private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryL
 
 	private function getLiteralValue(TreeNode $node): ?string
 	{
-		if ($node->getId() === 'token' && $node->getValueToken() === 'literal') {
-			return $node->getValueValue();
+		if ($node->getId() !== 'token') {
+			return null;
+		}
+
+		// token is the token name from grammar without the namespace so literal and class:literal are both called literal here
+		$token = $node->getValueToken();
+		$value = $node->getValueValue();
+
+		if (in_array($token, ['literal', 'escaped_end_class'], true)) {
+			if (strlen($node->getValueValue()) > 1 && $value[0] === '\\') {
+				return substr($value, 1);
+			}
+
+			return $value;
+		}
+
+		// literal "-" in front/back of a character class like '[-a-z]' or '[abc-]', not forming a range
+		if ($token === 'range') {
+			return $value;
+		}
+
+		// literal "[" or "]" inside character classes '[[]' or '[]]'
+		if (in_array($token, ['class_', '_class_literal'], true)) {
+			return $value;
+		}
+
+		// character escape sequences, just return a fixed string
+		if (in_array($token, ['character', 'dynamic_character', 'character_type'], true)) {
+			if ($token === 'character_type' && $value === '\d') {
+				return '0';
+			}
+
+			return $value;
+		}
+
+		// [:digit:] and the like, more support coming later
+		if ($token === 'posix_class') {
+			if ($value === '[:digit:]') {
+				return '0';
+			}
+			if (in_array($value, ['[:alpha:]', '[:alnum:]', '[:upper:]', '[:lower:]', '[:word:]', '[:ascii:]', '[:print:]', '[:xdigit:]', '[:graph:]'], true)) {
+				return 'a';
+			}
+			if ($value === '[:blank:]') {
+				return " \t";
+			}
+			if ($value === '[:cntrl:]') {
+				return "\x00\x1F";
+			}
+			if ($value === '[:space:]') {
+				return " \t\r\n\v\f";
+			}
+			if ($value === '[:punct:]') {
+				return '!"#$%&\'()*+,\-./:;<=>?@[\]^_`{|}~';
+			}
 		}
 
-		// literal "-" outside of a character class like '~^((\\d{1,6})-)$~'
-		if ($node->getId() === 'token' && $node->getValueToken() === 'range') {
-			return $node->getValueValue();
+		if ($token === 'anchor' || $token === 'match_point_reset') {
+			return '';
 		}
 
 		return null;

diff --git a/tests/PHPStan/Analyser/LegacyNodeScopeResolverTest.php b/tests/PHPStan/Analyser/LegacyNodeScopeResolverTest.php
@@ -8003,7 +8003,7 @@ public function dataPassedByReference(): array
 				'$arr',
 			],
 			[
-				'array{0?: string}',
+				'array<string>',
 				'$matches',
 			],
 			[

diff --git a/tests/PHPStan/Analyser/nsrt/preg_match_shapes.php b/tests/PHPStan/Analyser/nsrt/preg_match_shapes.php
@@ -127,28 +127,18 @@ function doUnknownFlags(string $s, int $flags): void {
 	assertType('array<array{string|null, int<-1, max>}|string|null>', $matches);
 }
 
-function doNonAutoCapturingModifier(string $s): void {
-	if (preg_match('/(?n)(\d+)/', $s, $matches)) {
-		// could be assertType('array{string}', $matches);
-		assertType('array<string>', $matches);
-	}
-	assertType('array<string>', $matches);
-}
-
 function doMultipleAlternativeCaptureGroupsWithSameNameWithModifier(string $s): void {
 	if (preg_match('/(?J)(?<Foo>[a-z]+)|(?<Foo>[0-9]+)/', $s, $matches)) {
-		// could be assertType('array{0: string, Foo: string, 1: string}', $matches);
-		assertType('array<string>', $matches);
+		assertType('array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
 	}
-	assertType('array<string>', $matches);
+	assertType('array{}|array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
 }
 
 function doMultipleConsecutiveCaptureGroupsWithSameNameWithModifier(string $s): void {
 	if (preg_match('/(?J)(?<Foo>[a-z]+)|(?<Foo>[0-9]+)/', $s, $matches)) {
-		// could be assertType('array{0: string, Foo: string, 1: string}', $matches);
-		assertType('array<string>', $matches);
+		assertType('array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
 	}
-	assertType('array<string>', $matches);
+	assertType('array{}|array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
 }
 
 // https://github.com/hoaproject/Regex/issues/31
@@ -472,3 +462,60 @@ function (string $s): void {
 		assertType("array{string, non-empty-string}", $matches);
 	}
 };
+
+function bug11323(string $s): void {
+	if (preg_match('/([*|+?{}()]+)([^*|+[:digit:]?{}()]+)/', $s, $matches)) {
+		assertType('array{string, non-empty-string, non-empty-string}', $matches);
+	}
+	if (preg_match('/\p{L}[[\]]+([-*|+?{}(?:)]+)([^*|+[:digit:]?{a-z}(\p{L})\a-]+)/', $s, $matches)) {
+		assertType('array{string, non-empty-string, non-empty-string}', $matches);
+	}
+	if (preg_match('{([-\p{L}[\]*|\x03\a\b+?{}(?:)-]+[^[:digit:]?{}a-z0-9#-k]+)(a-z)}', $s, $matches)) {
+		assertType('array{string, non-empty-string, non-empty-string}', $matches);
+	}
+	if (preg_match('{(\d+)(?i)insensitive((?x-i)case SENSITIVE here(?i:insensitive non-capturing group))}', $s, $matches)) {
+		assertType('array{string, numeric-string, non-empty-string}', $matches);
+	}
+	if (preg_match('{([]] [^]])}', $s, $matches)) {
+		assertType('array{string, non-empty-string}', $matches);
+	}
+	if (preg_match('{([[:digit:]])}', $s, $matches)) {
+		assertType('array{string, numeric-string}', $matches);
+	}
+	if (preg_match('{([\d])(\d)}', $s, $matches)) {
+		assertType('array{string, numeric-string, numeric-string}', $matches);
+	}
+	if (preg_match('{([0-9])}', $s, $matches)) {
+		assertType('array{string, numeric-string}', $matches);
+	}
+	if (preg_match('{(\p{L})(\p{P})(\p{Po})}', $s, $matches)) {
+		assertType('array{string, non-empty-string, non-empty-string, non-empty-string}', $matches);
+	}
+	if (preg_match('{(a)??(b)*+(c++)(d)+?}', $s, $matches)) {
+		assertType('array{string, string, string, non-empty-string, non-empty-string}', $matches);
+	}
+	if (preg_match('{(.\d)}', $s, $matches)) {
+		assertType('array{string, non-empty-string}', $matches);
+	}
+	if (preg_match('{(\d.)}', $s, $matches)) {
+		assertType('array{string, non-empty-string}', $matches);
+	}
+	if (preg_match('{(\d\d)}', $s, $matches)) {
+		assertType('array{string, numeric-string}', $matches);
+	}
+	if (preg_match('{(.(\d))}', $s, $matches)) {
+		assertType('array{string, non-empty-string, numeric-string}', $matches);
+	}
+	if (preg_match('{((\d).)}', $s, $matches)) {
+		assertType('array{string, non-empty-string, numeric-string}', $matches);
+	}
+	if (preg_match('{(\d([1-4])\d)}', $s, $matches)) {
+		assertType('array{string, numeric-string, numeric-string}', $matches);
+	}
+	if (preg_match('{(x?([1-4])\d)}', $s, $matches)) {
+		assertType('array{string, non-empty-string, numeric-string}', $matches);
+	}
+	if (preg_match('{([^1-4])}', $s, $matches)) {
+		assertType('array{string, non-empty-string}', $matches);
+	}
+}
diff --git a/tests/PHPStan/Analyser/nsrt/preg_match_shapes_php80.php b/tests/PHPStan/Analyser/nsrt/preg_match_shapes_php80.php
@@ -11,3 +11,11 @@ function doOffsetCaptureWithUnmatchedNull(string $s): void {
 	}
 	assertType('array{}|array{array{string|null, int<-1, max>}, array{non-empty-string|null, int<-1, max>}, array{non-empty-string|null, int<-1, max>}, array{non-empty-string|null, int<-1, max>}}', $matches);
 }
+
+function doNonAutoCapturingModifier(string $s): void {
+	if (preg_match('/(?n)(\d+)/', $s, $matches)) {
+		// should be assertType('array{string}', $matches);
+		assertType('array{string, numeric-string}', $matches);
+	}
+	assertType('array{}|array{string, numeric-string}', $matches);
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -8003,7 +8003,7 @@ public function dataPassedByReference(): array @@
     				'$arr',
     			],
     			[
-    				'array{0?: string}',
+    				'array<string>',
     				'$matches',
     			],
     			[
@@ Expand Down @@