From fbce951ee3991c63e4a7fcc2c65a2b84097a1427 Mon Sep 17 00:00:00 2001 From: Dan Balasescu Date: Thu, 1 Aug 2024 20:17:19 +0900 Subject: [PATCH] Implement PHP string escaping semantics --- .../Php/PhpStringLiteralSyntaxNode.cs | 115 ++++++++++++++++-- .../Php/PhpTokeniser.cs | 23 ++++ 2 files changed, 129 insertions(+), 9 deletions(-) diff --git a/LocalisationAnalyser.Tools/Php/PhpStringLiteralSyntaxNode.cs b/LocalisationAnalyser.Tools/Php/PhpStringLiteralSyntaxNode.cs index 646a3d6..aaedea5 100644 --- a/LocalisationAnalyser.Tools/Php/PhpStringLiteralSyntaxNode.cs +++ b/LocalisationAnalyser.Tools/Php/PhpStringLiteralSyntaxNode.cs @@ -1,7 +1,10 @@ // Copyright (c) ppy Pty Ltd . Licensed under the MIT Licence. // See the LICENCE file in the repository root for full licence text. +using System; +using System.Globalization; using System.Text; +using System.Text.RegularExpressions; namespace LocalisationAnalyser.Tools.Php { @@ -10,6 +13,10 @@ namespace LocalisationAnalyser.Tools.Php /// public class PhpStringLiteralSyntaxNode : PhpLiteralSyntaxNode { + private static readonly Regex oct_pattern = new Regex("^([0-7]{1,3})", RegexOptions.Compiled); + private static readonly Regex hex_pattern = new Regex("^(x[0-9A-Fa-f]{1,2})", RegexOptions.Compiled); + private static readonly Regex uni_pattern = new Regex("^(u{[0-9A-Fa-f]+})", RegexOptions.Compiled); + public readonly string Text; public PhpStringLiteralSyntaxNode(string text) @@ -21,27 +28,23 @@ public static PhpStringLiteralSyntaxNode Parse(PhpTokeniser tokeniser) { tokeniser.SkipWhitespace(); - char trivia = tokeniser.GetTrivia(); - - // Skip leading trivia. + char leader = tokeniser.GetTrivia(); tokeniser.Advance(); var stringBuilder = new StringBuilder(); - bool isEscaping = false; - while (isEscaping || tokeniser.GetTrivia() != trivia) + while (tokeniser.GetTrivia() != leader) { - var token = tokeniser.GetTrivia(); + char token = tokeniser.GetTrivia(); tokeniser.Advance(); - if (token == '\\' && !isEscaping) + if (token == '\\') { - isEscaping = true; + stringBuilder.Append(processEscapeSequence(leader, tokeniser)); continue; } stringBuilder.Append(token); - isEscaping = false; } // Skip trailing trivia. @@ -50,5 +53,99 @@ public static PhpStringLiteralSyntaxNode Parse(PhpTokeniser tokeniser) return new PhpStringLiteralSyntaxNode(stringBuilder.ToString()); } + + private static string processEscapeSequence(char leader, PhpTokeniser tokeniser) + { + char trivia = tokeniser.GetTrivia(); + + // Base cases for \{leader} and \\, supported by both single- and double-quoted strings. + if (trivia == leader || trivia == '\\') + { + tokeniser.Advance(); + return trivia.ToString(); + } + + // No other escape sequences are supported for single-quoted strings. + if (leader == '\'') + return @"\"; + + // Double-quoted strings have a few more cases... + switch (trivia) + { + case 'n': + tokeniser.Advance(); + return "\n"; + + case 'r': + tokeniser.Advance(); + return "\r"; + + case 't': + tokeniser.Advance(); + return "\t"; + + case 'v': + tokeniser.Advance(); + return "\v"; + + case 'e': + tokeniser.Advance(); + return "\x1B"; + + case 'f': + tokeniser.Advance(); + return "\f"; + + case '$': + tokeniser.Advance(); + return "$"; + + case >= '0' and <= '7': + { + Match match = oct_pattern.Match($"{trivia}{tokeniser.PeekNext(2)}"); + + if (match.Success) + { + tokeniser.Advance(match.Length); + + unchecked + { + byte octValue = (byte)Convert.ToInt32(match.Value, 8); + return ((char)octValue).ToString(); + } + } + + break; + } + + case 'x': + { + Match match = hex_pattern.Match($"{trivia}{tokeniser.PeekNext(2)}"); + + if (match.Success) + { + tokeniser.Advance(match.Length); + return ((char)byte.Parse(match.Value[1..], NumberStyles.HexNumber, CultureInfo.InvariantCulture)).ToString(); + } + + break; + } + + case 'u': + { + Match match = uni_pattern.Match($"{trivia}{tokeniser.PeekNext(16)}"); + + if (match.Success) + { + tokeniser.Advance(match.Length); + return char.ConvertFromUtf32(Convert.ToInt32(match.Value[2..^1], 16)); + } + + break; + } + } + + return @"\"; + } } } diff --git a/LocalisationAnalyser.Tools/Php/PhpTokeniser.cs b/LocalisationAnalyser.Tools/Php/PhpTokeniser.cs index dc3cc71..98be66b 100644 --- a/LocalisationAnalyser.Tools/Php/PhpTokeniser.cs +++ b/LocalisationAnalyser.Tools/Php/PhpTokeniser.cs @@ -67,6 +67,16 @@ public void TryAdvance() Advance(); } + /// + /// Advances by a number of trivia. + /// + /// The number of trivias to advance by. + public void Advance(int length) + { + for (int i = 0; i < length; i++) + Advance(); + } + /// /// Advances to the next trivia. /// @@ -104,6 +114,19 @@ public bool TryPeekNext(out char trivia) return true; } + /// + /// Peeks a number of future trivia. + /// + /// The length of trivia to peek. + /// The trivia. + public string PeekNext(int length) + { + int startIndex = Math.Min(content.Length, currentIndex + 1); + int endIndex = Math.Min(content.Length, startIndex + length); + + return content.AsSpan()[startIndex..endIndex].ToString(); + } + /// /// Skips all current whitespace and comments. ///