forked from ppy/osu-localisation-analyser
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement PHP string escaping semantics
- Loading branch information
Showing
2 changed files
with
129 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,10 @@ | ||
// Copyright (c) ppy Pty Ltd <[email protected]>. Licensed under the MIT Licence. | ||
// See the LICENCE file in the repository root for full licence text. | ||
|
||
using System; | ||
using System.Globalization; | ||
using System.Text; | ||
using System.Text.RegularExpressions; | ||
|
||
namespace LocalisationAnalyser.Tools.Php | ||
{ | ||
|
@@ -10,6 +13,10 @@ namespace LocalisationAnalyser.Tools.Php | |
/// </summary> | ||
public class PhpStringLiteralSyntaxNode : PhpLiteralSyntaxNode | ||
{ | ||
private static readonly Regex oct_pattern = new Regex("^([0-7]{1,3})", RegexOptions.Compiled); | ||
private static readonly Regex hex_pattern = new Regex("^(x[0-9A-Fa-f]{1,2})", RegexOptions.Compiled); | ||
private static readonly Regex uni_pattern = new Regex("^(u{[0-9A-Fa-f]+})", RegexOptions.Compiled); | ||
|
||
public readonly string Text; | ||
|
||
public PhpStringLiteralSyntaxNode(string text) | ||
|
@@ -21,27 +28,23 @@ public static PhpStringLiteralSyntaxNode Parse(PhpTokeniser tokeniser) | |
{ | ||
tokeniser.SkipWhitespace(); | ||
|
||
char trivia = tokeniser.GetTrivia(); | ||
|
||
// Skip leading trivia. | ||
char leader = tokeniser.GetTrivia(); | ||
tokeniser.Advance(); | ||
|
||
var stringBuilder = new StringBuilder(); | ||
bool isEscaping = false; | ||
|
||
while (isEscaping || tokeniser.GetTrivia() != trivia) | ||
while (tokeniser.GetTrivia() != leader) | ||
{ | ||
var token = tokeniser.GetTrivia(); | ||
char token = tokeniser.GetTrivia(); | ||
tokeniser.Advance(); | ||
|
||
if (token == '\\' && !isEscaping) | ||
if (token == '\\') | ||
{ | ||
isEscaping = true; | ||
stringBuilder.Append(processEscapeSequence(leader, tokeniser)); | ||
continue; | ||
} | ||
|
||
stringBuilder.Append(token); | ||
isEscaping = false; | ||
} | ||
|
||
// Skip trailing trivia. | ||
|
@@ -50,5 +53,99 @@ public static PhpStringLiteralSyntaxNode Parse(PhpTokeniser tokeniser) | |
|
||
return new PhpStringLiteralSyntaxNode(stringBuilder.ToString()); | ||
} | ||
|
||
private static string processEscapeSequence(char leader, PhpTokeniser tokeniser) | ||
{ | ||
char trivia = tokeniser.GetTrivia(); | ||
|
||
// Base cases for \{leader} and \\, supported by both single- and double-quoted strings. | ||
if (trivia == leader || trivia == '\\') | ||
{ | ||
tokeniser.Advance(); | ||
return trivia.ToString(); | ||
} | ||
|
||
// No other escape sequences are supported for single-quoted strings. | ||
if (leader == '\'') | ||
return @"\"; | ||
|
||
// Double-quoted strings have a few more cases... | ||
switch (trivia) | ||
{ | ||
case 'n': | ||
tokeniser.Advance(); | ||
return "\n"; | ||
|
||
case 'r': | ||
tokeniser.Advance(); | ||
return "\r"; | ||
|
||
case 't': | ||
tokeniser.Advance(); | ||
return "\t"; | ||
|
||
case 'v': | ||
tokeniser.Advance(); | ||
return "\v"; | ||
|
||
case 'e': | ||
tokeniser.Advance(); | ||
return "\x1B"; | ||
|
||
case 'f': | ||
tokeniser.Advance(); | ||
return "\f"; | ||
|
||
case '$': | ||
tokeniser.Advance(); | ||
return "$"; | ||
|
||
case >= '0' and <= '7': | ||
{ | ||
Match match = oct_pattern.Match($"{trivia}{tokeniser.PeekNext(2)}"); | ||
|
||
if (match.Success) | ||
{ | ||
tokeniser.Advance(match.Length); | ||
|
||
unchecked | ||
{ | ||
byte octValue = (byte)Convert.ToInt32(match.Value, 8); | ||
return ((char)octValue).ToString(); | ||
} | ||
} | ||
|
||
break; | ||
} | ||
|
||
case 'x': | ||
{ | ||
Match match = hex_pattern.Match($"{trivia}{tokeniser.PeekNext(2)}"); | ||
|
||
if (match.Success) | ||
{ | ||
tokeniser.Advance(match.Length); | ||
return ((char)byte.Parse(match.Value[1..], NumberStyles.HexNumber, CultureInfo.InvariantCulture)).ToString(); | ||
} | ||
|
||
break; | ||
} | ||
|
||
case 'u': | ||
{ | ||
Match match = uni_pattern.Match($"{trivia}{tokeniser.PeekNext(16)}"); | ||
|
||
if (match.Success) | ||
{ | ||
tokeniser.Advance(match.Length); | ||
return char.ConvertFromUtf32(Convert.ToInt32(match.Value[2..^1], 16)); | ||
} | ||
|
||
break; | ||
} | ||
} | ||
|
||
return @"\"; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters