From 70c070e84c617de0700612d03c5886e548195bed Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 14 Mar 2019 16:46:48 -0400 Subject: [PATCH] Avoid three expensive allocations in UriHelper (dotnet/corefx#36056) In this repro: ```C# using System; using System.Diagnostics; using System.Text; class Program { static void Main() { string input = $"param1={GenerateUrlEncoded(40)}¶m2={GenerateUrlEncoded(220)}"; Console.WriteLine("Input length: " + input.Length); var sw = Stopwatch.StartNew(); string result = Uri.UnescapeDataString(input); Console.WriteLine("Result length: " + result.Length); Console.WriteLine(sw.Elapsed); } private static string GenerateUrlEncoded(int rowsCount) { var sb = new StringBuilder(); for (int i = 0x100; i < 0x999; i++) { sb.Append((char)i); if (i % 10 == 0) sb.Append('<'); if (i % 20 == 0) sb.Append('>'); if (i % 15 == 0) sb.Append('\"'); } string escaped = Uri.EscapeDataString(sb.ToString()); sb.Clear(); for (int i = 0; i < rowsCount; i++) { sb.AppendLine(escaped); } return sb.ToString(); } } ``` on my machine it ends up allocating ~630GB of memory and takes ~14 seconds. Almost all of that ~14 seconds is spent in gc_heap::allocate_large, and most of that inside memset_repmovs. This ends up being due to some large allocations being done in a tight loop. This PR contains three simple fixes that address the majority of the problem. There's still more that can be done here, but this is the lowest of the low-hanging fruit and makes the biggest impact: 1. In UnescapeString, the previous code was allocating a new char[bytes.Length] for each iteration. Stop doing that. Instead, just reuse the same array over and over and only grow it if it's smaller than is needed. 2. In MatchUTF8Sequence, the previous code was allocating a byte[] for each character or surrogate pair. Stop doing that. Instead, just use a reusable four-byte segment of stack. 3. In UnescapeString, the previous code was allocating a new UTF8Encoding for each iteration of the loop. Stop doing that. The object is thread-safe and can be used for all requests, so we just make it a static. These changes drop that ~630GB to ~22MB and that ~14s to ~0.05s. Subsequently, there's more memory-related changes that could be done in this code, e.g. using pooling, addressing some of the other allocation, but I've left that for the future. Commit migrated from https://github.com/dotnet/corefx/commit/5e84d5b782fd56c7c6772f199719d208b299b605 --- .../src/System/UriHelper.cs | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/libraries/System.Private.Uri/src/System/UriHelper.cs b/src/libraries/System.Private.Uri/src/System/UriHelper.cs index 0c602e26aeba3..fb4181df7d447 100644 --- a/src/libraries/System.Private.Uri/src/System/UriHelper.cs +++ b/src/libraries/System.Private.Uri/src/System/UriHelper.cs @@ -13,6 +13,9 @@ internal static class UriHelper '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; + internal static readonly Encoding s_noFallbackCharUTF8 = Encoding.GetEncoding( + Encoding.UTF8.CodePage, new EncoderReplacementFallback(""), new DecoderReplacementFallback("")); + // http://host/Path/Path/File?Query is the base of // - http://host/Path/Path/File/ ... (those "File" words may be different in semantic but anyway) // - http://host/Path/Path/#Fragment @@ -278,6 +281,7 @@ internal static unsafe char[] UnescapeString(char* pStr, int start, int end, cha int next = start; bool iriParsing = Uri.IriParsingStatic(syntax) && ((unescapeMode & UnescapeMode.EscapeUnescape) == UnescapeMode.EscapeUnescape); + char[] unescapedChars = null; while (true) { @@ -474,13 +478,12 @@ internal static unsafe char[] UnescapeString(char* pStr, int start, int end, cha } } - Encoding noFallbackCharUTF8 = Encoding.GetEncoding( - Encoding.UTF8.CodePage, - new EncoderReplacementFallback(""), - new DecoderReplacementFallback("")); + if (unescapedChars == null || unescapedChars.Length < bytes.Length) + { + unescapedChars = new char[bytes.Length]; + } - char[] unescapedChars = new char[bytes.Length]; - int charCount = noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0); + int charCount = s_noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0); start = next; @@ -488,7 +491,7 @@ internal static unsafe char[] UnescapeString(char* pStr, int start, int end, cha // Do not unescape chars not allowed by Iri // need to check for invalid utf sequences that may not have given any chars - MatchUTF8Sequence(pDest, dest, ref destPosition, unescapedChars, charCount, bytes, + MatchUTF8Sequence(pDest, dest, ref destPosition, unescapedChars.AsSpan(0, charCount), charCount, bytes, byteCount, isQuery, iriParsing); } @@ -507,18 +510,20 @@ internal static unsafe char[] UnescapeString(char* pStr, int start, int end, cha // We got the unescaped chars, we then re-encode them and match off the bytes // to get the invalid sequence bytes that we just copy off // - internal static unsafe void MatchUTF8Sequence(char* pDest, char[] dest, ref int destOffset, char[] unescapedChars, + internal static unsafe void MatchUTF8Sequence(char* pDest, char[] dest, ref int destOffset, Span unescapedChars, int charCount, byte[] bytes, int byteCount, bool isQuery, bool iriParsing) { + Span maxUtf8EncodedSpan = stackalloc byte[4]; + int count = 0; fixed (char* unescapedCharsPtr = unescapedChars) { for (int j = 0; j < charCount; ++j) { bool isHighSurr = char.IsHighSurrogate(unescapedCharsPtr[j]); - - byte[] encodedBytes = Encoding.UTF8.GetBytes(unescapedChars, j, isHighSurr ? 2 : 1); - int encodedBytesLength = encodedBytes.Length; + Span encodedBytes = maxUtf8EncodedSpan; + int bytesWritten = Encoding.UTF8.GetBytes(unescapedChars.Slice(j, isHighSurr ? 2 : 1), encodedBytes); + encodedBytes = encodedBytes.Slice(0, bytesWritten); // we have to keep unicode chars outside Iri range escaped bool inIriRange = false; @@ -546,7 +551,7 @@ internal static unsafe void MatchUTF8Sequence(char* pDest, char[] dest, ref int // check if all bytes match bool allBytesMatch = true; int k = 0; - for (; k < encodedBytesLength; ++k) + for (; k < encodedBytes.Length; ++k) { if (bytes[count + k] != encodedBytes[k]) { @@ -557,7 +562,7 @@ internal static unsafe void MatchUTF8Sequence(char* pDest, char[] dest, ref int if (allBytesMatch) { - count += encodedBytesLength; + count += encodedBytes.Length; if (iriParsing) { if (!inIriRange)