Avoid three expensive allocations in UriHelper (dotnet/corefx#36056)

In this repro: ```C# using System; using System.Diagnostics; using System.Text; class Program { static void Main() { string input = $"param1={GenerateUrlEncoded(40)}&param2={GenerateUrlEncoded(220)}"; Console.WriteLine("Input length: " + input.Length); var sw = Stopwatch.StartNew(); string result = Uri.UnescapeDataString(input); Console.WriteLine("Result length: " + result.Length); Console.WriteLine(sw.Elapsed); } private static string GenerateUrlEncoded(int rowsCount) { var sb = new StringBuilder(); for (int i = 0x100; i < 0x999; i++) { sb.Append((char)i); if (i % 10 == 0) sb.Append('<'); if (i % 20 == 0) sb.Append('>'); if (i % 15 == 0) sb.Append('\"'); } string escaped = Uri.EscapeDataString(sb.ToString()); sb.Clear(); for (int i = 0; i < rowsCount; i++) { sb.AppendLine(escaped); } return sb.ToString(); } } ``` on my machine it ends up allocating ~630GB of memory and takes ~14 seconds. Almost all of that ~14 seconds is spent in gc_heap::allocate_large, and most of that inside memset_repmovs. This ends up being due to some large allocations being done in a tight loop. This PR contains three simple fixes that address the majority of the problem. There's still more that can be done here, but this is the lowest of the low-hanging fruit and makes the biggest impact: 1. In UnescapeString, the previous code was allocating a new char[bytes.Length] for each iteration. Stop doing that. Instead, just reuse the same array over and over and only grow it if it's smaller than is needed. 2. In MatchUTF8Sequence, the previous code was allocating a byte[] for each character or surrogate pair. Stop doing that. Instead, just use a reusable four-byte segment of stack. 3. In UnescapeString, the previous code was allocating a new UTF8Encoding for each iteration of the loop. Stop doing that. The object is thread-safe and can be used for all requests, so we just make it a static. These changes drop that ~630GB to ~22MB and that ~14s to ~0.05s. Subsequently, there's more memory-related changes that could be done in this code, e.g. using pooling, addressing some of the other allocation, but I've left that for the future. Commit migrated from dotnet/corefx@5e84d5b
dotnet · Mar 14, 2019 · 70c070e · 70c070e
1 parent 76dd5a6
commit 70c070e
Showing 1 changed file with 18 additions and 13 deletions.
diff --git a/src/libraries/System.Private.Uri/src/System/UriHelper.cs b/src/libraries/System.Private.Uri/src/System/UriHelper.cs
@@ -13,6 +13,9 @@ internal static class UriHelper
                                    '0', '1', '2', '3', '4', '5', '6', '7',
                                    '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
 
+        internal static readonly Encoding s_noFallbackCharUTF8 = Encoding.GetEncoding(
+            Encoding.UTF8.CodePage, new EncoderReplacementFallback(""), new DecoderReplacementFallback(""));
+
         // http://host/Path/Path/File?Query is the base of
         //      - http://host/Path/Path/File/ ...    (those "File" words may be different in semantic but anyway)
         //      - http://host/Path/Path/#Fragment
@@ -278,6 +281,7 @@ internal static unsafe char[] UnescapeString(char* pStr, int start, int end, cha
             int next = start;
             bool iriParsing = Uri.IriParsingStatic(syntax)
                                 && ((unescapeMode & UnescapeMode.EscapeUnescape) == UnescapeMode.EscapeUnescape);
+            char[] unescapedChars = null;
 
             while (true)
             {
@@ -474,21 +478,20 @@ internal static unsafe char[] UnescapeString(char* pStr, int start, int end, cha
                                 }
                             }
 
-                            Encoding noFallbackCharUTF8 = Encoding.GetEncoding(
-                                                                                Encoding.UTF8.CodePage,
-                                                                                new EncoderReplacementFallback(""),
-                                                                                new DecoderReplacementFallback(""));
+                            if (unescapedChars == null || unescapedChars.Length < bytes.Length)
+                            {
+                                unescapedChars = new char[bytes.Length];
+                            }
 
-                            char[] unescapedChars = new char[bytes.Length];
-                            int charCount = noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0);
+                            int charCount = s_noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0);
 
                             start = next;
 
                             // match exact bytes
                             // Do not unescape chars not allowed by Iri
                             // need to check for invalid utf sequences that may not have given any chars
 
-                            MatchUTF8Sequence(pDest, dest, ref destPosition, unescapedChars, charCount, bytes,
+                            MatchUTF8Sequence(pDest, dest, ref destPosition, unescapedChars.AsSpan(0, charCount), charCount, bytes,
                                 byteCount, isQuery, iriParsing);
                         }
 
@@ -507,18 +510,20 @@ internal static unsafe char[] UnescapeString(char* pStr, int start, int end, cha
         // We got the unescaped chars, we then re-encode them and match off the bytes
         // to get the invalid sequence bytes that we just copy off
         //
-        internal static unsafe void MatchUTF8Sequence(char* pDest, char[] dest, ref int destOffset, char[] unescapedChars,
+        internal static unsafe void MatchUTF8Sequence(char* pDest, char[] dest, ref int destOffset, Span<char> unescapedChars,
             int charCount, byte[] bytes, int byteCount, bool isQuery, bool iriParsing)
         {
+            Span<byte> maxUtf8EncodedSpan = stackalloc byte[4];
+
             int count = 0;
             fixed (char* unescapedCharsPtr = unescapedChars)
             {
                 for (int j = 0; j < charCount; ++j)
                 {
                     bool isHighSurr = char.IsHighSurrogate(unescapedCharsPtr[j]);
-
-                    byte[] encodedBytes = Encoding.UTF8.GetBytes(unescapedChars, j, isHighSurr ? 2 : 1);
-                    int encodedBytesLength = encodedBytes.Length;
+                    Span<byte> encodedBytes = maxUtf8EncodedSpan;
+                    int bytesWritten = Encoding.UTF8.GetBytes(unescapedChars.Slice(j, isHighSurr ? 2 : 1), encodedBytes);
+                    encodedBytes = encodedBytes.Slice(0, bytesWritten);
 
                     // we have to keep unicode chars outside Iri range escaped
                     bool inIriRange = false;
@@ -546,7 +551,7 @@ internal static unsafe void MatchUTF8Sequence(char* pDest, char[] dest, ref int
                         // check if all bytes match
                         bool allBytesMatch = true;
                         int k = 0;
-                        for (; k < encodedBytesLength; ++k)
+                        for (; k < encodedBytes.Length; ++k)
                         {
                             if (bytes[count + k] != encodedBytes[k])
                             {
@@ -557,7 +562,7 @@ internal static unsafe void MatchUTF8Sequence(char* pDest, char[] dest, ref int
 
                         if (allBytesMatch)
                         {
-                            count += encodedBytesLength;
+                            count += encodedBytes.Length;
                             if (iriParsing)
                             {
                                 if (!inIriRange)