Skip to content

Commit

Permalink
Avoid three expensive allocations in UriHelper (dotnet/corefx#36056)
Browse files Browse the repository at this point in the history
In this repro:
```C#
using System;
using System.Diagnostics;
using System.Text;

class Program
{
    static void Main()
    {
        string input = $"param1={GenerateUrlEncoded(40)}&param2={GenerateUrlEncoded(220)}";
        Console.WriteLine("Input length: " + input.Length);
        var sw = Stopwatch.StartNew();
        string result = Uri.UnescapeDataString(input);
        Console.WriteLine("Result length: " + result.Length);
        Console.WriteLine(sw.Elapsed);
    }

    private  static string GenerateUrlEncoded(int rowsCount)
    {
        var sb = new StringBuilder();
        for (int i = 0x100; i < 0x999; i++)
        {
            sb.Append((char)i);
            if (i % 10 == 0) sb.Append('<');
            if (i % 20 == 0) sb.Append('>');
            if (i % 15 == 0) sb.Append('\"');
        }

        string escaped = Uri.EscapeDataString(sb.ToString());
        sb.Clear();
        for (int i = 0; i < rowsCount; i++)
        {
            sb.AppendLine(escaped);
        }

        return sb.ToString();
    }
}
```
on my machine it ends up allocating ~630GB of memory and takes ~14 seconds.

Almost all of that ~14 seconds is spent in gc_heap::allocate_large, and most of that inside memset_repmovs.  This ends up being due to some large allocations being done in a tight loop.

This PR contains three simple fixes that address the majority of the problem.  There's still more that can be done here, but this is the lowest of the low-hanging fruit and makes the biggest impact:
1. In UnescapeString, the previous code was allocating a new char[bytes.Length] for each iteration.  Stop doing that.  Instead, just reuse the same array over and over and only grow it if it's smaller than is needed.
2. In MatchUTF8Sequence, the previous code was allocating a byte[] for each character or surrogate pair.  Stop doing that.  Instead, just use a reusable four-byte segment of stack.
3. In UnescapeString, the previous code was allocating a new UTF8Encoding for each iteration of the loop.  Stop doing that.  The object is thread-safe and can be used for all requests, so we just make it a static.

These changes drop that ~630GB to ~22MB and that ~14s to ~0.05s.

Subsequently, there's more memory-related changes that could be done in this code, e.g. using pooling, addressing some of the other allocation, but I've left that for the future.

Commit migrated from dotnet/corefx@5e84d5b
  • Loading branch information
stephentoub authored Mar 14, 2019
1 parent 76dd5a6 commit 70c070e
Showing 1 changed file with 18 additions and 13 deletions.
31 changes: 18 additions & 13 deletions src/libraries/System.Private.Uri/src/System/UriHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ internal static class UriHelper
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };

internal static readonly Encoding s_noFallbackCharUTF8 = Encoding.GetEncoding(
Encoding.UTF8.CodePage, new EncoderReplacementFallback(""), new DecoderReplacementFallback(""));

// http://host/Path/Path/File?Query is the base of
// - http://host/Path/Path/File/ ... (those "File" words may be different in semantic but anyway)
// - http://host/Path/Path/#Fragment
Expand Down Expand Up @@ -278,6 +281,7 @@ internal static unsafe char[] UnescapeString(char* pStr, int start, int end, cha
int next = start;
bool iriParsing = Uri.IriParsingStatic(syntax)
&& ((unescapeMode & UnescapeMode.EscapeUnescape) == UnescapeMode.EscapeUnescape);
char[] unescapedChars = null;

while (true)
{
Expand Down Expand Up @@ -474,21 +478,20 @@ internal static unsafe char[] UnescapeString(char* pStr, int start, int end, cha
}
}

Encoding noFallbackCharUTF8 = Encoding.GetEncoding(
Encoding.UTF8.CodePage,
new EncoderReplacementFallback(""),
new DecoderReplacementFallback(""));
if (unescapedChars == null || unescapedChars.Length < bytes.Length)
{
unescapedChars = new char[bytes.Length];
}

char[] unescapedChars = new char[bytes.Length];
int charCount = noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0);
int charCount = s_noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0);

start = next;

// match exact bytes
// Do not unescape chars not allowed by Iri
// need to check for invalid utf sequences that may not have given any chars

MatchUTF8Sequence(pDest, dest, ref destPosition, unescapedChars, charCount, bytes,
MatchUTF8Sequence(pDest, dest, ref destPosition, unescapedChars.AsSpan(0, charCount), charCount, bytes,
byteCount, isQuery, iriParsing);
}

Expand All @@ -507,18 +510,20 @@ internal static unsafe char[] UnescapeString(char* pStr, int start, int end, cha
// We got the unescaped chars, we then re-encode them and match off the bytes
// to get the invalid sequence bytes that we just copy off
//
internal static unsafe void MatchUTF8Sequence(char* pDest, char[] dest, ref int destOffset, char[] unescapedChars,
internal static unsafe void MatchUTF8Sequence(char* pDest, char[] dest, ref int destOffset, Span<char> unescapedChars,
int charCount, byte[] bytes, int byteCount, bool isQuery, bool iriParsing)
{
Span<byte> maxUtf8EncodedSpan = stackalloc byte[4];

int count = 0;
fixed (char* unescapedCharsPtr = unescapedChars)
{
for (int j = 0; j < charCount; ++j)
{
bool isHighSurr = char.IsHighSurrogate(unescapedCharsPtr[j]);

byte[] encodedBytes = Encoding.UTF8.GetBytes(unescapedChars, j, isHighSurr ? 2 : 1);
int encodedBytesLength = encodedBytes.Length;
Span<byte> encodedBytes = maxUtf8EncodedSpan;
int bytesWritten = Encoding.UTF8.GetBytes(unescapedChars.Slice(j, isHighSurr ? 2 : 1), encodedBytes);
encodedBytes = encodedBytes.Slice(0, bytesWritten);

// we have to keep unicode chars outside Iri range escaped
bool inIriRange = false;
Expand Down Expand Up @@ -546,7 +551,7 @@ internal static unsafe void MatchUTF8Sequence(char* pDest, char[] dest, ref int
// check if all bytes match
bool allBytesMatch = true;
int k = 0;
for (; k < encodedBytesLength; ++k)
for (; k < encodedBytes.Length; ++k)
{
if (bytes[count + k] != encodedBytes[k])
{
Expand All @@ -557,7 +562,7 @@ internal static unsafe void MatchUTF8Sequence(char* pDest, char[] dest, ref int

if (allBytesMatch)
{
count += encodedBytesLength;
count += encodedBytes.Length;
if (iriParsing)
{
if (!inIriRange)
Expand Down

0 comments on commit 70c070e

Please sign in to comment.