Skip to content

Commit

Permalink
Optimize percent-encoded UTF8 processing in Uri (#32552)
Browse files Browse the repository at this point in the history
* Optimize percent-encoded UTF8 processing in Uri

* Rename charsConsumed to bytesConsumed

* Use ValueStringBuilder Append(char*, int) instead of Append(ROS<char>)

* Add tests for PercentEncodingHelper

* Use string literals instead of char.ConvertFromUtf32

* Use sizeof(uint) instead of 4

* Add missing license headers

* Improve codegen by using temporary local copy

* Correct Debug asserts

* Add ValueStringBuilder.Append(Rune)

* Improve hex decoding throughput

* Move VSB.Append(Rune) to a Uri-specific partial VSB file

* Add missing csproj link

* Add more comments documenting PercentEncodingHelper's logic

* Fix rebase conflicts

* Address PR feedback
  • Loading branch information
MihaZupan authored Dec 14, 2020
1 parent b01c8e1 commit e53e543
Show file tree
Hide file tree
Showing 10 changed files with 489 additions and 555 deletions.
11 changes: 5 additions & 6 deletions src/libraries/System.Private.Uri/src/System.Private.Uri.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@
<Compile Include="System\DomainNameHelper.cs" />
<Compile Include="System\GenericUriParser.cs" />
<Compile Include="System\IPv4AddressHelper.cs" />
<Compile Include="$(CommonPath)System\Net\IPv4AddressHelper.Common.cs"
Link="System\IPv4AddressHelper.Common.cs" />
<Compile Include="$(CommonPath)System\Net\IPv4AddressHelper.Common.cs" Link="System\IPv4AddressHelper.Common.cs" />
<Compile Include="System\IPv6AddressHelper.cs" />
<Compile Include="$(CommonPath)System\Net\IPv6AddressHelper.Common.cs"
Link="System\IPv6AddressHelper.Common.cs" />
<Compile Include="$(CommonPath)System\Net\IPv6AddressHelper.Common.cs" Link="System\IPv6AddressHelper.Common.cs" />
<Compile Include="System\IriHelper.cs" />
<Compile Include="System\PercentEncodingHelper.cs" />
<Compile Include="System\UncNameHelper.cs" />
<Compile Include="System\Uri.cs" />
<Compile Include="System\UriBuilder.cs" />
Expand All @@ -34,8 +33,8 @@
<Compile Include="System\UriPartial.cs" />
<Compile Include="System\UriScheme.cs" />
<Compile Include="System\UriSyntax.cs" />
<Compile Include="$(CommonPath)System\Text\ValueStringBuilder.cs"
Link="Common\System\Text\ValueStringBuilder.cs" />
<Compile Include="$(CommonPath)System\Text\ValueStringBuilder.cs" Link="Common\System\Text\ValueStringBuilder.cs" />
<Compile Include="System\ValueStringBuilderExtensions.cs" />
</ItemGroup>
<ItemGroup Condition="'$(TargetsWindows)' == 'true'">
<Compile Include="System\Uri.Windows.cs" />
Expand Down
144 changes: 53 additions & 91 deletions src/libraries/System.Private.Uri/src/System/IriHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Text;

namespace System
Expand All @@ -14,10 +15,10 @@ internal static class IriHelper
//
internal static bool CheckIriUnicodeRange(char unicode, bool isQuery)
{
return ((unicode >= '\u00A0' && unicode <= '\uD7FF') ||
(unicode >= '\uF900' && unicode <= '\uFDCF') ||
(unicode >= '\uFDF0' && unicode <= '\uFFEF') ||
(isQuery && unicode >= '\uE000' && unicode <= '\uF8FF'));
return IsInInclusiveRange(unicode, '\u00A0', '\uD7FF')
|| IsInInclusiveRange(unicode, '\uF900', '\uFDCF')
|| IsInInclusiveRange(unicode, '\uFDF0', '\uFFEF')
|| (isQuery && IsInInclusiveRange(unicode, '\uE000', '\uF8FF'));
}

//
Expand Down Expand Up @@ -47,6 +48,27 @@ internal static bool CheckIriUnicodeRange(char highSurr, char lowSurr, out bool
return false;
}

internal static bool CheckIriUnicodeRange(uint value, bool isQuery)
{
if (value <= 0xFFFF)
{
return IsInInclusiveRange(value, '\u00A0', '\uD7FF')
|| IsInInclusiveRange(value, '\uF900', '\uFDCF')
|| IsInInclusiveRange(value, '\uFDF0', '\uFFEF')
|| (isQuery && IsInInclusiveRange(value, '\uE000', '\uF8FF'));
}
else
{
return ((value & 0xFFFF) < 0xFFFE)
&& !IsInInclusiveRange(value, 0xE0000, 0xE0FFF)
&& (isQuery || value < 0xF0000);
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsInInclusiveRange(uint value, uint min, uint max)
=> (value - min) <= (max - min);

//
// Check reserved chars according to RFC 3987 in a specific component
//
Expand All @@ -67,114 +89,55 @@ internal static bool CheckIsReserved(char ch, UriComponents component)
internal static unsafe string EscapeUnescapeIri(char* pInput, int start, int end, UriComponents component)
{
int size = end - start;
ValueStringBuilder dest = new ValueStringBuilder(size);
byte[]? bytes = null;

int next = start;
char ch;
ValueStringBuilder dest = size <= 256
? new ValueStringBuilder(stackalloc char[256])
: new ValueStringBuilder(size);

Span<byte> maxUtf8EncodedSpan = stackalloc byte[4];

for (; next < end; ++next)
for (int i = start; i < end; ++i)
{
if ((ch = pInput[next]) == '%')
char ch = pInput[i];
if (ch == '%')
{
if (next + 2 < end)
if (end - i > 2)
{
ch = UriHelper.DecodeHexChars(pInput[next + 1], pInput[next + 2]);
ch = UriHelper.DecodeHexChars(pInput[i + 1], pInput[i + 2]);

// Do not unescape a reserved char
if (ch == Uri.c_DummyChar || ch == '%' || CheckIsReserved(ch, component) || UriHelper.IsNotSafeForUnescape(ch))
{
// keep as is
dest.Append(pInput[next++]);
dest.Append(pInput[next++]);
dest.Append(pInput[next]);
dest.Append(pInput[i++]);
dest.Append(pInput[i++]);
dest.Append(pInput[i]);
continue;
}
else if (ch <= '\x7F')
{
Debug.Assert(ch < 0xFF, "Expecting ASCII character.");
//ASCII
dest.Append(ch);
next += 2;
i += 2;
continue;
}
else
{
// possibly utf8 encoded sequence of unicode

// check if safe to unescape according to Iri rules

Debug.Assert(ch < 0xFF, "Expecting ASCII character.");

int startSeq = next;
int byteCount = 1;
// lazy initialization of max size, will reuse the array for next sequences
if (bytes is null)
bytes = new byte[end - next];

bytes[0] = (byte)ch;
next += 3;
while (next < end)
{
// Check on exit criterion
if ((ch = pInput[next]) != '%' || next + 2 >= end)
break;

// already made sure we have 3 characters in str
ch = UriHelper.DecodeHexChars(pInput[next + 1], pInput[next + 2]);

//invalid hex sequence ?
if (ch == Uri.c_DummyChar)
break;
// character is not part of a UTF-8 sequence ?
else if (ch < '\x80')
break;
else
{
//a UTF-8 sequence
bytes[byteCount++] = (byte)ch;
next += 3;
}

Debug.Assert(ch < 0xFF, "Expecting ASCII character.");
}
next--; // for loop will increment


// Using encoder with no replacement fall-back will skip all invalid UTF-8 sequences.
Encoding noFallbackCharUTF8 = Encoding.GetEncoding(
Encoding.UTF8.CodePage,
new EncoderReplacementFallback(""),
new DecoderReplacementFallback(""));

char[] unescapedChars = new char[bytes.Length];
int charCount = noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0);


if (charCount != 0)
{
// If invalid sequences were present in the original escaped string, we need to
// copy the escaped versions of those sequences.
// Decoded Unicode values will be kept only when they are allowed by the URI/IRI RFC
// rules.
UriHelper.MatchUTF8Sequence(ref dest, unescapedChars, charCount, bytes,
byteCount, component == UriComponents.Query, true);
}
else
{
// copy escaped sequence as is
for (int i = startSeq; i <= next; ++i)
{
dest.Append(pInput[i]);
}
}
int charactersRead = PercentEncodingHelper.UnescapePercentEncodedUTF8Sequence(
pInput + i,
end - i,
ref dest,
component == UriComponents.Query,
iriParsing: true);

Debug.Assert(charactersRead > 0);
i += charactersRead - 1; // -1 as i will be incremented in the loop
}
}
else
{
dest.Append(pInput[next]);
dest.Append(pInput[i]);
}
}
else if (ch > '\x7f')
Expand All @@ -186,9 +149,9 @@ internal static unsafe string EscapeUnescapeIri(char* pInput, int start, int end

char ch2 = '\0';

if ((char.IsHighSurrogate(ch)) && (next + 1 < end))
if ((char.IsHighSurrogate(ch)) && (i + 1 < end))
{
ch2 = pInput[next + 1];
ch2 = pInput[i + 1];
isInIriUnicodeRange = CheckIriUnicodeRange(ch, ch2, out surrogatePair, component == UriComponents.Query);
}
else
Expand Down Expand Up @@ -227,18 +190,17 @@ internal static unsafe string EscapeUnescapeIri(char* pInput, int start, int end

if (surrogatePair)
{
next++;
i++;
}
}
else
{
// just copy the character
dest.Append(pInput[next]);
dest.Append(pInput[i]);
}
}

string result = dest.ToString();
return result;
return dest.ToString();
}
}
}
Loading

0 comments on commit e53e543

Please sign in to comment.