Skip to content

Commit

Permalink
URLParser should percent-encode non-ASCII and non-printable character…
Browse files Browse the repository at this point in the history
…s in fragment

https://bugs.webkit.org/show_bug.cgi?id=163287

Reviewed by Brady Eidson.

Source/WebCore:

Based on discussion in whatwg/url#150
If that discussion decides to keep the spec as-is (which keeps non-ASCII characters in the fragment
to match IE and Edge's behavior, which Chrome has followed for special schemes) then we can revert
this change later after enabling the URL parser.  Making this change keeps behavior matching Safari
and Firefox, as well as Chrome's handling of non-special schemes, such as data URLs.

Covered by updated API tests.

* platform/URLParser.cpp:
(WebCore::URLParser::appendToASCIIBuffer):
(WebCore::URLParser::copyURLPartsUntil):
(WebCore::URLParser::syntaxViolation):
(WebCore::URLParser::currentPosition):
(WebCore::URLParser::parse):
(WebCore::URLParser::fragmentSyntaxViolation): Deleted.
* platform/URLParser.h:
No more non-ASCII characters in canonicalized URLs.

Tools:

* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::TEST_F):



git-svn-id: http://svn.webkit.org/repository/webkit/trunk@207152 268f45cc-cd09-0410-ab3c-d52691b4dbfc
  • Loading branch information
[email protected] committed Oct 11, 2016
1 parent 69b7052 commit e34a7cb
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 97 deletions.
25 changes: 25 additions & 0 deletions Source/WebCore/ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,28 @@
2016-10-11 Alex Christensen <[email protected]>

URLParser should percent-encode non-ASCII and non-printable characters in fragment
https://bugs.webkit.org/show_bug.cgi?id=163287

Reviewed by Brady Eidson.

Based on discussion in https://github.com/whatwg/url/issues/150
If that discussion decides to keep the spec as-is (which keeps non-ASCII characters in the fragment
to match IE and Edge's behavior, which Chrome has followed for special schemes) then we can revert
this change later after enabling the URL parser. Making this change keeps behavior matching Safari
and Firefox, as well as Chrome's handling of non-special schemes, such as data URLs.

Covered by updated API tests.

* platform/URLParser.cpp:
(WebCore::URLParser::appendToASCIIBuffer):
(WebCore::URLParser::copyURLPartsUntil):
(WebCore::URLParser::syntaxViolation):
(WebCore::URLParser::currentPosition):
(WebCore::URLParser::parse):
(WebCore::URLParser::fragmentSyntaxViolation): Deleted.
* platform/URLParser.h:
No more non-ASCII characters in canonicalized URLs.

2016-10-11 Alex Christensen <[email protected]>

Remove dead networking code
Expand Down
85 changes: 8 additions & 77 deletions Source/WebCore/platform/URLParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -462,15 +462,13 @@ ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterTy

ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
{
ASSERT(m_unicodeFragmentBuffer.isEmpty());
ASSERT(isASCII(codePoint));
if (UNLIKELY(m_didSeeSyntaxViolation))
m_asciiBuffer.append(codePoint);
}

ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
{
ASSERT(m_unicodeFragmentBuffer.isEmpty());
if (UNLIKELY(m_didSeeSyntaxViolation))
m_asciiBuffer.append(characters, length);
}
Expand Down Expand Up @@ -829,7 +827,6 @@ void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePoint
syntaxViolation(iterator);

m_asciiBuffer.clear();
m_unicodeFragmentBuffer.clear();
copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
switch (part) {
case URLPart::FragmentEnd:
Expand Down Expand Up @@ -1018,7 +1015,6 @@ void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator
m_didSeeSyntaxViolation = true;

ASSERT(m_asciiBuffer.isEmpty());
ASSERT(m_unicodeFragmentBuffer.isEmpty());
size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
m_asciiBuffer.reserveCapacity(m_inputString.length());
Expand All @@ -1028,30 +1024,6 @@ void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator
}
}

template<typename CharacterType>
void URLParser::fragmentSyntaxViolation(const CodePointIterator<CharacterType>& iterator)
{
ASSERT(m_didSeeUnicodeFragmentCodePoint);
if (m_didSeeSyntaxViolation)
return;
m_didSeeSyntaxViolation = true;

ASSERT(m_asciiBuffer.isEmpty());
ASSERT(m_unicodeFragmentBuffer.isEmpty());
size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
size_t asciiCodeUnitsToCopy = m_url.m_queryEnd;
size_t unicodeCodeUnitsToCopy = codeUnitsToCopy - asciiCodeUnitsToCopy;
RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
m_asciiBuffer.reserveCapacity(asciiCodeUnitsToCopy);
for (size_t i = 0; i < asciiCodeUnitsToCopy; ++i) {
ASSERT(isASCII(m_inputString[i]));
m_asciiBuffer.uncheckedAppend(m_inputString[i]);
}
m_unicodeFragmentBuffer.reserveCapacity(m_inputString.length() - asciiCodeUnitsToCopy);
for (size_t i = asciiCodeUnitsToCopy; i < asciiCodeUnitsToCopy + unicodeCodeUnitsToCopy; ++i)
m_unicodeFragmentBuffer.uncheckedAppend(m_inputString[i]);
}

void URLParser::failure()
{
m_url.invalidate();
Expand Down Expand Up @@ -1111,10 +1083,8 @@ ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
template<typename CharacterType>
ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
{
if (UNLIKELY(m_didSeeSyntaxViolation)) {
ASSERT(m_unicodeFragmentBuffer.isEmpty());
if (UNLIKELY(m_didSeeSyntaxViolation))
return m_asciiBuffer.size();
}

return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
}
Expand Down Expand Up @@ -1160,7 +1130,6 @@ void URLParser::parse(const CharacterType* input, const unsigned length, const U
URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
m_url = { };
ASSERT(m_asciiBuffer.isEmpty());
ASSERT(m_unicodeFragmentBuffer.isEmpty());

bool isUTF8Encoding = encoding == UTF8Encoding();
Vector<UChar> queryBuffer;
Expand Down Expand Up @@ -1811,35 +1780,9 @@ void URLParser::parse(const CharacterType* input, const unsigned length, const U
} while (!c.atEnd());
break;
case State::Fragment:
do {
URL_PARSER_LOG("State Fragment");
if (!c.atEnd() && isTabOrNewline(*c)) {
if (m_didSeeUnicodeFragmentCodePoint)
fragmentSyntaxViolation(c);
else
syntaxViolation(c);
++c;
continue;
}
if (!m_didSeeUnicodeFragmentCodePoint && isASCII(*c)) {
if (m_urlIsSpecial)
appendToASCIIBuffer(*c);
else
utf8PercentEncode<isInSimpleEncodeSet>(c);
} else {
if (m_urlIsSpecial) {
m_didSeeUnicodeFragmentCodePoint = true;
if (UNLIKELY(m_didSeeSyntaxViolation))
appendCodePoint(m_unicodeFragmentBuffer, *c);
else {
ASSERT(m_asciiBuffer.isEmpty());
ASSERT(m_unicodeFragmentBuffer.isEmpty());
}
} else
utf8PercentEncode<isInSimpleEncodeSet>(c);
}
++c;
} while (!c.atEnd());
URL_PARSER_LOG("State Fragment");
utf8PercentEncode<isInSimpleEncodeSet>(c);
++c;
break;
}
}
Expand Down Expand Up @@ -2043,28 +1986,16 @@ void URLParser::parse(const CharacterType* input, const unsigned length, const U
m_url.m_fragmentEnd = m_url.m_queryEnd;
break;
case State::Fragment:
{
LOG_FINAL_STATE("Fragment");
size_t length = m_didSeeSyntaxViolation ? m_asciiBuffer.size() + m_unicodeFragmentBuffer.size() : c.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
m_url.m_fragmentEnd = length;
break;
}
LOG_FINAL_STATE("Fragment");
m_url.m_fragmentEnd = currentPosition(c);
break;
}

if (LIKELY(!m_didSeeSyntaxViolation)) {
m_url.m_string = m_inputString;
ASSERT(m_asciiBuffer.isEmpty());
ASSERT(m_unicodeFragmentBuffer.isEmpty());
} else if (!m_didSeeUnicodeFragmentCodePoint) {
ASSERT(m_unicodeFragmentBuffer.isEmpty());
} else
m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
} else {
Vector<UChar> buffer;
buffer.reserveInitialCapacity(m_asciiBuffer.size() + m_unicodeFragmentBuffer.size());
buffer.appendVector(m_asciiBuffer);
buffer.appendVector(m_unicodeFragmentBuffer);
m_url.m_string = String::adopt(WTFMove(buffer));
}
m_url.m_isValid = true;
URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
}
Expand Down
3 changes: 0 additions & 3 deletions Source/WebCore/platform/URLParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,6 @@ class URLParser {
private:
URL m_url;
Vector<LChar> m_asciiBuffer;
Vector<UChar> m_unicodeFragmentBuffer;
bool m_didSeeUnicodeFragmentCodePoint { false };
bool m_urlIsSpecial { false };
bool m_hostHasPercentOrNonASCII { false };
String m_inputString;
Expand All @@ -73,7 +71,6 @@ class URLParser {
void advance(CodePointIterator<CharacterType>&, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition);
template<typename CharacterType> bool takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>);
template<typename CharacterType> void syntaxViolation(const CodePointIterator<CharacterType>&);
template<typename CharacterType> void fragmentSyntaxViolation(const CodePointIterator<CharacterType>&);
template<typename CharacterType> bool isPercentEncodedDot(CodePointIterator<CharacterType>);
template<typename CharacterType> bool isWindowsDriveLetter(CodePointIterator<CharacterType>);
template<typename CharacterType> bool isSingleDotPathSegment(CodePointIterator<CharacterType>);
Expand Down
10 changes: 10 additions & 0 deletions Tools/ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
2016-10-11 Alex Christensen <[email protected]>

URLParser should percent-encode non-ASCII and non-printable characters in fragment
https://bugs.webkit.org/show_bug.cgi?id=163287

Reviewed by Brady Eidson.

* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::TEST_F):

2016-10-11 Alex Christensen <[email protected]>

Remove dead networking code
Expand Down
23 changes: 6 additions & 17 deletions Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,10 @@ TEST_F(URLParserTest, Basic)
checkURL("aA://", {"aa", "", "", "", 0, "//", "", "", "aa://"});
checkURL(utf16String(u"foo://host/#ПП\u0007 a</"), {"foo", "", "", "host", 0, "/", "", "%D0%9F%D0%9F%07 a</", "foo://host/#%D0%9F%D0%9F%07 a</"});
checkURL(utf16String(u"foo://host/#\u0007 a</"), {"foo", "", "", "host", 0, "/", "", "%07 a</", "foo://host/#%07 a</"});
checkURL(utf16String(u"http://host?ß😍#ß😍"), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", "%C3%9F%F0%9F%98%8D", "http://host/?%C3%9F%F0%9F%98%8D#%C3%9F%F0%9F%98%8D"}, testTabsValueForSurrogatePairs);
checkURL(utf16String(u"http://host/path#💩\t💩"), {"http", "", "", "host", 0, "/path", "", "%F0%9F%92%A9%F0%9F%92%A9", "http://host/path#%F0%9F%92%A9%F0%9F%92%A9"}, testTabsValueForSurrogatePairs);
checkURL(utf16String(u"http://host/#ПП\u0007 a</"), {"http", "", "", "host", 0, "/", "", "%D0%9F%D0%9F%07 a</", "http://host/#%D0%9F%D0%9F%07 a</"});
checkURL(utf16String(u"http://host/#\u0007 a</"), {"http", "", "", "host", 0, "/", "", "%07 a</", "http://host/#%07 a</"});

// This disagrees with the web platform test for http://:@www.example.com but agrees with Chrome and URL::parse,
// and Firefox fails the web platform test differently. Maybe the web platform test ought to be changed.
Expand Down Expand Up @@ -456,6 +460,7 @@ TEST_F(URLParserTest, ParseRelative)
checkRelativeURL(" ", "http://host/path?query#fra#gment", {"http", "", "", "host", 0, "/path", "query", "", "http://host/path?query"});
checkRelativeURL(" \a ", "http://host/#fragment", {"http", "", "", "host", 0, "/", "", "", "http://host/"});
checkRelativeURL("foo://", "http://example.org/foo/bar", {"foo", "", "", "", 0, "//", "", "", "foo://"});
checkRelativeURL(utf16String(u""), "http://example.org/foo/bar", {"http", "", "", "example.org", 0, "/foo/bar", "", "%CE%B2", "http://example.org/foo/bar#%CE%B2"});

// The checking of slashes in SpecialAuthoritySlashes needed to get this to pass contradicts what is in the spec,
// but it is included in the web platform tests.
Expand Down Expand Up @@ -630,9 +635,6 @@ TEST_F(URLParserTest, ParserDifferences)
checkURLDifferences("file://[0:a:0:0:b:c:0:0]/path",
{"file", "", "", "[0:a::b:c:0:0]", 0, "/path", "", "", "file://[0:a::b:c:0:0]/path"},
{"file", "", "", "[0:a:0:0:b:c:0:0]", 0, "/path", "", "", "file://[0:a:0:0:b:c:0:0]/path"});
checkRelativeURLDifferences(utf16String(u""), "http://example.org/foo/bar",
{"http", "", "", "example.org", 0, "/foo/bar", "", utf16String(u"β"), utf16String(u"http://example.org/foo/bar#β")},
{"http", "", "", "example.org", 0, "/foo/bar", "", "%CE%B2", "http://example.org/foo/bar#%CE%B2"});
checkURLDifferences("http://",
{"", "", "", "", 0, "", "", "", "http://"},
{"http", "", "", "", 0, "/", "", "", "http:/"});
Expand Down Expand Up @@ -769,12 +771,6 @@ TEST_F(URLParserTest, ParserDifferences)
checkURLDifferences("notspecial://@test@test@example:800\\path@end",
{"notspecial", "@test@test@example", "800\\path", "end", 0, "/", "", "", "notspecial://%40test%40test%40example:800%5Cpath@end/"},
{"", "", "", "", 0, "", "", "", "notspecial://@test@test@example:800\\path@end"});
checkURLDifferences(utf16String(u"http://host?ß😍#ß😍"),
{"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", utf16String(u"ß😍"), utf16String(u"http://host/?%C3%9F%F0%9F%98%8D#ß😍")},
{"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", "%C3%9F%F0%9F%98%8D", "http://host/?%C3%9F%F0%9F%98%8D#%C3%9F%F0%9F%98%8D"}, testTabsValueForSurrogatePairs);
checkURLDifferences(utf16String(u"http://host/path#💩\t💩"),
{"http", "", "", "host", 0, "/path", "", utf16String(u"💩💩"), utf16String(u"http://host/path#💩💩")},
{"http", "", "", "host", 0, "/path", "", "%F0%9F%92%A9%F0%9F%92%A9", "http://host/path#%F0%9F%92%A9%F0%9F%92%A9"});
checkURLDifferences("http://%48OsT",
{"http", "", "", "host", 0, "/", "", "", "http://host/"},
{"http", "", "", "%48ost", 0, "/", "", "", "http://%48ost/"});
Expand Down Expand Up @@ -1060,12 +1056,6 @@ TEST_F(URLParserTest, DefaultPort)
checkURLDifferences("file://:0/path",
{"", "", "", "", 0, "", "", "", "file://:0/path"},
{"file", "", "", "", 0, "/path", "", "", "file://:0/path"});
checkURLDifferences(utf16String(u"http://host/#ПП\u0007 a</"),
{"http", "", "", "host", 0, "/", "", utf16String(u"ПП\u0007 a</"), utf16String(u"http://host/#ПП\u0007 a</")},
{"http", "", "", "host", 0, "/", "", "%D0%9F%D0%9F%07 a</", "http://host/#%D0%9F%D0%9F%07 a</"});
checkURLDifferences(utf16String(u"http://host/#\u0007 a</"),
{"http", "", "", "host", 0, "/", "", "\a a</", "http://host/#\a a</"},
{"http", "", "", "host", 0, "/", "", "%07 a</", "http://host/#%07 a</"});
}

static void shouldFail(const String& urlString)
Expand Down Expand Up @@ -1224,8 +1214,7 @@ static void checkURL(const String& urlString, const String& baseURLString, const

TEST_F(URLParserTest, QueryEncoding)
{
checkURL(utf16String(u"http://host?ß😍#ß😍"), UTF8Encoding(), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", utf16String(u"ß😍"), utf16String(u"http://host/?%C3%9F%F0%9F%98%8D#ß😍")}, testTabsValueForSurrogatePairs);
checkURL(utf16String(u"http://host?ß😍#ß😍"), UTF8Encoding(), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", utf16String(u"ß😍"), utf16String(u"http://host/?%C3%9F%F0%9F%98%8D#ß😍")}, testTabsValueForSurrogatePairs);
checkURL(utf16String(u"http://host?ß😍#ß😍"), UTF8Encoding(), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", "%C3%9F%F0%9F%98%8D", utf16String(u"http://host/?%C3%9F%F0%9F%98%8D#%C3%9F%F0%9F%98%8D")}, testTabsValueForSurrogatePairs);

TextEncoding latin1(String("latin1"));
checkURL("http://host/?query with%20spaces", latin1, {"http", "", "", "host", 0, "/", "query%20with%20spaces", "", "http://host/?query%20with%20spaces"});
Expand Down

0 comments on commit e34a7cb

Please sign in to comment.