URLParser should percent-encode non-ASCII and non-printable character…

…s in fragment https://bugs.webkit.org/show_bug.cgi?id=163287 Reviewed by Brady Eidson. Source/WebCore: Based on discussion in whatwg/url#150 If that discussion decides to keep the spec as-is (which keeps non-ASCII characters in the fragment to match IE and Edge's behavior, which Chrome has followed for special schemes) then we can revert this change later after enabling the URL parser. Making this change keeps behavior matching Safari and Firefox, as well as Chrome's handling of non-special schemes, such as data URLs. Covered by updated API tests. * platform/URLParser.cpp: (WebCore::URLParser::appendToASCIIBuffer): (WebCore::URLParser::copyURLPartsUntil): (WebCore::URLParser::syntaxViolation): (WebCore::URLParser::currentPosition): (WebCore::URLParser::parse): (WebCore::URLParser::fragmentSyntaxViolation): Deleted. * platform/URLParser.h: No more non-ASCII characters in canonicalized URLs. Tools: * TestWebKitAPI/Tests/WebCore/URLParser.cpp: (TestWebKitAPI::TEST_F): git-svn-id: http://svn.webkit.org/repository/webkit/trunk@207152 268f45cc-cd09-0410-ab3c-d52691b4dbfc
paul99 · Oct 11, 2016 · e34a7cb · e34a7cb
1 parent 69b7052
commit e34a7cb
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 97 deletions.
diff --git a/Source/WebCore/ChangeLog b/Source/WebCore/ChangeLog
@@ -1,3 +1,28 @@
+2016-10-11  Alex Christensen  <[email protected]>
+
+        URLParser should percent-encode non-ASCII and non-printable characters in fragment
+        https://bugs.webkit.org/show_bug.cgi?id=163287
+
+        Reviewed by Brady Eidson.
+
+        Based on discussion in https://github.com/whatwg/url/issues/150
+        If that discussion decides to keep the spec as-is (which keeps non-ASCII characters in the fragment
+        to match IE and Edge's behavior, which Chrome has followed for special schemes) then we can revert
+        this change later after enabling the URL parser.  Making this change keeps behavior matching Safari
+        and Firefox, as well as Chrome's handling of non-special schemes, such as data URLs.
+
+        Covered by updated API tests.
+
+        * platform/URLParser.cpp:
+        (WebCore::URLParser::appendToASCIIBuffer):
+        (WebCore::URLParser::copyURLPartsUntil):
+        (WebCore::URLParser::syntaxViolation):
+        (WebCore::URLParser::currentPosition):
+        (WebCore::URLParser::parse):
+        (WebCore::URLParser::fragmentSyntaxViolation): Deleted.
+        * platform/URLParser.h:
+        No more non-ASCII characters in canonicalized URLs.
+
 2016-10-11  Alex Christensen  <[email protected]>
 
         Remove dead networking code

diff --git a/Source/WebCore/platform/URLParser.cpp b/Source/WebCore/platform/URLParser.cpp
@@ -462,15 +462,13 @@ ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterTy
 
 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
 {
-    ASSERT(m_unicodeFragmentBuffer.isEmpty());
     ASSERT(isASCII(codePoint));
     if (UNLIKELY(m_didSeeSyntaxViolation))
         m_asciiBuffer.append(codePoint);
 }
 
 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
 {
-    ASSERT(m_unicodeFragmentBuffer.isEmpty());
     if (UNLIKELY(m_didSeeSyntaxViolation))
         m_asciiBuffer.append(characters, length);
 }
@@ -829,7 +827,6 @@ void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePoint
     syntaxViolation(iterator);
 
     m_asciiBuffer.clear();
-    m_unicodeFragmentBuffer.clear();
     copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
     switch (part) {
     case URLPart::FragmentEnd:
@@ -1018,7 +1015,6 @@ void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator
     m_didSeeSyntaxViolation = true;
 
     ASSERT(m_asciiBuffer.isEmpty());
-    ASSERT(m_unicodeFragmentBuffer.isEmpty());
     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
     m_asciiBuffer.reserveCapacity(m_inputString.length());
@@ -1028,30 +1024,6 @@ void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator
     }
 }
 
-template<typename CharacterType>
-void URLParser::fragmentSyntaxViolation(const CodePointIterator<CharacterType>& iterator)
-{
-    ASSERT(m_didSeeUnicodeFragmentCodePoint);
-    if (m_didSeeSyntaxViolation)
-        return;
-    m_didSeeSyntaxViolation = true;
-
-    ASSERT(m_asciiBuffer.isEmpty());
-    ASSERT(m_unicodeFragmentBuffer.isEmpty());
-    size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
-    size_t asciiCodeUnitsToCopy = m_url.m_queryEnd;
-    size_t unicodeCodeUnitsToCopy = codeUnitsToCopy - asciiCodeUnitsToCopy;
-    RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
-    m_asciiBuffer.reserveCapacity(asciiCodeUnitsToCopy);
-    for (size_t i = 0; i < asciiCodeUnitsToCopy; ++i) {
-        ASSERT(isASCII(m_inputString[i]));
-        m_asciiBuffer.uncheckedAppend(m_inputString[i]);
-    }
-    m_unicodeFragmentBuffer.reserveCapacity(m_inputString.length() - asciiCodeUnitsToCopy);
-    for (size_t i = asciiCodeUnitsToCopy; i < asciiCodeUnitsToCopy + unicodeCodeUnitsToCopy; ++i)
-        m_unicodeFragmentBuffer.uncheckedAppend(m_inputString[i]);
-}
-
 void URLParser::failure()
 {
     m_url.invalidate();
@@ -1111,10 +1083,8 @@ ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
 template<typename CharacterType>
 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
 {
-    if (UNLIKELY(m_didSeeSyntaxViolation)) {
-        ASSERT(m_unicodeFragmentBuffer.isEmpty());
+    if (UNLIKELY(m_didSeeSyntaxViolation))
         return m_asciiBuffer.size();
-    }
 
     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
 }
@@ -1160,7 +1130,6 @@ void URLParser::parse(const CharacterType* input, const unsigned length, const U
     URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
     m_url = { };
     ASSERT(m_asciiBuffer.isEmpty());
-    ASSERT(m_unicodeFragmentBuffer.isEmpty());
 
     bool isUTF8Encoding = encoding == UTF8Encoding();
     Vector<UChar> queryBuffer;
@@ -1811,35 +1780,9 @@ void URLParser::parse(const CharacterType* input, const unsigned length, const U
             } while (!c.atEnd());
             break;
         case State::Fragment:
-            do {
-                URL_PARSER_LOG("State Fragment");
-                if (!c.atEnd() && isTabOrNewline(*c)) {
-                    if (m_didSeeUnicodeFragmentCodePoint)
-                        fragmentSyntaxViolation(c);
-                    else
-                        syntaxViolation(c);
-                    ++c;
-                    continue;
-                }
-                if (!m_didSeeUnicodeFragmentCodePoint && isASCII(*c)) {
-                    if (m_urlIsSpecial)
-                        appendToASCIIBuffer(*c);
-                    else
-                        utf8PercentEncode<isInSimpleEncodeSet>(c);
-                } else {
-                    if (m_urlIsSpecial) {
-                        m_didSeeUnicodeFragmentCodePoint = true;
-                        if (UNLIKELY(m_didSeeSyntaxViolation))
-                            appendCodePoint(m_unicodeFragmentBuffer, *c);
-                        else {
-                            ASSERT(m_asciiBuffer.isEmpty());
-                            ASSERT(m_unicodeFragmentBuffer.isEmpty());
-                        }
-                    } else
-                        utf8PercentEncode<isInSimpleEncodeSet>(c);
-                }
-                ++c;
-            } while (!c.atEnd());
+            URL_PARSER_LOG("State Fragment");
+            utf8PercentEncode<isInSimpleEncodeSet>(c);
+            ++c;
             break;
         }
     }
@@ -2043,28 +1986,16 @@ void URLParser::parse(const CharacterType* input, const unsigned length, const U
         m_url.m_fragmentEnd = m_url.m_queryEnd;
         break;
     case State::Fragment:
-        {
-            LOG_FINAL_STATE("Fragment");
-            size_t length = m_didSeeSyntaxViolation ? m_asciiBuffer.size() + m_unicodeFragmentBuffer.size() : c.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
-            m_url.m_fragmentEnd = length;
-            break;
-        }
+        LOG_FINAL_STATE("Fragment");
+        m_url.m_fragmentEnd = currentPosition(c);
+        break;
     }
 
     if (LIKELY(!m_didSeeSyntaxViolation)) {
         m_url.m_string = m_inputString;
         ASSERT(m_asciiBuffer.isEmpty());
-        ASSERT(m_unicodeFragmentBuffer.isEmpty());
-    } else if (!m_didSeeUnicodeFragmentCodePoint) {
-        ASSERT(m_unicodeFragmentBuffer.isEmpty());
+    } else
         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
-    } else {
-        Vector<UChar> buffer;
-        buffer.reserveInitialCapacity(m_asciiBuffer.size() + m_unicodeFragmentBuffer.size());
-        buffer.appendVector(m_asciiBuffer);
-        buffer.appendVector(m_unicodeFragmentBuffer);
-        m_url.m_string = String::adopt(WTFMove(buffer));
-    }
     m_url.m_isValid = true;
     URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
 }

diff --git a/Source/WebCore/platform/URLParser.h b/Source/WebCore/platform/URLParser.h
@@ -51,8 +51,6 @@ class URLParser {
 private:
     URL m_url;
     Vector<LChar> m_asciiBuffer;
-    Vector<UChar> m_unicodeFragmentBuffer;
-    bool m_didSeeUnicodeFragmentCodePoint { false };
     bool m_urlIsSpecial { false };
     bool m_hostHasPercentOrNonASCII { false };
     String m_inputString;
@@ -73,7 +71,6 @@ class URLParser {
     void advance(CodePointIterator<CharacterType>&, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition);
     template<typename CharacterType> bool takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>);
     template<typename CharacterType> void syntaxViolation(const CodePointIterator<CharacterType>&);
-    template<typename CharacterType> void fragmentSyntaxViolation(const CodePointIterator<CharacterType>&);
     template<typename CharacterType> bool isPercentEncodedDot(CodePointIterator<CharacterType>);
     template<typename CharacterType> bool isWindowsDriveLetter(CodePointIterator<CharacterType>);
     template<typename CharacterType> bool isSingleDotPathSegment(CodePointIterator<CharacterType>);

diff --git a/Tools/ChangeLog b/Tools/ChangeLog
@@ -1,3 +1,13 @@
+2016-10-11  Alex Christensen  <[email protected]>
+
+        URLParser should percent-encode non-ASCII and non-printable characters in fragment
+        https://bugs.webkit.org/show_bug.cgi?id=163287
+
+        Reviewed by Brady Eidson.
+
+        * TestWebKitAPI/Tests/WebCore/URLParser.cpp:
+        (TestWebKitAPI::TEST_F):
+
 2016-10-11  Alex Christensen  <[email protected]>
 
         Remove dead networking code

diff --git a/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp b/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp
@@ -331,6 +331,10 @@ TEST_F(URLParserTest, Basic)
     checkURL("aA://", {"aa", "", "", "", 0, "//", "", "", "aa://"});
     checkURL(utf16String(u"foo://host/#ПП\u0007 a</"), {"foo", "", "", "host", 0, "/", "", "%D0%9F%D0%9F%07 a</", "foo://host/#%D0%9F%D0%9F%07 a</"});
     checkURL(utf16String(u"foo://host/#\u0007 a</"), {"foo", "", "", "host", 0, "/", "", "%07 a</", "foo://host/#%07 a</"});
+    checkURL(utf16String(u"http://host?ß😍#ß😍"), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", "%C3%9F%F0%9F%98%8D", "http://host/?%C3%9F%F0%9F%98%8D#%C3%9F%F0%9F%98%8D"}, testTabsValueForSurrogatePairs);
+    checkURL(utf16String(u"http://host/path#💩\t💩"), {"http", "", "", "host", 0, "/path", "", "%F0%9F%92%A9%F0%9F%92%A9", "http://host/path#%F0%9F%92%A9%F0%9F%92%A9"}, testTabsValueForSurrogatePairs);
+    checkURL(utf16String(u"http://host/#ПП\u0007 a</"), {"http", "", "", "host", 0, "/", "", "%D0%9F%D0%9F%07 a</", "http://host/#%D0%9F%D0%9F%07 a</"});
+    checkURL(utf16String(u"http://host/#\u0007 a</"), {"http", "", "", "host", 0, "/", "", "%07 a</", "http://host/#%07 a</"});
 
     // This disagrees with the web platform test for http://:@www.example.com but agrees with Chrome and URL::parse,
     // and Firefox fails the web platform test differently. Maybe the web platform test ought to be changed.
@@ -456,6 +460,7 @@ TEST_F(URLParserTest, ParseRelative)
     checkRelativeURL("  ", "http://host/path?query#fra#gment", {"http", "", "", "host", 0, "/path", "query", "", "http://host/path?query"});
     checkRelativeURL(" \a ", "http://host/#fragment", {"http", "", "", "host", 0, "/", "", "", "http://host/"});
     checkRelativeURL("foo://", "http://example.org/foo/bar", {"foo", "", "", "", 0, "//", "", "", "foo://"});
+    checkRelativeURL(utf16String(u"#β"), "http://example.org/foo/bar", {"http", "", "", "example.org", 0, "/foo/bar", "", "%CE%B2", "http://example.org/foo/bar#%CE%B2"});
 
     // The checking of slashes in SpecialAuthoritySlashes needed to get this to pass contradicts what is in the spec,
     // but it is included in the web platform tests.
@@ -630,9 +635,6 @@ TEST_F(URLParserTest, ParserDifferences)
     checkURLDifferences("file://[0:a:0:0:b:c:0:0]/path",
         {"file", "", "", "[0:a::b:c:0:0]", 0, "/path", "", "", "file://[0:a::b:c:0:0]/path"},
         {"file", "", "", "[0:a:0:0:b:c:0:0]", 0, "/path", "", "", "file://[0:a:0:0:b:c:0:0]/path"});
-    checkRelativeURLDifferences(utf16String(u"#β"), "http://example.org/foo/bar",
-        {"http", "", "", "example.org", 0, "/foo/bar", "", utf16String(u"β"), utf16String(u"http://example.org/foo/bar#β")},
-        {"http", "", "", "example.org", 0, "/foo/bar", "", "%CE%B2", "http://example.org/foo/bar#%CE%B2"});
     checkURLDifferences("http://",
         {"", "", "", "", 0, "", "", "", "http://"},
         {"http", "", "", "", 0, "/", "", "", "http:/"});
@@ -769,12 +771,6 @@ TEST_F(URLParserTest, ParserDifferences)
     checkURLDifferences("notspecial://@test@test@example:800\\path@end",
         {"notspecial", "@test@test@example", "800\\path", "end", 0, "/", "", "", "notspecial://%40test%40test%40example:800%5Cpath@end/"},
         {"", "", "", "", 0, "", "", "", "notspecial://@test@test@example:800\\path@end"});
-    checkURLDifferences(utf16String(u"http://host?ß😍#ß😍"),
-        {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", utf16String(u"ß😍"), utf16String(u"http://host/?%C3%9F%F0%9F%98%8D#ß😍")},
-        {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", "%C3%9F%F0%9F%98%8D", "http://host/?%C3%9F%F0%9F%98%8D#%C3%9F%F0%9F%98%8D"}, testTabsValueForSurrogatePairs);
-    checkURLDifferences(utf16String(u"http://host/path#💩\t💩"),
-        {"http", "", "", "host", 0, "/path", "", utf16String(u"💩💩"), utf16String(u"http://host/path#💩💩")},
-        {"http", "", "", "host", 0, "/path", "", "%F0%9F%92%A9%F0%9F%92%A9", "http://host/path#%F0%9F%92%A9%F0%9F%92%A9"});
     checkURLDifferences("http://%48OsT",
         {"http", "", "", "host", 0, "/", "", "", "http://host/"},
         {"http", "", "", "%48ost", 0, "/", "", "", "http://%48ost/"});
@@ -1060,12 +1056,6 @@ TEST_F(URLParserTest, DefaultPort)
     checkURLDifferences("file://:0/path",
         {"", "", "", "", 0, "", "", "", "file://:0/path"},
         {"file", "", "", "", 0, "/path", "", "", "file://:0/path"});
-    checkURLDifferences(utf16String(u"http://host/#ПП\u0007 a</"),
-        {"http", "", "", "host", 0, "/", "", utf16String(u"ПП\u0007 a</"), utf16String(u"http://host/#ПП\u0007 a</")},
-        {"http", "", "", "host", 0, "/", "", "%D0%9F%D0%9F%07 a</", "http://host/#%D0%9F%D0%9F%07 a</"});
-    checkURLDifferences(utf16String(u"http://host/#\u0007 a</"),
-        {"http", "", "", "host", 0, "/", "", "\a a</", "http://host/#\a a</"},
-        {"http", "", "", "host", 0, "/", "", "%07 a</", "http://host/#%07 a</"});
 }
 
 static void shouldFail(const String& urlString)
@@ -1224,8 +1214,7 @@ static void checkURL(const String& urlString, const String& baseURLString, const
 
 TEST_F(URLParserTest, QueryEncoding)
 {
-    checkURL(utf16String(u"http://host?ß😍#ß😍"), UTF8Encoding(), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", utf16String(u"ß😍"), utf16String(u"http://host/?%C3%9F%F0%9F%98%8D#ß😍")}, testTabsValueForSurrogatePairs);
-    checkURL(utf16String(u"http://host?ß😍#ß😍"), UTF8Encoding(), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", utf16String(u"ß😍"), utf16String(u"http://host/?%C3%9F%F0%9F%98%8D#ß😍")}, testTabsValueForSurrogatePairs);
+    checkURL(utf16String(u"http://host?ß😍#ß😍"), UTF8Encoding(), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", "%C3%9F%F0%9F%98%8D", utf16String(u"http://host/?%C3%9F%F0%9F%98%8D#%C3%9F%F0%9F%98%8D")}, testTabsValueForSurrogatePairs);
 
     TextEncoding latin1(String("latin1"));
     checkURL("http://host/?query with%20spaces", latin1, {"http", "", "", "host", 0, "/", "query%20with%20spaces", "", "http://host/?query%20with%20spaces"});