Merge pull request #25073 from JuliaLang/sk/invalid-utf8

allow invalid UTF-8 string literals, deprecate b"..."
JuliaLang · Dec 15, 2017 · 4b5d067 · 4b5d067
2 parents f06f33c + 80548a6
commit 4b5d067
Show file tree

Hide file tree

Showing 10 changed files with 109 additions and 73 deletions.
diff --git a/base/char.jl b/base/char.jl
@@ -17,6 +17,11 @@ function ismalformed(c::Char)
     (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0)
 end
 
+function isoverlong(c::Char)
+    u = reinterpret(UInt32, c)
+    (u >> 24 == 0xc0) | (u >> 21 == 0x0704) | (u >> 20 == 0x0f08)
+end
+
 function convert(::Type{UInt32}, c::Char)
     # TODO: use optimized inline LLVM
     u = reinterpret(UInt32, c)
@@ -111,17 +116,7 @@ function show(io::IO, c::Char)
             return
         end
     end
-    if Unicode.isprint(c)
-        write(io, 0x27, c, 0x27)
-    elseif !ismalformed(c)
-        u = UInt32(c)
-        write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
-        d = max(2, 8 - (leading_zeros(u) >> 2))
-        while 0 < d
-            write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
-        end
-        write(io, 0x27)
-    else # malformed
+    if isoverlong(c) || ismalformed(c)
         write(io, 0x27)
         u = reinterpret(UInt32, c)
         while true
@@ -131,15 +126,28 @@ function show(io::IO, c::Char)
             (u <<= 8) == 0 && break
         end
         write(io, 0x27)
+    elseif Unicode.isprint(c)
+        write(io, 0x27, c, 0x27)
+    else # unprintable, well-formed, non-overlong Unicode
+        u = UInt32(c)
+        write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
+        d = max(2, 8 - (leading_zeros(u) >> 2))
+        while 0 < d
+            write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
+        end
+        write(io, 0x27)
     end
     return
 end
 
 function show(io::IO, ::MIME"text/plain", c::Char)
     show(io, c)
     if !ismalformed(c)
+        print(io, ": ")
+        isoverlong(c) && print(io, "[overlong] ")
         u = UInt32(c)
-        print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
+        h = hex(u, u ≤ 0xffff ? 4 : 6)
+        print(io, (Unicode.isascii(c) ? "ASCII/" : ""), "Unicode U+", h)
     else
         print(io, ": Malformed UTF-8")
     end

diff --git a/base/replutil.jl b/base/replutil.jl
@@ -139,17 +139,6 @@ end
 show(io::IO, ::MIME"text/plain", X::AbstractArray) = _display(io, X)
 show(io::IO, ::MIME"text/plain", r::AbstractRange) = show(io, r) # always use the compact form for printing ranges
 
-# display something useful even for strings containing arbitrary
-# (non-UTF8) binary data:
-function show(io::IO, ::MIME"text/plain", s::String)
-    if isvalid(s)
-        show(io, s)
-    else
-        println(io, sizeof(s), "-byte String of invalid UTF-8 data:")
-        print_array(io, Vector{UInt8}(s))
-    end
-end
-
 function show(io::IO, ::MIME"text/plain", opt::JLOptions)
     println(io, "JLOptions(")
     fields = fieldnames(JLOptions)

diff --git a/base/serialize.jl b/base/serialize.jl
@@ -674,7 +674,7 @@ function writeheader(s::AbstractSerializer)
                sizeof(Int) == 8 ? 1 :
                error("unsupported word size in serializer"))
     write(io, UInt8(endianness) | (UInt8(machine) << 2))
-    write(io, b"\x00\x00\x00")  # 3 reserved bytes
+    write(io, [0x00,0x00,0x00]) # 3 reserved bytes
     nothing
 end
 

diff --git a/base/strings/io.jl b/base/strings/io.jl
@@ -271,17 +271,22 @@ function escape_string(io, s::AbstractString, esc::AbstractString="")
     i = start(s)
     while !done(s,i)
         c, j = next(s,i)
-        if !ismalformed(c)
+        if c in esc
+            print(io, '\\', c)
+        elseif Unicode.isascii(c)
             c == '\0'          ? print(io, escape_nul(s,j)) :
             c == '\e'          ? print(io, "\\e") :
             c == '\\'          ? print(io, "\\\\") :
             c in esc           ? print(io, '\\', c) :
             '\a' <= c <= '\r'  ? print(io, '\\', "abtnvfr"[Int(c)-6]) :
+            Unicode.isprint(c) ? print(io, c) :
+                                 print(io, "\\x", hex(c, 2))
+        elseif !isoverlong(c) && !ismalformed(c)
             Unicode.isprint(c) ? print(io, c) :
             c <= '\x7f'        ? print(io, "\\x", hex(c, 2)) :
-            c <= '\uffff'      ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) :
-                                 print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4))
-        else # malformed
+            c <= '\uffff'      ? print(io, "\\u", hex(c, need_full_hex(s, j) ? 4 : 2)) :
+                                 print(io, "\\U", hex(c, need_full_hex(s, j) ? 8 : 4))
+        else # malformed or overlong
             u = bswap(reinterpret(UInt32, c))
             while true
                 print(io, "\\x", hex(u % UInt8, 2))
@@ -332,9 +337,10 @@ function unescape_string(io, s::AbstractString)
                         'A' <= c <= 'F' ? n<<4 + (c-'A'+10) : break
                     i = j
                 end
-                if k == 1
+                if k == 1 || n > 0x10ffff
+                    u = m == 4 ? 'u' : 'U'
                     throw(ArgumentError("invalid $(m == 2 ? "hex (\\x)" :
-                                        "unicode (\\u)") escape sequence used in $(repr(s))"))
+                                        "unicode (\\$u)") escape sequence"))
                 end
                 if m == 2 # \x escape sequence
                     write(io, UInt8(n))

diff --git a/src/julia-parser.scm b/src/julia-parser.scm
@@ -2090,10 +2090,7 @@
 (define (tostr raw io)
   (if raw
       (io.tostring! io)
-      (let ((str (unescape-string (io.tostring! io))))
-        (if (not (string.isutf8 str))
-            (error "invalid UTF-8 sequence")
-            str))))
+      (let ((str (unescape-string (io.tostring! io)))) str)))
 
 ;; raw = raw string literal
 ;; when raw is #t, unescape only \\ and delimiter

diff --git a/test/char.jl b/test/char.jl
@@ -217,3 +217,22 @@ end
         rm(file, force=true)
     end
 end
+
+function test_overlong(c::Char, n::Integer, rep::String)
+    @test Int(c) == n
+    @test sprint(show, c) == rep
+end
+
+# TODO: use char syntax once #25072 is fixed
+test_overlong('\0', 0, "'\\0'")
+test_overlong("\xc0\x80"[1], 0, "'\\xc0\\x80'")
+test_overlong("\xe0\x80\x80"[1], 0, "'\\xe0\\x80\\x80'")
+test_overlong("\xf0\x80\x80\x80"[1], 0, "'\\xf0\\x80\\x80\\x80'")
+
+test_overlong('\x30', 0x30, "'0'")
+test_overlong("\xc0\xb0"[1], 0x30, "'\\xc0\\xb0'")
+test_overlong("\xe0\x80\xb0"[1], 0x30, "'\\xe0\\x80\\xb0'")
+test_overlong("\xf0\x80\x80\xb0"[1], 0x30, "'\\xf0\\x80\\x80\\xb0'")
+
+test_overlong('\u8430', 0x8430, "'萰'")
+test_overlong("\xf0\x88\x90\xb0"[1], 0x8430, "'\\xf0\\x88\\x90\\xb0'")
diff --git a/test/show.jl b/test/show.jl
@@ -105,6 +105,13 @@ end
 @test_repr "(!x).a"
 @test_repr "(!x)::a"
 
+# invalid UTF-8 strings
+@test_repr "\"\\ud800\""
+@test_repr "\"\\udfff\""
+@test_repr "\"\\xc0\\xb0\""
+@test_repr "\"\\xe0\\xb0\\xb0\""
+@test_repr "\"\\xf0\\xb0\\xb0\\xb0\""
+
 # Complex
 
 # Meta.parse(repr(:(...))) returns a double-quoted block, so we need to eval twice to unquote it

diff --git a/test/strings/basic.jl b/test/strings/basic.jl
@@ -316,31 +316,31 @@ end
         @test isvalid(Char, val) == pass
     end
     for (val, pass) in (
-            (b"\x00", true),
-            (b"\x7f", true),
-            (b"\x80", false),
-            (b"\xbf", false),
-            (b"\xc0", false),
-            (b"\xff", false),
-            (b"\xc0\x80", false),
-            (b"\xc1\x80", false),
-            (b"\xc2\x80", true),
-            (b"\xc2\xc0", false),
-            (b"\xed\x9f\xbf", true),
-            (b"\xed\xa0\x80", false),
-            (b"\xed\xbf\xbf", false),
-            (b"\xee\x80\x80", true),
-            (b"\xef\xbf\xbf", true),
-            (b"\xf0\x90\x80\x80", true),
-            (b"\xf4\x8f\xbf\xbf", true),
-            (b"\xf4\x90\x80\x80", false),
-            (b"\xf5\x80\x80\x80", false),
-            (b"\ud800\udc00", false),
-            (b"\udbff\udfff", false),
-            (b"\ud800\u0100", false),
-            (b"\udc00\u0100", false),
-            (b"\udc00\ud800", false)
-            )
+            ("\x00", true),
+            ("\x7f", true),
+            ("\x80", false),
+            ("\xbf", false),
+            ("\xc0", false),
+            ("\xff", false),
+            ("\xc0\x80", false),
+            ("\xc1\x80", false),
+            ("\xc2\x80", true),
+            ("\xc2\xc0", false),
+            ("\xed\x9f\xbf", true),
+            ("\xed\xa0\x80", false),
+            ("\xed\xbf\xbf", false),
+            ("\xee\x80\x80", true),
+            ("\xef\xbf\xbf", true),
+            ("\xf0\x90\x80\x80", true),
+            ("\xf4\x8f\xbf\xbf", true),
+            ("\xf4\x90\x80\x80", false),
+            ("\xf5\x80\x80\x80", false),
+            ("\ud800\udc00", false),
+            ("\udbff\udfff", false),
+            ("\ud800\u0100", false),
+            ("\udc00\u0100", false),
+            ("\udc00\ud800", false),
+        )
         @test isvalid(String, val) == pass == isvalid(String(val))
     end
 
@@ -430,8 +430,8 @@ end
     @test_throws ArgumentError ascii(GenericString("Hello, ∀"))
 end
 @testset "issue #17271: endof() doesn't throw an error even with invalid strings" begin
-    @test endof(String(b"\x90")) == 1
-    @test endof(String(b"\xce")) == 1
+    @test endof("\x90") == 1
+    @test endof("\xce") == 1
 end
 # issue #17624, missing getindex method for String
 @test "abc"[:] == "abc"
@@ -652,3 +652,5 @@ end
         @test ncodeunits(GenericString(s)) == n
     end
 end
+
+@test Vector{UInt8}("\xcc\xdd\xee\xff\x80") == [0xcc,0xdd,0xee,0xff,0x80]
diff --git a/test/syntax.jl b/test/syntax.jl
@@ -232,6 +232,12 @@ end
     Meta.parse("\"foo\r\nbar\"") == Meta.parse("\"foo\rbar\"") == Meta.parse("\"foo\nbar\"")
 @test '\r' == first("\r") == first("\r\n") # still allow explicit \r
 
+# allow invalid UTF-8 in string literals
+@test "\ud800"[1] == Char(0xd800)
+@test "\udfff"[1] == Char(0xdfff)
+@test length("\xc0\xb0") == 1
+@test "\xc0\xb0"[1] == reinterpret(Char, 0xc0b00000)
+
 # issue #14561 - generating 0-method generic function def
 let fname = :f
     @test :(function $fname end) == Expr(:function, :f)

diff --git a/test/unicode/utf8.jl b/test/unicode/utf8.jl
@@ -1,13 +1,13 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
 @testset "string indexing" begin
-    let str = String(b"this is a test\xed\x80")
+    let str = "this is a test\xed\x80"
         @test next(str, 15) == (reinterpret(Char, 0xed800000), 17)
         @test_throws BoundsError getindex(str, 0:3)
         @test_throws BoundsError getindex(str, 17:18)
         @test_throws BoundsError getindex(str, 2:17)
         @test_throws BoundsError getindex(str, 16:17)
-        @test string(Char(0x110000)) == String(b"\xf4\x90\x80\x80")
+        @test string(Char(0x110000)) == "\xf4\x90\x80\x80"
     end
 end
 
@@ -17,20 +17,22 @@ end
     @test reverse("abc") == "cba"
     @test reverse("xyz\uff\u800\uffff\U10ffff") == "\U10ffff\uffff\u800\uffzyx"
     for (s, r) in [
-        b"xyz\xc1"          => b"\xc1zyx",
-        b"xyz\xd0"          => b"\xd0zyx",
-        b"xyz\xe0"          => b"\xe0zyx",
-        b"xyz\xed\x80"      => b"\xed\x80zyx",
-        b"xyz\xf0"          => b"\xf0zyx",
-        b"xyz\xf0\x80"      => b"\xf0\x80zyx",
-        b"xyz\xf0\x80\x80"  => b"\xf0\x80\x80zyx",
+        "xyz\xc1"         => "\xc1zyx",
+        "xyz\xd0"         => "\xd0zyx",
+        "xyz\xe0"         => "\xe0zyx",
+        "xyz\xed\x80"     => "\xed\x80zyx",
+        "xyz\xf0"         => "\xf0zyx",
+        "xyz\xf0\x80"     => "\xf0\x80zyx",
+        "xyz\xf0\x80\x80" => "\xf0\x80\x80zyx",
     ]
-        @test reverse(String(s)) == String(r)
+        @test reverse(s) == r
     end
 end
 
 @testset "string convert" begin
-    @test String(b"this is a test\xed\x80\x80") == "this is a test\ud000"
+    @test String(b"this is a test\xed\x80\x80") ==
+                  "this is a test\xed\x80\x80"  ==
+                  "this is a test\ud000"
     # Specifically check UTF-8 string whose lead byte is same as a surrogate
-    @test String(b"\xed\x9f\xbf") == "\ud7ff"
+    @test String(b"\xed\x9f\xbf") == "\xed\x9f\xbf" == "\ud7ff"
 end