From 0ab2c1915df126fba52000f17669ca03dff8513b Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Wed, 13 Dec 2017 21:23:52 -0500 Subject: [PATCH] invalid UTF-8 in string literals: allow it and print it also print invalid UTF-8 characters correctly related: #25072 --- base/char.jl | 32 ++++++++++++++++++++------------ base/replutil.jl | 11 ----------- base/strings/io.jl | 18 ++++++++++++------ src/julia-parser.scm | 5 +---- test/char.jl | 19 +++++++++++++++++++ test/show.jl | 7 +++++++ test/syntax.jl | 6 ++++++ 7 files changed, 65 insertions(+), 33 deletions(-) diff --git a/base/char.jl b/base/char.jl index 76f87409a656d..44c230bb4041b 100644 --- a/base/char.jl +++ b/base/char.jl @@ -17,6 +17,11 @@ function ismalformed(c::Char) (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) end +function isoverlong(c::Char) + u = reinterpret(UInt32, c) + (u >> 24 == 0xc0) | (u >> 21 == 0x0704) | (u >> 20 == 0x0f08) +end + function convert(::Type{UInt32}, c::Char) # TODO: use optimized inline LLVM u = reinterpret(UInt32, c) @@ -111,17 +116,7 @@ function show(io::IO, c::Char) return end end - if Unicode.isprint(c) - write(io, 0x27, c, 0x27) - elseif !ismalformed(c) - u = UInt32(c) - write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55) - d = max(2, 8 - (leading_zeros(u) >> 2)) - while 0 < d - write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1]) - end - write(io, 0x27) - else # malformed + if isoverlong(c) || ismalformed(c) write(io, 0x27) u = reinterpret(UInt32, c) while true @@ -131,6 +126,16 @@ function show(io::IO, c::Char) (u <<= 8) == 0 && break end write(io, 0x27) + elseif Unicode.isprint(c) + write(io, 0x27, c, 0x27) + else # unprintable, well-formed, non-overlong Unicode + u = UInt32(c) + write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55) + d = max(2, 8 - (leading_zeros(u) >> 2)) + while 0 < d + write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1]) + end + write(io, 0x27) end return end @@ -138,8 +143,11 @@ end function show(io::IO, ::MIME"text/plain", c::Char) show(io, c) if !ismalformed(c) + print(io, ": ") + isoverlong(c) && print(io, "[overlong] ") u = UInt32(c) - print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4)) + h = hex(u, u ≤ 0xffff ? 4 : 6) + print(io, (Unicode.isascii(c) ? "ASCII/" : ""), "Unicode U+", h) else print(io, ": Malformed UTF-8") end diff --git a/base/replutil.jl b/base/replutil.jl index afaef1312c5df..f0cda76db7771 100644 --- a/base/replutil.jl +++ b/base/replutil.jl @@ -139,17 +139,6 @@ end show(io::IO, ::MIME"text/plain", X::AbstractArray) = _display(io, X) show(io::IO, ::MIME"text/plain", r::AbstractRange) = show(io, r) # always use the compact form for printing ranges -# display something useful even for strings containing arbitrary -# (non-UTF8) binary data: -function show(io::IO, ::MIME"text/plain", s::String) - if isvalid(s) - show(io, s) - else - println(io, sizeof(s), "-byte String of invalid UTF-8 data:") - print_array(io, Vector{UInt8}(s)) - end -end - function show(io::IO, ::MIME"text/plain", opt::JLOptions) println(io, "JLOptions(") fields = fieldnames(JLOptions) diff --git a/base/strings/io.jl b/base/strings/io.jl index 98648bac824a6..e53a62ced49a1 100644 --- a/base/strings/io.jl +++ b/base/strings/io.jl @@ -271,17 +271,22 @@ function escape_string(io, s::AbstractString, esc::AbstractString="") i = start(s) while !done(s,i) c, j = next(s,i) - if !ismalformed(c) + if c in esc + print(io, '\\', c) + elseif Unicode.isascii(c) c == '\0' ? print(io, escape_nul(s,j)) : c == '\e' ? print(io, "\\e") : c == '\\' ? print(io, "\\\\") : c in esc ? print(io, '\\', c) : '\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) : + Unicode.isprint(c) ? print(io, c) : + print(io, "\\x", hex(c, 2)) + elseif !isoverlong(c) && !ismalformed(c) Unicode.isprint(c) ? print(io, c) : c <= '\x7f' ? print(io, "\\x", hex(c, 2)) : - c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) : - print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4)) - else # malformed + c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s, j) ? 4 : 2)) : + print(io, "\\U", hex(c, need_full_hex(s, j) ? 8 : 4)) + else # malformed or overlong u = bswap(reinterpret(UInt32, c)) while true print(io, "\\x", hex(u % UInt8, 2)) @@ -332,9 +337,10 @@ function unescape_string(io, s::AbstractString) 'A' <= c <= 'F' ? n<<4 + (c-'A'+10) : break i = j end - if k == 1 + if k == 1 || n > 0x10ffff + u = m == 4 ? 'u' : 'U' throw(ArgumentError("invalid $(m == 2 ? "hex (\\x)" : - "unicode (\\u)") escape sequence used in $(repr(s))")) + "unicode (\\$u)") escape sequence")) end if m == 2 # \x escape sequence write(io, UInt8(n)) diff --git a/src/julia-parser.scm b/src/julia-parser.scm index e4a80a023196f..da41ca2402c23 100644 --- a/src/julia-parser.scm +++ b/src/julia-parser.scm @@ -2090,10 +2090,7 @@ (define (tostr raw io) (if raw (io.tostring! io) - (let ((str (unescape-string (io.tostring! io)))) - (if (not (string.isutf8 str)) - (error "invalid UTF-8 sequence") - str)))) + (let ((str (unescape-string (io.tostring! io)))) str))) ;; raw = raw string literal ;; when raw is #t, unescape only \\ and delimiter diff --git a/test/char.jl b/test/char.jl index b6548183891ff..17b2c8eabf04b 100644 --- a/test/char.jl +++ b/test/char.jl @@ -217,3 +217,22 @@ end rm(file, force=true) end end + +function test_overlong(c::Char, n::Integer, rep::String) + @test Int(c) == n + @test sprint(show, c) == rep +end + +# TODO: use char syntax once #25072 is fixed +test_overlong('\0', 0, "'\\0'") +test_overlong("\xc0\x80"[1], 0, "'\\xc0\\x80'") +test_overlong("\xe0\x80\x80"[1], 0, "'\\xe0\\x80\\x80'") +test_overlong("\xf0\x80\x80\x80"[1], 0, "'\\xf0\\x80\\x80\\x80'") + +test_overlong('\x30', 0x30, "'0'") +test_overlong("\xc0\xb0"[1], 0x30, "'\\xc0\\xb0'") +test_overlong("\xe0\x80\xb0"[1], 0x30, "'\\xe0\\x80\\xb0'") +test_overlong("\xf0\x80\x80\xb0"[1], 0x30, "'\\xf0\\x80\\x80\\xb0'") + +test_overlong('\u8430', 0x8430, "'萰'") +test_overlong("\xf0\x88\x90\xb0"[1], 0x8430, "'\\xf0\\x88\\x90\\xb0'") diff --git a/test/show.jl b/test/show.jl index 00434d2d085ab..48e2015aeb393 100644 --- a/test/show.jl +++ b/test/show.jl @@ -105,6 +105,13 @@ end @test_repr "(!x).a" @test_repr "(!x)::a" +# invalid UTF-8 strings +@test_repr "\"\\ud800\"" +@test_repr "\"\\udfff\"" +@test_repr "\"\\xc0\\xb0\"" +@test_repr "\"\\xe0\\xb0\\xb0\"" +@test_repr "\"\\xf0\\xb0\\xb0\\xb0\"" + # Complex # Meta.parse(repr(:(...))) returns a double-quoted block, so we need to eval twice to unquote it diff --git a/test/syntax.jl b/test/syntax.jl index 1fb6a1d41cdef..365fbe7e4682b 100644 --- a/test/syntax.jl +++ b/test/syntax.jl @@ -232,6 +232,12 @@ end Meta.parse("\"foo\r\nbar\"") == Meta.parse("\"foo\rbar\"") == Meta.parse("\"foo\nbar\"") @test '\r' == first("\r") == first("\r\n") # still allow explicit \r +# allow invalid UTF-8 in string literals +@test "\ud800"[1] == Char(0xd800) +@test "\udfff"[1] == Char(0xdfff) +@test length("\xc0\xb0") == 1 +@test "\xc0\xb0"[1] == reinterpret(Char, 0xc0b00000) + # issue #14561 - generating 0-method generic function def let fname = :f @test :(function $fname end) == Expr(:function, :f)