Skip to content

Commit

Permalink
invalid UTF-8 in string literals: allow it and print it
Browse files Browse the repository at this point in the history
also print invalid UTF-8 characters correctly

related: #25072
  • Loading branch information
StefanKarpinski committed Dec 14, 2017
1 parent d192302 commit a632530
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 33 deletions.
32 changes: 20 additions & 12 deletions base/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ function ismalformed(c::Char)
(((u & 0x00c0c0c0) 0x00808080) >> t0 != 0)
end

function isoverlong(c::Char)
u = reinterpret(UInt32, c)
(u >> 24 == 0xc0) | (u >> 21 == 0x0704) | (u >> 20 == 0x0f08)
end

function convert(::Type{UInt32}, c::Char)
# TODO: use optimized inline LLVM
u = reinterpret(UInt32, c)
Expand Down Expand Up @@ -111,17 +116,7 @@ function show(io::IO, c::Char)
return
end
end
if Unicode.isprint(c)
write(io, 0x27, c, 0x27)
elseif !ismalformed(c)
u = UInt32(c)
write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
d = max(2, 8 - (leading_zeros(u) >> 2))
while 0 < d
write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
end
write(io, 0x27)
else # malformed
if isoverlong(c) || ismalformed(c)
write(io, 0x27)
u = reinterpret(UInt32, c)
while true
Expand All @@ -131,15 +126,28 @@ function show(io::IO, c::Char)
(u <<= 8) == 0 && break
end
write(io, 0x27)
elseif Unicode.isprint(c)
write(io, 0x27, c, 0x27)
else # unprintable, well-formed, non-overlong Unicode
u = UInt32(c)
write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
d = max(2, 8 - (leading_zeros(u) >> 2))
while 0 < d
write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
end
write(io, 0x27)
end
return
end

function show(io::IO, ::MIME"text/plain", c::Char)
show(io, c)
if !ismalformed(c)
print(io, ": ")
isoverlong(c) && print(io, "[overlong] ")
u = UInt32(c)
print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
h = hex(u, u 0xffff ? 4 : 6)
print(io, (Unicode.isascii(c) ? "ASCII/" : ""), "Unicode U+", h)
else
print(io, ": Malformed UTF-8")
end
Expand Down
11 changes: 0 additions & 11 deletions base/replutil.jl
Original file line number Diff line number Diff line change
Expand Up @@ -139,17 +139,6 @@ end
show(io::IO, ::MIME"text/plain", X::AbstractArray) = _display(io, X)
show(io::IO, ::MIME"text/plain", r::AbstractRange) = show(io, r) # always use the compact form for printing ranges

# display something useful even for strings containing arbitrary
# (non-UTF8) binary data:
function show(io::IO, ::MIME"text/plain", s::String)
if isvalid(s)
show(io, s)
else
println(io, sizeof(s), "-byte String of invalid UTF-8 data:")
print_array(io, Vector{UInt8}(s))
end
end

function show(io::IO, ::MIME"text/plain", opt::JLOptions)
println(io, "JLOptions(")
fields = fieldnames(JLOptions)
Expand Down
18 changes: 12 additions & 6 deletions base/strings/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -271,17 +271,22 @@ function escape_string(io, s::AbstractString, esc::AbstractString="")
i = start(s)
while !done(s,i)
c, j = next(s,i)
if !ismalformed(c)
if c in esc
print(io, '\\', c)
elseif Unicode.isascii(c)
c == '\0' ? print(io, escape_nul(s,j)) :
c == '\e' ? print(io, "\\e") :
c == '\\' ? print(io, "\\\\") :
c in esc ? print(io, '\\', c) :
'\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) :
Unicode.isprint(c) ? print(io, c) :
print(io, "\\x", hex(c, 2))
elseif !isoverlong(c) && !ismalformed(c)
Unicode.isprint(c) ? print(io, c) :
c <= '\x7f' ? print(io, "\\x", hex(c, 2)) :
c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) :
print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4))
else # malformed
c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s, j) ? 4 : 2)) :
print(io, "\\U", hex(c, need_full_hex(s, j) ? 8 : 4))
else # malformed or overlong
u = bswap(reinterpret(UInt32, c))
while true
print(io, "\\x", hex(u % UInt8, 2))
Expand Down Expand Up @@ -332,9 +337,10 @@ function unescape_string(io, s::AbstractString)
'A' <= c <= 'F' ? n<<4 + (c-'A'+10) : break
i = j
end
if k == 1
if k == 1 || n > 0x10ffff
u = m == 4 ? 'u' : 'U'
throw(ArgumentError("invalid $(m == 2 ? "hex (\\x)" :
"unicode (\\u)") escape sequence used in $(repr(s))"))
"unicode (\\$u)") escape sequence"))
end
if m == 2 # \x escape sequence
write(io, UInt8(n))
Expand Down
5 changes: 1 addition & 4 deletions src/julia-parser.scm
Original file line number Diff line number Diff line change
Expand Up @@ -2090,10 +2090,7 @@
(define (tostr raw io)
(if raw
(io.tostring! io)
(let ((str (unescape-string (io.tostring! io))))
(if (not (string.isutf8 str))
(error "invalid UTF-8 sequence")
str))))
(let ((str (unescape-string (io.tostring! io)))) str)))

;; raw = raw string literal
;; when raw is #t, unescape only \\ and delimiter
Expand Down
19 changes: 19 additions & 0 deletions test/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,22 @@ end
rm(file, force=true)
end
end

function test_overlong(c::Char, n::Integer, rep::String)
@test Int(c) == n
@test sprint(show, c) == rep
end

# TODO: use char syntax once #25072 is fixed
test_overlong('\0', 0, "'\\0'")
test_overlong("\xc0\x80"[1], 0, "'\\xc0\\x80'")
test_overlong("\xe0\x80\x80"[1], 0, "'\\xe0\\x80\\x80'")
test_overlong("\xf0\x80\x80\x80"[1], 0, "'\\xf0\\x80\\x80\\x80'")

test_overlong('\x30', 0x30, "'0'")
test_overlong("\xc0\xb0"[1], 0x30, "'\\xc0\\xb0'")
test_overlong("\xe0\x80\xb0"[1], 0x30, "'\\xe0\\x80\\xb0'")
test_overlong("\xf0\x80\x80\xb0"[1], 0x30, "'\\xf0\\x80\\x80\\xb0'")

test_overlong('\u8430', 0x8430, "'萰'")
test_overlong("\xf0\x88\x90\xb0"[1], 0x8430, "'\\xf0\\x88\\x90\\xb0'")
7 changes: 7 additions & 0 deletions test/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,13 @@ end
@test_repr "(!x).a"
@test_repr "(!x)::a"

# invalid UTF-8 strings
@test_repr "\"\\ud800\""
@test_repr "\"\\udfff\""
@test_repr "\"\\xc0\\xb0\""
@test_repr "\"\\xe0\\xb0\\xb0\""
@test_repr "\"\\xf0\\xb0\\xb0\\xb0\""

# Complex

# Meta.parse(repr(:(...))) returns a double-quoted block, so we need to eval twice to unquote it
Expand Down
6 changes: 6 additions & 0 deletions test/syntax.jl
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,12 @@ end
Meta.parse("\"foo\r\nbar\"") == Meta.parse("\"foo\rbar\"") == Meta.parse("\"foo\nbar\"")
@test '\r' == first("\r") == first("\r\n") # still allow explicit \r

# allow invalid UTF-8 in string literals
@test "\ud800"[1] == Char(0xd800)
@test "\udfff"[1] == Char(0xdfff)
@test length("\xc0\xb0") == 1
@test "\xc0\xb0"[1] == reinterpret(Char, 0xc0b00000)

# issue #14561 - generating 0-method generic function def
let fname = :f
@test :(function $fname end) == Expr(:function, :f)
Expand Down

0 comments on commit a632530

Please sign in to comment.