Skip to content

Commit

Permalink
Merge pull request #25073 from JuliaLang/sk/invalid-utf8
Browse files Browse the repository at this point in the history
allow invalid UTF-8 string literals, deprecate b"..."
  • Loading branch information
StefanKarpinski authored Dec 15, 2017
2 parents f06f33c + 80548a6 commit 4b5d067
Show file tree
Hide file tree
Showing 10 changed files with 109 additions and 73 deletions.
32 changes: 20 additions & 12 deletions base/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ function ismalformed(c::Char)
(((u & 0x00c0c0c0) 0x00808080) >> t0 != 0)
end

function isoverlong(c::Char)
u = reinterpret(UInt32, c)
(u >> 24 == 0xc0) | (u >> 21 == 0x0704) | (u >> 20 == 0x0f08)
end

function convert(::Type{UInt32}, c::Char)
# TODO: use optimized inline LLVM
u = reinterpret(UInt32, c)
Expand Down Expand Up @@ -111,17 +116,7 @@ function show(io::IO, c::Char)
return
end
end
if Unicode.isprint(c)
write(io, 0x27, c, 0x27)
elseif !ismalformed(c)
u = UInt32(c)
write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
d = max(2, 8 - (leading_zeros(u) >> 2))
while 0 < d
write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
end
write(io, 0x27)
else # malformed
if isoverlong(c) || ismalformed(c)
write(io, 0x27)
u = reinterpret(UInt32, c)
while true
Expand All @@ -131,15 +126,28 @@ function show(io::IO, c::Char)
(u <<= 8) == 0 && break
end
write(io, 0x27)
elseif Unicode.isprint(c)
write(io, 0x27, c, 0x27)
else # unprintable, well-formed, non-overlong Unicode
u = UInt32(c)
write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
d = max(2, 8 - (leading_zeros(u) >> 2))
while 0 < d
write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
end
write(io, 0x27)
end
return
end

function show(io::IO, ::MIME"text/plain", c::Char)
show(io, c)
if !ismalformed(c)
print(io, ": ")
isoverlong(c) && print(io, "[overlong] ")
u = UInt32(c)
print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
h = hex(u, u 0xffff ? 4 : 6)
print(io, (Unicode.isascii(c) ? "ASCII/" : ""), "Unicode U+", h)
else
print(io, ": Malformed UTF-8")
end
Expand Down
11 changes: 0 additions & 11 deletions base/replutil.jl
Original file line number Diff line number Diff line change
Expand Up @@ -139,17 +139,6 @@ end
show(io::IO, ::MIME"text/plain", X::AbstractArray) = _display(io, X)
show(io::IO, ::MIME"text/plain", r::AbstractRange) = show(io, r) # always use the compact form for printing ranges

# display something useful even for strings containing arbitrary
# (non-UTF8) binary data:
function show(io::IO, ::MIME"text/plain", s::String)
if isvalid(s)
show(io, s)
else
println(io, sizeof(s), "-byte String of invalid UTF-8 data:")
print_array(io, Vector{UInt8}(s))
end
end

function show(io::IO, ::MIME"text/plain", opt::JLOptions)
println(io, "JLOptions(")
fields = fieldnames(JLOptions)
Expand Down
2 changes: 1 addition & 1 deletion base/serialize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ function writeheader(s::AbstractSerializer)
sizeof(Int) == 8 ? 1 :
error("unsupported word size in serializer"))
write(io, UInt8(endianness) | (UInt8(machine) << 2))
write(io, b"\x00\x00\x00") # 3 reserved bytes
write(io, [0x00,0x00,0x00]) # 3 reserved bytes
nothing
end

Expand Down
18 changes: 12 additions & 6 deletions base/strings/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -271,17 +271,22 @@ function escape_string(io, s::AbstractString, esc::AbstractString="")
i = start(s)
while !done(s,i)
c, j = next(s,i)
if !ismalformed(c)
if c in esc
print(io, '\\', c)
elseif Unicode.isascii(c)
c == '\0' ? print(io, escape_nul(s,j)) :
c == '\e' ? print(io, "\\e") :
c == '\\' ? print(io, "\\\\") :
c in esc ? print(io, '\\', c) :
'\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) :
Unicode.isprint(c) ? print(io, c) :
print(io, "\\x", hex(c, 2))
elseif !isoverlong(c) && !ismalformed(c)
Unicode.isprint(c) ? print(io, c) :
c <= '\x7f' ? print(io, "\\x", hex(c, 2)) :
c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) :
print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4))
else # malformed
c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s, j) ? 4 : 2)) :
print(io, "\\U", hex(c, need_full_hex(s, j) ? 8 : 4))
else # malformed or overlong
u = bswap(reinterpret(UInt32, c))
while true
print(io, "\\x", hex(u % UInt8, 2))
Expand Down Expand Up @@ -332,9 +337,10 @@ function unescape_string(io, s::AbstractString)
'A' <= c <= 'F' ? n<<4 + (c-'A'+10) : break
i = j
end
if k == 1
if k == 1 || n > 0x10ffff
u = m == 4 ? 'u' : 'U'
throw(ArgumentError("invalid $(m == 2 ? "hex (\\x)" :
"unicode (\\u)") escape sequence used in $(repr(s))"))
"unicode (\\$u)") escape sequence"))
end
if m == 2 # \x escape sequence
write(io, UInt8(n))
Expand Down
5 changes: 1 addition & 4 deletions src/julia-parser.scm
Original file line number Diff line number Diff line change
Expand Up @@ -2090,10 +2090,7 @@
(define (tostr raw io)
(if raw
(io.tostring! io)
(let ((str (unescape-string (io.tostring! io))))
(if (not (string.isutf8 str))
(error "invalid UTF-8 sequence")
str))))
(let ((str (unescape-string (io.tostring! io)))) str)))

;; raw = raw string literal
;; when raw is #t, unescape only \\ and delimiter
Expand Down
19 changes: 19 additions & 0 deletions test/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,22 @@ end
rm(file, force=true)
end
end

function test_overlong(c::Char, n::Integer, rep::String)
@test Int(c) == n
@test sprint(show, c) == rep
end

# TODO: use char syntax once #25072 is fixed
test_overlong('\0', 0, "'\\0'")
test_overlong("\xc0\x80"[1], 0, "'\\xc0\\x80'")
test_overlong("\xe0\x80\x80"[1], 0, "'\\xe0\\x80\\x80'")
test_overlong("\xf0\x80\x80\x80"[1], 0, "'\\xf0\\x80\\x80\\x80'")

test_overlong('\x30', 0x30, "'0'")
test_overlong("\xc0\xb0"[1], 0x30, "'\\xc0\\xb0'")
test_overlong("\xe0\x80\xb0"[1], 0x30, "'\\xe0\\x80\\xb0'")
test_overlong("\xf0\x80\x80\xb0"[1], 0x30, "'\\xf0\\x80\\x80\\xb0'")

test_overlong('\u8430', 0x8430, "'萰'")
test_overlong("\xf0\x88\x90\xb0"[1], 0x8430, "'\\xf0\\x88\\x90\\xb0'")
7 changes: 7 additions & 0 deletions test/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,13 @@ end
@test_repr "(!x).a"
@test_repr "(!x)::a"

# invalid UTF-8 strings
@test_repr "\"\\ud800\""
@test_repr "\"\\udfff\""
@test_repr "\"\\xc0\\xb0\""
@test_repr "\"\\xe0\\xb0\\xb0\""
@test_repr "\"\\xf0\\xb0\\xb0\\xb0\""

# Complex

# Meta.parse(repr(:(...))) returns a double-quoted block, so we need to eval twice to unquote it
Expand Down
56 changes: 29 additions & 27 deletions test/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -316,31 +316,31 @@ end
@test isvalid(Char, val) == pass
end
for (val, pass) in (
(b"\x00", true),
(b"\x7f", true),
(b"\x80", false),
(b"\xbf", false),
(b"\xc0", false),
(b"\xff", false),
(b"\xc0\x80", false),
(b"\xc1\x80", false),
(b"\xc2\x80", true),
(b"\xc2\xc0", false),
(b"\xed\x9f\xbf", true),
(b"\xed\xa0\x80", false),
(b"\xed\xbf\xbf", false),
(b"\xee\x80\x80", true),
(b"\xef\xbf\xbf", true),
(b"\xf0\x90\x80\x80", true),
(b"\xf4\x8f\xbf\xbf", true),
(b"\xf4\x90\x80\x80", false),
(b"\xf5\x80\x80\x80", false),
(b"\ud800\udc00", false),
(b"\udbff\udfff", false),
(b"\ud800\u0100", false),
(b"\udc00\u0100", false),
(b"\udc00\ud800", false)
)
("\x00", true),
("\x7f", true),
("\x80", false),
("\xbf", false),
("\xc0", false),
("\xff", false),
("\xc0\x80", false),
("\xc1\x80", false),
("\xc2\x80", true),
("\xc2\xc0", false),
("\xed\x9f\xbf", true),
("\xed\xa0\x80", false),
("\xed\xbf\xbf", false),
("\xee\x80\x80", true),
("\xef\xbf\xbf", true),
("\xf0\x90\x80\x80", true),
("\xf4\x8f\xbf\xbf", true),
("\xf4\x90\x80\x80", false),
("\xf5\x80\x80\x80", false),
("\ud800\udc00", false),
("\udbff\udfff", false),
("\ud800\u0100", false),
("\udc00\u0100", false),
("\udc00\ud800", false),
)
@test isvalid(String, val) == pass == isvalid(String(val))
end

Expand Down Expand Up @@ -430,8 +430,8 @@ end
@test_throws ArgumentError ascii(GenericString("Hello, ∀"))
end
@testset "issue #17271: endof() doesn't throw an error even with invalid strings" begin
@test endof(String(b"\x90")) == 1
@test endof(String(b"\xce")) == 1
@test endof("\x90") == 1
@test endof("\xce") == 1
end
# issue #17624, missing getindex method for String
@test "abc"[:] == "abc"
Expand Down Expand Up @@ -652,3 +652,5 @@ end
@test ncodeunits(GenericString(s)) == n
end
end

@test Vector{UInt8}("\xcc\xdd\xee\xff\x80") == [0xcc,0xdd,0xee,0xff,0x80]
6 changes: 6 additions & 0 deletions test/syntax.jl
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,12 @@ end
Meta.parse("\"foo\r\nbar\"") == Meta.parse("\"foo\rbar\"") == Meta.parse("\"foo\nbar\"")
@test '\r' == first("\r") == first("\r\n") # still allow explicit \r

# allow invalid UTF-8 in string literals
@test "\ud800"[1] == Char(0xd800)
@test "\udfff"[1] == Char(0xdfff)
@test length("\xc0\xb0") == 1
@test "\xc0\xb0"[1] == reinterpret(Char, 0xc0b00000)

# issue #14561 - generating 0-method generic function def
let fname = :f
@test :(function $fname end) == Expr(:function, :f)
Expand Down
26 changes: 14 additions & 12 deletions test/unicode/utf8.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license

@testset "string indexing" begin
let str = String(b"this is a test\xed\x80")
let str = "this is a test\xed\x80"
@test next(str, 15) == (reinterpret(Char, 0xed800000), 17)
@test_throws BoundsError getindex(str, 0:3)
@test_throws BoundsError getindex(str, 17:18)
@test_throws BoundsError getindex(str, 2:17)
@test_throws BoundsError getindex(str, 16:17)
@test string(Char(0x110000)) == String(b"\xf4\x90\x80\x80")
@test string(Char(0x110000)) == "\xf4\x90\x80\x80"
end
end

Expand All @@ -17,20 +17,22 @@ end
@test reverse("abc") == "cba"
@test reverse("xyz\uff\u800\uffff\U10ffff") == "\U10ffff\uffff\u800\uffzyx"
for (s, r) in [
b"xyz\xc1" => b"\xc1zyx",
b"xyz\xd0" => b"\xd0zyx",
b"xyz\xe0" => b"\xe0zyx",
b"xyz\xed\x80" => b"\xed\x80zyx",
b"xyz\xf0" => b"\xf0zyx",
b"xyz\xf0\x80" => b"\xf0\x80zyx",
b"xyz\xf0\x80\x80" => b"\xf0\x80\x80zyx",
"xyz\xc1" => "\xc1zyx",
"xyz\xd0" => "\xd0zyx",
"xyz\xe0" => "\xe0zyx",
"xyz\xed\x80" => "\xed\x80zyx",
"xyz\xf0" => "\xf0zyx",
"xyz\xf0\x80" => "\xf0\x80zyx",
"xyz\xf0\x80\x80" => "\xf0\x80\x80zyx",
]
@test reverse(String(s)) == String(r)
@test reverse(s) == r
end
end

@testset "string convert" begin
@test String(b"this is a test\xed\x80\x80") == "this is a test\ud000"
@test String(b"this is a test\xed\x80\x80") ==
"this is a test\xed\x80\x80" ==
"this is a test\ud000"
# Specifically check UTF-8 string whose lead byte is same as a surrogate
@test String(b"\xed\x9f\xbf") == "\ud7ff"
@test String(b"\xed\x9f\xbf") == "\xed\x9f\xbf" == "\ud7ff"
end

0 comments on commit 4b5d067

Please sign in to comment.