Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow invalid UTF-8 string literals, deprecate b"..." #25073

Merged
merged 3 commits into from
Dec 15, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 20 additions & 12 deletions base/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ function ismalformed(c::Char)
(((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0)
end

function isoverlong(c::Char)
u = reinterpret(UInt32, c)
(u >> 24 == 0xc0) | (u >> 21 == 0x0704) | (u >> 20 == 0x0f08)
end

function convert(::Type{UInt32}, c::Char)
# TODO: use optimized inline LLVM
u = reinterpret(UInt32, c)
Expand Down Expand Up @@ -111,17 +116,7 @@ function show(io::IO, c::Char)
return
end
end
if Unicode.isprint(c)
write(io, 0x27, c, 0x27)
elseif !ismalformed(c)
u = UInt32(c)
write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
d = max(2, 8 - (leading_zeros(u) >> 2))
while 0 < d
write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
end
write(io, 0x27)
else # malformed
if isoverlong(c) || ismalformed(c)
write(io, 0x27)
u = reinterpret(UInt32, c)
while true
Expand All @@ -131,15 +126,28 @@ function show(io::IO, c::Char)
(u <<= 8) == 0 && break
end
write(io, 0x27)
elseif Unicode.isprint(c)
write(io, 0x27, c, 0x27)
else # unprintable, well-formed, non-overlong Unicode
u = UInt32(c)
write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
d = max(2, 8 - (leading_zeros(u) >> 2))
while 0 < d
write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
end
write(io, 0x27)
end
return
end

function show(io::IO, ::MIME"text/plain", c::Char)
show(io, c)
if !ismalformed(c)
print(io, ": ")
isoverlong(c) && print(io, "[overlong] ")
u = UInt32(c)
print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
h = hex(u, u ≤ 0xffff ? 4 : 6)
print(io, (Unicode.isascii(c) ? "ASCII/" : ""), "Unicode U+", h)
else
print(io, ": Malformed UTF-8")
end
Expand Down
11 changes: 0 additions & 11 deletions base/replutil.jl
Original file line number Diff line number Diff line change
Expand Up @@ -139,17 +139,6 @@ end
show(io::IO, ::MIME"text/plain", X::AbstractArray) = _display(io, X)
show(io::IO, ::MIME"text/plain", r::AbstractRange) = show(io, r) # always use the compact form for printing ranges

# display something useful even for strings containing arbitrary
# (non-UTF8) binary data:
function show(io::IO, ::MIME"text/plain", s::String)
if isvalid(s)
show(io, s)
else
println(io, sizeof(s), "-byte String of invalid UTF-8 data:")
print_array(io, Vector{UInt8}(s))
end
end

function show(io::IO, ::MIME"text/plain", opt::JLOptions)
println(io, "JLOptions(")
fields = fieldnames(JLOptions)
Expand Down
2 changes: 1 addition & 1 deletion base/serialize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ function writeheader(s::AbstractSerializer)
sizeof(Int) == 8 ? 1 :
error("unsupported word size in serializer"))
write(io, UInt8(endianness) | (UInt8(machine) << 2))
write(io, b"\x00\x00\x00") # 3 reserved bytes
write(io, [0x00,0x00,0x00]) # 3 reserved bytes
nothing
end

Expand Down
18 changes: 12 additions & 6 deletions base/strings/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -271,17 +271,22 @@ function escape_string(io, s::AbstractString, esc::AbstractString="")
i = start(s)
while !done(s,i)
c, j = next(s,i)
if !ismalformed(c)
if c in esc
print(io, '\\', c)
elseif Unicode.isascii(c)
c == '\0' ? print(io, escape_nul(s,j)) :
c == '\e' ? print(io, "\\e") :
c == '\\' ? print(io, "\\\\") :
c in esc ? print(io, '\\', c) :
'\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) :
Unicode.isprint(c) ? print(io, c) :
print(io, "\\x", hex(c, 2))
elseif !isoverlong(c) && !ismalformed(c)
Unicode.isprint(c) ? print(io, c) :
c <= '\x7f' ? print(io, "\\x", hex(c, 2)) :
c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) :
print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4))
else # malformed
c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s, j) ? 4 : 2)) :
print(io, "\\U", hex(c, need_full_hex(s, j) ? 8 : 4))
else # malformed or overlong
u = bswap(reinterpret(UInt32, c))
while true
print(io, "\\x", hex(u % UInt8, 2))
Expand Down Expand Up @@ -332,9 +337,10 @@ function unescape_string(io, s::AbstractString)
'A' <= c <= 'F' ? n<<4 + (c-'A'+10) : break
i = j
end
if k == 1
if k == 1 || n > 0x10ffff
u = m == 4 ? 'u' : 'U'
throw(ArgumentError("invalid $(m == 2 ? "hex (\\x)" :
"unicode (\\u)") escape sequence used in $(repr(s))"))
"unicode (\\$u)") escape sequence"))
end
if m == 2 # \x escape sequence
write(io, UInt8(n))
Expand Down
5 changes: 1 addition & 4 deletions src/julia-parser.scm
Original file line number Diff line number Diff line change
Expand Up @@ -2090,10 +2090,7 @@
(define (tostr raw io)
(if raw
(io.tostring! io)
(let ((str (unescape-string (io.tostring! io))))
(if (not (string.isutf8 str))
(error "invalid UTF-8 sequence")
str))))
(let ((str (unescape-string (io.tostring! io)))) str)))

;; raw = raw string literal
;; when raw is #t, unescape only \\ and delimiter
Expand Down
19 changes: 19 additions & 0 deletions test/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,22 @@ end
rm(file, force=true)
end
end

function test_overlong(c::Char, n::Integer, rep::String)
@test Int(c) == n
@test sprint(show, c) == rep
end

# TODO: use char syntax once #25072 is fixed
test_overlong('\0', 0, "'\\0'")
test_overlong("\xc0\x80"[1], 0, "'\\xc0\\x80'")
test_overlong("\xe0\x80\x80"[1], 0, "'\\xe0\\x80\\x80'")
test_overlong("\xf0\x80\x80\x80"[1], 0, "'\\xf0\\x80\\x80\\x80'")

test_overlong('\x30', 0x30, "'0'")
test_overlong("\xc0\xb0"[1], 0x30, "'\\xc0\\xb0'")
test_overlong("\xe0\x80\xb0"[1], 0x30, "'\\xe0\\x80\\xb0'")
test_overlong("\xf0\x80\x80\xb0"[1], 0x30, "'\\xf0\\x80\\x80\\xb0'")

test_overlong('\u8430', 0x8430, "'萰'")
test_overlong("\xf0\x88\x90\xb0"[1], 0x8430, "'\\xf0\\x88\\x90\\xb0'")
7 changes: 7 additions & 0 deletions test/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,13 @@ end
@test_repr "(!x).a"
@test_repr "(!x)::a"

# invalid UTF-8 strings
@test_repr "\"\\ud800\""
@test_repr "\"\\udfff\""
@test_repr "\"\\xc0\\xb0\""
@test_repr "\"\\xe0\\xb0\\xb0\""
@test_repr "\"\\xf0\\xb0\\xb0\\xb0\""

# Complex

# Meta.parse(repr(:(...))) returns a double-quoted block, so we need to eval twice to unquote it
Expand Down
56 changes: 29 additions & 27 deletions test/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -316,31 +316,31 @@ end
@test isvalid(Char, val) == pass
end
for (val, pass) in (
(b"\x00", true),
(b"\x7f", true),
(b"\x80", false),
(b"\xbf", false),
(b"\xc0", false),
(b"\xff", false),
(b"\xc0\x80", false),
(b"\xc1\x80", false),
(b"\xc2\x80", true),
(b"\xc2\xc0", false),
(b"\xed\x9f\xbf", true),
(b"\xed\xa0\x80", false),
(b"\xed\xbf\xbf", false),
(b"\xee\x80\x80", true),
(b"\xef\xbf\xbf", true),
(b"\xf0\x90\x80\x80", true),
(b"\xf4\x8f\xbf\xbf", true),
(b"\xf4\x90\x80\x80", false),
(b"\xf5\x80\x80\x80", false),
(b"\ud800\udc00", false),
(b"\udbff\udfff", false),
(b"\ud800\u0100", false),
(b"\udc00\u0100", false),
(b"\udc00\ud800", false)
)
("\x00", true),
("\x7f", true),
("\x80", false),
("\xbf", false),
("\xc0", false),
("\xff", false),
("\xc0\x80", false),
("\xc1\x80", false),
("\xc2\x80", true),
("\xc2\xc0", false),
("\xed\x9f\xbf", true),
("\xed\xa0\x80", false),
("\xed\xbf\xbf", false),
("\xee\x80\x80", true),
("\xef\xbf\xbf", true),
("\xf0\x90\x80\x80", true),
("\xf4\x8f\xbf\xbf", true),
("\xf4\x90\x80\x80", false),
("\xf5\x80\x80\x80", false),
("\ud800\udc00", false),
("\udbff\udfff", false),
("\ud800\u0100", false),
("\udc00\u0100", false),
("\udc00\ud800", false),
)
@test isvalid(String, val) == pass == isvalid(String(val))
end

Expand Down Expand Up @@ -430,8 +430,8 @@ end
@test_throws ArgumentError ascii(GenericString("Hello, ∀"))
end
@testset "issue #17271: endof() doesn't throw an error even with invalid strings" begin
@test endof(String(b"\x90")) == 1
@test endof(String(b"\xce")) == 1
@test endof("\x90") == 1
@test endof("\xce") == 1
end
# issue #17624, missing getindex method for String
@test "abc"[:] == "abc"
Expand Down Expand Up @@ -652,3 +652,5 @@ end
@test ncodeunits(GenericString(s)) == n
end
end

@test Vector{UInt8}("\xcc\xdd\xee\xff\x80") == [0xcc,0xdd,0xee,0xff,0x80]
6 changes: 6 additions & 0 deletions test/syntax.jl
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,12 @@ end
Meta.parse("\"foo\r\nbar\"") == Meta.parse("\"foo\rbar\"") == Meta.parse("\"foo\nbar\"")
@test '\r' == first("\r") == first("\r\n") # still allow explicit \r

# allow invalid UTF-8 in string literals
@test "\ud800"[1] == Char(0xd800)
@test "\udfff"[1] == Char(0xdfff)
@test length("\xc0\xb0") == 1
@test "\xc0\xb0"[1] == reinterpret(Char, 0xc0b00000)

# issue #14561 - generating 0-method generic function def
let fname = :f
@test :(function $fname end) == Expr(:function, :f)
Expand Down
26 changes: 14 additions & 12 deletions test/unicode/utf8.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license

@testset "string indexing" begin
let str = String(b"this is a test\xed\x80")
let str = "this is a test\xed\x80"
@test next(str, 15) == (reinterpret(Char, 0xed800000), 17)
@test_throws BoundsError getindex(str, 0:3)
@test_throws BoundsError getindex(str, 17:18)
@test_throws BoundsError getindex(str, 2:17)
@test_throws BoundsError getindex(str, 16:17)
@test string(Char(0x110000)) == String(b"\xf4\x90\x80\x80")
@test string(Char(0x110000)) == "\xf4\x90\x80\x80"
end
end

Expand All @@ -17,20 +17,22 @@ end
@test reverse("abc") == "cba"
@test reverse("xyz\uff\u800\uffff\U10ffff") == "\U10ffff\uffff\u800\uffzyx"
for (s, r) in [
b"xyz\xc1" => b"\xc1zyx",
b"xyz\xd0" => b"\xd0zyx",
b"xyz\xe0" => b"\xe0zyx",
b"xyz\xed\x80" => b"\xed\x80zyx",
b"xyz\xf0" => b"\xf0zyx",
b"xyz\xf0\x80" => b"\xf0\x80zyx",
b"xyz\xf0\x80\x80" => b"\xf0\x80\x80zyx",
"xyz\xc1" => "\xc1zyx",
"xyz\xd0" => "\xd0zyx",
"xyz\xe0" => "\xe0zyx",
"xyz\xed\x80" => "\xed\x80zyx",
"xyz\xf0" => "\xf0zyx",
"xyz\xf0\x80" => "\xf0\x80zyx",
"xyz\xf0\x80\x80" => "\xf0\x80\x80zyx",
]
@test reverse(String(s)) == String(r)
@test reverse(s) == r
end
end

@testset "string convert" begin
@test String(b"this is a test\xed\x80\x80") == "this is a test\ud000"
@test String(b"this is a test\xed\x80\x80") ==
"this is a test\xed\x80\x80" ==
"this is a test\ud000"
# Specifically check UTF-8 string whose lead byte is same as a surrogate
@test String(b"\xed\x9f\xbf") == "\ud7ff"
@test String(b"\xed\x9f\xbf") == "\xed\x9f\xbf" == "\ud7ff"
end