Skip to content

Commit

Permalink
Fix JuliaLang#10959 problems with UTF-8 conversions
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed Jun 22, 2015
1 parent 1f1a6fd commit 9175a02
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 48 deletions.
108 changes: 88 additions & 20 deletions base/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
## from base/boot.jl:
#
# immutable UTF8String <: AbstractString
# data::Array{UInt8,1}
# data::Vector{UInt8}
# end
#

Expand All @@ -26,27 +26,23 @@ const utf8_trailing = [
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
]

is_utf8_start(byte::UInt8) = ((byte&0xc0)!=0x80)

## required core functionality ##

function endof(s::UTF8String)
d = s.data
i = length(d)
i == 0 && return i
while !is_utf8_start(d[i])
while is_valid_continuation(d[i])
i -= 1
end
i
end

is_utf8_continuation(byte::UInt8) = ((byte&0xc0) == 0x80)

function length(s::UTF8String)
d = s.data
cnum = 0
for i = 1:length(d)
@inbounds cnum += !is_utf8_continuation(d[i])
@inbounds cnum += !is_valid_continuation(d[i])
end
cnum
end
Expand All @@ -65,8 +61,17 @@ function next(s::UTF8String, i::Int)

d = s.data
b = d[i]
if !is_utf8_start(b)
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
if is_valid_continuation(b)
j = i-1
while 0 < j && is_valid_continuation(d[j])
j -= 1
end
if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
# b is a continuation byte of a valid UTF-8 character
throw(UnicodeError(UTF_ERR_CONT, i, d[j]))
end
# move past 1 byte in case the data is actually Latin-1
return '\ufffd', i+1
end
trailing = utf8_trailing[b+1]
if length(d) < i + trailing
Expand All @@ -93,7 +98,7 @@ end
function reverseind(s::UTF8String, i::Integer)
j = lastidx(s) + 1 - i
d = s.data
while !is_utf8_start(d[j])
while is_valid_continuation(d[j])
j -= 1
end
return j
Expand All @@ -106,19 +111,16 @@ sizeof(s::UTF8String) = sizeof(s.data)
lastidx(s::UTF8String) = length(s.data)

isvalid(s::UTF8String, i::Integer) =
(1 <= i <= endof(s.data)) && is_utf8_start(s.data[i])
(1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i])

const empty_utf8 = UTF8String(UInt8[])

function getindex(s::UTF8String, r::UnitRange{Int})
isempty(r) && return empty_utf8
i, j = first(r), last(r)
d = s.data
if i < 1 || i > length(s.data)
throw(BoundsError(s, i))
end
if !is_utf8_start(d[i])
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
if is_valid_continuation(d[i])
i = nextind(s,i)
end
if j > length(d)
throw(BoundsError())
Expand Down Expand Up @@ -214,20 +216,86 @@ write(io::IO, s::UTF8String) = write(io, s.data)
utf8(x) = convert(UTF8String, x)
convert(::Type{UTF8String}, s::UTF8String) = s
convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8))
function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString)

"
Converts a UTF-8 encoded vector of `UInt8` to a `UTF8String`
### Input Arguments:
* `::Type{UTF8String}`
* `dat::Vector{UInt8}`
### Returns:
* `::UTF8String`
### Throws:
* `UnicodeError`
"
function convert(::Type{UTF8String}, dat::Vector{UInt8})
# handle zero length string quickly
isempty(dat) && return empty_utf8
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = check_string(dat)
if (flags & (UTF_LONG | UTF_SURROGATE)) == 0
len = sizeof(dat)
@inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
end
# Copy, but eliminate over-long encodings and surrogate pairs
len += num2byte + num3byte*2 + num4byte*3
buf = Vector{UInt8}(len)
out = 0
pos = 0
@inbounds while out < len
ch::UInt32 = dat[pos += 1]
# Handle ASCII characters
if ch <= 0x7f
buf[out += 1] = ch
# Handle overlong < 0x100
elseif ch < 0xc2
buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f)
# Handle 0x100-0x7ff
elseif ch < 0xe0
buf[out += 1] = ch
buf[out += 1] = dat[pos += 1]
elseif ch != 0xed
buf[out += 1] = ch
buf[out += 1] = dat[pos += 1]
buf[out += 1] = dat[pos += 1]
# Copy 4-byte encoded value
ch >= 0xf0 && (buf[out += 1] = dat[pos += 1])
# Handle surrogate pairs
else
ch = dat[pos += 1]
if ch < 0xa0 # not surrogate pairs
buf[out += 1] = 0xed
buf[out += 1] = ch
buf[out += 1] = dat[pos += 1]
else
# Pick up surrogate pairs (CESU-8 format)
ch = (((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10)
+ (((dat[pos + 3] & 0x3f) << 6) | (dat[pos + 4] & 0x3f))
- 0xc00)
pos += 4
output_utf8_4byte!(buf, out, ch)
out += 4
end
end
end
UTF8String(buf)
end

function convert(::Type{UTF8String}, a::Vector{UInt8}, invalids_as::AbstractString)
l = length(a)
idx = 1
iscopy = false
while idx <= l
if is_utf8_start(a[idx])
if !is_valid_continuation(a[idx])
nextidx = idx+1+utf8_trailing[a[idx]+1]
(nextidx <= (l+1)) && (idx = nextidx; continue)
end
!iscopy && (a = copy(a); iscopy = true)
endn = idx
while endn <= l
is_utf8_start(a[endn]) && break
!is_valid_continuation(a[endn]) && break
endn += 1
end
(endn > idx) && (endn -= 1)
Expand Down
59 changes: 31 additions & 28 deletions test/strings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1917,8 +1917,8 @@ str3_UTF8 = "abcd\uff\uff\u7fff\u7fff"
str4_UTF8 = "abcd\uff\u7ff\u7fff\U7ffff"
strS_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80")
strC_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\U10000")
strZ_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80")
strz_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\0")
strZ = b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80"

strA_UTF16 = utf16(strA_UTF8)
strL_UTF16 = utf16(strL_UTF8)
Expand Down Expand Up @@ -1953,97 +1953,100 @@ tstcvt(str4_UTF8,str4_UTF16,str4_UTF32)
@test utf16(strS_UTF32) == strC_UTF8

# Test converting overlong \0
# @test utf8(strZ_UTF8) == strz_UTF8 # currently broken! (in utf8.jl)
@test utf16(strZ_UTF8) == strz_UTF8
@test utf32(strZ_UTF8) == strz_UTF8
@test utf8(strZ) == strz_UTF8
@test utf16(UTF8String(strZ)) == strz_UTF8
@test utf32(UTF8String(strZ)) == strz_UTF8

# Test invalid sequences

@inline strval(::Type{UTF8String}, dat) = dat
@inline strval(::Union(Type{UTF16String},Type{UTF32String}), dat) = UTF8String(dat)

byt = 0x0
for T in (UTF16String, UTF32String)
for T in (UTF8String, UTF16String, UTF32String)
try
# Continuation byte not after lead
for byt in 0x80:0xbf
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt]))
@test_throws UnicodeError convert(T, strval(T,UInt8[byt]))
end

# Test lead bytes
for byt in 0xc0:0xff
# Single lead byte at end of string
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt]))
@test_throws UnicodeError convert(T, strval(T,UInt8[byt]))
# Lead followed by non-continuation character < 0x80
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0]))
@test_throws UnicodeError convert(T, strval(T,UInt8[byt,0]))
# Lead followed by non-continuation character > 0xbf
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0xc0]))
@test_throws UnicodeError convert(T, strval(T,UInt8[byt,0xc0]))
end

# Test overlong 2-byte
for byt in 0x81:0xbf
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xc0,byt]))
@test_throws UnicodeError convert(T, strval(T,UInt8[0xc0,byt]))
end
for byt in 0x80:0xbf
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xc1,byt]))
@test_throws UnicodeError convert(T, strval(T,UInt8[0xc1,byt]))
end

# Test overlong 3-byte
for byt in 0x80:0x9f
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xe0,byt,0x80]))
@test_throws UnicodeError convert(T, strval(T,UInt8[0xe0,byt,0x80]))
end

# Test overlong 4-byte
for byt in 0x80:0x8f
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xef,byt,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T,UInt8[0xef,byt,0x80,0x80]))
end

# Test 4-byte > 0x10ffff
for byt in 0x90:0xbf
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xf4,byt,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T,UInt8[0xf4,byt,0x80,0x80]))
end
for byt in 0xf5:0xf7
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0x80]))
end

# Test 5-byte
for byt in 0xf8:0xfb
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0x80,0x80]))
end

# Test 6-byte
for byt in 0xfc:0xfd
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0x80,0x80,0x80]))
end

# Test 7-byte
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T,UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]))

# Three and above byte sequences
for byt in 0xe0:0xef
# Lead followed by only 1 continuation byte
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80]))
@test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80]))
# Lead ended by non-continuation character < 0x80
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0]))
@test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0]))
# Lead ended by non-continuation character > 0xbf
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0xc0]))
@test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0xc0]))
end

# 3-byte encoded surrogate character(s)
# Single surrogate
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80]))
@test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xa0,0x80]))
# Not followed by surrogate
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]))
# Trailing surrogate first
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]))
@test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]))
# Followed by lead surrogate
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]))
@test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]))

# Four byte sequences
for byt in 0xf0:0xf4
# Lead followed by only 2 continuation bytes
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80]))
# Lead followed by non-continuation character < 0x80
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0]))
@test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0]))
# Lead followed by non-continuation character > 0xbf
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0xc0]))
@test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0xc0]))
end
catch exp ;
println("Error checking $T: $byt")
Expand Down

0 comments on commit 9175a02

Please sign in to comment.