Skip to content

Commit

Permalink
Fix JuliaLang#10959 problems with UTF-8 conversions
Browse files Browse the repository at this point in the history
Update for search change

Updated to use unsafe_checkstring, fix comments
  • Loading branch information
ScottPJones committed Jul 10, 2015
1 parent 4424f42 commit e17ecf4
Showing 1 changed file with 85 additions and 21 deletions.
106 changes: 85 additions & 21 deletions base/unicode/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
## from base/boot.jl:
#
# immutable UTF8String <: AbstractString
# data::Array{UInt8,1}
# data::Vector{UInt8}
# end
#

Expand All @@ -26,27 +26,23 @@ const utf8_trailing = [
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
]

is_utf8_start(byte::UInt8) = ((byte&0xc0)!=0x80)

## required core functionality ##

function endof(s::UTF8String)
d = s.data
i = length(d)
i == 0 && return i
while !is_utf8_start(d[i])
while is_valid_continuation(d[i])
i -= 1
end
i
end

is_utf8_continuation(byte::UInt8) = ((byte&0xc0) == 0x80)

function length(s::UTF8String)
d = s.data
cnum = 0
for i = 1:length(d)
@inbounds cnum += !is_utf8_continuation(d[i])
@inbounds cnum += !is_valid_continuation(d[i])
end
cnum
end
Expand All @@ -65,8 +61,17 @@ function next(s::UTF8String, i::Int)

d = s.data
b = d[i]
if !is_utf8_start(b)
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
if is_valid_continuation(b)
j = i-1
while 0 < j && is_valid_continuation(d[j])
j -= 1
end
if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
# b is a continuation byte of a valid UTF-8 character
throw(UnicodeError(UTF_ERR_CONT, i, d[j]))
end
# move past 1 byte in case the data is actually Latin-1
return '\ufffd', i+1
end
trailing = utf8_trailing[b+1]
if length(d) < i + trailing
Expand All @@ -93,7 +98,7 @@ end
function reverseind(s::UTF8String, i::Integer)
j = lastidx(s) + 1 - i
d = s.data
while !is_utf8_start(d[j])
while is_valid_continuation(d[j])
j -= 1
end
return j
Expand All @@ -106,19 +111,16 @@ sizeof(s::UTF8String) = sizeof(s.data)
lastidx(s::UTF8String) = length(s.data)

isvalid(s::UTF8String, i::Integer) =
(1 <= i <= endof(s.data)) && is_utf8_start(s.data[i])
(1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i])

const empty_utf8 = UTF8String(UInt8[])

function getindex(s::UTF8String, r::UnitRange{Int})
isempty(r) && return empty_utf8
i, j = first(r), last(r)
d = s.data
if i < 1 || i > length(s.data)
throw(BoundsError(s, i))
end
if !is_utf8_start(d[i])
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
if is_valid_continuation(d[i])
i = nextind(s,i)
end
if j > length(d)
throw(BoundsError())
Expand All @@ -133,7 +135,7 @@ function search(s::UTF8String, c::Char, i::Integer)
throw(BoundsError(s, i))
end
d = s.data
if !is_utf8_start(d[i])
if is_valid_continuation(d[i])
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
end
c < Char(0x80) && return search(d, c%UInt8, i)
Expand Down Expand Up @@ -214,20 +216,82 @@ write(io::IO, s::UTF8String) = write(io, s.data)
utf8(x) = convert(UTF8String, x)
convert(::Type{UTF8String}, s::UTF8String) = s
convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8))
function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString)

"
Converts a UTF-8 encoded vector of `UInt8` to a `UTF8String`
### Returns:
* `UTF8String`
### Throws:
* `UnicodeError`
"
function convert(::Type{UTF8String}, dat::Vector{UInt8})
# handle zero length string quickly
isempty(dat) && return empty_utf8
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat)
if (flags & (UTF_LONG | UTF_SURROGATE)) == 0
len = sizeof(dat)
@inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
end
# Copy, but eliminate over-long encodings and surrogate pairs
len += num2byte + num3byte*2 + num4byte*3
buf = Vector{UInt8}(len)
out = 0
pos = 0
@inbounds while out < len
ch::UInt32 = dat[pos += 1]
# Handle ASCII characters
if ch <= 0x7f
buf[out += 1] = ch
# Handle overlong < 0x100
elseif ch < 0xc2
buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f)
# Handle 0x100-0x7ff
elseif ch < 0xe0
buf[out += 1] = ch
buf[out += 1] = dat[pos += 1]
elseif ch != 0xed
buf[out += 1] = ch
buf[out += 1] = dat[pos += 1]
buf[out += 1] = dat[pos += 1]
# Copy 4-byte encoded value
ch >= 0xf0 && (buf[out += 1] = dat[pos += 1])
# Handle surrogate pairs
else
ch = dat[pos += 1]
if ch < 0xa0 # not surrogate pairs
buf[out += 1] = 0xed
buf[out += 1] = ch
buf[out += 1] = dat[pos += 1]
else
# Pick up surrogate pairs (CESU-8 format)
ch = (((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10)
+ (((dat[pos + 3] & 0x3f) << 6) | (dat[pos + 4] & 0x3f))
- 0xc00)
pos += 4
output_utf8_4byte!(buf, out, ch)
out += 4
end
end
end
UTF8String(buf)
end

function convert(::Type{UTF8String}, a::Vector{UInt8}, invalids_as::AbstractString)
l = length(a)
idx = 1
iscopy = false
while idx <= l
if is_utf8_start(a[idx])
if !is_valid_continuation(a[idx])
nextidx = idx+1+utf8_trailing[a[idx]+1]
(nextidx <= (l+1)) && (idx = nextidx; continue)
end
!iscopy && (a = copy(a); iscopy = true)
endn = idx
while endn <= l
is_utf8_start(a[endn]) && break
!is_valid_continuation(a[endn]) && break
endn += 1
end
(endn > idx) && (endn -= 1)
Expand Down

0 comments on commit e17ecf4

Please sign in to comment.