diff --git a/base/unicode/utf8.jl b/base/unicode/utf8.jl index f38c2149179d5e..db246148c517f7 100644 --- a/base/unicode/utf8.jl +++ b/base/unicode/utf8.jl @@ -3,7 +3,7 @@ ## from base/boot.jl: # # immutable UTF8String <: AbstractString -# data::Array{UInt8,1} +# data::Vector{UInt8} # end # @@ -26,27 +26,23 @@ const utf8_trailing = [ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5, ] -is_utf8_start(byte::UInt8) = ((byte&0xc0)!=0x80) - ## required core functionality ## function endof(s::UTF8String) d = s.data i = length(d) i == 0 && return i - while !is_utf8_start(d[i]) + while is_valid_continuation(d[i]) i -= 1 end i end -is_utf8_continuation(byte::UInt8) = ((byte&0xc0) == 0x80) - function length(s::UTF8String) d = s.data cnum = 0 for i = 1:length(d) - @inbounds cnum += !is_utf8_continuation(d[i]) + @inbounds cnum += !is_valid_continuation(d[i]) end cnum end @@ -65,8 +61,17 @@ function next(s::UTF8String, i::Int) d = s.data b = d[i] - if !is_utf8_start(b) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i])) + if is_valid_continuation(b) + j = i-1 + while 0 < j && is_valid_continuation(d[j]) + j -= 1 + end + if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d) + # b is a continuation byte of a valid UTF-8 character + throw(UnicodeError(UTF_ERR_CONT, i, d[j])) + end + # move past 1 byte in case the data is actually Latin-1 + return '\ufffd', i+1 end trailing = utf8_trailing[b+1] if length(d) < i + trailing @@ -93,7 +98,7 @@ end function reverseind(s::UTF8String, i::Integer) j = lastidx(s) + 1 - i d = s.data - while !is_utf8_start(d[j]) + while is_valid_continuation(d[j]) j -= 1 end return j @@ -106,7 +111,7 @@ sizeof(s::UTF8String) = sizeof(s.data) lastidx(s::UTF8String) = length(s.data) isvalid(s::UTF8String, i::Integer) = - (1 <= i <= endof(s.data)) && is_utf8_start(s.data[i]) + (1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i]) const empty_utf8 = UTF8String(UInt8[]) @@ -114,11 +119,8 @@ function getindex(s::UTF8String, r::UnitRange{Int}) isempty(r) && return empty_utf8 i, j = first(r), last(r) d = s.data - if i < 1 || i > length(s.data) - throw(BoundsError(s, i)) - end - if !is_utf8_start(d[i]) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i])) + if is_valid_continuation(d[i]) + i = nextind(s,i) end if j > length(d) throw(BoundsError()) @@ -133,7 +135,7 @@ function search(s::UTF8String, c::Char, i::Integer) throw(BoundsError(s, i)) end d = s.data - if !is_utf8_start(d[i]) + if is_valid_continuation(d[i]) throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i])) end c < Char(0x80) && return search(d, c%UInt8, i) @@ -214,20 +216,82 @@ write(io::IO, s::UTF8String) = write(io, s.data) utf8(x) = convert(UTF8String, x) convert(::Type{UTF8String}, s::UTF8String) = s convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data) -convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8)) -function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString) + +" +Converts a UTF-8 encoded vector of `UInt8` to a `UTF8String` + +### Returns: +* `UTF8String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF8String}, dat::Vector{UInt8}) + # handle zero length string quickly + isempty(dat) && return empty_utf8 + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat) + if (flags & (UTF_LONG | UTF_SURROGATE)) == 0 + len = sizeof(dat) + @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + end + # Copy, but eliminate over-long encodings and surrogate pairs + len += num2byte + num3byte*2 + num4byte*3 + buf = Vector{UInt8}(len) + out = 0 + pos = 0 + @inbounds while out < len + ch::UInt32 = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle overlong < 0x100 + elseif ch < 0xc2 + buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f) + # Handle 0x100-0x7ff + elseif ch < 0xe0 + buf[out += 1] = ch + buf[out += 1] = dat[pos += 1] + elseif ch != 0xed + buf[out += 1] = ch + buf[out += 1] = dat[pos += 1] + buf[out += 1] = dat[pos += 1] + # Copy 4-byte encoded value + ch >= 0xf0 && (buf[out += 1] = dat[pos += 1]) + # Handle surrogate pairs + else + ch = dat[pos += 1] + if ch < 0xa0 # not surrogate pairs + buf[out += 1] = 0xed + buf[out += 1] = ch + buf[out += 1] = dat[pos += 1] + else + # Pick up surrogate pairs (CESU-8 format) + ch = (((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10) + + (((dat[pos + 3] & 0x3f) << 6) | (dat[pos + 4] & 0x3f)) + - 0xc00) + pos += 4 + output_utf8_4byte!(buf, out, ch) + out += 4 + end + end + end + UTF8String(buf) +end + +function convert(::Type{UTF8String}, a::Vector{UInt8}, invalids_as::AbstractString) l = length(a) idx = 1 iscopy = false while idx <= l - if is_utf8_start(a[idx]) + if !is_valid_continuation(a[idx]) nextidx = idx+1+utf8_trailing[a[idx]+1] (nextidx <= (l+1)) && (idx = nextidx; continue) end !iscopy && (a = copy(a); iscopy = true) endn = idx while endn <= l - is_utf8_start(a[endn]) && break + !is_valid_continuation(a[endn]) && break endn += 1 end (endn > idx) && (endn -= 1)