Skip to content

Commit

Permalink
Fix JuliaLang#10959 problems with UTF-8 conversions
Browse files Browse the repository at this point in the history
Update for search change

Updated to use unsafe_checkstring, fix comments

Update comments

Remove @inline from test function

Removed conversions with Vector{Char}

Ensure all changes included
  • Loading branch information
ScottPJones committed Jul 19, 2015
1 parent 5cac28a commit e71378c
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 65 deletions.
8 changes: 4 additions & 4 deletions base/unicode/checkstring.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ const UTF_SURROGATE = 32 ##< surrogate pairs present
(ch << 6) | (byt & 0x3f)
end

"
"""
Validates and calculates number of characters in a UTF-8,UTF-16 or UTF-32 encoded vector/string
Warning: this function does not check the bounds of the start or end positions
Expand All @@ -46,7 +46,7 @@ Use `checkstring` to make sure the bounds are checked
### Throws:
* `UnicodeError`
"
"""
function unsafe_checkstring end

function unsafe_checkstring(dat::Vector{UInt8},
Expand Down Expand Up @@ -191,7 +191,7 @@ function unsafe_checkstring{T <: Union{Vector{UInt16}, Vector{UInt32}, AbstractS
return totalchar, flags, num4byte, num3byte, num2byte
end

"
"""
Validates and calculates number of characters in a UTF-8,UTF-16 or UTF-32 encoded vector/string
This function checks the bounds of the start and end positions
Expand All @@ -214,7 +214,7 @@ Use `unsafe_checkstring` to avoid that overhead if the bounds have already been
### Throws:
* `UnicodeError`
"
"""
function checkstring end

# No need to check bounds if using defaults
Expand Down
16 changes: 8 additions & 8 deletions base/unicode/utf16.jl
Original file line number Diff line number Diff line change
Expand Up @@ -101,15 +101,15 @@ function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
return i > n || !is_surrogate_codeunit(data[i])
end

"
"""
Converts an `AbstractString` to a `UTF16String`
### Returns:
* `UTF16String`
### Throws:
* `UnicodeError`
"
"""
function convert(::Type{UTF16String}, str::AbstractString)
len, flags, num4byte = unsafe_checkstring(str)
buf = Vector{UInt16}(len+num4byte+1)
Expand All @@ -128,15 +128,15 @@ function convert(::Type{UTF16String}, str::AbstractString)
UTF16String(buf)
end

"
"""
Converts a `UTF8String` to a `UTF16String`
### Returns:
* `UTF16String`
### Throws:
* `UnicodeError`
"
"""
function convert(::Type{UTF16String}, str::UTF8String)
dat = str.data
# handle zero length string quickly
Expand Down Expand Up @@ -174,15 +174,15 @@ function convert(::Type{UTF16String}, str::UTF8String)
UTF16String(buf)
end

"
"""
Converts a `UTF16String` to a `UTF8String`
### Returns:
* `UTF8String`
### Throws:
* `UnicodeError`
"
"""
function convert(::Type{UTF8String}, str::UTF16String)
dat = str.data
len = sizeof(dat) >>> 1
Expand All @@ -194,7 +194,7 @@ function convert(::Type{UTF8String}, str::UTF16String)
return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
end

"
"""
Converts an already validated UTF-32 encoded vector of `UInt32` to a `UTF16String`
### Input Arguments:
Expand All @@ -203,7 +203,7 @@ Converts an already validated UTF-32 encoded vector of `UInt32` to a `UTF16Strin
### Returns:
* `::UTF16String`
"
"""
function encode_to_utf16(dat, len)
buf = Vector{UInt16}(len)
@inbounds buf[len] = 0 # NULL termination
Expand Down
20 changes: 10 additions & 10 deletions base/unicode/utf32.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ utf32(x) = convert(UTF32String, x)
convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
convert(::Type{UTF32String}, s::UTF32String) = s

"
"""
Converts an `AbstractString` to a `UTF32String`
### Returns:
* `UTF32String`
### Throws:
* `UnicodeError`
"
"""
function convert(::Type{UTF32String}, str::AbstractString)
len, flags = unsafe_checkstring(str)
buf = Vector{Char}(len+1)
Expand All @@ -33,15 +33,15 @@ function convert(::Type{UTF32String}, str::AbstractString)
UTF32String(buf)
end

"
"""
Converts a `UTF32String` to a `UTF8String`
### Returns:
* `UTF8String`
### Throws:
* `UnicodeError`
"
"""
function convert(::Type{UTF8String}, str::UTF32String)
dat = reinterpret(UInt32, str.data)
len = sizeof(dat) >>> 2
Expand All @@ -53,15 +53,15 @@ function convert(::Type{UTF8String}, str::UTF32String)
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
end

"
"""
Converts a `UTF8String` to a `UTF32String`
### Returns:
* `::UTF32String`
### Throws:
* `UnicodeError`
"
"""
function convert(::Type{UTF32String}, str::UTF8String)
dat = str.data
# handle zero length string quickly
Expand Down Expand Up @@ -107,15 +107,15 @@ function convert(::Type{UTF32String}, str::UTF8String)
UTF32String(buf)
end

"
"""
Converts a `UTF16String` to `UTF32String`
### Returns:
* `::UTF32String`
### Throws:
* `UnicodeError`
"
"""
function convert(::Type{UTF32String}, str::UTF16String)
dat = str.data
len = sizeof(dat)
Expand All @@ -138,15 +138,15 @@ function convert(::Type{UTF32String}, str::UTF16String)
UTF32String(buf)
end

"
"""
Converts a `UTF32String` to `UTF16String`
### Returns:
* `::UTF16String`
### Throws:
* `UnicodeError`
"
"""
function convert(::Type{UTF16String}, str::UTF32String)
dat = reinterpret(UInt32, str.data)
len = sizeof(dat)
Expand Down
92 changes: 77 additions & 15 deletions base/unicode/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
## from base/boot.jl:
#
# immutable UTF8String <: AbstractString
# data::Array{UInt8,1}
# data::Vector{UInt8}
# end
#

Expand All @@ -26,6 +26,8 @@ const utf8_trailing = [
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
]

# Retained because although undocumented and unexported, used in a package (MutableStrings)
# should be deprecated
is_utf8_start(byte::UInt8) = ((byte&0xc0)!=0x80)

## required core functionality ##
Expand All @@ -34,19 +36,17 @@ function endof(s::UTF8String)
d = s.data
i = length(d)
i == 0 && return i
while !is_utf8_start(d[i])
while is_valid_continuation(d[i])
i -= 1
end
i
end

is_utf8_continuation(byte::UInt8) = ((byte&0xc0) == 0x80)

function length(s::UTF8String)
d = s.data
cnum = 0
for i = 1:length(d)
@inbounds cnum += !is_utf8_continuation(d[i])
@inbounds cnum += !is_valid_continuation(d[i])
end
cnum
end
Expand All @@ -65,7 +65,7 @@ function next(s::UTF8String, i::Int)

d = s.data
b = d[i]
if !is_utf8_start(b)
if is_valid_continuation(b)
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
end
trailing = utf8_trailing[b+1]
Expand Down Expand Up @@ -93,7 +93,7 @@ end
function reverseind(s::UTF8String, i::Integer)
j = lastidx(s) + 1 - i
d = s.data
while !is_utf8_start(d[j])
while is_valid_continuation(d[j])
j -= 1
end
return j
Expand All @@ -106,7 +106,7 @@ sizeof(s::UTF8String) = sizeof(s.data)
lastidx(s::UTF8String) = length(s.data)

isvalid(s::UTF8String, i::Integer) =
(1 <= i <= endof(s.data)) && is_utf8_start(s.data[i])
(1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i])

const empty_utf8 = UTF8String(UInt8[])

Expand All @@ -133,7 +133,7 @@ function search(s::UTF8String, c::Char, i::Integer)
throw(BoundsError(s, i))
end
d = s.data
if !is_utf8_start(d[i])
if is_valid_continuation(d[i])
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
end
c < Char(0x80) && return search(d, c%UInt8, i)
Expand Down Expand Up @@ -214,20 +214,82 @@ write(io::IO, s::UTF8String) = write(io, s.data)
utf8(x) = convert(UTF8String, x)
convert(::Type{UTF8String}, s::UTF8String) = s
convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8))
function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString)

"""
Converts a UTF-8 encoded vector of `UInt8` to a `UTF8String`
### Returns:
* `UTF8String`
### Throws:
* `UnicodeError`
"""
function convert(::Type{UTF8String}, dat::Vector{UInt8})
# handle zero length string quickly
isempty(dat) && return empty_utf8
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat)
if (flags & (UTF_LONG | UTF_SURROGATE)) == 0
len = sizeof(dat)
@inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
end
# Copy, but eliminate over-long encodings and surrogate pairs
len += num2byte + num3byte*2 + num4byte*3
buf = Vector{UInt8}(len)
out = 0
pos = 0
@inbounds while out < len
ch::UInt32 = dat[pos += 1]
# Handle ASCII characters
if ch <= 0x7f
buf[out += 1] = ch
# Handle overlong < 0x100
elseif ch < 0xc2
buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f)
# Handle 0x100-0x7ff
elseif ch < 0xe0
buf[out += 1] = ch
buf[out += 1] = dat[pos += 1]
elseif ch != 0xed
buf[out += 1] = ch
buf[out += 1] = dat[pos += 1]
buf[out += 1] = dat[pos += 1]
# Copy 4-byte encoded value
ch >= 0xf0 && (buf[out += 1] = dat[pos += 1])
# Handle surrogate pairs
else
ch = dat[pos += 1]
if ch < 0xa0 # not surrogate pairs
buf[out += 1] = 0xed
buf[out += 1] = ch
buf[out += 1] = dat[pos += 1]
else
# Pick up surrogate pairs (CESU-8 format)
ch = (((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10)
+ (((dat[pos + 3] & 0x3f) << 6) | (dat[pos + 4] & 0x3f))
- 0xc00)
pos += 4
output_utf8_4byte!(buf, out, ch)
out += 4
end
end
end
UTF8String(buf)
end

function convert(::Type{UTF8String}, a::Vector{UInt8}, invalids_as::AbstractString)
l = length(a)
idx = 1
iscopy = false
while idx <= l
if is_utf8_start(a[idx])
if !is_valid_continuation(a[idx])
nextidx = idx+1+utf8_trailing[a[idx]+1]
(nextidx <= (l+1)) && (idx = nextidx; continue)
end
!iscopy && (a = copy(a); iscopy = true)
endn = idx
while endn <= l
is_utf8_start(a[endn]) && break
!is_valid_continuation(a[endn]) && break
endn += 1
end
(endn > idx) && (endn -= 1)
Expand All @@ -238,7 +300,7 @@ function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractStr
end
convert(::Type{UTF8String}, s::AbstractString) = utf8(bytestring(s))

"
"""
Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
### Input Arguments:
Expand All @@ -247,7 +309,7 @@ Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
### Returns:
* `UTF8String`
"
"""
function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
buf = Vector{UInt8}(len)
out = 0
Expand Down
Loading

0 comments on commit e71378c

Please sign in to comment.