Skip to content

Commit

Permalink
Merge pull request #11573 from ScottPJones/spj/utferror
Browse files Browse the repository at this point in the history
Improve Unicode related error messages
  • Loading branch information
tkelman committed Jun 5, 2015
2 parents e9fa25b + 0b158a6 commit 47a0eb1
Show file tree
Hide file tree
Showing 8 changed files with 80 additions and 37 deletions.
1 change: 1 addition & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ export
SystemError,
TypeError,
AssertionError,
UnicodeError,

# Global constants and variables
ARGS,
Expand Down
2 changes: 2 additions & 0 deletions base/sysimg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ include("iterator.jl")
include("osutils.jl")

# strings & printing
include("utferror.jl")
include("utftypes.jl")
include("char.jl")
include("ascii.jl")
include("utf8.jl")
Expand Down
20 changes: 5 additions & 15 deletions base/utf16.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,5 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

immutable UTF16String <: AbstractString
data::Array{UInt16,1} # includes 16-bit NULL termination after string chars
function UTF16String(data::Vector{UInt16})
if length(data) < 1 || data[end] != 0
throw(ArgumentError("UTF16String data must be NULL-terminated"))
end
new(data)
end
end

utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800
utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00
utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
Expand Down Expand Up @@ -39,7 +29,7 @@ function next(s::UTF16String, i::Int)
elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
end
throw(ArgumentError("invalid UTF-16 character index"))
throw(UnicodeError(UTF_ERR_INVALID_INDEX,0,0))
end

function reverseind(s::UTF16String, i::Integer)
Expand Down Expand Up @@ -74,7 +64,7 @@ function encode16(s::AbstractString)
push!(buf, UInt16(0xd7c0 + (c>>10)))
push!(buf, UInt16(0xdc00 + (c & 0x3ff)))
else
throw(ArgumentError("invalid Unicode character (0x$(hex(c)) > 0x10ffff)"))
throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch))
end
end
push!(buf, 0) # NULL termination
Expand Down Expand Up @@ -111,7 +101,7 @@ function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
end

function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
!isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
!isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
len = length(data)
d = Array(UInt16, len + 1)
d[end] = 0 # NULL terminate
Expand All @@ -126,7 +116,7 @@ convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =

function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
isempty(bytes) && return UTF16String(UInt16[0])
isodd(length(bytes)) && throw(ArgumentError("odd number of bytes"))
isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0))
data = reinterpret(UInt16, bytes)
# check for byte-order mark (BOM):
if data[1] == 0xfeff # native byte order
Expand All @@ -142,7 +132,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
copy!(d,1, data,1, length(data)) # assume native byte order
end
d[end] = 0 # NULL terminate
!isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data"))
!isvalid(UTF16String, d) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
UTF16String(d)
end

Expand Down
20 changes: 2 additions & 18 deletions base/utf32.jl
Original file line number Diff line number Diff line change
@@ -1,19 +1,5 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

## UTF-32 in the native byte order, i.e. plain old character arrays ##

immutable UTF32String <: DirectIndexString
data::Vector{Char} # includes 32-bit NULL termination after string chars

function UTF32String(a::Vector{Char})
if length(a) < 1 || a[end] != Char(0)
throw(ArgumentError("UTF32String data must be NULL-terminated"))
end
new(a)
end
end
UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data))

next(s::UTF32String, i::Int) = (s.data[i], i+1)
endof(s::UTF32String) = length(s.data) - 1
length(s::UTF32String) = length(s.data) - 1
Expand Down Expand Up @@ -65,7 +51,7 @@ unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =

function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
isempty(bytes) && return UTF32String(Char[0])
length(bytes) & 3 != 0 && throw(ArgumentError("need multiple of 4 bytes"))
length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0))
data = reinterpret(Char, bytes)
# check for byte-order mark (BOM):
if data[1] == Char(0x0000feff) # native byte order
Expand All @@ -91,8 +77,6 @@ function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
return true
end
isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data)

utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len)
Expand All @@ -110,7 +94,7 @@ function map(f, s::UTF32String)
for i = 1:(length(d)-1)
c2 = f(d[i])
if !isa(c2, Char)
throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0))
end
out[i] = (c2::Char)
end
Expand Down
6 changes: 3 additions & 3 deletions base/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ function next(s::UTF8String, i::Int)
end
if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
# b is a continuation byte of a valid UTF-8 character
throw(ArgumentError("invalid UTF-8 character index"))
throw(UnicodeError(UTF_ERR_CONT, i, d[j]))
end
# move past 1 byte in case the data is actually Latin-1
return '\ufffd', i+1
Expand Down Expand Up @@ -198,7 +198,7 @@ function reverse(s::UTF8String)
out = similar(s.data)
if ccall(:u8_reverse, Cint, (Ptr{UInt8}, Ptr{UInt8}, Csize_t),
out, s.data, length(out)) == 1
throw(ArgumentError("invalid UTF-8 data"))
throw(UnicodeError(UTF_ERR_INVALID_8,0,0))
end
UTF8String(out)
end
Expand All @@ -212,7 +212,7 @@ write(io::IO, s::UTF8String) = write(io, s.data)
utf8(x) = convert(UTF8String, x)
convert(::Type{UTF8String}, s::UTF8String) = s
convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(ArgumentError("invalid UTF-8 sequence"))
convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8))
function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString)
l = length(a)
idx = 1
Expand Down
30 changes: 30 additions & 0 deletions base/utferror.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

##\brief Error messages for Unicode / UTF support

const UTF_ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)"
const UTF_ERR_CONT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
const UTF_ERR_LONG = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
const UTF_ERR_NOT_LEAD = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
const UTF_ERR_NOT_TRAIL = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
const UTF_ERR_NOT_SURROGATE = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>"
const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
const UTF_ERR_INVALID = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
const UTF_ERR_SURROGATE = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
const UTF_ERR_ODD_BYTES_16 = "UTF16String can't have odd number of bytes <<1>>"
const UTF_ERR_ODD_BYTES_32 = "UTF32String must have multiple of 4 bytes <<1>>"
const UTF_ERR_INVALID_CHAR = "invalid Unicode character (0x<<2>> > 0x10ffff)"
const UTF_ERR_INVALID_8 = "invalid UTF-8 data"
const UTF_ERR_INVALID_16 = "invalid UTF-16 data"
const UTF_ERR_INVALID_INDEX = "invalid character index"
const UTF_ERR_MAP_CHAR = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"

type UnicodeError <: Exception
errmsg::AbstractString ##< A UTF_ERR_ message
errpos::Int32 ##< Position of invalid character
errchr::UInt32 ##< Invalid character
end

show(io::IO, exc::UnicodeError) = print(io, replace(replace(exc.errmsg,"<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr)))
34 changes: 34 additions & 0 deletions base/utftypes.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

##\brief Base UTF16String type, has 16-bit NULL termination word after data, native byte order
#
# \throws UnicodeError

immutable UTF16String <: AbstractString
data::Vector{UInt16} # includes 16-bit NULL termination after string chars
function UTF16String(data::Vector{UInt16})
if length(data) < 1 || data[end] != 0
throw(UnicodeError(UTF_ERR_NULL_16_TERMINATE, 0, 0))
end
new(data)
end
end

##\brief Base UTF32String type, has 32-bit NULL termination word after data, native byte order
#
# \throws UnicodeError

immutable UTF32String <: DirectIndexString
data::Vector{Char} # includes 32-bit NULL termination after string chars

function UTF32String(data::Vector{Char})
if length(data) < 1 || data[end] != Char(0)
throw(UnicodeError(UTF_ERR_NULL_32_TERMINATE, 0, 0))
end
new(data)
end
end
UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data))

isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data)
4 changes: 3 additions & 1 deletion test/unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ u16 = utf16(u8)
@test collect(u8) == collect(u16)
@test u8 == utf16(u16.data[1:end-1]) == utf16(copy!(Array(UInt8, 18), 1, reinterpret(UInt8, u16.data), 1, 18))
@test u8 == utf16(pointer(u16)) == utf16(convert(Ptr{Int16}, pointer(u16)))
@test_throws ArgumentError utf16(utf32(Char(0x120000)))
@test_throws UnicodeError utf16(utf32(Char(0x120000)))
@test_throws UnicodeError utf16(UInt8[1,2,3])

# UTF32
u32 = utf32(u8)
Expand All @@ -21,6 +22,7 @@ u32 = utf32(u8)
@test collect(u8) == collect(u32)
@test u8 == utf32(u32.data[1:end-1]) == utf32(copy!(Array(UInt8, 20), 1, reinterpret(UInt8, u32.data), 1, 20))
@test u8 == utf32(pointer(u32)) == utf32(convert(Ptr{Int32}, pointer(u32)))
@test_throws UnicodeError utf32(UInt8[1,2,3])

# Wstring
w = wstring(u8)
Expand Down

0 comments on commit 47a0eb1

Please sign in to comment.