Merge pull request #11573 from ScottPJones/spj/utferror

Improve Unicode related error messages
JuliaLang · Jun 5, 2015 · 47a0eb1 · 47a0eb1
2 parents e9fa25b + 0b158a6
commit 47a0eb1
Show file tree

Hide file tree

Showing 8 changed files with 80 additions and 37 deletions.
diff --git a/base/exports.jl b/base/exports.jl
@@ -165,6 +165,7 @@ export
     SystemError,
     TypeError,
     AssertionError,
+    UnicodeError,
 
 # Global constants and variables
     ARGS,

diff --git a/base/sysimg.jl b/base/sysimg.jl
@@ -84,6 +84,8 @@ include("iterator.jl")
 include("osutils.jl")
 
 # strings & printing
+include("utferror.jl")
+include("utftypes.jl")
 include("char.jl")
 include("ascii.jl")
 include("utf8.jl")

diff --git a/base/utf16.jl b/base/utf16.jl
@@ -1,15 +1,5 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
 
-immutable UTF16String <: AbstractString
-    data::Array{UInt16,1} # includes 16-bit NULL termination after string chars
-    function UTF16String(data::Vector{UInt16})
-        if length(data) < 1 || data[end] != 0
-            throw(ArgumentError("UTF16String data must be NULL-terminated"))
-        end
-        new(data)
-    end
-end
-
 utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800
 utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00
 utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
@@ -39,7 +29,7 @@ function next(s::UTF16String, i::Int)
     elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
         return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
     end
-    throw(ArgumentError("invalid UTF-16 character index"))
+    throw(UnicodeError(UTF_ERR_INVALID_INDEX,0,0))
 end
 
 function reverseind(s::UTF16String, i::Integer)
@@ -74,7 +64,7 @@ function encode16(s::AbstractString)
             push!(buf, UInt16(0xd7c0 + (c>>10)))
             push!(buf, UInt16(0xdc00 + (c & 0x3ff)))
         else
-            throw(ArgumentError("invalid Unicode character (0x$(hex(c)) > 0x10ffff)"))
+            throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch))
         end
     end
     push!(buf, 0) # NULL termination
@@ -111,7 +101,7 @@ function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
 end
 
 function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
-    !isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
+    !isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
     len = length(data)
     d = Array(UInt16, len + 1)
     d[end] = 0 # NULL terminate
@@ -126,7 +116,7 @@ convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =
 
 function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
     isempty(bytes) && return UTF16String(UInt16[0])
-    isodd(length(bytes)) && throw(ArgumentError("odd number of bytes"))
+    isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0))
     data = reinterpret(UInt16, bytes)
     # check for byte-order mark (BOM):
     if data[1] == 0xfeff        # native byte order
@@ -142,7 +132,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
         copy!(d,1, data,1, length(data)) # assume native byte order
     end
     d[end] = 0 # NULL terminate
-    !isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data"))
+    !isvalid(UTF16String, d) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
     UTF16String(d)
 end
 

diff --git a/base/utf32.jl b/base/utf32.jl
@@ -1,19 +1,5 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
 
-## UTF-32 in the native byte order, i.e. plain old character arrays ##
-
-immutable UTF32String <: DirectIndexString
-    data::Vector{Char} # includes 32-bit NULL termination after string chars
-
-    function UTF32String(a::Vector{Char})
-        if length(a) < 1 || a[end] != Char(0)
-            throw(ArgumentError("UTF32String data must be NULL-terminated"))
-        end
-        new(a)
-    end
-end
-UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data))
-
 next(s::UTF32String, i::Int) = (s.data[i], i+1)
 endof(s::UTF32String) = length(s.data) - 1
 length(s::UTF32String) = length(s.data) - 1
@@ -65,7 +51,7 @@ unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
 
 function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
     isempty(bytes) && return UTF32String(Char[0])
-    length(bytes) & 3 != 0 && throw(ArgumentError("need multiple of 4 bytes"))
+    length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0))
     data = reinterpret(Char, bytes)
     # check for byte-order mark (BOM):
     if data[1] == Char(0x0000feff) # native byte order
@@ -91,8 +77,6 @@ function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
     return true
 end
 isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
-isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)
-isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data)
 
 utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
 utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len)
@@ -110,7 +94,7 @@ function map(f, s::UTF32String)
     for i = 1:(length(d)-1)
         c2 = f(d[i])
         if !isa(c2, Char)
-            throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
+            throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0))
         end
         out[i] = (c2::Char)
     end

diff --git a/base/utf8.jl b/base/utf8.jl
@@ -72,7 +72,7 @@ function next(s::UTF8String, i::Int)
         end
         if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
             # b is a continuation byte of a valid UTF-8 character
-            throw(ArgumentError("invalid UTF-8 character index"))
+            throw(UnicodeError(UTF_ERR_CONT, i, d[j]))
         end
         # move past 1 byte in case the data is actually Latin-1
         return '\ufffd', i+1
@@ -198,7 +198,7 @@ function reverse(s::UTF8String)
     out = similar(s.data)
     if ccall(:u8_reverse, Cint, (Ptr{UInt8}, Ptr{UInt8}, Csize_t),
              out, s.data, length(out)) == 1
-        throw(ArgumentError("invalid UTF-8 data"))
+        throw(UnicodeError(UTF_ERR_INVALID_8,0,0))
     end
     UTF8String(out)
 end
@@ -212,7 +212,7 @@ write(io::IO, s::UTF8String) = write(io, s.data)
 utf8(x) = convert(UTF8String, x)
 convert(::Type{UTF8String}, s::UTF8String) = s
 convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
-convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(ArgumentError("invalid UTF-8 sequence"))
+convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8))
 function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString)
     l = length(a)
     idx = 1

diff --git a/base/utferror.jl b/base/utferror.jl
@@ -0,0 +1,30 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+##\brief      Error messages for Unicode / UTF support
+
+const UTF_ERR_SHORT             = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)"
+const UTF_ERR_CONT              = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
+const UTF_ERR_LONG              = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
+const UTF_ERR_NOT_LEAD          = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
+const UTF_ERR_NOT_TRAIL         = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
+const UTF_ERR_NOT_SURROGATE     = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>"
+const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
+const UTF_ERR_INVALID           = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
+const UTF_ERR_SURROGATE         = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
+const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
+const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
+const UTF_ERR_ODD_BYTES_16      = "UTF16String can't have odd number of bytes <<1>>"
+const UTF_ERR_ODD_BYTES_32      = "UTF32String must have multiple of 4 bytes <<1>>"
+const UTF_ERR_INVALID_CHAR      = "invalid Unicode character (0x<<2>> > 0x10ffff)"
+const UTF_ERR_INVALID_8         = "invalid UTF-8 data"
+const UTF_ERR_INVALID_16        = "invalid UTF-16 data"
+const UTF_ERR_INVALID_INDEX     = "invalid character index"
+const UTF_ERR_MAP_CHAR          = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"
+
+type UnicodeError <: Exception
+    errmsg::AbstractString      ##< A UTF_ERR_ message
+    errpos::Int32               ##< Position of invalid character
+    errchr::UInt32              ##< Invalid character
+end
+
+show(io::IO, exc::UnicodeError) = print(io, replace(replace(exc.errmsg,"<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr)))
diff --git a/base/utftypes.jl b/base/utftypes.jl
@@ -0,0 +1,34 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+##\brief      Base UTF16String type, has 16-bit NULL termination word after data, native byte order
+#
+# \throws     UnicodeError
+
+immutable UTF16String <: AbstractString
+    data::Vector{UInt16} # includes 16-bit NULL termination after string chars
+    function UTF16String(data::Vector{UInt16})
+        if length(data) < 1 || data[end] != 0
+            throw(UnicodeError(UTF_ERR_NULL_16_TERMINATE, 0, 0))
+        end
+        new(data)
+    end
+end
+
+##\brief      Base UTF32String type, has 32-bit NULL termination word after data, native byte order
+#
+# \throws     UnicodeError
+
+immutable UTF32String <: DirectIndexString
+    data::Vector{Char} # includes 32-bit NULL termination after string chars
+
+    function UTF32String(data::Vector{Char})
+        if length(data) < 1 || data[end] != Char(0)
+            throw(UnicodeError(UTF_ERR_NULL_32_TERMINATE, 0, 0))
+        end
+        new(data)
+    end
+end
+UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data))
+
+isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)
+isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data)
diff --git a/test/unicode.jl b/test/unicode.jl
@@ -10,7 +10,8 @@ u16 = utf16(u8)
 @test collect(u8) == collect(u16)
 @test u8 == utf16(u16.data[1:end-1]) == utf16(copy!(Array(UInt8, 18), 1, reinterpret(UInt8, u16.data), 1, 18))
 @test u8 == utf16(pointer(u16)) == utf16(convert(Ptr{Int16}, pointer(u16)))
-@test_throws ArgumentError utf16(utf32(Char(0x120000)))
+@test_throws UnicodeError utf16(utf32(Char(0x120000)))
+@test_throws UnicodeError utf16(UInt8[1,2,3])
 
 # UTF32
 u32 = utf32(u8)
@@ -21,6 +22,7 @@ u32 = utf32(u8)
 @test collect(u8) == collect(u32)
 @test u8 == utf32(u32.data[1:end-1]) == utf32(copy!(Array(UInt8, 20), 1, reinterpret(UInt8, u32.data), 1, 20))
 @test u8 == utf32(pointer(u32)) == utf32(convert(Ptr{Int32}, pointer(u32)))
+@test_throws UnicodeError utf32(UInt8[1,2,3])
 
 # Wstring
 w = wstring(u8)