From 93376fbec82c27ce60555266694a1e1a3f062cb1 Mon Sep 17 00:00:00 2001
From: ScottPJones <scottjones@alum.mit.edu>
Date: Wed, 1 Jul 2015 01:05:17 -0400
Subject: [PATCH] Fix #10959 problems with UTF-8 conversions

---
 base/utf8.jl       | 108 +++++++++--
 base/utfconvert.jl | 444 +++++++++++++++++++++++++++++++++++++++++++++
 test/strings.jl    |  68 ++++---
 3 files changed, 563 insertions(+), 57 deletions(-)
 create mode 100644 base/utfconvert.jl

diff --git a/base/utf8.jl b/base/utf8.jl
index e94a988777521d..587f8e008bfdec 100644
--- a/base/utf8.jl
+++ b/base/utf8.jl
@@ -3,7 +3,7 @@
 ## from base/boot.jl:
 #
 # immutable UTF8String <: AbstractString
-#     data::Array{UInt8,1}
+#     data::Vector{UInt8}
 # end
 #
 
@@ -26,27 +26,23 @@ const utf8_trailing = [
     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
 ]
 
-is_utf8_start(byte::UInt8) = ((byte&0xc0)!=0x80)
-
 ## required core functionality ##
 
 function endof(s::UTF8String)
     d = s.data
     i = length(d)
     i == 0 && return i
-    while !is_utf8_start(d[i])
+    while is_valid_continuation(d[i])
         i -= 1
     end
     i
 end
 
-is_utf8_continuation(byte::UInt8) = ((byte&0xc0) == 0x80)
-
 function length(s::UTF8String)
     d = s.data
     cnum = 0
     for i = 1:length(d)
-        @inbounds cnum += !is_utf8_continuation(d[i])
+        @inbounds cnum += !is_valid_continuation(d[i])
     end
     cnum
 end
@@ -65,8 +61,17 @@ function next(s::UTF8String, i::Int)
 
     d = s.data
     b = d[i]
-    if !is_utf8_start(b)
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
+    if is_valid_continuation(b)
+        j = i-1
+        while 0 < j && is_valid_continuation(d[j])
+            j -= 1
+        end
+        if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
+            # b is a continuation byte of a valid UTF-8 character
+            throw(UnicodeError(UTF_ERR_CONT, i, d[j]))
+        end
+        # move past 1 byte in case the data is actually Latin-1
+        return '\ufffd', i+1
     end
     trailing = utf8_trailing[b+1]
     if length(d) < i + trailing
@@ -93,7 +98,7 @@ end
 function reverseind(s::UTF8String, i::Integer)
     j = lastidx(s) + 1 - i
     d = s.data
-    while !is_utf8_start(d[j])
+    while is_valid_continuation(d[j])
         j -= 1
     end
     return j
@@ -106,7 +111,7 @@ sizeof(s::UTF8String) = sizeof(s.data)
 lastidx(s::UTF8String) = length(s.data)
 
 isvalid(s::UTF8String, i::Integer) =
-    (1 <= i <= endof(s.data)) && is_utf8_start(s.data[i])
+    (1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i])
 
 const empty_utf8 = UTF8String(UInt8[])
 
@@ -114,11 +119,8 @@ function getindex(s::UTF8String, r::UnitRange{Int})
     isempty(r) && return empty_utf8
     i, j = first(r), last(r)
     d = s.data
-    if i < 1 || i > length(s.data)
-        throw(BoundsError(s, i))
-    end
-    if !is_utf8_start(d[i])
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
+    if is_valid_continuation(d[i])
+        i = nextind(s,i)
     end
     if j > length(d)
         throw(BoundsError())
@@ -214,20 +216,86 @@ write(io::IO, s::UTF8String) = write(io, s.data)
 utf8(x) = convert(UTF8String, x)
 convert(::Type{UTF8String}, s::UTF8String) = s
 convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
-convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8))
-function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString)
+
+"
+Converts a UTF-8 encoded vector of `UInt8` to a `UTF8String`
+
+### Input Arguments:
+*   `::Type{UTF8String}`
+*   `dat::Vector{UInt8}`
+
+### Returns:
+*   `::UTF8String`
+
+### Throws:
+*   `UnicodeError`
+"
+function convert(::Type{UTF8String}, dat::Vector{UInt8})
+    # handle zero length string quickly
+    isempty(dat) && return empty_utf8
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = check_string(dat)
+    if (flags & (UTF_LONG | UTF_SURROGATE)) == 0
+        len = sizeof(dat)
+        @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
+    end
+    # Copy, but eliminate over-long encodings and surrogate pairs
+    len += num2byte + num3byte*2 + num4byte*3
+    buf = Vector{UInt8}(len)
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch::UInt32 = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle overlong < 0x100
+        elseif ch < 0xc2
+            buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f)
+        # Handle 0x100-0x7ff
+        elseif ch < 0xe0
+            buf[out += 1] = ch
+            buf[out += 1] = dat[pos += 1]
+        elseif ch != 0xed
+            buf[out += 1] = ch
+            buf[out += 1] = dat[pos += 1]
+            buf[out += 1] = dat[pos += 1]
+            # Copy 4-byte encoded value
+            ch >= 0xf0 && (buf[out += 1] = dat[pos += 1])
+        # Handle surrogate pairs
+        else
+            ch = dat[pos += 1]
+            if ch < 0xa0 # not surrogate pairs
+                buf[out += 1] = 0xed
+                buf[out += 1] = ch
+                buf[out += 1] = dat[pos += 1]
+            else
+                # Pick up surrogate pairs (CESU-8 format)
+                ch = (((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10)
+                        + (((dat[pos + 3] & 0x3f) << 6) | (dat[pos + 4] & 0x3f))
+                        - 0xc00)
+                pos += 4
+                output_utf8_4byte!(buf, out, ch)
+                out += 4
+            end
+        end
+    end
+    UTF8String(buf)
+end
+
+function convert(::Type{UTF8String}, a::Vector{UInt8}, invalids_as::AbstractString)
     l = length(a)
     idx = 1
     iscopy = false
     while idx <= l
-        if is_utf8_start(a[idx])
+        if !is_valid_continuation(a[idx])
             nextidx = idx+1+utf8_trailing[a[idx]+1]
             (nextidx <= (l+1)) && (idx = nextidx; continue)
         end
         !iscopy && (a = copy(a); iscopy = true)
         endn = idx
         while endn <= l
-            is_utf8_start(a[endn]) && break
+            !is_valid_continuation(a[endn]) && break
             endn += 1
         end
         (endn > idx) && (endn -= 1)
diff --git a/base/utfconvert.jl b/base/utfconvert.jl
new file mode 100644
index 00000000000000..cd5b12cb8b068b
--- /dev/null
+++ b/base/utfconvert.jl
@@ -0,0 +1,444 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+# Functions to convert to different UTF encodings
+
+# Quickly copy and set trailing \0
+@inline function fast_utf_copy(T::Type{UInt16}, len, dat)
+    @inbounds return UTF16String(setindex!(copy!(Vector{T}(len), dat), 0, len))
+end
+@inline function fast_utf_copy(T::Type{Char}, len, dat)
+    @inbounds return UTF32String(setindex!(copy!(Vector{T}(len), dat), 0, len))
+end
+
+# Get rest of character ch from 3-byte UTF-8 sequence in dat
+@inline function get_utf8_3(dat, pos, ch)
+    @inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f)
+end
+
+# Get rest of character ch from 4-byte UTF-8 sequence in dat
+@inline function get_utf8_4(dat, pos, ch)
+    @inbounds return (((ch & 0x7) << 18)
+                        | (UInt32(dat[pos-2] & 0x3f) << 12)
+                        | (UInt32(dat[pos-1] & 0x3f) << 6)
+                        | (dat[pos] & 0x3f))
+end
+
+# Output a character as a 4-byte UTF-8 sequence
+@inline function output_utf8_4(buf, out, ch)
+    @inbounds begin
+        buf[out + 1] = 0xf0 | (ch >>> 18)
+        buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f)
+        buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f)
+        buf[out + 4] = 0x80 | (ch & 0x3f)
+    end
+end
+
+#=
+"""
+@brief      Converts an AbstractString to a UTF16String
+
+@param[in]  ::Type{UTF16String}
+@param[in]  str::AbstractString
+
+@return     ::UTF16String
+@throws     ArgumentError
+"""
+=#
+function convert(::Type{UTF16String}, str::AbstractString)
+    len, flags, num4byte = check_string_abs(str)
+    buf = Vector{UInt16}(len+num4byte+1)
+    out = 0
+    @inbounds for ch in str
+        c = UInt32(ch)
+        if c < 0x10000
+            buf[out += 1] = UInt16(c)
+        else
+            # output surrogate pair
+            buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
+            buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
+        end
+    end
+    @inbounds buf[out + 1] = 0 # NULL termination
+    UTF16String(buf)
+end
+
+#=
+"""
+@brief      Converts an AbstractString to a UTF32String
+
+@param[in]  ::Type{UTF32String}
+@param[in]  str::AbstractString
+
+@return     ::UTF32String
+@throws     ArgumentError
+"""
+=#
+function convert(::Type{UTF32String}, str::AbstractString)
+    len, flags = check_string_abs(str)
+    buf = Vector{Char}(len+1)
+    out = 0
+    @inbounds for ch in str ; buf[out += 1] = ch ; end
+    @inbounds buf[out + 1] = 0 # NULL termination
+    UTF32String(buf)
+end
+
+#=
+@doc """
+@brief      Converts a UTF8String to a UTF16String
+
+@param[in]  ::Type{UTF16String}
+@param[in]  str::UTF8String
+
+@return     ::UTF16String
+@throws     ArgumentError
+""" ->
+=#
+function convert(::Type{UTF16String}, str::UTF8String)
+    dat = str.data
+    # handle zero length string quickly
+    sizeof(dat) == 0 && return empty_utf16
+    # Check that is correct UTF-8 encoding and get number of words needed
+    len, flags, num4byte = check_string_utf8(dat)
+    len += num4byte
+    buf = Vector{UInt16}(len+1)
+    @inbounds buf[len+1] = 0
+    # Optimize case where no characters > 0x7f
+    flags == 0 && @inbounds return UTF16String(copy!(buf, dat))
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch::UInt32 = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle range 0x80-0x7ff
+        elseif ch < 0xe0
+            buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
+        # Handle range 0x800-0xffff
+        elseif ch < 0xf0
+            pos += 2
+            buf[out += 1] = get_utf8_3(dat, pos, ch)
+        # Handle range 0x10000-0x10ffff
+        else
+            pos += 3
+            ch = get_utf8_4(dat, pos, ch)
+            # output surrogate pair
+            buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
+            buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
+        end
+    end
+    UTF16String(buf)
+end
+
+#=
+@doc """
+@brief      Converts a UTF-16 encoded vector of UInt16 to a UTF8String
+
+@param[in]  ::Type{UTF8String}
+@param[in]  dat::Vector{UInt16}
+
+@return     ::UTF8String
+@throws     ArgumentError
+""" ->
+=#
+function convert(::Type{UTF8String}, dat::Vector{UInt16})
+    len = sizeof(dat)
+    # handle zero length string quickly
+    len == 0 && return UTF8String("")
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len>>>1)
+    flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat))
+    return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
+end
+
+#=
+@doc """
+@brief      Converts a UTF16String to a UTF8String
+
+@param[in]  ::Type{UTF8String}
+@param[in]  str::UTF16String
+
+@return     ::UTF8String
+@throws     ArgumentError
+""" ->
+=#
+function convert(::Type{UTF8String}, str::UTF16String)
+    dat = str.data
+    len = sizeof(dat) >>> 1
+    # handle zero length string quickly
+    len <= 1 && return UTF8String("")
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len-1)
+    flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
+    return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
+end
+
+#=
+@doc """
+@brief      Encodes a UTF-32 encoded vector of UInt32 to a UTF8String
+
+@param[in]  ::Type{UTF8String}
+@param[in]  dat::Vector{UInt32}
+
+@return     ::UTF8String
+@throws     ArgumentError
+""" ->
+=#
+function convert(::Type{UTF8String}, dat::Vector{UInt32})
+    len = sizeof(dat)
+    # handle zero length string quickly
+    len == 0 && return UTF8String("")
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len>>>2)
+    flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
+    return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
+end
+
+#=
+@doc """
+@brief      Converts a UTF32String to a UTF8String
+
+@param[in]  ::Type{UTF8String}
+@param[in]  str::UTF32String
+
+@return     ::UTF8String
+@throws     ArgumentError
+""" ->
+=#
+function convert(::Type{UTF8String},  str::UTF32String)
+    dat = reinterpret(UInt32, str.data)
+    len = sizeof(dat) >>> 2
+    # handle zero length string quickly
+    len <= 1 && return UTF8String("")
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len-1)
+    flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
+    return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
+end
+
+#=
+@doc """
+@brief      Converts an already validated vector of UInt16 or UInt32 to a UTF8String
+
+@param[in]  T           type (UInt16 or UInt32)
+@param[in]  dat         Vector{T}
+@param[in]  len         length of output in bytes
+
+@return     ::UTF8String
+""" ->
+=#
+function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len)
+    buf = Vector{UInt8}(len)
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch::UInt32 = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle 0x80-0x7ff
+        elseif ch < 0x800
+            buf[out += 1] = 0xc0 | (ch >>> 6)
+            buf[out += 1] = 0x80 | (ch & 0x3f)
+        # Handle 0x10000-0x10ffff (if input is UInt32)
+        elseif T == UInt32 && ch > 0xffff
+            output_utf8_4(buf, out, ch)
+            out += 4
+        # Handle surrogate pairs
+        elseif is_surrogate_codeunit(ch)
+            output_utf8_4(buf, out, get_supplementary(ch, dat[pos += 1]))
+            out += 4
+        # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
+        else
+            buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
+            buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
+            buf[out += 1] = 0x80 | (ch & 0x3f)
+        end
+    end
+    UTF8String(buf)
+end
+
+#=
+"""
+@brief      Converts a UTF8String to a UTF32String
+
+@param[in]  ::Type{UTF32String}
+@param[in]  str::UTF8String
+
+@return     ::UTF32String
+@throws     ArgumentError
+"""
+=#
+function convert(::Type{UTF32String}, str::UTF8String)
+    dat = str.data
+    # handle zero length string quickly
+    sizeof(dat) == 0 && return empty_utf32
+    # Validate UTF-8 encoding, and get number of words to create
+    len, flags = check_string_utf8(dat)
+    # Optimize case where no characters > 0x7f
+    totlen = len+1
+    flags == 0 && return fast_utf_copy(Char, totlen, dat)
+    # has multi-byte UTF-8 sequences
+    buf = Vector{Char}(totlen)
+    @inbounds buf[totlen] = 0 # NULL termination
+    local ch::UInt32, surr::UInt32
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle range 0x80-0x7ff
+        elseif ch < 0xe0
+            buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
+        # Handle range 0x800-0xffff
+        elseif ch < 0xf0
+            pos += 2
+            ch = get_utf8_3(dat, pos, ch)
+            # Handle surrogate pairs (should have been encoded in 4 bytes)
+            if is_surrogate_lead(ch)
+                # Build up 32-bit character from ch and trailing surrogate in next 3 bytes
+                pos += 3
+                surr = ((UInt32(dat[pos-2] & 0xf) << 12)
+                        | (UInt32(dat[pos-1] & 0x3f) << 6)
+                        | (dat[pos] & 0x3f))
+                ch = get_supplementary(ch, surr)
+            end
+            buf[out += 1] = ch
+        # Handle range 0x10000-0x10ffff
+        else
+            pos += 3
+            buf[out += 1] = get_utf8_4(dat, pos, ch)
+        end
+    end
+    UTF32String(buf)
+end
+
+#=
+"""
+@brief      Converts a UTF16String to UTF32String
+
+@param[in]  ::Type{UTF32String}
+@param[in]  str::UTF16String
+
+@return     ::UTF32String
+@throws     ArgumentError
+"""
+=#
+function convert(::Type{UTF32String}, str::UTF16String)
+    dat = str.data
+    len = sizeof(dat)
+    # handle zero length string quickly (account for trailing \0)
+    len <= 2 && return empty_utf32
+    # get number of words to create
+    len, flags, num4byte = check_string_utf16(dat, len>>>1)
+    # No surrogate pairs, do optimized copy
+    (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat))
+    local ch::UInt32
+    buf = Vector{Char}(len)
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch = dat[pos += 1]
+        # check for surrogate pair
+        if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end
+        buf[out += 1] = ch
+    end
+    UTF32String(buf)
+end
+
+#=
+"""
+@brief      Converts a UTF-32 encoded vector of UInt32 to a UTF16String
+
+@param[in]  ::Type{UTF16String}
+@param[in]  dat::Vector{UInt32}
+
+@return     ::UTF16String
+@throws     ArgumentError
+"""
+=#
+function convert(::Type{UTF16String}, dat::Vector{UInt32})
+    len = sizeof(dat)
+    # handle zero length string quickly
+    len <= 4 && return empty_utf16
+    # get number of words to allocate
+    len, flags, num4byte = check_string_utf32(dat, len>>>2)
+    len += num4byte + 1
+    # optimized path, no surrogates
+    num4byte == 0 && return fast_utf_copy(UInt16, len, dat)
+    return encode_to_utf16(dat, len)
+end
+
+#=
+"""
+@brief      Converts a UTF32String to UTF16String
+
+@param[in]  ::Type{UTF16String}
+@param[in]  str::UTF32String
+
+@return     ::UTF16String
+@throws     ArgumentError
+"""
+=#
+function convert(::Type{UTF16String}, str::UTF32String)
+    dat = reinterpret(UInt32, str.data)
+    len = sizeof(dat)
+    # handle zero length string quickly
+    len <= 4 && return empty_utf16
+    # get number of words to allocate
+    len, flags, num4byte = check_string_utf32(dat, len>>>2)
+    # optimized path, no surrogates
+    num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat))
+    return encode_to_utf16(dat, len + num4byte)
+end
+
+#=
+@doc """
+@brief      Converts an already validated UTF-32 encoded vector of UInt32 to a UTF16String
+
+@param[in]  dat::Vector{UInt32} UTF-32 encoded data
+@param[in]  len                 length of output in 16-bit words
+
+@return     ::UTF16String
+""" ->
+=#
+function encode_to_utf16(dat, len)
+    buf = Vector{UInt16}(len)
+    @inbounds buf[len] = 0 # NULL termination
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch = UInt32(dat[pos += 1])
+        if ch > 0xffff
+            # Output surrogate pair for 0x10000-0x10ffff
+            buf[out += 1] = 0xd7c0 + (ch >>> 10)
+            ch = 0xdc00 + (ch & 0x3ff)
+        end
+        buf[out += 1] = ch
+    end
+    UTF16String(buf)
+end
+
+convert(::Type{UTF8String},  dat::Vector{Char})   = convert(UTF8String, reinterpret(UInt32, dat))
+
+function convert(::Type{UTF16String}, str::ASCIIString)
+    dat = str.data
+    fast_utf_copy(UInt16, length(dat)+1, dat)
+end
+
+function convert(::Type{UTF32String}, str::ASCIIString)
+    dat = str.data
+    fast_utf_copy(Char, length(dat)+1, dat)
+end
+
+convert(::Type{UTF16String}, str::UTF16String)    = str
+convert(::Type{UTF16String}, dat::Vector{Char})   = convert(UTF16String, reinterpret(UInt32, dat))
+
+convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data
+convert(::Type{Array{UInt16}},  str::UTF16String) = str.data
+
+convert(::Type{UTF32String}, str::UTF32String)    = str
+
+convert(::Type{UTF32String}, c::Char)             = UTF32String(Char[c, Char(0)])
diff --git a/test/strings.jl b/test/strings.jl
index 200c5f780c6401..f7424c8a5b0607 100644
--- a/test/strings.jl
+++ b/test/strings.jl
@@ -1948,14 +1948,6 @@ foobaz(ch) = Char(0x200000)
 @test_throws UnicodeError map(foobar, utf16(str))
 @test_throws UnicodeError map(foobaz, utf16(str))
 
-# issue #11551 (#11004,#10959)
-function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String)
-    @test utf16(strUTF8) == strUTF16
-    @test utf8(strUTF16) == strUTF8
-end
-
-# Create some ASCII, UTF8 and UTF16
-
 # issue #11551 (#11004,#10959)
 function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String)
     @test utf16(strUTF8) == strUTF16
@@ -1967,7 +1959,6 @@ function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32Strin
 end
 
 # Create some ASCII, UTF8, UTF16, and UTF32 strings
-
 strAscii = "abcdefgh"
 strA_UTF8 = ("abcdefgh\uff")[1:8]
 strL_UTF8 = "abcdef\uff\uff"
@@ -1976,8 +1967,8 @@ str3_UTF8 = "abcd\uff\uff\u7fff\u7fff"
 str4_UTF8 = "abcd\uff\u7ff\u7fff\U7ffff"
 strS_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80")
 strC_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\U10000")
-strZ_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80")
 strz_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\0")
+strZ      = b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80"
 
 strA_UTF16 = utf16(strA_UTF8)
 strL_UTF16 = utf16(strL_UTF8)
@@ -2012,97 +2003,100 @@ tstcvt(str4_UTF8,str4_UTF16,str4_UTF32)
 @test utf16(strS_UTF32) == strC_UTF8
 
 # Test converting overlong \0
-# @test utf8(strZ_UTF8)  == strz_UTF8   # currently broken! (in utf8.jl)
-@test utf16(strZ_UTF8) == strz_UTF8
-@test utf32(strZ_UTF8) == strz_UTF8
+@test utf8(strZ)  == strz_UTF8
+@test utf16(UTF8String(strZ)) == strz_UTF8
+@test utf32(UTF8String(strZ)) == strz_UTF8
 
 # Test invalid sequences
 
+@inline strval(::Type{UTF8String}, dat) = dat
+@inline strval(::Union(Type{UTF16String},Type{UTF32String}), dat) = UTF8String(dat)
+
 byt = 0x0
-for T in (UTF16String, UTF32String)
+for T in (UTF8String, UTF16String, UTF32String)
     try
     # Continuation byte not after lead
     for byt in 0x80:0xbf
-        @test_throws UnicodeError convert(T,  UTF8String(UInt8[byt]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt]))
     end
 
     # Test lead bytes
     for byt in 0xc0:0xff
         # Single lead byte at end of string
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt]))
         # Lead followed by non-continuation character < 0x80
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0]))
         # Lead followed by non-continuation character > 0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0xc0]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0xc0]))
     end
 
     # Test overlong 2-byte
     for byt in 0x81:0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc0,byt]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[0xc0,byt]))
     end
     for byt in 0x80:0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc1,byt]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[0xc1,byt]))
     end
 
     # Test overlong 3-byte
     for byt in 0x80:0x9f
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xe0,byt,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[0xe0,byt,0x80]))
     end
 
     # Test overlong 4-byte
     for byt in 0x80:0x8f
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xef,byt,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[0xef,byt,0x80,0x80]))
     end
 
     # Test 4-byte > 0x10ffff
     for byt in 0x90:0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xf4,byt,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[0xf4,byt,0x80,0x80]))
     end
     for byt in 0xf5:0xf7
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0x80]))
     end
 
     # Test 5-byte
     for byt in 0xf8:0xfb
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0x80,0x80]))
     end
 
     # Test 6-byte
     for byt in 0xfc:0xfd
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0x80,0x80,0x80]))
     end
 
     # Test 7-byte
-    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]))
+    @test_throws UnicodeError convert(T, strval(T,UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]))
 
     # Three and above byte sequences
     for byt in 0xe0:0xef
         # Lead followed by only 1 continuation byte
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80]))
         # Lead ended by non-continuation character < 0x80
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0]))
         # Lead ended by non-continuation character > 0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0xc0]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0xc0]))
     end
 
     # 3-byte encoded surrogate character(s)
     # Single surrogate
-    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80]))
+    @test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xa0,0x80]))
     # Not followed by surrogate
-    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]))
+    @test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]))
     # Trailing surrogate first
-    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]))
+    @test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]))
     # Followed by lead surrogate
-    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]))
+    @test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]))
 
     # Four byte sequences
     for byt in 0xf0:0xf4
         # Lead followed by only 2 continuation bytes
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80]))
         # Lead followed by non-continuation character < 0x80
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0]))
         # Lead followed by non-continuation character > 0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0xc0]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0xc0]))
     end
     catch exp ;
         println("Error checking $T: $byt")