From ca87916acc14fbc03bc16fae7baf622d156e4d6b Mon Sep 17 00:00:00 2001
From: ScottPJones <scottjones@alum.mit.edu>
Date: Wed, 1 Jul 2015 01:05:17 -0400
Subject: [PATCH] Fix #10959 problems with UTF-8 conversions

Update for search change

Updated to use unsafe_checkstring, fix comments
---
 base/unicode/utf8.jl | 106 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 85 insertions(+), 21 deletions(-)

diff --git a/base/unicode/utf8.jl b/base/unicode/utf8.jl
index f38c2149179d5e..db246148c517f7 100644
--- a/base/unicode/utf8.jl
+++ b/base/unicode/utf8.jl
@@ -3,7 +3,7 @@
 ## from base/boot.jl:
 #
 # immutable UTF8String <: AbstractString
-#     data::Array{UInt8,1}
+#     data::Vector{UInt8}
 # end
 #
 
@@ -26,27 +26,23 @@ const utf8_trailing = [
     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
 ]
 
-is_utf8_start(byte::UInt8) = ((byte&0xc0)!=0x80)
-
 ## required core functionality ##
 
 function endof(s::UTF8String)
     d = s.data
     i = length(d)
     i == 0 && return i
-    while !is_utf8_start(d[i])
+    while is_valid_continuation(d[i])
         i -= 1
     end
     i
 end
 
-is_utf8_continuation(byte::UInt8) = ((byte&0xc0) == 0x80)
-
 function length(s::UTF8String)
     d = s.data
     cnum = 0
     for i = 1:length(d)
-        @inbounds cnum += !is_utf8_continuation(d[i])
+        @inbounds cnum += !is_valid_continuation(d[i])
     end
     cnum
 end
@@ -65,8 +61,17 @@ function next(s::UTF8String, i::Int)
 
     d = s.data
     b = d[i]
-    if !is_utf8_start(b)
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
+    if is_valid_continuation(b)
+        j = i-1
+        while 0 < j && is_valid_continuation(d[j])
+            j -= 1
+        end
+        if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
+            # b is a continuation byte of a valid UTF-8 character
+            throw(UnicodeError(UTF_ERR_CONT, i, d[j]))
+        end
+        # move past 1 byte in case the data is actually Latin-1
+        return '\ufffd', i+1
     end
     trailing = utf8_trailing[b+1]
     if length(d) < i + trailing
@@ -93,7 +98,7 @@ end
 function reverseind(s::UTF8String, i::Integer)
     j = lastidx(s) + 1 - i
     d = s.data
-    while !is_utf8_start(d[j])
+    while is_valid_continuation(d[j])
         j -= 1
     end
     return j
@@ -106,7 +111,7 @@ sizeof(s::UTF8String) = sizeof(s.data)
 lastidx(s::UTF8String) = length(s.data)
 
 isvalid(s::UTF8String, i::Integer) =
-    (1 <= i <= endof(s.data)) && is_utf8_start(s.data[i])
+    (1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i])
 
 const empty_utf8 = UTF8String(UInt8[])
 
@@ -114,11 +119,8 @@ function getindex(s::UTF8String, r::UnitRange{Int})
     isempty(r) && return empty_utf8
     i, j = first(r), last(r)
     d = s.data
-    if i < 1 || i > length(s.data)
-        throw(BoundsError(s, i))
-    end
-    if !is_utf8_start(d[i])
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
+    if is_valid_continuation(d[i])
+        i = nextind(s,i)
     end
     if j > length(d)
         throw(BoundsError())
@@ -133,7 +135,7 @@ function search(s::UTF8String, c::Char, i::Integer)
         throw(BoundsError(s, i))
     end
     d = s.data
-    if !is_utf8_start(d[i])
+    if is_valid_continuation(d[i])
         throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
     end
     c < Char(0x80) && return search(d, c%UInt8, i)
@@ -214,20 +216,82 @@ write(io::IO, s::UTF8String) = write(io, s.data)
 utf8(x) = convert(UTF8String, x)
 convert(::Type{UTF8String}, s::UTF8String) = s
 convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
-convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8))
-function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString)
+
+"
+Converts a UTF-8 encoded vector of `UInt8` to a `UTF8String`
+
+### Returns:
+*   `UTF8String`
+
+### Throws:
+*   `UnicodeError`
+"
+function convert(::Type{UTF8String}, dat::Vector{UInt8})
+    # handle zero length string quickly
+    isempty(dat) && return empty_utf8
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat)
+    if (flags & (UTF_LONG | UTF_SURROGATE)) == 0
+        len = sizeof(dat)
+        @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
+    end
+    # Copy, but eliminate over-long encodings and surrogate pairs
+    len += num2byte + num3byte*2 + num4byte*3
+    buf = Vector{UInt8}(len)
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch::UInt32 = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle overlong < 0x100
+        elseif ch < 0xc2
+            buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f)
+        # Handle 0x100-0x7ff
+        elseif ch < 0xe0
+            buf[out += 1] = ch
+            buf[out += 1] = dat[pos += 1]
+        elseif ch != 0xed
+            buf[out += 1] = ch
+            buf[out += 1] = dat[pos += 1]
+            buf[out += 1] = dat[pos += 1]
+            # Copy 4-byte encoded value
+            ch >= 0xf0 && (buf[out += 1] = dat[pos += 1])
+        # Handle surrogate pairs
+        else
+            ch = dat[pos += 1]
+            if ch < 0xa0 # not surrogate pairs
+                buf[out += 1] = 0xed
+                buf[out += 1] = ch
+                buf[out += 1] = dat[pos += 1]
+            else
+                # Pick up surrogate pairs (CESU-8 format)
+                ch = (((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10)
+                        + (((dat[pos + 3] & 0x3f) << 6) | (dat[pos + 4] & 0x3f))
+                        - 0xc00)
+                pos += 4
+                output_utf8_4byte!(buf, out, ch)
+                out += 4
+            end
+        end
+    end
+    UTF8String(buf)
+end
+
+function convert(::Type{UTF8String}, a::Vector{UInt8}, invalids_as::AbstractString)
     l = length(a)
     idx = 1
     iscopy = false
     while idx <= l
-        if is_utf8_start(a[idx])
+        if !is_valid_continuation(a[idx])
             nextidx = idx+1+utf8_trailing[a[idx]+1]
             (nextidx <= (l+1)) && (idx = nextidx; continue)
         end
         !iscopy && (a = copy(a); iscopy = true)
         endn = idx
         while endn <= l
-            is_utf8_start(a[endn]) && break
+            !is_valid_continuation(a[endn]) && break
             endn += 1
         end
         (endn > idx) && (endn -= 1)