From e596847b8648529e29a95fe38b4866eff08f41c2 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Fri, 8 Dec 2017 17:23:17 -0500
Subject: [PATCH 01/22] string overhaul: new Char representation, revamped core
 string API

---
 base/char.jl                    |  86 ++++-
 base/filesystem.jl              |  20 ++
 base/intfuncs.jl                |   4 +-
 base/io.jl                      |  66 ++--
 base/iostream.jl                |  32 +-
 base/parse.jl                   |   4 +-
 base/regex.jl                   |   8 +-
 base/repl/REPLCompletions.jl    |   2 +-
 base/stream.jl                  |   8 +
 base/strings/basic.jl           | 580 +++++++++++++++++---------------
 base/strings/io.jl              |  74 ++--
 base/strings/string.jl          | 433 ++++++++++--------------
 base/strings/substring.jl       | 144 ++------
 base/strings/unicode.jl         | 135 +++++---
 base/strings/util.jl            |  35 +-
 src/ast.c                       |  12 +-
 src/datatype.c                  |  18 +-
 src/jl_uv.c                     |  15 +-
 stdlib/Test/src/Test.jl         |   7 +-
 stdlib/Unicode/test/runtests.jl |   4 +-
 test/char.jl                    |  22 ++
 test/intfuncs.jl                |   2 +-
 test/lineedit.jl                |   4 +-
 test/strings/basic.jl           |  94 +++---
 test/strings/io.jl              |   3 +-
 test/strings/types.jl           | 119 ++++---
 test/unicode/utf8.jl            |  21 +-
 27 files changed, 1001 insertions(+), 951 deletions(-)

diff --git a/base/char.jl b/base/char.jl
index 9b99bb50b086f..6d21af949ebe8 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -1,8 +1,58 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-convert(::Type{Char}, x::UInt32) = reinterpret(Char, x)
+struct MalformedCharError <: Exception
+    char::Char
+end
+struct CodePointError <: Exception
+    code::Integer
+end
+@noinline malformed_char(c::Char) = throw(MalformedCharError(c))
+@noinline code_point_err(u::UInt32) = throw(CodePointError(u))
+
+function ismalformed(c::Char)
+    u = reinterpret(UInt32, c)
+    l1 = leading_ones(u) << 3
+    t0 = trailing_zeros(u) & 56
+    (l1 == 8) | (l1 + t0 > 32) |
+    (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0)
+end
+
+function convert(::Type{UInt32}, c::Char)
+    # TODO: use optimized inline LLVM
+    u = reinterpret(UInt32, c)
+    u < 0x80000000 && return reinterpret(UInt32, u >> 24)
+    l1 = leading_ones(u)
+    t0 = trailing_zeros(u) & 56
+    (l1 == 1) | (8l1 + t0 > 32) |
+    (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) &&
+        malformed_char(c)::Union{}
+    u &= 0xffffffff >> l1
+    u >>= t0
+    (u & 0x0000007f >> 0) | (u & 0x00007f00 >> 2) |
+    (u & 0x007f0000 >> 4) | (u & 0x7f000000 >> 6)
+end
+
+function convert(::Type{Char}, u::UInt32)
+    u < 0x80 && return reinterpret(Char, u << 24)
+    u < 0x00200000 || code_point_err(u)::Union{}
+    c = ((u << 0) & 0x0000003f) | ((u << 2) & 0x00003f00) |
+        ((u << 4) & 0x003f0000) | ((u << 6) & 0x3f000000)
+    c = u < 0x00000800 ? (c << 16) | 0xc0800000 :
+        u < 0x00010000 ? (c << 08) | 0xe0808000 :
+                         (c << 00) | 0xf0808080
+    reinterpret(Char, c)
+end
+
+function convert(::Type{T}, c::Char) where T <: Union{Int8,UInt8}
+    i = reinterpret(Int32, c)
+    i ≥ 0 ? ((i >>> 24) % T) : T(UInt32(c))
+end
+
+function convert(::Type{Char}, b::Union{Int8,UInt8})
+    0 ≤ b ≤ 0x7f ? reinterpret(Char, (b % UInt32) << 24) : Char(UInt32(b))
+end
+
 convert(::Type{Char}, x::Number) = Char(UInt32(x))
-convert(::Type{UInt32}, x::Char) = reinterpret(UInt32, x)
 convert(::Type{T}, x::Char) where {T<:Number} = convert(T, UInt32(x))
 
 rem(x::Char, ::Type{T}) where {T<:Number} = rem(UInt32(x), T)
@@ -29,11 +79,9 @@ done(c::Char, state) = state
 isempty(c::Char) = false
 in(x::Char, y::Char) = x == y
 
-==(x::Char, y::Char) = UInt32(x) == UInt32(y)
-isless(x::Char, y::Char) = UInt32(x) < UInt32(y)
-
-const hashchar_seed = 0xd4d64234
-hash(x::Char, h::UInt) = hash_uint64(((UInt64(x)+hashchar_seed)<<32) ⊻ UInt64(h))
+==(x::Char, y::Char) = reinterpret(UInt32, x) == reinterpret(UInt32, y)
+isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y)
+hash(x::Char, h::UInt) = hash(reinterpret(UInt32, x), hash(Char, h))
 
 -(x::Char, y::Char) = Int(x) - Int(y)
 -(x::Char, y::Integer) = Char(Int32(x) - Int32(y))
@@ -66,7 +114,7 @@ function show(io::IO, c::Char)
     end
     if Unicode.isprint(c)
         write(io, 0x27, c, 0x27)
-    else
+    elseif !ismalformed(c)
         u = UInt32(c)
         write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
         d = max(2, 8 - (leading_zeros(u) >> 2))
@@ -74,13 +122,29 @@ function show(io::IO, c::Char)
             write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
         end
         write(io, 0x27)
+    else # malformed
+        write(io, 0x27)
+        u = reinterpret(UInt32, c)
+        while true
+            a = hex_chars[((u >> 28) & 0xf) + 1]
+            b = hex_chars[((u >> 24) & 0xf) + 1]
+            write(io, 0x5c, 'x', a, b)
+            (u <<= 8) == 0 && break
+        end
+        write(io, 0x27)
     end
     return
 end
 
 function show(io::IO, ::MIME"text/plain", c::Char)
     show(io, c)
-    u = UInt32(c)
-    print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
-    print(io, " (category ", Unicode.category_abbrev(c), ": ", Unicode.category_string(c), ")")
+    if !ismalformed(c)
+        u = UInt32(c)
+        print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
+    else
+        print(io, ": Malformed UTF-8")
+    end
+    abr = Unicode.category_abbrev(c)
+    str = Unicode.category_string(c)
+    print(io, " (category ", abr, ": ", str, ")")
 end
diff --git a/base/filesystem.jl b/base/filesystem.jl
index c5f8e4b10854d..6268d1d420752 100644
--- a/base/filesystem.jl
+++ b/base/filesystem.jl
@@ -149,6 +149,26 @@ function read(f::File, ::Type{UInt8})
     return ret % UInt8
 end
 
+function read(f::File, ::Type{Char})
+    b0 = read(f, UInt8)
+    l = 8(4-leading_ones(b0))
+    c = UInt32(b0) << 24
+    if l < 24
+        s = 16
+        while s ≥ l && !eof(f)
+            p = position(f)
+            b = read(f, UInt8)
+            if b & 0xc0 != 0x80
+                seek(f, p)
+                break
+            end
+            c |= UInt32(b) << s
+            s -= 8
+        end
+    end
+    return reinterpret(Char, c)
+end
+
 function unsafe_read(f::File, p::Ptr{UInt8}, nel::UInt)
     check_open(f)
     ret = ccall(:jl_fs_read, Int32, (Int32, Ptr{Void}, Csize_t),
diff --git a/base/intfuncs.jl b/base/intfuncs.jl
index abc1fd95b3e6a..76b45f90cf4e8 100644
--- a/base/intfuncs.jl
+++ b/base/intfuncs.jl
@@ -654,8 +654,8 @@ for sym in (:bin, :oct, :dec, :hex)
     @eval begin
         ($sym)(x::Unsigned, p::Int) = ($sym)(x,p,false)
         ($sym)(x::Unsigned)         = ($sym)(x,1,false)
-        ($sym)(x::Char, p::Int)     = ($sym)(unsigned(x),p,false)
-        ($sym)(x::Char)             = ($sym)(unsigned(x),1,false)
+        ($sym)(x::Char, p::Int)     = ($sym)(UInt32(x),p,false)
+        ($sym)(x::Char)             = ($sym)(UInt32(x),1,false)
         ($sym)(x::Integer, p::Int)  = ($sym)(unsigned(abs(x)),p,x<0)
         ($sym)(x::Integer)          = ($sym)(unsigned(abs(x)),1,x<0)
     end
diff --git a/base/io.jl b/base/io.jl
index 4f88d24cd7a0a..0cd51fd955bdc 100644
--- a/base/io.jl
+++ b/base/io.jl
@@ -535,25 +535,13 @@ function write(s::IO, a::SubArray{T,N,<:Array}) where {T,N}
     end
 end
 
-
-function write(s::IO, ch::Char)
-    c = reinterpret(UInt32, ch)
-    if c < 0x80
-        return write(s, c%UInt8)
-    elseif c < 0x800
-        return (write(s, (( c >> 6          ) | 0xC0)%UInt8)) +
-               (write(s, (( c        & 0x3F ) | 0x80)%UInt8))
-    elseif c < 0x10000
-        return (write(s, (( c >> 12         ) | 0xE0)%UInt8)) +
-               (write(s, (((c >> 6)  & 0x3F ) | 0x80)%UInt8)) +
-               (write(s, (( c        & 0x3F ) | 0x80)%UInt8))
-    elseif c < 0x110000
-        return (write(s, (( c >> 18         ) | 0xF0)%UInt8)) +
-               (write(s, (((c >> 12) & 0x3F ) | 0x80)%UInt8)) +
-               (write(s, (((c >> 6)  & 0x3F ) | 0x80)%UInt8)) +
-               (write(s, (( c        & 0x3F ) | 0x80)%UInt8))
-    else
-        return write(s, '\ufffd')
+function write(io::IO, c::Char)
+    u = bswap(reinterpret(UInt32, c))
+    n = 1
+    while true
+        write(io, u % UInt8)
+        (u >>= 8) == 0 && return n
+        n += 1
     end
 end
 
@@ -596,23 +584,20 @@ function read!(s::IO, a::Array{T}) where T
     return a
 end
 
-function read(s::IO, ::Type{Char})
-    ch = read(s, UInt8)
-    if ch < 0x80
-        return Char(ch)
-    end
-
-    # mimic utf8.next function
-    trailing = Base.utf8_trailing[ch+1]
-    c::UInt32 = 0
-    for j = 1:trailing
-        c += ch
-        c <<= 6
-        ch = read(s, UInt8)
+function read(io::IO, ::Type{Char})
+    b0 = read(io, UInt8)
+    l = 8(4-leading_ones(b0))
+    c = UInt32(b0) << 24
+    if l < 24
+        s = 16
+        while s ≥ l && !eof(io)
+            peek(io) & 0xc0 == 0x80 || break
+            b = read(io, UInt8)
+            c |= UInt32(b) << s
+            s -= 8
+        end
     end
-    c += ch
-    c -= Base.utf8_offset[trailing+1]
-    return Char(c)
+    return reinterpret(Char, c)
 end
 
 # readuntil_string is useful below since it has
@@ -620,7 +605,7 @@ end
 readuntil_string(s::IO, delim::UInt8) = String(readuntil(s, delim))
 
 function readuntil(s::IO, delim::Char)
-    if delim < Char(0x80)
+    if delim ≤ '\x7f'
         return readuntil_string(s, delim % UInt8)
     end
     out = IOBuffer()
@@ -701,7 +686,7 @@ function readuntil(io::IO, target::AbstractString)
     i = start(target)
     done(target, i) && return ""
     c, i = next(target, start(target))
-    if done(target, i) && c < Char(0x80)
+    if done(target, i) && c <= '\x7f'
         return readuntil_string(io, c % UInt8)
     end
     # decide how we can index target
@@ -728,12 +713,11 @@ function readuntil(io::IO, target::AbstractVector{T}) where T
     return out
 end
 
-
 """
     readchomp(x)
 
-Read the entirety of `x` as a string and remove a single trailing newline.
-Equivalent to `chomp!(read(x, String))`.
+Read the entirety of `x` as a string and remove a single trailing newline
+if there is one. Equivalent to `chomp(read(x, String))`.
 
 # Examples
 ```jldoctest
@@ -747,7 +731,7 @@ julia> readchomp("my_file.txt")
 julia> rm("my_file.txt");
 ```
 """
-readchomp(x) = chomp!(read(x, String))
+readchomp(x) = chomp(read(x, String))
 
 # read up to nb bytes into nb, returning # bytes read
 
diff --git a/base/iostream.jl b/base/iostream.jl
index 117bf77e7f8a6..347b86ca10f34 100644
--- a/base/iostream.jl
+++ b/base/iostream.jl
@@ -315,12 +315,13 @@ end
 
 ## low-level calls ##
 
-write(s::IOStream, b::UInt8) = Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios))
+function write(s::IOStream, b::UInt8)
+    iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable"))
+    Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios))
+end
 
 function unsafe_write(s::IOStream, p::Ptr{UInt8}, nb::UInt)
-    if !iswritable(s)
-        throw(ArgumentError("write failed, IOStream is not writeable"))
-    end
+    iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable"))
     return Int(ccall(:ios_write, Csize_t, (Ptr{Void}, Ptr{Void}, Csize_t), s.ios, p, nb))
 end
 
@@ -353,14 +354,6 @@ end
 
 ## text I/O ##
 
-function write(s::IOStream, c::Char)
-    if !iswritable(s)
-        throw(ArgumentError("write failed, IOStream is not writeable"))
-    end
-    Int(ccall(:ios_pututf8, Cint, (Ptr{Void}, UInt32), s.ios, c))
-end
-read(s::IOStream, ::Type{Char}) = Char(ccall(:jl_getutf8, UInt32, (Ptr{Void},), s.ios))
-
 take!(s::IOStream) =
     ccall(:jl_take_buffer, Vector{UInt8}, (Ptr{Void},), s.ios)
 
@@ -452,14 +445,23 @@ function read(s::IOStream, nb::Integer; all::Bool=true)
 end
 
 ## Character streams ##
-const _chtmp = Ref{Char}()
+
 function peekchar(s::IOStream)
-    if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{Char}), s, _chtmp) < 0
+    chref = Ref{UInt32}()
+    if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{UInt32}), s, chref) < 0
         return typemax(Char)
     end
-    return _chtmp[]
+    return Char(chref[])
 end
 
 function peek(s::IOStream)
     ccall(:ios_peekc, Cint, (Ptr{Void},), s)
 end
+
+function peek(s::IO)
+    mark(s)
+    try read(s, UInt8)
+    finally
+        reset(s)
+    end
+end
diff --git a/base/parse.jl b/base/parse.jl
index ddbf833cb162f..7181b3538c457 100644
--- a/base/parse.jl
+++ b/base/parse.jl
@@ -224,12 +224,12 @@ end
 ## string to float functions ##
 
 tryparse(::Type{Float64}, s::String) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s))
-tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof)
+tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits)
 tryparse_internal(::Type{Float64}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1)
 tryparse_internal(::Type{Float64}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1)
 
 tryparse(::Type{Float32}, s::String) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s))
-tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof)
+tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits)
 tryparse_internal(::Type{Float32}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1)
 tryparse_internal(::Type{Float32}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1)
 
diff --git a/base/regex.jl b/base/regex.jl
index 555032ef30d23..2fd5a804bf1af 100644
--- a/base/regex.jl
+++ b/base/regex.jl
@@ -303,8 +303,12 @@ struct SubstitutionString{T<:AbstractString} <: AbstractString
     string::T
 end
 
-endof(s::SubstitutionString) = endof(s.string)
-next(s::SubstitutionString, idx::Int) = next(s.string, idx)
+ncodeunits(s::SubstitutionString) = ncodeunits(s.string)
+codeunit(s::SubstitutionString) = codeunit(s.string)
+codeunit(s::SubstitutionString, i::Integer) = codeunit(s.string, i)
+isvalid(s::SubstitutionString, i::Integer) = isvalid(s.string, i)
+next(s::SubstitutionString, i::Integer) = next(s.string, i)
+
 function show(io::IO, s::SubstitutionString)
     print(io, "s")
     show(io, s.string)
diff --git a/base/repl/REPLCompletions.jl b/base/repl/REPLCompletions.jl
index 3e5056d613f26..2c4ba328093fa 100644
--- a/base/repl/REPLCompletions.jl
+++ b/base/repl/REPLCompletions.jl
@@ -106,7 +106,7 @@ const sorted_keywords = [
     "primitive type", "quote", "return", "struct",
     "true", "try", "using", "while"]
 
-function complete_keyword(s::String)
+function complete_keyword(s::Union{String,SubString{String}})
     r = searchsorted(sorted_keywords, s)
     i = first(r)
     n = length(sorted_keywords)
diff --git a/base/stream.jl b/base/stream.jl
index 24831e3ce83da..e60d5386ce7a5 100644
--- a/base/stream.jl
+++ b/base/stream.jl
@@ -1148,6 +1148,14 @@ unmark(x::LibuvStream)   = unmark(x.buffer)
 reset(x::LibuvStream)    = reset(x.buffer)
 ismarked(x::LibuvStream) = ismarked(x.buffer)
 
+function peek(s::LibuvStream)
+    mark(s)
+    try read(s, UInt8)
+    finally
+        reset(s)
+    end
+end
+
 # BufferStream's are non-OS streams, backed by a regular IOBuffer
 mutable struct BufferStream <: LibuvStream
     buffer::IOBuffer
diff --git a/base/strings/basic.jl b/base/strings/basic.jl
index 0c3044573a215..7ff30d09bc027 100644
--- a/base/strings/basic.jl
+++ b/base/strings/basic.jl
@@ -1,57 +1,188 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-## core string functions ##
+"""
+The `AbstractString` type is the supertype of all string implementations in
+Julia. Strings are encodings of sequences of [Unicode](https://unicode.org/)
+code points as represented by the `Char` type. Julia makes a few assumptions
+about strings:
+
+* Strings are encoded in terms of fixed-size "code units"
+  * Code units can be extracted with `codeunit(s, i)`
+  * The first code unit has index `1`
+  * The last code unit has index `ncodeunits(s)`
+  * Any index `i` such that `1 ≤ i ≤ ncodeunits(s)` is in bounds
+* String indexing is done in terms of these code units:
+  * Characters are extracted by `s[i]` with a valid string index `i`
+  * Each `Char` in a string is encoded by one or more code units
+  * Only the index of the first code unit of a `Char` is a valid index
+  * The encoding of a `Char` is independent of what precedes or follows it
+  * String encodings are "self-synchronizing" – i.e. `isvalid(s,i)` is O(1)
+
+Some string functions error if you use an out-of-bounds or invalid string index,
+including code unit extraction `codeunit(s,i)`, string indexing `s[i]`, and
+string iteration `next(s,i)`. Other string functions take a more relaxed
+approach to indexing and give you the closest valid string index when in-bounds,
+or when out-of-bounds, behave as if there were an infinite number of characters
+padding each side of the string. Usually these imaginary padding characters have
+code unit length `1`, but string types may choose different sizes. Relaxed
+indexing functions include those intended for index arithmetic: `thisind`,
+`nextind` and `prevind`. This model allows index arithmetic to work with out-of-
+bounds indices as intermediate values so long as one never uses them to retrieve
+a character, which often helps avoid needing to code around edge cases.
+
+See also: `codeunit`, `ncodeunits`, `thisind`, `nextind`, `prevind`
+"""
+AbstractString
 
-endof(s::AbstractString) = error("you must implement endof(", typeof(s), ")")
-next(s::AbstractString, i::Int) = error("you must implement next(", typeof(s), ",Int)")
-next(s::AbstractString, i::Integer) = next(s,Int(i))
+## required string functions ##
 
-string() = ""
-string(s::AbstractString) = s
+"""
+    ncodeunits(s::AbstractString) -> Int
 
-(::Type{Vector{UInt8}})(s::AbstractString) = Vector{UInt8}(String(s))
-(::Type{Array{UInt8}})(s::AbstractString) = Vector{UInt8}(s)
-(::Type{Vector{Char}})(s::AbstractString) = collect(s)
+Return the number of code units in a string. Indices that are in bounds to
+access this string must satisfy `1 ≤ i ≤ ncodeunits(s)`. Not all such indices
+are valid – they may not be the start of a character, but they will return a
+code unit value when calling `codeunit(s,i)`.
 
-Symbol(s::AbstractString) = Symbol(String(s))
+See also: `codeunit`, `checkbounds`, `sizeof`, `length`, `endof`
+"""
+ncodeunits(s::AbstractString)
 
-# string types are convertible
-convert(::Type{T}, s::T) where {T<:AbstractString} = s
-convert(::Type{T}, s::AbstractString) where {T<:AbstractString} = T(s)
+"""
+    codeunit(s::AbstractString) -> Type{<:Union{UInt8, UInt16, UInt32}}
 
-## generic supplied functions ##
+Return the code unit type of the given string object. For ASCII, Latin-1, or
+UTF-8 encoded strings, this would be `UInt8`; for UCS-2 and UTF-16 it would be
+`UInt16`; for UTF-32 it would be `UInt32`. The unit code type need not be
+limited to these three types, but it's hard to think of widely used string
+encodings that don't use one of these units. `codeunit(s)` is the same as
+`typeof(codeunit(s,1))` when `s` is a non-empty string.
 
-start(s::AbstractString) = 1
-done(s::AbstractString,i) = (i > endof(s))
-getindex(s::AbstractString, i::Int) = next(s,i)[1]
-getindex(s::AbstractString, i::Integer) = s[Int(i)]
-getindex(s::AbstractString, i::Colon) = s
-getindex(s::AbstractString, r::UnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
-# TODO: handle other ranges with stride ±1 specially?
-getindex(s::AbstractString, v::AbstractVector{<:Integer}) =
-    sprint(length(v), io->(for i in v; write(io,s[i]) end))
-getindex(s::AbstractString, v::AbstractVector{Bool}) =
-    throw(ArgumentError("logical indexing not supported for strings"))
+See also: `ncodeunits`
+"""
+codeunit(s::AbstractString)
 
-get(s::AbstractString, i::Integer, default) = isvalid(s,i) ? s[i] : default
+"""
+    codeunit(s::AbstractString, i::Integer) -> Union{UInt8, UInt16, UInt32}
+
+Return the code unit value in the string `s` at index `i`. Note that
+
+    codeunit(s, i) :: codeunit(s)
+
+I.e. the value returned by `codeunit(s, i)` is of the type returned by
+`codeunit(s)`.
+
+See also: `ncodeunits`, `checkbounds`
+"""
+codeunit(s::AbstractString, i::Integer) = typeof(i) === Int ?
+    throw(MethodError(codeunit, Tuple{typeof(s),Int})) :
+        codeunit(s, Int(i))
 
 """
-    sizeof(s::AbstractString)
+    isvalid(s::AbstractString, i::Integer) -> Bool
 
-The number of bytes in string `s`.
+Predicate indicating whether the given index is the start of the encoding of
+a character in `s` or not. If `isvalid(s, i)` is true then `s[i]` will return
+the character whose encoding starts at that index, if it's false, then `s[i]`
+will raise an invalid index error. Behavior of `next(s, i)` is similar except
+that the character is returned along with the index of the following character.
+In order for `isvalid(s, i)` to be an O(1) function, the encoding of `s` must
+be [self-synchronizing](https://en.wikipedia.org/wiki/Self-synchronizing_code);
+this is a basic assumption of Julia's generic string support.
+
+See also: `getindex`, `next`, `thisind`, `nextind`, `prevind`, `length`
 
 # Examples
+
 ```jldoctest
-julia> sizeof("❤")
-3
+julia> str = "αβγdef";
+
+julia> isvalid(str, 1)
+true
+
+julia> str[1]
+'α': Unicode U+03b1 (category Ll: Letter, lowercase)
+
+julia> isvalid(str, 2)
+false
+
+julia> str[2]
+ERROR: UnicodeError: invalid character index
+Stacktrace:
+[...]
 ```
 """
-sizeof(s::AbstractString) = error("type $(typeof(s)) has no canonical binary representation")
+isvalid(s::AbstractString, i::Integer) = typeof(i) === Int ?
+    throw(MethodError(isvalid, Tuple{typeof(s),Int})) :
+        isvalid(s, Int(i))
+
+"""
+    next(s::AbstractString, i::Integer) -> Tuple{Char, Int}
 
+Return a tuple of the character in `s` at index `i` with the index of the start
+of the following character in `s`. This is the key method that allows strings to
+be iterated, yielding a sequences of characters. If `i` is out of bounds in `s`
+then a bounds error is raised; if `i` is not a valid character index in `s` then
+a Unicode index error is raised.
+
+See also: `getindex`, `start`, `done`, `checkbounds`
+"""
+next(s::AbstractString, i::Integer) = typeof(i) === Int ?
+    throw(MethodError(next, Tuple{typeof(s),Int})) :
+        next(s, Int(i))
+
+## basic generic definitions ##
+
+start(s::AbstractString) = 1
+done(s::AbstractString, i::Integer) = i > ncodeunits(s)
 eltype(::Type{<:AbstractString}) = Char
+sizeof(s::AbstractString) = ncodeunits(s) * sizeof(codeunit(s))
+endof(s::AbstractString) = thisind(s, ncodeunits(s))
+
+getindex(s::AbstractString, i::Integer) = next(s, i)[1]
+getindex(s::AbstractString, i::Colon) = s
+# TODO: handle other ranges with stride ±1 specially?
+getindex(s::AbstractString, r::UnitRange{<:Integer}) = SubString(s, r)
+getindex(s::AbstractString, v::AbstractVector{<:Integer}) =
+    sprint(length(v), io->(for i in v; write(io, s[i]) end))
+getindex(s::AbstractString, v::AbstractVector{Bool}) =
+    throw(ArgumentError("logical indexing not supported for strings"))
+
+get(s::AbstractString, i::Integer, default) = checkbounds(Bool, s, i) ? s[i] : default
+
+## bounds checking ##
+
+checkbounds(::Type{Bool}, s::AbstractString, i::Integer) =
+    1 ≤ i ≤ ncodeunits(s)
+checkbounds(::Type{Bool}, s::AbstractString, r::AbstractRange{<:Integer}) =
+    isempty(r) || (1 ≤ minimum(r) && maximum(r) ≤ ncodeunits(s))
+checkbounds(::Type{Bool}, s::AbstractString, I::AbstractArray{<:Real}) =
+    all(i -> checkbounds(s, i), I)
+checkbounds(::Type{Bool}, s::AbstractString, I::AbstractArray{<:Integer}) =
+    all(i -> checkbounds(s, i), I)
+checkbounds(s::AbstractString, I::Union{Integer,AbstractArray}) =
+    checkbounds(Bool, s, I) || throw(BoundsError(s, I))
+
+## construction, conversion, promotion ##
+
+string() = ""
+string(s::AbstractString) = s
+
+(::Type{Vector{UInt8}})(s::AbstractString) = Vector{UInt8}(String(s))
+(::Type{Array{UInt8}})(s::AbstractString) = Vector{UInt8}(s)
+(::Type{Vector{Char}})(s::AbstractString) = collect(s)
+
+Symbol(s::AbstractString) = Symbol(String(s))
+
+convert(::Type{T}, s::T) where {T<:AbstractString} = s
+convert(::Type{T}, s::AbstractString) where {T<:AbstractString} = T(s)
+
+promote_rule(::Type{<:AbstractString}, ::Type{<:AbstractString}) = String
+
+## string & character concatenation ##
 
 """
-    *(s::Union{AbstractString, Char}, t::Union{AbstractString, Char}...)
+    *(s::Union{AbstractString, Char}, t::Union{AbstractString, Char}...) -> String
 
 Concatenate strings and/or characters, producing a [`String`](@ref). This is equivalent
 to calling the [`string`](@ref) function on the arguments.
@@ -69,49 +200,16 @@ julia> 'j' * "ulia"
 
 one(::Union{T,Type{T}}) where {T<:AbstractString} = convert(T, "")
 
-# generic number of code units; implementations generally know how long a string
-# is though and should override this with a more efficient method
-ncodeunits(s::AbstractString) = nextind(s, endof(s)) - 1
-
-"""
-    length(s::AbstractString)
-
-The number of characters in string `s`.
-
-# Examples
-```jldoctest
-julia> length("jμΛIα")
-5
-```
-"""
-function length(s::AbstractString)
-    i = start(s)
-    if done(s,i)
-        return 0
-    end
-    n = 1
-    while true
-        c, j = next(s,i)
-        if done(s,j)
-            return n
-        end
-        n += 1
-        i = j
-    end
-end
+## generic string comparison ##
 
-## string comparison functions ##
 """
-    cmp(a::AbstractString, b::AbstractString)
-
-Compare two strings for equality.
+    cmp(a::AbstractString, b::AbstractString) -> Int
 
-Return `0` if both strings have the same length and the character
-at each index is the same in both strings.
-Return `-1` if `a` is a substring of `b`, or if `a` comes before
-`b` in alphabetical order.
-Return `1` if `b` is a substring of `a`, or if `b` comes before
-`a` in alphabetical order.
+Compare two strings for equality. Return `0` if both strings have the same
+length and the character at each index is the same in both strings. Return `-1`
+if `a` is a substring of `b`, or if `a` comes before `b` in alphabetical order.
+Return `1` if `b` is a substring of `a`, or if `b` comes before `a` in
+alphabetical order (technically, lexicographical order by Unicode code points).
 
 # Examples
 ```jldoctest
@@ -138,28 +236,23 @@ julia> cmp("b", "β")
 ```
 """
 function cmp(a::AbstractString, b::AbstractString)
-    if a === b
-        return 0
-    end
+    a === b && return 0
     i = start(a)
     j = start(b)
-    while !done(a,i)
-        if done(b,j)
-            return +1
-        end
-        c, i = next(a,i)
-        d, j = next(b,j)
-        if c != d
-            return c < d ? -1 : +1
-        end
+    while !done(a, i)
+        done(b, j) && return 1
+        c, i = next(a, i)
+        d, j = next(b, j)
+        c ≠ d && return ifelse(c < d, -1, 1)
     end
-    done(b,j) ? 0 : -1
+    return ifelse(done(b, j), 0, -1)
 end
 
 """
-    ==(a::AbstractString, b::AbstractString)
+    ==(a::AbstractString, b::AbstractString) -> Bool
 
-Test whether two strings are equal character by character.
+Test whether two strings are equal character by character (technically, Unicode
+code point by code point).
 
 # Examples
 ```jldoctest
@@ -170,12 +263,13 @@ julia> "abc" == "αβγ"
 false
 ```
 """
-==(a::AbstractString, b::AbstractString) = cmp(a,b) == 0
+==(a::AbstractString, b::AbstractString) = cmp(a, b) == 0
 
 """
-    isless(a::AbstractString, b::AbstractString)
+    isless(a::AbstractString, b::AbstractString) -> Bool
 
-Test whether string `a` comes before string `b` in alphabetical order.
+Test whether string `a` comes before string `b` in alphabetical order
+(technically, in lexicographical order by Unicode code points).
 
 # Examples
 ```jldoctest
@@ -189,64 +283,58 @@ julia> isless("a", "a")
 false
 ```
 """
-isless(a::AbstractString, b::AbstractString) = cmp(a,b) < 0
+isless(a::AbstractString, b::AbstractString) = cmp(a, b) < 0
 
 # faster comparisons for symbols
 
 cmp(a::Symbol, b::Symbol) = Int(sign(ccall(:strcmp, Int32, (Cstring, Cstring), a, b)))
 
-isless(a::Symbol, b::Symbol) = cmp(a,b) < 0
+isless(a::Symbol, b::Symbol) = cmp(a, b) < 0
 
-## Generic validation functions ##
+## character index arithmetic ##
 
 """
-    isvalid(str::AbstractString, i::Integer)
+    length(s::AbstractString, lo::Integer=1, hi::Integer=ncodeunits(s)) -> Integer
+
+The number of characters in string `s` from indices `lo` through `hi`. This is
+computed as the number of code unit indices from `lo` to `hi` which are valid
+character indices. Without only a single string argument, this computes the
+number of characters in the entire string. If `lo` or `hi` are out of ranges
+each out of range code unit is considered to be one character. This matches the
+"loose" indexing model of `thisind`, `nextind` and `prevind`.
 
-Tell whether index `i` is valid for the given string.
+See also: `isvalid`, `ncodeunits`, `endof`, `thisind`, `nextind`, `prevind`
 
 # Examples
 ```jldoctest
-julia> str = "αβγdef";
-
-julia> isvalid(str, 1)
-true
-
-julia> str[1]
-'α': Unicode U+03b1 (category Ll: Letter, lowercase)
-
-julia> isvalid(str, 2)
-false
-
-julia> str[2]
-ERROR: UnicodeError: invalid character index
-Stacktrace:
-[...]
+julia> length("jμΛIα")
+5
 ```
 """
-function isvalid(s::AbstractString, i::Integer)
-    i < 1 && return false
-    done(s,i) && return false
-    try
-        next(s,i)
-        true
-    catch
-        false
+function length(s::AbstractString, lo::Integer=1, hi::Integer=ncodeunits(s))
+    z = ncodeunits(s)
+    a = Int(max(1, min(z, lo)))
+    b = Int(min(z, max(1, hi)))
+    n = a - b
+    for i = a:b
+        n += isvalid(s, i)
     end
+    return n + hi - lo
 end
 
-## Generic indexing functions ##
-
 """
-    thisind(s::AbstractString, i::Integer)
+    thisind(s::AbstractString, i::Integer) -> Int
 
-If `i` is the index into a character in `s` then `thisind` returns the index of the
-start of that character. If `i < start(s)` then it returns `start(s) - 1`.
-If `i > ncodeunits(s)` then it returns `ncodeunits(s) + 1`.
+If `i` is in bounds in `s` return the index of the start of the character whose
+encoding code unit `i` is part of. In other words, if `i` is the start of a
+character, return `i`; if `i` is not the start of a character, rewind until the
+start of a character and return that index. If `i` is out of bounds in `s`
+return `i`.
 
 # Examples
 ```jldoctest
 julia> thisind("αβγdef", -5)
-0
+-5
 
 julia> thisind("αβγdef", 1)
 1
@@ -264,23 +352,24 @@ julia> thisind("αβγdef", 10)
 10
 
 julia> thisind("αβγdef", 20)
-10
+20
 """
 function thisind(s::AbstractString, i::Integer)
-    j = Int(i)
-    isvalid(s, j) && return j
-    j < start(s) && return 0
-    n = ncodeunits(s)
-    j > n && return n + 1
-    prevind(s, j)
+    i ≤ ncodeunits(s) || return i
+    @inbounds while 1 < i && !isvalid(s, i)
+        i -= 1
+    end
+    return i
 end
 
 """
-    prevind(str::AbstractString, i::Integer, nchar::Integer=1)
+    prevind(str::AbstractString, i::Integer, n::Integer=1) -> Int
 
-Get the previous valid string index before `i`.
-Returns a value less than `1` at the beginning of the string.
-If the `nchar` argument is given the function goes back `nchar` characters.
+If `i` is in bounds in `s` return the index of the start of the character whose
+encoding starts before index `i`. In other words, if `i` is the start of a
+character, return the start of the previous character; if `i` is not the start
+of a character, rewind until the start of a character and return that index.
+If `i` is out of bounds in `s` return `i - 1`. If `n == 0` return `i`.
 
 # Examples
 ```jldoctest
@@ -290,51 +379,32 @@ julia> prevind("αβγdef", 3)
 julia> prevind("αβγdef", 1)
 0
 
+julia> prevind("αβγdef", 0)
+-1
+
 julia> prevind("αβγdef", 3, 2)
 0
 ```
 """
-function prevind(s::AbstractString, i::Integer)
-    e = endof(s)
-    if i > e
-        return e
+function prevind(s::AbstractString, i::Integer, n::Integer=1)
+    n < 0 && throw(ArgumentError("n cannot be negative: $n"))
+    z = ncodeunits(s) + 1
+    if i > z
+        n -= i - z
+        i = z
     end
-    j = Int(i)-1
-    while j >= 1
-        if isvalid(s,j)
-            return j
-        end
-        j -= 1
-    end
-    return 0 # out of range
-end
-
-function prevind(s::AbstractString, i::Integer, nchar::Integer)
-    nchar > 0 || throw(ArgumentError("nchar must be greater than 0"))
-    e = endof(s)
-    j = Int(i)
-    j < 1 && return 0
-    while nchar > 0
-        if j > e
-            j = e
-        else
-            j -= 1
-            while j >= 1 && !isvalid(s,j)
-                j -= 1
-            end
-        end
-        j < 1 && return 0
-        nchar -= 1
+    while n > 0 && 1 < i
+        @inbounds n -= isvalid(s, i -= 1)
     end
-    j
+    return i - n
 end
 
 """
-    nextind(str::AbstractString, i::Integer, nchar::Integer=1)
+    nextind(str::AbstractString, i::Integer, n::Integer=1) -> Int
 
-Get the next valid string index after `i`.
-Returns a value greater than `endof(str)` at or after the end of the string.
-If the `nchar` argument is given the function goes forward `nchar` characters.
+If `i` is in bounds in `s` return the index of the start of the character whose
+encoding starts after index `i`. If `i` is out of bounds in `s` return `i + 1`.
+If `n == 0` return `i`.
 
 # Examples
 ```jldoctest
@@ -353,48 +423,19 @@ julia> nextind(str, 9)
 10
 ```
 """
-function nextind(s::AbstractString, i::Integer)
-    e = endof(s)
+function nextind(s::AbstractString, i::Integer, n::Integer=1)
+    n < 0 && throw(ArgumentError("n cannot be negative: $n"))
     if i < 1
-        return 1
-    end
-    if i > e
-        return Int(i)+1
-    end
-    for j = Int(i)+1:e
-        if isvalid(s,j)
-            return j
-        end
+        n += i - 1
+        i = 1
     end
-    next(s,e)[2] # out of range
-end
-
-function nextind(s::AbstractString, i::Integer, nchar::Integer)
-    nchar > 0 || throw(ArgumentError("nchar must be greater than 0"))
-    e = endof(s)
-    j = Int(i)
-    while nchar > 0
-        if j < 1
-            j = 1
-        else
-            j > e && return j + nchar
-            j == e && return next(s,e)[2] + nchar - 1
-            for outer j = j+1:e
-                isvalid(s,j) && break
-            end
-        end
-        nchar -= 1
+    z = ncodeunits(s)
+    while n > 0 && i < z
+        @inbounds n -= isvalid(s, i += 1)
     end
-    j
+    return i + n
 end
 
-checkbounds(s::AbstractString, i::Integer) = start(s) <= i <= endof(s) || throw(BoundsError(s, i))
-checkbounds(s::AbstractString, r::AbstractRange{<:Integer}) = isempty(r) || (minimum(r) >= start(s) && maximum(r) <= endof(s)) || throw(BoundsError(s, r))
-# The following will end up using a deprecated checkbounds, when the covariant parameter is not Integer
-checkbounds(s::AbstractString, I::AbstractArray{<:Real}) = all(i -> checkbounds(s, i), I)
-checkbounds(s::AbstractString, I::AbstractArray{<:Integer}) = all(i -> checkbounds(s, i), I)
-
-
 """
     ind2chr(s::AbstractString, i::Integer)
 
@@ -414,10 +455,7 @@ julia> chr2ind(str, 2)
 3
 ```
 """
-function ind2chr(s::AbstractString, i::Integer)
-    s[i] # throws error if invalid
-    unsafe_ind2chr(s, i)
-end
+ind2chr(s::AbstractString, i::Integer) = length(s, 1, i)
 
 """
     chr2ind(s::AbstractString, i::Integer)
@@ -437,26 +475,10 @@ julia> ind2chr(str, 3)
 2
 ```
 """
-function chr2ind(s::AbstractString, i::Integer)
-    i < start(s) && throw(BoundsError(s, i))
-    k = unsafe_chr2ind(s, i)
-    s[k] # throws error if invalid
-    k
-end
-
-function map_chr_ind(s::AbstractString, i::Integer, stop, ret)
-    j = 1
-    k = start(s)
-    while true
-        i == stop((j, k)) && return ret((j, k)) # k could point after the last character
-        _, k = next(s, k)
-        j += 1
-    end
-end
-
-unsafe_ind2chr(s::AbstractString, i::Integer) = map_chr_ind(s, i, last, first)
-unsafe_chr2ind(s::AbstractString, i::Integer) = map_chr_ind(s, i, first, last)
+chr2ind(s::AbstractString, n::Integer) =
+    n < 0 ? prevind(s, 0, -n) : nextind(s, 0, n)
 
+## string index iteration type ##
 
 struct EachStringIndex{T<:AbstractString}
     s::T
@@ -469,32 +491,26 @@ next(e::EachStringIndex, state) = (state, nextind(e.s, state))
 done(e::EachStringIndex, state) = done(e.s, state)
 eltype(::Type{EachStringIndex}) = Int
 
-## string promotion rules ##
-
-promote_rule(::Type{<:AbstractString}, ::Type{<:AbstractString}) = String
-
 ## string map, filter, has ##
 
 function map(f, s::AbstractString)
-    out = IOBuffer(StringVector(endof(s)),true,true)
-    truncate(out,0)
+    out = IOBuffer(StringVector(endof(s)), true, true)
+    truncate(out, 0)
     for c in s
-        c2 = f(c)
-        if !isa(c2,Char)
-            throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
-        end
-        write(out, c2::Char)
+        c′ = f(c)
+        isa(c′, Char) || throw(ArgumentError(
+            "map(f, s::AbstractString) requires f to return Char; " *
+            "try map(f, collect(s)) or a comprehension instead"))
+        write(out, c′::Char)
     end
     String(take!(out))
 end
 
 function filter(f, s::AbstractString)
-    out = IOBuffer(StringVector(endof(s)),true,true)
-    truncate(out,0)
+    out = IOBuffer(StringVector(endof(s)), true, true)
+    truncate(out, 0)
     for c in s
-        if f(c)
-            write(out, c)
-        end
+        f(c) && write(out, c)
     end
     String(take!(out))
 end
@@ -502,9 +518,9 @@ end
 ## string first and last ##
 
 """
-    first(str::AbstractString, nchar::Integer)
+    first(s::AbstractString, n::Integer)
 
-Get a string consisting of the first `nchar` characters of `str`.
+Get a string consisting of the first `n` characters of `s`.
 
 ```jldoctest
 julia> first("∀ϵ≠0: ϵ²>0", 0)
@@ -517,17 +533,12 @@ julia> first("∀ϵ≠0: ϵ²>0", 3)
 "∀ϵ≠"
 ```
 """
-function first(str::AbstractString, nchar::Integer)
-    if 0 <= nchar <= 1
-        return str[1:nchar]
-    end
-    str[1:nextind(str, 1, nchar-1)]
-end
+first(s::AbstractString, n::Integer) = s[1:min(end, nextind(s, 0, n))]
 
 """
-    last(str::AbstractString, nchar::Integer)
+    last(s::AbstractString, n::Integer)
 
-Get a string consisting of the last `nchar` characters of `str`.
+Get a string consisting of the last `n` characters of `s`.
 
 ```jldoctest
 julia> last("∀ϵ≠0: ϵ²>0", 0)
@@ -540,13 +551,54 @@ julia> last("∀ϵ≠0: ϵ²>0", 3)
 "²>0"
 ```
 """
-function last(str::AbstractString, nchar::Integer)
-    e = endof(str)
-    if 0 <= nchar <= 1
-        return str[(e-nchar+1):e]
-    end
-    str[prevind(str, e, nchar-1):e]
-end
+last(s::AbstractString, n::Integer) = s[max(1, prevind(s, ncodeunits(s)+1, n)):end]
+
+"""
+    reverseind(v, i)
+
+Given an index `i` in [`reverse(v)`](@ref), return the corresponding index in `v` so that
+`v[reverseind(v,i)] == reverse(v)[i]`. (This can be nontrivial in cases where `v` contains
+non-ASCII characters.)
+
+# Examples
+```jldoctest
+julia> r = reverse("Julia")
+"ailuJ"
+
+julia> for i in 1:length(r)
+           print(r[reverseind("Julia", i)])
+       end
+Julia
+```
+"""
+reverseind(s::AbstractString, i::Integer) = thisind(s, ncodeunits(s)-i+1)
+
+"""
+    repeat(s::AbstractString, r::Integer)
+
+Repeat a string `r` times. This can equivalently be accomplished by calling [`s^r`](@ref ^).
+
+# Examples
+```jldoctest
+julia> repeat("ha", 3)
+"hahaha"
+```
+"""
+repeat(s::AbstractString, r::Integer) = repeat(convert(String, s), r)
+
+"""
+    ^(s::Union{AbstractString,Char}, n::Integer)
+
+Repeat a string or character `n` times.
+The [`repeat`](@ref) function is an alias to this operator.
+
+# Examples
+```jldoctest
+julia> "Test "^3
+"Test Test Test "
+```
+"""
+(^)(s::Union{AbstractString,Char}, r::Integer) = repeat(s, r)
 
 # reverse-order iteration for strings and indices thereof
 start(r::Iterators.Reverse{<:AbstractString}) = endof(r.itr)
diff --git a/base/strings/io.jl b/base/strings/io.jl
index 49d223111041b..98648bac824a6 100644
--- a/base/strings/io.jl
+++ b/base/strings/io.jl
@@ -139,7 +139,7 @@ write(io::IO, s::AbstractString) = (len = 0; for c in s; len += write(io, c); en
 show(io::IO, s::AbstractString) = print_quoted(io, s)
 
 write(to::GenericIOBuffer, s::SubString{String}) =
-    s.endof==0 ? 0 : unsafe_write(to, pointer(s.string, s.offset + 1), UInt(nextind(s, s.endof) - 1))
+    s.ncodeunits ≤ 0 ? 0 : unsafe_write(to, pointer(s.string, s.offset+1), UInt(s.ncodeunits))
 
 ## printing literal quoted string data ##
 
@@ -271,15 +271,23 @@ function escape_string(io, s::AbstractString, esc::AbstractString="")
     i = start(s)
     while !done(s,i)
         c, j = next(s,i)
-        c == '\0'           ? print(io, escape_nul(s,j)) :
-        c == '\e'           ? print(io, "\\e") :
-        c == '\\'           ? print(io, "\\\\") :
-        c in esc            ? print(io, '\\', c) :
-        '\a' <= c <= '\r'   ? print(io, '\\', "abtnvfr"[Int(c)-6]) :
-        Unicode.isprint(c)  ? print(io, c) :
-        c <= '\x7f'         ? print(io, "\\x", hex(c, 2)) :
-        c <= '\uffff'       ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) :
-                              print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4))
+        if !ismalformed(c)
+            c == '\0'          ? print(io, escape_nul(s,j)) :
+            c == '\e'          ? print(io, "\\e") :
+            c == '\\'          ? print(io, "\\\\") :
+            c in esc           ? print(io, '\\', c) :
+            '\a' <= c <= '\r'  ? print(io, '\\', "abtnvfr"[Int(c)-6]) :
+            Unicode.isprint(c) ? print(io, c) :
+            c <= '\x7f'        ? print(io, "\\x", hex(c, 2)) :
+            c <= '\uffff'      ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) :
+                                 print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4))
+        else # malformed
+            u = bswap(reinterpret(UInt32, c))
+            while true
+                print(io, "\\x", hex(u % UInt8, 2))
+                (u >>= 8) == 0 && break
+            end
+        end
         i = j
     end
 end
@@ -290,27 +298,10 @@ function print_quoted(io, s::AbstractString)
     print(io, '"')
 end
 
-# bare minimum unescaping function unescapes only given characters
-
-function print_unescaped_chars(io, s::AbstractString, esc::AbstractString)
-    if !('\\' in esc)
-        esc = string("\\", esc)
-    end
-    i = start(s)
-    while !done(s,i)
-        c, i = next(s,i)
-        if c == '\\' && !done(s,i) && s[i] in esc
-            c, i = next(s,i)
-        end
-        print(io, c)
-    end
-end
-
-unescape_chars(s::AbstractString, esc::AbstractString) =
-    sprint(endof(s), print_unescaped_chars, s, esc)
-
 # general unescaping of traditional C and Unicode escape sequences
 
+# TODO: handle unescaping invalid UTF-8 sequences
+
 """
     unescape_string(str::AbstractString) -> AbstractString
 
@@ -334,16 +325,16 @@ function unescape_string(io, s::AbstractString)
                 n = k = 0
                 m = c == 'x' ? 2 :
                     c == 'u' ? 4 : 8
-                while (k+=1) <= m && !done(s,i)
+                while (k += 1) <= m && !done(s,i)
                     c, j = next(s,i)
-                    n = '0' <= c <= '9' ? n<<4 + c-'0' :
-                        'a' <= c <= 'f' ? n<<4 + c-'a'+10 :
-                        'A' <= c <= 'F' ? n<<4 + c-'A'+10 : break
+                    n = '0' <= c <= '9' ? n<<4 + (c-'0') :
+                        'a' <= c <= 'f' ? n<<4 + (c-'a'+10) :
+                        'A' <= c <= 'F' ? n<<4 + (c-'A'+10) : break
                     i = j
                 end
                 if k == 1
                     throw(ArgumentError("invalid $(m == 2 ? "hex (\\x)" :
-                                            "unicode (\\u)") escape sequence used in $(repr(s))"))
+                                        "unicode (\\u)") escape sequence used in $(repr(s))"))
                 end
                 if m == 2 # \x escape sequence
                     write(io, UInt8(n))
@@ -353,7 +344,7 @@ function unescape_string(io, s::AbstractString)
             elseif '0' <= c <= '7'
                 k = 1
                 n = c-'0'
-                while (k+=1) <= 3 && !done(s,i)
+                while (k += 1) <= 3 && !done(s,i)
                     c, j = next(s,i)
                     n = ('0' <= c <= '7') ? n<<3 + c-'0' : break
                     i = j
@@ -503,18 +494,7 @@ end
 
 function convert(::Type{String}, chars::AbstractVector{Char})
     sprint(length(chars), io->begin
-        state = start(chars)
-        while !done(chars, state)
-            c, state = next(chars, state)
-            if '\ud7ff' < c && c + 1024 < '\ue000'
-                d, state = next(chars, state)
-                if '\ud7ff' < d - 1024 && d < '\ue000'
-                    c = Char(0x10000 + ((UInt32(c) & 0x03ff) << 10) | (UInt32(d) & 0x03ff))
-                else
-                    write(io, c)
-                    c = d
-                end
-            end
+        for c in chars
             write(io, c)
         end
     end)
diff --git a/base/strings/string.jl b/base/strings/string.jl
index 67c238358486f..0e1ef86e6a759 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -2,6 +2,8 @@
 
 const ByteArray = Union{Vector{UInt8},Vector{Int8}}
 
+@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
+
 ## constructors and conversions ##
 
 # String constructor docstring from boot.jl, workaround for #16730
@@ -49,7 +51,6 @@ Convert a string to a contiguous byte array representation encoded as UTF-8 byte
 This representation is often appropriate for passing strings to C.
 """
 String(s::AbstractString) = print_to_string(s)
-
 String(s::Symbol) = unsafe_string(Cstring(s))
 
 (::Type{Vector{UInt8}})(s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s)
@@ -59,48 +60,14 @@ String(s::Symbol) = unsafe_string(Cstring(s))
 pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
 pointer(s::String, i::Integer) = pointer(s)+(i-1)
 
-sizeof(s::String) = Core.sizeof(s)
-
-"""
-    codeunit(s::AbstractString, i::Integer)
-
-Get the `i`th code unit of an encoded string. For example,
-returns the `i`th byte of the representation of a UTF-8 string.
-
-# Examples
-```jldoctest
-julia> s = "δ=γ"; [codeunit(s, i) for i in 1:sizeof(s)]
-5-element Array{UInt8,1}:
- 0xce
- 0xb4
- 0x3d
- 0xce
- 0xb3
-```
-"""
-codeunit(s::AbstractString, i::Integer)
+ncodeunits(s::String) = Core.sizeof(s)
+codeunit(s::String) = UInt8
 
 @inline function codeunit(s::String, i::Integer)
-    @boundscheck if (i < 1) | (i > sizeof(s))
-        throw(BoundsError(s,i))
-    end
+    @boundscheck between(i, 1, ncodeunits(s)) || throw(BoundsError(s, i))
     @gc_preserve s unsafe_load(pointer(s, i))
 end
 
-"""
-    ncodeunits(s::AbstractString)
-
-The number of code units in a string. For example, for UTF-8-like data such as
-the default `String` type, the number of code units is the number of bytes in
-the string, a.k.a. `sizeof(s)`. For a UTF-16 encoded string type, however, the
-code unit is `UInt16` so the number of code units is the number of `UInt16`
-words in the representation of the string. The expression `codeunit(s, i)` is
-valid and safe for precisely the range of `i` values `1:ncodeunits(s)`.
-
-See also: [`codeunit`](@ref).
-"""
-ncodeunits(s::String) = sizeof(s)
-
 write(io::IO, s::String) =
     @gc_preserve s unsafe_write(io, pointer(s), reinterpret(UInt, sizeof(s)))
 
@@ -118,81 +85,45 @@ function ==(a::String, b::String)
     al == sizeof(b) && 0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, al)
 end
 
-## thisind, prevind and nextind ##
+## thisind, nextind, prevind ##
 
-function thisind(s::String, i::Integer)
-    j = Int(i)
-    j < 1 && return 0
-    n = ncodeunits(s)
-    j > n && return n + 1
-    @inbounds while j > 0 && is_valid_continuation(codeunit(s,j))
-        j -= 1
-    end
-    j
-end
+thisind(s::String, i::Integer) = oftype(i, thisind(s, Int(i)))
+nextind(s::String, i::Integer) = oftype(i, nextind(s, Int(i)))
 
-function prevind(s::String, i::Integer)
-    j = Int(i)
-    e = sizeof(s)
-    if j > e
-        return endof(s)
-    end
-    j -= 1
-    @inbounds while j > 0 && is_valid_continuation(codeunit(s,j))
-        j -= 1
-    end
-    j
-end
-
-function prevind(s::String, i::Integer, nchar::Integer)
-    nchar > 0 || throw(ArgumentError("nchar must be greater than 0"))
-    j = Int(i)
-    e = sizeof(s)
-    while nchar > 0
-        if j > e
-            j = endof(s)
-        else
-            j -= 1
-            @inbounds while j > 0 && is_valid_continuation(codeunit(s,j))
-                j -= 1
-            end
-        end
-        nchar -= 1
-        j <= 0 && return j - nchar
-    end
-    j
-end
-
-function nextind(s::String, i::Integer)
-    j = Int(i)
-    if j < 1
-        return 1
-    end
-    e = sizeof(s)
-    j += 1
-    @inbounds while j <= e && is_valid_continuation(codeunit(s,j))
-        j += 1
-    end
-    j
+function thisind(s::String, i::Int)
+    n = ncodeunits(s)
+    between(i, 2, n) || return i
+    @inbounds b = codeunit(s, i)
+    b & 0xc0 == 0x80 || return i
+    @inbounds b = codeunit(s, i-1)
+    between(b, 0b11000000, 0b11110111) && return i-1
+    (b & 0xc0 == 0x80) & (i-2 > 0) || return i
+    @inbounds b = codeunit(s, i-2)
+    between(b, 0b11100000, 0b11110111) && return i-2
+    (b & 0xc0 == 0x80) & (i-3 > 0) || return i
+    @inbounds b = codeunit(s, i-3)
+    between(b, 0b11110000, 0b11110111) && return i-3
+    return i
 end
 
-function nextind(s::String, i::Integer, nchar::Integer)
-    nchar > 0 || throw(ArgumentError("nchar must be greater than 0"))
-    j = Int(i)
-    e = sizeof(s)
-    while nchar > 0
-        if j < 1
-            j = 1
-        else
-            j += 1
-            @inbounds while j <= e && is_valid_continuation(codeunit(s,j))
-                j += 1
-            end
-        end
-        nchar -= 1
-        j > e && return j + nchar
-    end
-    j
+function nextind(s::String, i::Int)
+    n = ncodeunits(s)
+    between(i, 1, n-1) || return i+1
+    @inbounds l = codeunit(s, i)
+    (l < 0x80) | (0xf8 ≤ l) && return i+1
+    if l < 0xc0
+        i′ = thisind(s, i)
+        return i′ < i ? nextind(s, i′) : i+1
+    end
+    # first continuation byte
+    @inbounds b = codeunit(s, i += 1)
+    (b & 0xc0 != 0x80) | ((i += 1) > n) | (l < 0xe0) && return i
+    # second continuation byte
+    @inbounds b = codeunit(s, i)
+    (b & 0xc0 != 0x80) | ((i += 1) > n) | (l < 0xf0) && return i
+    # third continuation byte
+    @inbounds b = codeunit(s, i)
+    ifelse(b & 0xc0 != 0x80, i, i+1)
 end
 
 ## checking UTF-8 & ACSII validity ##
@@ -208,121 +139,146 @@ byte_string_classify(s::String) =
 isvalid(::Type{String}, s::Union{Vector{UInt8},String}) = byte_string_classify(s) != 0
 isvalid(s::String) = isvalid(String, s)
 
-## basic UTF-8 decoding & iteration ##
-
-is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
-is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
-is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
-is_valid_continuation(c) = ((c & 0xc0) == 0x80)
-
-const utf8_offset = [
-    0x00000000, 0x00003080,
-    0x000e2080, 0x03c82080,
-    0xfa082080, 0x82082080,
-]
-
-const utf8_trailing = [
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
-]
+is_valid_continuation(c) = c & 0xc0 == 0x80
 
 ## required core functionality ##
 
-function endof(s::String)
-    i = sizeof(s)
-    @inbounds while i > 0 && is_valid_continuation(codeunit(s, i))
-        i -= 1
-    end
-    i
+function next(s::String, i::Int)
+    @boundscheck 1 ≤ i ≤ sizeof(s) || throw(BoundsError(s, i))
+    @inbounds b = codeunit(s, i)
+    # TODO: check index validity
+    u = UInt32(b) << 24
+    (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u), i+1
+    return next_continued(s, i, u)
 end
 
-function length(s::String)
-    cnum = 0
-    @inbounds for i = 1:sizeof(s)
-        cnum += !is_valid_continuation(codeunit(s, i))
+@noinline function next_continued(s::String, i::Int, u::UInt32)
+    if u < 0xc0000000
+        isvalid(s, i) && (i += 1; @goto ret)
+        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, (u >> 24) % UInt8))
     end
-    cnum
+    n = ncodeunits(s)
+    # first continuation byte
+    (i += 1) > n && @goto ret
+    @inbounds b = codeunit(s, i)
+    b & 0xc0 == 0x80 || @goto ret
+    u |= UInt32(b) << 16
+    # second continuation byte
+    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
+    @inbounds b = codeunit(s, i)
+    b & 0xc0 == 0x80 || @goto ret
+    u |= UInt32(b) << 8
+    # third continuation byte
+    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
+    @inbounds b = codeunit(s, i)
+    b & 0xc0 == 0x80 || @goto ret
+    u |= UInt32(b); i += 1
+@label ret
+    return reinterpret(Char, u), i
 end
 
-@noinline function slow_utf8_next(s::String, b::UInt8, i::Int, l::Int)
-    @inbounds if is_valid_continuation(b)
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i)))
-    end
-    trailing = utf8_trailing[b + 1]
-    if l < i + trailing
-        return '\ufffd', i+1
-    end
-    c::UInt32 = 0
-    @inbounds for j = 1:(trailing + 1)
-        c <<= 6
-        c += codeunit(s, i)
-        i += 1
-    end
-    c -= utf8_offset[trailing + 1]
-    return Char(c), i
+function getindex(s::String, i::Int)
+    @boundscheck 1 ≤ i ≤ ncodeunits(s) || throw(BoundsError(s, i))
+    @inbounds b = codeunit(s, i)
+    # TODO: check index validity
+    u = UInt32(b) << 24
+    (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u)
+    return getindex_continued(s, i, u)
 end
 
-# This implementation relies on `next` returning a value past the end of the
-# String's underlying data, which is true for valid Strings
-done(s::String, state) = state > sizeof(s)
-
-@inline function next(s::String, i::Int)
-    # function is split into this critical fast-path
-    # for pure ascii data, such as parsing numbers,
-    # and a longer function that can handle any utf8 data
-    @boundscheck if (i < 1) | (i > sizeof(s))
-        throw(BoundsError(s,i))
+@noinline function getindex_continued(s::String, i::Int, u::UInt32)
+    if u < 0xc0000000
+        isvalid(s, i) && @goto ret
+        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, (u >> 24) % UInt8))
     end
+    n = ncodeunits(s)
+    # first continuation byte
+    (i += 1) > n && @goto ret
     @inbounds b = codeunit(s, i)
-    if b < 0x80
-        return Char(b), i + 1
-    end
-    return slow_utf8_next(s, b, i, sizeof(s))
+    b & 0xc0 == 0x80 || @goto ret
+    u |= UInt32(b) << 16
+    # second continuation byte
+    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
+    @inbounds b = codeunit(s, i)
+    b & 0xc0 == 0x80 || @goto ret
+    u |= UInt32(b) << 8
+    # third continuation byte
+    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
+    @inbounds b = codeunit(s, i)
+    b & 0xc0 == 0x80 || @goto ret
+    u |= UInt32(b)
+@label ret
+    return reinterpret(Char, u)
 end
 
-function first_utf8_byte(ch::Char)
-    c = UInt32(ch)
-    b = c < 0x80    ? c%UInt8 :
-        c < 0x800   ? ((c>>6)  | 0xc0)%UInt8 :
-        c < 0x10000 ? ((c>>12) | 0xe0)%UInt8 :
-                      ((c>>18) | 0xf0)%UInt8
-    return b
-end
-
-## overload methods for efficiency ##
-
-isvalid(s::String, i::Integer) =
-    (1 <= i <= sizeof(s)) && ((@inbounds b = codeunit(s, i)); !is_valid_continuation(b))
+getindex(s::String, r::UnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
 
 function getindex(s::String, r::UnitRange{Int})
     isempty(r) && return ""
-    l = sizeof(s)
-    i = first(r)
-    if i < 1 || i > l
-        throw(BoundsError(s, i))
-    end
-    @inbounds si = codeunit(s, i)
-    if is_valid_continuation(si)
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, si))
-    end
-    j = last(r)
-    if j > l
-        throw(BoundsError(s, j))
-    end
-    @inbounds sj = codeunit(s, j)
-    if is_valid_continuation(sj)
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, sj))
+    i, j = first(r), last(r)
+    @boundscheck begin
+        checkbounds(s, r)
+        @inbounds isvalid(s, i) ||
+            throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i)))
+        @inbounds isvalid(s, j) ||
+            throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, codeunit(s, j)))
+    end
+    j = nextind(s, j) - 1
+    n = j - i + 1
+    ss = _string_n(n)
+    p = pointer(ss)
+    for k = 1:n
+        unsafe_store!(p, codeunit(s, i + k - 1), k)
+    end
+    return ss
+end
+
+function length(s::String, lo::Int, hi::Int)
+    z = ncodeunits(s)
+    i = Int(max(1, min(z, lo)))
+    n = Int(min(z, max(1, hi)))
+    c = i - n
+    if i ≤ n
+        i, j = thisind(s, i), i
+        c -= i < j
+        i -= 1
+        while true
+            (i += 1) ≤ n || break
+            @inbounds b = codeunit(s, i) # lead byte
+        @label L
+            c += 1
+            (0xc0 ≤ b) & (b < 0xf8) || continue
+            l = b
+
+            (i += 1) ≤ n || break
+            @inbounds b = codeunit(s, i) # cont byte 1
+            b & 0xc0 == 0x80 || @goto L
+            l ≥ 0xe0 || continue
+
+            (i += 1) ≤ n || break
+            @inbounds b = codeunit(s, i) # cont byte 2
+            b & 0xc0 == 0x80 || @goto L
+            l ≥ 0xf0 || continue
+
+            (i += 1) ≤ n || break
+            @inbounds b = codeunit(s, i) # cont byte 3
+            b & 0xc0 == 0x80 || @goto L
+        end
     end
-    j = nextind(s,j)
-    unsafe_string(pointer(s,i), j-i)
+    return c + hi - lo
 end
 
+# TODO: delete or move to char.jl
+first_utf8_byte(c::Char) = (reinterpret(UInt32, c) >> 24) % UInt8
+
+## overload methods for efficiency ##
+
+function isvalid(s::String, i::Int)
+    @boundscheck checkbounds(s, i)
+    return thisind(s, i) == i
+end
+isvalid(s::String, i::Integer) = isvalid(s, Int(i))
+
 function search(s::String, c::Char, i::Integer = 1)
     if i < 1 || i > sizeof(s)
         i == sizeof(s) + 1 && return 0
@@ -331,11 +287,11 @@ function search(s::String, c::Char, i::Integer = 1)
     @inbounds if is_valid_continuation(codeunit(s,i))
         throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s,i)))
     end
-    c < Char(0x80) && return search(s, c%UInt8, i)
+    c ≤ '\x7f' && return search(s, c % UInt8, i)
     while true
         i = search(s, first_utf8_byte(c), i)
-        (i==0 || s[i] == c) && return i
-        i = next(s,i)[2]
+        (i == 0 || s[i] == c) && return i
+        i = next(s, i)[2]
     end
 end
 
@@ -361,12 +317,12 @@ function search(a::ByteArray, b::Char, i::Integer = 1)
 end
 
 function rsearch(s::String, c::Char, i::Integer = sizeof(s))
-    c < Char(0x80) && return rsearch(s, c%UInt8, i)
+    c ≤ '\x7f' && return rsearch(s, c % UInt8, i)
     b = first_utf8_byte(c)
     while true
         i = rsearch(s, b, i)
-        (i==0 || s[i] == c) && return i
-        i = prevind(s,i)
+        (i == 0 || s[i] == c) && return i
+        i = prevind(s, i)
     end
 end
 
@@ -411,62 +367,15 @@ function string(a::String...)
 end
 
 # UTF-8 encoding length of a character
-function codelen(d::Char)
-    c = UInt32(d)
-    if c < 0x80
-        return 1
-    elseif c < 0x800
-        return 2
-    elseif c < 0x10000
-        return 3
-    elseif c < 0x110000
-        return 4
-    end
-    return 3  # '\ufffd'
-end
+# TODO: delete or move to char.jl
+codelen(c::Char) = 4 - (trailing_zeros(0xff000000 | reinterpret(UInt32, c)) >> 3)
 
 function string(a::Union{String,Char}...)
-    n = 0
-    for d in a
-        if isa(d,Char)
-            n += codelen(d::Char)
-        else
-            n += sizeof(d::String)
+    sprint() do io
+        for x in a
+            write(io, x)
         end
     end
-    out = _string_n(n)
-    offs = 1
-    p = pointer(out)
-    for d in a
-        if isa(d,Char)
-            c = UInt32(d::Char)
-            if c < 0x80
-                unsafe_store!(p, c%UInt8, offs); offs += 1
-            elseif c < 0x800
-                unsafe_store!(p, (( c >> 6          ) | 0xC0)%UInt8, offs); offs += 1
-                unsafe_store!(p, (( c        & 0x3F ) | 0x80)%UInt8, offs); offs += 1
-            elseif c < 0x10000
-                unsafe_store!(p, (( c >> 12         ) | 0xE0)%UInt8, offs); offs += 1
-                unsafe_store!(p, (((c >> 6)  & 0x3F ) | 0x80)%UInt8, offs); offs += 1
-                unsafe_store!(p, (( c        & 0x3F ) | 0x80)%UInt8, offs); offs += 1
-            elseif c < 0x110000
-                unsafe_store!(p, (( c >> 18         ) | 0xF0)%UInt8, offs); offs += 1
-                unsafe_store!(p, (((c >> 12) & 0x3F ) | 0x80)%UInt8, offs); offs += 1
-                unsafe_store!(p, (((c >> 6)  & 0x3F ) | 0x80)%UInt8, offs); offs += 1
-                unsafe_store!(p, (( c        & 0x3F ) | 0x80)%UInt8, offs); offs += 1
-            else
-                # '\ufffd'
-                unsafe_store!(p, 0xef, offs); offs += 1
-                unsafe_store!(p, 0xbf, offs); offs += 1
-                unsafe_store!(p, 0xbd, offs); offs += 1
-            end
-        else
-            l = sizeof(d::String)
-            unsafe_copy!(pointer(out,offs), pointer(d::String), l)
-            offs += l
-        end
-    end
-    return out
 end
 
 function repeat(s::String, r::Integer)
diff --git a/base/strings/substring.jl b/base/strings/substring.jl
index b5fabef1788dc..75dc64e8b01d7 100644
--- a/base/strings/substring.jl
+++ b/base/strings/substring.jl
@@ -22,13 +22,18 @@ julia> SubString("abc", 2)
 struct SubString{T<:AbstractString} <: AbstractString
     string::T
     offset::Int
-    endof::Int
+    ncodeunits::Int
 
     function SubString{T}(s::T, i::Int, j::Int) where T<:AbstractString
-        i > j && return new(s, i - 1, 0) # always allow i > j as it is consistent with getindex
-        isvalid(s, i) || throw(BoundsError(s, i))
-        isvalid(s, j) || throw(BoundsError(s, j))
-        new(s, i-1, j-i+1)
+        i ≤ j || return new(s, i-1, 0)
+        @boundscheck begin
+            checkbounds(s, i:j)
+            @inbounds isvalid(s, i) ||
+                throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i)))
+            @inbounds isvalid(s, j) ||
+                throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, codeunit(s, j)))
+        end
+        return new(s, i-1, nextind(s,j)-i)
     end
 end
 
@@ -37,11 +42,8 @@ SubString(s::AbstractString, i::Integer, j::Integer=endof(s)) = SubString(s, Int
 SubString(s::AbstractString, r::UnitRange{<:Integer}) = SubString(s, first(r), last(r))
 
 function SubString(s::SubString, i::Int, j::Int)
-    # always allow i > j as it is consistent with getindex
-    i > j && return SubString(s.string, s.offset + i, s.offset + j)
-    i >= 1 || throw(BoundsError(s, i))
-    j <= endof(s) || throw(BoundsError(s, j))
-    SubString(s.string, s.offset + i, s.offset + j)
+    @boundscheck i ≤ j && checkbounds(s, i:j)
+    SubString(s.string, s.offset+i, s.offset+j)
 end
 
 SubString(s::AbstractString) = SubString(s, 1, endof(s))
@@ -50,78 +52,56 @@ SubString{T}(s::T) where {T<:AbstractString} = SubString{T}(s, 1, endof(s))
 convert(::Type{SubString{S}}, s::AbstractString) where {S<:AbstractString} =
     SubString(convert(S, s))
 
-String(p::SubString{String}) =
-    unsafe_string(pointer(p.string, p.offset+1), nextind(p, p.endof)-1)
+String(s::SubString{String}) = unsafe_string(pointer(s.string, s.offset+1), s.ncodeunits)
 
-sizeof(s::SubString{String}) = s.endof == 0 ? 0 : nextind(s, s.endof) - 1
+ncodeunits(s::SubString) = s.ncodeunits
+codeunit(s::SubString) = codeunit(s.string)
+length(s::SubString) = length(s.string, s.offset+1, s.offset+s.ncodeunits)
 
-# TODO: length(s::SubString) = ??
-# default implementation will work but it's slow
-# can this be delegated efficiently somehow?
-# that may require additional string interfaces
-function length(s::SubString{String})
-    return s.endof==0 ? 0 : Int(ccall(:u8_charnum, Csize_t, (Ptr{UInt8}, Csize_t),
-                                      pointer(s), nextind(s, s.endof) - 1))
+function codeunit(s::SubString, i::Integer)
+    @boundscheck checkbounds(s, i)
+    @inbounds return codeunit(s.string, s.offset + i)
 end
 
-function next(s::SubString, i::Int)
-    if i < 1 || i > s.endof
-        throw(BoundsError(s, i))
-    end
-    c, i = next(s.string, i+s.offset)
-    c, i-s.offset
+function next(s::SubString, i::Integer)
+    @boundscheck checkbounds(s, i)
+    @inbounds c, i = next(s.string, s.offset + i)
+    return c, i - s.offset
 end
 
-function getindex(s::SubString, i::Int)
-    if i < 1 || i > s.endof
-        throw(BoundsError(s, i))
-    end
-    getindex(s.string, i+s.offset)
+function getindex(s::SubString, i::Integer)
+    @boundscheck checkbounds(s, i)
+    @inbounds return getindex(s.string, s.offset + i)
 end
 
-endof(s::SubString) = s.endof
-
 function isvalid(s::SubString, i::Integer)
-    return (start(s) <= i <= endof(s)) && isvalid(s.string, s.offset+i)
+    @boundscheck checkbounds(s, i)
+    @inbounds return isvalid(s.string, s.offset + i)
 end
 
-function thisind(s::SubString{String}, i::Integer)
-    j = Int(i)
-    j < start(s) && return 0
-    n = ncodeunits(s)
-    j > n && return n + 1
-    offset = s.offset
-    str = s.string
-    j += offset
-    @inbounds while j > offset && is_valid_continuation(codeunit(str, j))
-        j -= 1
-    end
-    j - offset
-end
-
-nextind(s::SubString, i::Integer) = nextind(s.string, i+s.offset)-s.offset
-prevind(s::SubString, i::Integer) = prevind(s.string, i+s.offset)-s.offset
-
-function getindex(s::AbstractString, r::UnitRange{Int})
-    checkbounds(s, r) || throw(BoundsError(s, r))
-    SubString(s, first(r), last(r))
-end
+thisind(s::SubString, i::Integer) = thisind(s.string, s.offset + i) - s.offset
+nextind(s::SubString, i::Integer) = nextind(s.string, s.offset + i) - s.offset
+prevind(s::SubString, i::Integer) = prevind(s.string, s.offset + i) - s.offset
 
 function cmp(a::SubString{String}, b::SubString{String})
     na = sizeof(a)
     nb = sizeof(b)
     c = ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt),
-              pointer(a), pointer(b), min(na,nb))
-    c < 0 ? -1 : c > 0 ? +1 : cmp(na,nb)
+              pointer(a), pointer(b), min(na, nb))
+    return c < 0 ? -1 : c > 0 ? +1 : cmp(na, nb)
 end
 
 # don't make unnecessary copies when passing substrings to C functions
 cconvert(::Type{Ptr{UInt8}}, s::SubString{String}) = s
 cconvert(::Type{Ptr{Int8}}, s::SubString{String}) = s
+
 function unsafe_convert(::Type{Ptr{R}}, s::SubString{String}) where R<:Union{Int8, UInt8}
     convert(Ptr{R}, pointer(s.string)) + s.offset
 end
 
+pointer(x::SubString{String}) = pointer(x.string) + x.offset
+pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1)
+
 """
     reverse(s::AbstractString) -> AbstractString
 
@@ -159,53 +139,3 @@ function reverse(s::Union{String,SubString{String}})::String
         end
     end
 end
-
-"""
-    reverseind(v, i)
-
-Given an index `i` in [`reverse(v)`](@ref), return the corresponding index in `v` so that
-`v[reverseind(v,i)] == reverse(v)[i]`. (This can be nontrivial in cases where `v` contains
-non-ASCII characters.)
-
-# Examples
-```jldoctest
-julia> r = reverse("Julia")
-"ailuJ"
-
-julia> for i in 1:length(r)
-           print(r[reverseind("Julia", i)])
-       end
-Julia
-```
-"""
-reverseind(s::AbstractString, i::Integer) = thisind(s, ncodeunits(s)-i+1)
-
-"""
-    repeat(s::AbstractString, r::Integer)
-
-Repeat a string `r` times. This can equivalently be accomplished by calling [`s^r`](@ref ^).
-
-# Examples
-```jldoctest
-julia> repeat("ha", 3)
-"hahaha"
-```
-"""
-repeat(s::AbstractString, r::Integer) = repeat(convert(String, s), r)
-
-"""
-    ^(s::Union{AbstractString,Char}, n::Integer)
-
-Repeat a string or character `n` times.
-The [`repeat`](@ref) function is an alias to this operator.
-
-# Examples
-```jldoctest
-julia> "Test "^3
-"Test Test Test "
-```
-"""
-(^)(s::Union{AbstractString,Char}, r::Integer) = repeat(s,r)
-
-pointer(x::SubString{String}) = pointer(x.string) + x.offset
-pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1)
diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl
index 67859b41b54c9..202b481896a7e 100644
--- a/base/strings/unicode.jl
+++ b/base/strings/unicode.jl
@@ -3,7 +3,8 @@
 # Various Unicode functionality from the utf8proc library
 module Unicode
 
-import Base: show, ==, hash, string, Symbol, isless, length, eltype, start, next, done, convert, isvalid
+import Base: show, ==, hash, string, Symbol, isless, length, eltype, start,
+             next, done, convert, isvalid, MalformedCharError, ismalformed
 
 # whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff
 
@@ -111,7 +112,9 @@ const category_strings = [
     "Other, control",
     "Other, format",
     "Other, surrogate",
-    "Other, private use"
+    "Other, private use",
+    "Invalid, too high",
+    "Malformed, bad data",
 ]
 
 const UTF8PROC_STABLE    = (1<<1)
@@ -148,10 +151,26 @@ end
 
 utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags)
 
-function normalize(s::AbstractString; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
+function normalize(
+    s::AbstractString;
+    stable::Bool=false,
+    compat::Bool=false,
+    compose::Bool=true,
+    decompose::Bool=false,
+    stripignore::Bool=false,
+    rejectna::Bool=false,
+    newline2ls::Bool=false,
+    newline2ps::Bool=false,
+    newline2lf::Bool=false,
+    stripcc::Bool=false,
+    casefold::Bool=false,
+    lump::Bool=false,
+    stripmark::Bool=false,
+)
     flags = 0
     stable && (flags = flags | UTF8PROC_STABLE)
     compat && (flags = flags | UTF8PROC_COMPAT)
+    # TODO: error if compose & decompose?
     if decompose
         flags = flags | UTF8PROC_DECOMPOSE
     elseif compose
@@ -250,7 +269,10 @@ julia> textwidth('❤')
 2
 ```
 """
-textwidth(c::Char) = Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))
+function textwidth(c::Char)
+    ismalformed(c) && (c = '\ufffd')
+    Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))
+end
 
 """
     textwidth(s::AbstractString)
@@ -267,17 +289,29 @@ julia> textwidth("March")
 """
 textwidth(s::AbstractString) = mapreduce(textwidth, +, 0, s)
 
-lowercase(c::Char) = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) : Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c))
-uppercase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c))
-titlecase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : Char(ccall(:utf8proc_totitle, UInt32, (UInt32,), c))
+lowercase(c::Char) = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) :
+    Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c))
+uppercase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
+    Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c))
+titlecase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
+    Char(ccall(:utf8proc_totitle, UInt32, (UInt32,), c))
 
 ############################################################################
 
 # returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
-category_code(c) = ccall(:utf8proc_category, Cint, (UInt32,), c)
+function category_code(c::Char)
+    ismalformed(c) && return Cint(31)
+    (u = UInt32(c)) ≤ 0x10ffff || return Cint(30)
+    ccall(:utf8proc_category, Cint, (UInt32,), u)
+end
 
 # more human-readable representations of the category code
-category_abbrev(c) = unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), c))
+function category_abbrev(c)
+    ismalformed(c) && return "Ma"
+    (u = UInt32(c)) ≤ 0x10ffff || return "In"
+    unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), u))
+end
+
 category_string(c) = category_strings[category_code(c)+1]
 
 """
@@ -321,7 +355,7 @@ julia> islower('❤')
 false
 ```
 """
-islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL)
+islower(c::Char) = category_code(c) == UTF8PROC_CATEGORY_LL
 
 # true for Unicode upper and mixed case
 
@@ -347,8 +381,8 @@ false
 ```
 """
 function isupper(c::Char)
-    ccode = category_code(c)
-    return ccode == UTF8PROC_CATEGORY_LU || ccode == UTF8PROC_CATEGORY_LT
+    cat = category_code(c)
+    cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LT
 end
 
 """
@@ -370,7 +404,7 @@ julia> isdigit('α')
 false
 ```
 """
-isdigit(c::Char)  = ('0' <= c <= '9')
+isdigit(c::Char) = '0' <= c <= '9'
 
 """
     isalpha(c::Char) -> Bool
@@ -393,7 +427,7 @@ julia> isalpha('9')
 false
 ```
 """
-isalpha(c::Char)  = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO)
+isalpha(c::Char) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO
 
 """
     isnumeric(c::Char) -> Bool
@@ -422,7 +456,7 @@ julia> isnumeric('❤')
 false
 ```
 """
-isnumeric(c::Char) = (UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO)
+isnumeric(c::Char) = UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO
 
 """
     isalnum(c::Char) -> Bool
@@ -446,9 +480,9 @@ true
 ```
 """
 function isalnum(c::Char)
-    ccode = category_code(c)
-    return (UTF8PROC_CATEGORY_LU <= ccode <= UTF8PROC_CATEGORY_LO) ||
-           (UTF8PROC_CATEGORY_ND <= ccode <= UTF8PROC_CATEGORY_NO)
+    cat = category_code(c)
+    UTF8PROC_CATEGORY_LU <= cat <= UTF8PROC_CATEGORY_LO ||
+    UTF8PROC_CATEGORY_ND <= cat <= UTF8PROC_CATEGORY_NO
 end
 
 # following C++ only control characters from the Latin-1 subset return true
@@ -470,7 +504,7 @@ julia> iscntrl('a')
 false
 ```
 """
-iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f))
+iscntrl(c::Char) = c <= '\x1f' || '\x7f' <= c <= '\u9f'
 
 """
     ispunct(c::Char) -> Bool
@@ -492,7 +526,7 @@ julia> ispunct(';')
 true
 ```
 """
-ispunct(c::Char) = (UTF8PROC_CATEGORY_PC <= category_code(c) <= UTF8PROC_CATEGORY_PO)
+ispunct(c::Char) = UTF8PROC_CATEGORY_PC <= category_code(c) <= UTF8PROC_CATEGORY_PO
 
 # \u85 is the Unicode Next Line (NEL) character
 
@@ -520,7 +554,9 @@ julia> isspace('\\x20')
 true
 ```
 """
-@inline isspace(c::Char) = c == ' ' || '\t' <= c <='\r' || c == '\u85' || '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS
+@inline isspace(c::Char) =
+    c == ' ' || '\t' <= c <= '\r' || c == '\u85' ||
+    '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS
 
 """
     isprint(c::Char) -> Bool
@@ -538,7 +574,7 @@ julia> isprint('A')
 true
 ```
 """
-isprint(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_ZS)
+isprint(c::Char) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_ZS
 
 # true in principal if a printer would use ink
 
@@ -560,7 +596,7 @@ julia> isgraph('A')
 true
 ```
 """
-isgraph(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_SO)
+isgraph(c::Char) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_SO
 
 """
     isascii(c::Union{Char,AbstractString}) -> Bool
@@ -585,7 +621,7 @@ julia> isascii("αβγ")
 false
 ```
 """
-isascii(c::Char) = c < Char(0x80)
+isascii(c::Char) = bswap(reinterpret(UInt32, c)) < 0x80
 isascii(s::AbstractString) = all(isascii, s)
 
 """
@@ -640,7 +676,7 @@ julia> lowercase("STRINGS AND THINGS")
 lowercase(s::AbstractString) = map(lowercase, s)
 
 """
-    titlecase(s::AbstractString)
+    titlecase(s::AbstractString) -> String
 
 Capitalize the first character of each word in `s`.
 See also [`ucfirst`](@ref) to capitalize only the first
@@ -648,9 +684,7 @@ character in `s`.
 
 # Examples
 ```jldoctest
-julia> using Unicode
-
-julia> titlecase("the julia programming language")
+julia> titlecase("the Julia programming language")
 "The Julia Programming Language"
 ```
 """
@@ -670,56 +704,67 @@ function titlecase(s::AbstractString)
 end
 
 """
-    ucfirst(s::AbstractString)
+    ucfirst(s::AbstractString) -> String
 
-Return `string` with the first character converted to uppercase
-(technically "title case" for Unicode).
-See also [`titlecase`](@ref) to capitalize the first character of
-every word in `s`.
+Return `s` with the first character converted to uppercase (technically "title
+case" for Unicode). See also [`titlecase`](@ref) to capitalize the first
+character of every word in `s`.
+
+See also: `lcfirst`, `uppercase`, `lowercase`, `titlecase`
 
 # Examples
 ```jldoctest
-julia> using Unicode
-
 julia> ucfirst("python")
 "Python"
 ```
 """
 function ucfirst(s::AbstractString)
-    isempty(s) && return s
+    isempty(s) && return ""
     c = s[1]
-    tc = titlecase(c)
-    return c==tc ? s : string(tc,s[nextind(s,1):end])
+    c′ = titlecase(c)
+    c == c′ ? convert(String, s) :
+    string(c′, SubString(s, nextind(s, 1)))
 end
 
 """
     lcfirst(s::AbstractString)
 
-Return `string` with the first character converted to lowercase.
+Return `s` with the first character converted to lowercase.
+
+See also: `ucfirst`, `uppercase`, `lowercase`, `titlecase`
 
 # Examples
 ```jldoctest
-julia> using Unicode
-
 julia> lcfirst("Julia")
 "julia"
 ```
 """
 function lcfirst(s::AbstractString)
-    isempty(s) || islower(s[1]) ? s : string(lowercase(s[1]),s[nextind(s,1):end])
+    isempty(s) && return ""
+    c = s[1]
+    c′ = lowercase(c)
+    c == c′ ? convert(String, s) :
+    string(c′, SubString(s, nextind(s, 1)))
 end
 
 ############################################################################
 # iterators for grapheme segmentation
 
 isgraphemebreak(c1::Char, c2::Char) =
+    ismalformed(c1) || ismalformed(c2) ||
     ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2)
 
 # Stateful grapheme break required by Unicode-9 rules: the string
 # must be processed in sequence, with state initialized to Ref{Int32}(0).
 # Requires utf8proc v2.0 or later.
-isgraphemebreak!(state::Ref{Int32}, c1::Char, c2::Char) =
-    ccall(:utf8proc_grapheme_break_stateful, Bool, (UInt32, UInt32, Ref{Int32}), c1, c2, state)
+function isgraphemebreak!(state::Ref{Int32}, c1::Char, c2::Char)
+    if ismalformed(c1) || ismalformed(c2)
+        state[] = 0
+        return true
+    end
+    ccall(:utf8proc_grapheme_break_stateful, Bool,
+          (UInt32, UInt32, Ref{Int32}), c1, c2, state)
+end
 
 struct GraphemeIterator{S<:AbstractString}
     s::S # original string (for generation of SubStrings)
@@ -739,7 +784,7 @@ eltype(::Type{GraphemeIterator{S}}) where {S} = SubString{S}
 eltype(::Type{GraphemeIterator{SubString{S}}}) where {S} = SubString{S}
 
 function length(g::GraphemeIterator)
-    c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
+    c0 = typemax(Char)
     n = 0
     state = Ref{Int32}(0)
     for c in g.s
diff --git a/base/strings/util.jl b/base/strings/util.jl
index 1f6777e7c6c0f..da299d538a55f 100644
--- a/base/strings/util.jl
+++ b/base/strings/util.jl
@@ -58,10 +58,12 @@ function endswith(a::AbstractString, b::AbstractString)
 end
 endswith(str::AbstractString, chars::Chars) = !isempty(str) && last(str) in chars
 
-startswith(a::String, b::String) =
-    (sizeof(a) >= sizeof(b) && ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, sizeof(b)) == 0)
-startswith(a::Vector{UInt8}, b::Vector{UInt8}) =
-    (length(a) >= length(b) && ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, length(b)) == 0)
+# FIXME: check that end of `b` doesn't match a partial character in `a`
+startswith(a::String, b::String) = sizeof(a) ≥ sizeof(b) &&
+    ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, sizeof(b)) == 0
+
+startswith(a::Vector{UInt8}, b::Vector{UInt8}) = length(a) ≥ length(b) &&
+    ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, length(b)) == 0
 
 # TODO: fast endswith
 
@@ -88,15 +90,9 @@ julia> chop(a, 5, 5)
 ""
 ```
 """
-function chop(s::AbstractString, head::Integer, tail::Integer)
-    # negative values of head/tail will throw error in nextind/prevind
-    headidx = head == 0 ? start(s) : nextind(s, start(s), head)
-    tailidx = tail == 0 ? endof(s) : prevind(s, endof(s), tail)
-    SubString(s, headidx, tailidx)
-end
-
-# no head/tail version left for performance reasons
 chop(s::AbstractString) = SubString(s, start(s), prevind(s, endof(s)))
+chop(s::AbstractString, head::Integer, tail::Integer) =
+    SubString(s, nextind(s, start(s), head), prevind(s, endof(s), tail))
 
 """
     chomp(s::AbstractString)
@@ -127,17 +123,6 @@ function chomp(s::String)
     end
 end
 
-# NOTE: use with caution -- breaks the immutable string convention!
-# TODO: this is hard to provide with the new representation
-#function chomp!(s::String)
-#    if !isempty(s) && codeunit(s,sizeof(s)) == 0x0a
-#        n = (endof(s) < 2 || s.data[end-1] != 0x0d) ? 1 : 2
-#        ccall(:jl_array_del_end, Void, (Any, UInt), s.data, n)
-#    end
-#    return s
-#end
-chomp!(s::AbstractString) = chomp(s) # copying fallback for other string types
-
 const _default_delims = [' ','\t','\n','\v','\f','\r']
 
 """
@@ -449,6 +434,7 @@ replace(s::AbstractString, pat, f) = replace_new(String(s), pat, f, typemax(Int)
 # replace(s::AbstractString, pat, f, count::Integer=typemax(Int)) =
 #     replace(String(s), pat, f, count)
 
+# TODO: allow transform as the first argument to replace?
 
 # hex <-> bytes conversion
 
@@ -550,7 +536,8 @@ end
 # check for pure ASCII-ness
 
 function ascii(s::String)
-    for (i, b) in enumerate(Vector{UInt8}(s))
+    for i = 1:sizeof(s)
+        b = codeunit(s,i)
         b < 0x80 || throw(ArgumentError("invalid ASCII at index $i in $(repr(s))"))
     end
     return s
diff --git a/src/ast.c b/src/ast.c
index d54e5581fab89..fba225b231feb 100644
--- a/src/ast.c
+++ b/src/ast.c
@@ -557,7 +557,17 @@ static jl_value_t *scm_to_julia_(fl_context_t *fl_ctx, value_t e, jl_module_t *m
         return (jl_value_t*)ex;
     }
     if (iscprim(e) && cp_class((cprim_t*)ptr(e)) == fl_ctx->wchartype) {
-        return jl_box32(jl_char_type, *(int32_t*)cp_data((cprim_t*)ptr(e)));
+        uint32_t c, u = *(uint32_t*)cp_data((cprim_t*)ptr(e));
+        if (u < 0x80) {
+            c = u << 24;
+        } else {
+            c = ((u << 0) & 0x0000003f) | ((u << 2) & 0x00003f00) |
+                ((u << 4) & 0x003f0000) | ((u << 6) & 0x3f000000);
+            c = u < 0x00000800 ? (c << 16) | 0xc0800000 :
+                u < 0x00010000 ? (c <<  8) | 0xe0808000 :
+                                 (c <<  0) | 0xf0808080 ;
+        }
+        return jl_box_char(c);
     }
     if (iscvalue(e) && cv_class((cvalue_t*)ptr(e)) == jl_ast_ctx(fl_ctx)->jvtype) {
         return *(jl_value_t**)cv_data((cvalue_t*)ptr(e));
diff --git a/src/datatype.c b/src/datatype.c
index 41f5cdb62ac70..edf94df39591c 100644
--- a/src/datatype.c
+++ b/src/datatype.c
@@ -640,7 +640,6 @@ SIBOX_FUNC(int16,  int16_t, 1)
 SIBOX_FUNC(int32,  int32_t, 1)
 UIBOX_FUNC(uint16, uint16_t, 1)
 UIBOX_FUNC(uint32, uint32_t, 1)
-UIBOX_FUNC(char,   uint32_t, 1)
 UIBOX_FUNC(ssavalue, size_t, 1)
 UIBOX_FUNC(slotnumber, size_t, 1)
 #ifdef _P64
@@ -651,6 +650,17 @@ SIBOX_FUNC(int64,  int64_t, 2)
 UIBOX_FUNC(uint64, uint64_t, 2)
 #endif
 
+static jl_value_t *boxed_char_cache[128];
+JL_DLLEXPORT jl_value_t *jl_box_char(uint32_t x)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+    if (0 < (int32_t)x)
+        return boxed_char_cache[x >> 24];
+    jl_value_t *v = jl_gc_alloc(ptls, sizeof(void*), jl_char_type);
+    *(uint32_t*)jl_data_ptr(v) = x;
+    return v;
+}
+
 static jl_value_t *boxed_int8_cache[256];
 JL_DLLEXPORT jl_value_t *jl_box_int8(int8_t x)
 {
@@ -684,14 +694,16 @@ void jl_init_int32_int64_cache(void)
 void jl_init_box_caches(void)
 {
     int64_t i;
+    for(i=0; i < 128; i++) {
+        boxed_char_cache[i] = jl_permbox32(jl_char_type, i << 24);
+    }
     for(i=0; i < 256; i++) {
-        boxed_int8_cache[i]  = jl_permbox8(jl_int8_type, i);
+        boxed_int8_cache[i] = jl_permbox8(jl_int8_type, i);
     }
     for(i=0; i < NBOX_C; i++) {
         boxed_int16_cache[i]  = jl_permbox16(jl_int16_type, i-NBOX_C/2);
         boxed_uint16_cache[i] = jl_permbox16(jl_uint16_type, i);
         boxed_uint32_cache[i] = jl_permbox32(jl_uint32_type, i);
-        boxed_char_cache[i]   = jl_permbox32(jl_char_type, i);
         boxed_uint64_cache[i] = jl_permbox64(jl_uint64_type, i);
     }
 }
diff --git a/src/jl_uv.c b/src/jl_uv.c
index 77719693eb943..4753655bbdd9d 100644
--- a/src/jl_uv.c
+++ b/src/jl_uv.c
@@ -490,10 +490,21 @@ JL_DLLEXPORT void jl_uv_putb(uv_stream_t *stream, uint8_t b)
     jl_uv_puts(stream, (char*)&b, 1);
 }
 
-JL_DLLEXPORT void jl_uv_putc(uv_stream_t *stream, uint32_t wchar)
+JL_DLLEXPORT void jl_uv_putc(uv_stream_t *stream, uint32_t c)
 {
     char s[4];
-    jl_uv_puts(stream, s, u8_wc_toutf8(s, wchar));
+    int n = 1;
+    s[0] = c >> 24;
+    if ((s[1] = c >> 16)) {
+        n++;
+        if ((s[2] = c >> 8)) {
+            n++;
+            if ((s[3] = c)) {
+                n++;
+            }
+        }
+    }
+    jl_uv_puts(stream, s, n);
 }
 
 extern int vasprintf(char **str, const char *fmt, va_list ap);
diff --git a/stdlib/Test/src/Test.jl b/stdlib/Test/src/Test.jl
index 916834c42bf3b..97d46e237343e 100644
--- a/stdlib/Test/src/Test.jl
+++ b/stdlib/Test/src/Test.jl
@@ -1396,8 +1396,11 @@ with string types besides the standard `String` type.
 struct GenericString <: AbstractString
     string::AbstractString
 end
-Base.endof(s::GenericString) = endof(s.string)
-Base.next(s::GenericString, i::Int) = next(s.string, i)
+Base.ncodeunits(s::GenericString) = ncodeunits(s.string)
+Base.codeunit(s::GenericString) = codeunit(s.string)
+Base.codeunit(s::GenericString, i::Integer) = codeunit(s.string, i)
+Base.isvalid(s::GenericString, i::Integer) = isvalid(s.string, i)
+Base.next(s::GenericString, i::Integer) = next(s.string, i)
 Base.reverse(s::GenericString) = GenericString(reverse(s.string))
 Base.reverse(s::SubString{GenericString}) =
     GenericString(typeof(s.string)(reverse(String(s))))
diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl
index 1a332e638a971..bcf3943c8b423 100644
--- a/stdlib/Unicode/test/runtests.jl
+++ b/stdlib/Unicode/test/runtests.jl
@@ -382,8 +382,8 @@ end
     foobar(ch) = Char(0xd800)
     foobaz(ch) = reinterpret(Char, typemax(UInt32))
     @test_throws ArgumentError map(foomap, GenericString(str))
-    @test map(foobar, GenericString(str)) == String(repeat(b"\ud800", outer=[17]))
-    @test map(foobaz, GenericString(str)) == String(repeat(b"\ufffd", outer=[17]))
+    @test map(foobar, GenericString(str)) == String(repeat(b"\ud800", outer=[length(str)]))
+    @test map(foobaz, GenericString(str)) == String(repeat([0xff], outer=[4*length(str)]))
 
     @test "a".*["b","c"] == ["ab","ac"]
     @test ["b","c"].*"a" == ["ba","ca"]
diff --git a/test/char.jl b/test/char.jl
index c40f60de3be23..85b2acf5385ef 100644
--- a/test/char.jl
+++ b/test/char.jl
@@ -198,3 +198,25 @@ end
 
 @test sprint(show, "text/plain", '$') == "'\$': ASCII/Unicode U+0024 (category Sc: Symbol, currency)"
 @test repr('$') == "'\$'"
+
+@testset "read incomplete character at end of stream or file" begin
+    local file = tempname()
+    local iob = IOBuffer([0xf0])
+    local bytes(c::Char) = Vector{UInt8}(string(c))
+    @test bytes(read(iob, Char)) == [0xf0]
+    @test eof(iob)
+    try
+        write(file, 0xf0)
+        open(file) do io
+            @test bytes(read(io, Char)) == [0xf0]
+            @test eof(io)
+        end
+        let io = Base.Filesystem.open(file, Base.Filesystem.JL_O_RDONLY)
+            @test bytes(read(io, Char)) == [0xf0]
+            @test eof(io)
+            close(io)
+        end
+   finally
+        rm(file, force=true)
+    end
+end
diff --git a/test/intfuncs.jl b/test/intfuncs.jl
index 779ce240add9a..062d1103c530f 100644
--- a/test/intfuncs.jl
+++ b/test/intfuncs.jl
@@ -134,7 +134,7 @@ end
     @test base(2, 5, 7) == "0000101"
 
     @test bitstring(Int16(3)) == "0000000000000011"
-    @test bitstring('3') == "00000000000000000000000000110011"
+    @test bitstring('3') == "00110011000000000000000000000000"
     @test bitstring(1035) == (Int == Int32 ? "00000000000000000000010000001011" :
         "0000000000000000000000000000000000000000000000000000010000001011")
     @test bitstring(Int128(3)) == "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011"
diff --git a/test/lineedit.jl b/test/lineedit.jl
index de997ccfbcb98..571ed8a75d36b 100644
--- a/test/lineedit.jl
+++ b/test/lineedit.jl
@@ -16,8 +16,8 @@ function new_state()
     LineEdit.init_state(term, ModalInterface([Prompt("test> ")]))
 end
 
-charseek(buf, i) = seek(buf, Base.unsafe_chr2ind(content(buf), i+1)-1)
-charpos(buf, pos=position(buf)) = Base.unsafe_ind2chr(content(buf), pos+1)-1
+charseek(buf, i) = seek(buf, chr2ind(content(buf), i+1)-1)
+charpos(buf, pos=position(buf)) = ind2chr(content(buf), pos+1)-1
 
 function transform!(f, s, i = -1) # i is char-based (not bytes) buffer position
     buf = buffer(s)
diff --git a/test/strings/basic.jl b/test/strings/basic.jl
index 1ad8ca9184e74..de46027782d8c 100644
--- a/test/strings/basic.jl
+++ b/test/strings/basic.jl
@@ -99,14 +99,14 @@ end
 end
 
 @testset "issue #7248" begin
-    @test_throws BoundsError ind2chr("hello", -1)
-    @test_throws BoundsError chr2ind("hello", -1)
-    @test_throws BoundsError ind2chr("hellø", -1)
-    @test_throws BoundsError chr2ind("hellø", -1)
-    @test_throws BoundsError ind2chr("hello", 10)
-    @test_throws BoundsError chr2ind("hello", 10)
-    @test_throws BoundsError ind2chr("hellø", 10)
-    @test_throws BoundsError chr2ind("hellø", 10)
+    @test ind2chr("hello", -1) == -1
+    @test chr2ind("hello", -1) == -1
+    @test ind2chr("hellø", -1) == -1
+    @test chr2ind("hellø", -1) == -1
+    @test ind2chr("hello", 10) == 10
+    @test chr2ind("hello", 10) == 10
+    @test ind2chr("hellø", 10) == 9
+    @test chr2ind("hellø", 10) == 11
     @test_throws BoundsError checkbounds("hello", 0)
     @test_throws BoundsError checkbounds("hello", 6)
     @test_throws BoundsError checkbounds("hello", 0:3)
@@ -127,7 +127,6 @@ end
     @test SubString("hellø", 1, 5)[10:9] == ""
     @test SubString("hellø", 1, 0)[10:9] == ""
     @test SubString("", 1, 0)[10:9] == ""
-
     @test_throws BoundsError SubString("", 1, 6)
     @test_throws BoundsError SubString("", 1, 1)
 end
@@ -143,8 +142,8 @@ end
     @test get(utf8_str, -1, 'X') == 'X'
     @test get(utf8_str, 1000, 'X') == 'X'
 
-    # Test that indexing into the middle of a character returns the default
-    @test get(utf8_str, 2, 'X') == 'X'
+    # Test that indexing into the middle of a character throws
+    @test_throws UnicodeError get(utf8_str, 2, 'X')
 end
 
 #=
@@ -172,8 +171,10 @@ end
 
 # make sure substrings do not accept code unit if it is not start of codepoint
 let s = "x\u0302"
+    @test s[1:2] == s
+    @test_throws BoundsError s[0:3]
+    @test_throws BoundsError s[1:4]
     @test_throws UnicodeError s[1:3]
-    @test s[1:2]==s
 end
 
 @testset "issue #9781" begin
@@ -193,8 +194,15 @@ struct tstStringType <: AbstractString
 end
 @testset "AbstractString functions" begin
     tstr = tstStringType(Vector{UInt8}("12"))
-    @test_throws ErrorException endof(tstr)
-    @test_throws ErrorException next(tstr, Bool(1))
+    @test_throws MethodError ncodeunits(tstr)
+    @test_throws MethodError codeunit(tstr)
+    @test_throws MethodError codeunit(tstr, 1)
+    @test_throws MethodError codeunit(tstr, true)
+    @test_throws MethodError isvalid(tstr, 1)
+    @test_throws MethodError isvalid(tstr, true)
+    @test_throws MethodError next(tstr, 1)
+    @test_throws MethodError next(tstr, true)
+    @test_throws MethodError endof(tstr)
 
     gstr = GenericString("12")
     @test string(gstr) isa GenericString
@@ -213,18 +221,19 @@ end
     @test done(eachindex("foobar"),7)
     @test eltype(Base.EachStringIndex) == Int
     @test map(Base.Unicode.uppercase, "foó") == "FOÓ"
-    @test chr2ind("fóobar",3) == 4
-
-    @test Symbol(gstr)==Symbol("12")
+    @test chr2ind("fóobar", 3) == 4
 
-    @test_throws ErrorException sizeof(gstr)
+    @test Symbol(gstr) == Symbol("12")
 
-    @test length(GenericString(""))==0
+    @test sizeof(gstr) == 2
+    @test ncodeunits(gstr) == 2
+    @test length(gstr) == 2
+    @test length(GenericString("")) == 0
 
     @test nextind(1:1, 1) == 2
     @test nextind([1], 1) == 2
 
-    @test ind2chr(gstr,2)==2
+    @test ind2chr(gstr, 2) == 2
 
     # tests promote_rule
     let svec = [s"12", GenericString("12"), SubString("123", 1, 2)]
@@ -421,7 +430,7 @@ end
     @test_throws ArgumentError ascii(GenericString("Hello, ∀"))
 end
 @testset "issue #17271: endof() doesn't throw an error even with invalid strings" begin
-    @test endof(String(b"\x90")) == 0
+    @test endof(String(b"\x90")) == 1
     @test endof(String(b"\xce")) == 1
 end
 # issue #17624, missing getindex method for String
@@ -503,7 +512,7 @@ end
                    SubString("123∀α>β:α+1>β123", 4, 18),
                    SubString(s"123∀α>β:α+1>β123", 4, 18)]
         for s in strs
-            @test thisind(s, -2) == 0
+            @test thisind(s, -2) == -2
             @test thisind(s, 0) == 0
             @test thisind(s, 1) == 1
             @test thisind(s, 2) == 1
@@ -514,13 +523,13 @@ end
             @test thisind(s, 15) == 15
             @test thisind(s, 16) == 15
             @test thisind(s, 17) == 17
-            @test thisind(s, 30) == 17
+            @test thisind(s, 30) == 30
         end
     end
 
     let strs = Any["", s"", SubString("123", 2, 1), SubString(s"123", 2, 1)]
         for s in strs, i in -2:2
-            @test thisind(s, i) == (i > 0)
+            @test thisind(s, i) == i
         end
     end
 end
@@ -545,17 +554,18 @@ end
             @test prevind(strs[i], 15, 4) == 10
             @test prevind(strs[i], 15, 10) == 0
             @test prevind(strs[i], 15, 9) == 1
-            @test prevind(strs[i], 15, 10) == 0
             @test prevind(strs[i], 16) == 15
             @test prevind(strs[i], 16, 1) == 15
             @test prevind(strs[i], 16, 2) == 14
-            @test prevind(strs[i], 20) == 15
-            @test prevind(strs[i], 20, 1) == 15
-            @test prevind(strs[i], 20, 10) == 1
-            @test_throws ArgumentError prevind(strs[i], 20, 0)
-
-            @test nextind(strs[i], -1) == 1
-            @test nextind(strs[i], -1, 1) == 1
+            @test prevind(strs[i], 20) == 19
+            @test prevind(strs[i], 20, 1) == 19
+            @test prevind(strs[i], 20, 10) == 7
+            @test prevind(strs[i], 20, 0) == 20
+
+            @test nextind(strs[i], -1) == 0
+            @test nextind(strs[i], -1, 1) == 0
+            @test nextind(strs[i], -1, 2) == 1
+            @test nextind(strs[i], -1, 3) == 4
             @test nextind(strs[i], 0, 2) == 4
             @test nextind(strs[i], 0, 20) == 26
             @test nextind(strs[i], 0, 10) == 15
@@ -576,7 +586,7 @@ end
             @test nextind(strs[i], 15, 1) == 17
             @test nextind(strs[i], 20) == 21
             @test nextind(strs[i], 20, 1) == 21
-            @test_throws ArgumentError nextind(strs[i], 20, 0)
+            @test nextind(strs[i], 20, 0) == 20
 
             for x in -10:20
                 n = p = x
@@ -591,8 +601,8 @@ end
         @test prevind(strs[1], -1) == -2
         @test prevind(strs[1], -1, 1) == -2
 
-        @test prevind(strs[2], -1) == 0
-        @test prevind(strs[2], -1, 1) == 0
+        @test prevind(strs[2], -1) == -2
+        @test prevind(strs[2], -1, 1) == -2
     end
 end
 
@@ -605,7 +615,7 @@ end
     @test first(s, 3) == "∀ϵ≠"
     @test first(s, 4) == "∀ϵ≠0"
     @test first(s, length(s)) == s
-    @test_throws BoundsError first(s, length(s)+1)
+    @test first(s, length(s)+1) == s
     @test_throws ArgumentError last(s, -1)
     @test last(s, 0) == ""
     @test last(s, 1) == "0"
@@ -613,21 +623,13 @@ end
     @test last(s, 3) == "²>0"
     @test last(s, 4) == "ϵ²>0"
     @test last(s, length(s)) == s
-    @test_throws BoundsError last(s, length(s)+1)
+    @test last(s, length(s)+1) == s
 end
 
 @testset "invalid code point" begin
     s = String([0x61, 0xba, 0x41])
     @test !isvalid(s)
-    @test_throws UnicodeError s[2]
-    e = try
-        s[2]
-    catch e
-        e
-    end
-    b = IOBuffer()
-    show(b, e)
-    @test String(take!(b)) == "UnicodeError: invalid character index 2 (0xba is a continuation byte)"
+    @test s[2] == reinterpret(Char, UInt32(0xba) << 24)
 end
 
 @testset "ncodeunits" begin
diff --git a/test/strings/io.jl b/test/strings/io.jl
index ad770804d8e21..e320e97712242 100644
--- a/test/strings/io.jl
+++ b/test/strings/io.jl
@@ -172,8 +172,7 @@ myio = IOBuffer()
 join(myio, "", "", 1)
 @test isempty(take!(myio))
 
-@testset "unescape_chars" begin
-    @test Base.unescape_chars("\\t","t") == "t"
+@testset "unescape_string ArgumentErrors" begin
     @test_throws ArgumentError unescape_string(IOBuffer(), string('\\',"xZ"))
     @test_throws ArgumentError unescape_string(IOBuffer(), string('\\',"777"))
 end
diff --git a/test/strings/types.jl b/test/strings/types.jl
index 12dd75a1bd421..00bac71f826b8 100644
--- a/test/strings/types.jl
+++ b/test/strings/types.jl
@@ -32,12 +32,21 @@ for idx in 0:1
 end
 
 # Substring provided with invalid end index throws BoundsError
-@test_throws BoundsError SubString("∀", 1, 2)
-@test_throws BoundsError SubString("∀", 1, 3)
+@test_throws UnicodeError SubString("∀", 1, 2)
+@test_throws UnicodeError SubString("∀", 1, 3)
 @test_throws BoundsError SubString("∀", 1, 4)
 
 # Substring provided with invalid start index throws BoundsError
-@test_throws BoundsError SubString("∀∀", 2:4)
+@test SubString("∀∀", 1:1) == "∀"
+@test SubString("∀∀", 1:4) == "∀∀"
+@test SubString("∀∀", 4:4) == "∀"
+@test_throws UnicodeError SubString("∀∀", 1:2)
+@test_throws UnicodeError SubString("∀∀", 1:5)
+@test_throws UnicodeError SubString("∀∀", 2:4)
+@test_throws BoundsError SubString("∀∀", 0:1)
+@test_throws BoundsError SubString("∀∀", 0:4)
+@test_throws BoundsError SubString("∀∀", 1:7)
+@test_throws BoundsError SubString("∀∀", 4:7)
 
 # tests for SubString of more than one multibyte `Char` string
 # we are consistent with `getindex` for `String`
@@ -46,10 +55,12 @@ for idx in [0, 1, 4]
     @test SubString("∀∀", 4, idx) == "∀∀"[4:idx]
 end
 
-# second index beyond endof("∀∀")
-for idx in 5:8
+# index beyond endof("∀∀")
+for idx in [2:3; 5:6]
+    @test_throws UnicodeError SubString("∀∀", 1, idx)
+end
+for idx in 7:8
     @test_throws BoundsError SubString("∀∀", 1, idx)
-    @test_throws BoundsError SubString("∀∀", 4, idx)
 end
 
 let str="tempus fugit"              #length(str)==12
@@ -65,13 +76,13 @@ let str="tempus fugit"              #length(str)==12
     ss=SubString(str,1:0)
     @test length(ss)==0
 
-    @test_throws BoundsError SubString(str,14,20)  #start indexing beyond source string length
-    @test_throws BoundsError SubString(str,10,16)  #end indexing beyond source string length
+    @test_throws BoundsError SubString(str, 14, 20)  #start indexing beyond source string length
+    @test_throws BoundsError SubString(str, 10, 16)  #end indexing beyond source string length
 
     @test_throws BoundsError SubString("", 1, 4)  #empty source string
     @test_throws BoundsError SubString("", 1, 1)  #empty source string, identical start and end index
     @test_throws BoundsError SubString("", 10, 12)
-    @test SubString("",12,10) == ""
+    @test SubString("", 12, 10) == ""
 end
 
 @test SubString("foobar", big(1), big(3)) == "foo"
@@ -83,7 +94,7 @@ let str = "aa\u2200\u2222bb"
     write(b, u)
     @test String(take!(b)) == "\u2200\u2222"
 
-    @test_throws BoundsError SubString(str, 4, 5)
+    @test_throws UnicodeError SubString(str, 4, 5)
     @test_throws BoundsError next(u, 0)
     @test_throws BoundsError next(u, 7)
     @test_throws BoundsError getindex(u, 0)
@@ -147,64 +158,69 @@ end
 @test ismatch(Regex(""), SubString("",1,0))
 
 # isvalid(), chr2ind() and ind2chr() for SubString{String}
-let ss, s="lorem ipsum",
-    sdict=Dict(SubString(s,1,11)=>s,
-               SubString(s,1,6)=>"lorem ",
-               SubString(s,1,0)=>"",
-               SubString(s,2,4)=>"ore",
-               SubString(s,2,11)=>"orem ipsum",
-               SubString(s,15,14)=>""
-               )
-    for (ss,s) in sdict
-        local ss
-        for i in -1:12
-            @test isvalid(ss,i)==isvalid(s,i)
+let s = "lorem ipsum", sdict = Dict(
+    SubString(s, 1, 11)  => "lorem ipsum",
+    SubString(s, 1, 6)   => "lorem ",
+    SubString(s, 1, 0)   => "",
+    SubString(s, 2, 4)   => "ore",
+    SubString(s, 2, 11)  => "orem ipsum",
+    SubString(s, 15, 14) => "",
+)
+    for (ss, s) in sdict
+        @test ncodeunits(ss) == ncodeunits(s)
+        for i in -2:13
+            if 1 ≤ i ≤ ncodeunits(ss)
+                @test isvalid(ss, i) == isvalid(s, i)
+            else
+                @test_throws BoundsError isvalid(ss, i)
+                @test_throws BoundsError isvalid(s, i)
+            end
         end
-    end
-    for (ss,s) in sdict
-        local ss
-        for i in 1:length(ss)
-            @test ind2chr(ss,i)==ind2chr(s,i)
+        for i in 1:ncodeunits(ss)
+            @test ind2chr(ss, i) == ind2chr(s, i)
         end
     end
-    for (ss,s) in sdict
-        local ss
+    for (ss, s) in sdict
+        @test length(ss) == length(s)
         for i in 1:length(ss)
-            @test chr2ind(ss,i)==chr2ind(s,i)
+            @test chr2ind(ss, i) == chr2ind(s, i)
         end
     end
-end #let
+end
 
-#for isvalid(SubString{String})
+# for isvalid(SubString{String})
 let s = "Σx + βz - 2"
-    for i in -1:(length(s)+2)
-        if isvalid(s, i)
-            ss=SubString(s,1,i)
-            # make sure isvalid gives equivalent results for SubString and String
-            @test isvalid(ss,i)==isvalid(s,i)
-        else
-            if i > 0
-                @test_throws BoundsError SubString(s,1,i)
+    for i in -1:ncodeunits(s)+2
+        if checkbounds(Bool, s, i)
+            if isvalid(s, i)
+                ss = SubString(s, 1, i)
+                for j = 1:ncodeunits(ss)
+                    @test isvalid(ss, j) == isvalid(s, j)
+                end
             else
-                @test SubString(s,1,i) == ""
+                @test_throws UnicodeError SubString(s, 1, i)
             end
+        elseif i > 0
+            @test_throws BoundsError SubString(s, 1, i)
+        else
+            @test SubString(s, 1, i) == ""
         end
     end
 end
 
-let ss=SubString("hello",1,5)
-    @test_throws BoundsError ind2chr(ss, -1)
-    @test_throws BoundsError chr2ind(ss, -1)
-    @test_throws BoundsError chr2ind(ss, 10)
-    @test_throws BoundsError ind2chr(ss, 10)
+let ss = SubString("hello", 1, 5)
+    @test ind2chr(ss, -1) == -1
+    @test chr2ind(ss, -1) == -1
+    @test chr2ind(ss, 10) == 10
+    @test ind2chr(ss, 10) == 10
 end
 
 # length(SubString{String}) performance specialization
 let s = "|η(α)-ϕ(κ)| < ε"
-    @test length(SubString(s,1,0))==length(s[1:0])
-    @test length(SubString(s,4,4))==length(s[4:4])
-    @test length(SubString(s,1,7))==length(s[1:7])
-    @test length(SubString(s,4,11))==length(s[4:11])
+    @test length(SubString(s, 1, 0)) == length(s[1:0])
+    @test length(SubString(s, 4, 4)) == length(s[4:4])
+    @test length(SubString(s, 1, 7)) == length(s[1:7])
+    @test length(SubString(s, 4, 11)) == length(s[4:11])
 end
 
 @testset "reverseind" for T in (String, SubString, GenericString)
@@ -217,7 +233,8 @@ end
                 @test c == s[reverseind(s, ri)] == r[ri]
                 s = convert(T, string(prefix, prefix, c, suffix, suffix))
                 pre = convert(T, prefix)
-                sb = SubString(s, nextind(pre, endof(pre)), endof(convert(T, string(prefix, prefix, c, suffix))))
+                sb = SubString(s, nextind(pre, endof(pre)),
+                               endof(convert(T, string(prefix, prefix, c, suffix))))
                 r = reverse(sb)
                 ri = search(r, c)
                 @test c == sb[reverseind(sb, ri)] == r[ri]
diff --git a/test/unicode/utf8.jl b/test/unicode/utf8.jl
index a9db6316d2fa9..c65934217dfb9 100644
--- a/test/unicode/utf8.jl
+++ b/test/unicode/utf8.jl
@@ -1,24 +1,13 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-@testset "cesu8 input" begin
-    let ch = 0x10000
-        for hi = 0xd800:0xdbff
-            for lo = 0xdc00:0xdfff
-                @test String(Vector{UInt8}(String(Char[hi, lo]))) == string(Char(ch))
-                ch += 1
-            end
-        end
-    end
-end
-
 @testset "string indexing" begin
     let str = String(b"this is a test\xed\x80")
-        @test next(str, 15) == ('\ufffd', 16)
+        @test next(str, 15) == (reinterpret(Char, 0xed800000), 17)
         @test_throws BoundsError getindex(str, 0:3)
         @test_throws BoundsError getindex(str, 17:18)
         @test_throws BoundsError getindex(str, 2:17)
-        @test_throws UnicodeError getindex(str, 16:17)
-        @test string(Char(0x110000)) == "\ufffd"
+        @test_throws BoundsError getindex(str, 16:17)
+        @test string(Char(0x110000)) == String(b"\xf4\x90\x80\x80")
     end
 end
 
@@ -36,12 +25,12 @@ end
         b"xyz\xf0\x80"      => b"\xf0\x80zyx",
         b"xyz\xf0\x80\x80"  => b"\xf0\x80\x80zyx",
     ]
-        @test_broken reverse(String(s)) == String(r)
+        @test reverse(String(s)) == String(r)
     end
 end
 
 @testset "string convert" begin
     @test String(b"this is a test\xed\x80\x80") == "this is a test\ud000"
-    ## Specifically check UTF-8 string whose lead byte is same as a surrogate
+    # Specifically check UTF-8 string whose lead byte is same as a surrogate
     @test String(b"\xed\x9f\xbf") == "\ud7ff"
 end

From 274cf84e51873063be282d9297eab4d55edd8eac Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Fri, 8 Dec 2017 18:44:23 -0500
Subject: [PATCH 02/22] LineEdit: use character syntax for wildcard character

---
 base/repl/LineEdit.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/base/repl/LineEdit.jl b/base/repl/LineEdit.jl
index 607740573b0e9..deda700e72e80 100644
--- a/base/repl/LineEdit.jl
+++ b/base/repl/LineEdit.jl
@@ -1197,12 +1197,12 @@ end
 
 ### Keymap Support
 
-const wildcard = Char(0x0010f7ff) # "Private Use" Char
+const wildcard = '\U10f7ff' # "Private Use" Char
 
 normalize_key(key::Char) = string(key)
 normalize_key(key::Integer) = normalize_key(Char(key))
 function normalize_key(key::AbstractString)
-    wildcard in key && error("Matching Char(0x0010f7ff) not supported.")
+    wildcard in key && error("Matching '\U10f7ff' not supported.")
     buf = IOBuffer()
     i = start(key)
     while !done(key, i)

From 4d8b90f73cd4117a4f302ee928296378d97e1f72 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Fri, 8 Dec 2017 18:45:08 -0500
Subject: [PATCH 03/22] strings/string: use checkbounds helper in a few places

---
 base/strings/string.jl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 0e1ef86e6a759..f48fd5ebee764 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -64,7 +64,7 @@ ncodeunits(s::String) = Core.sizeof(s)
 codeunit(s::String) = UInt8
 
 @inline function codeunit(s::String, i::Integer)
-    @boundscheck between(i, 1, ncodeunits(s)) || throw(BoundsError(s, i))
+    @boundscheck checkbounds(s, i)
     @gc_preserve s unsafe_load(pointer(s, i))
 end
 
@@ -144,7 +144,7 @@ is_valid_continuation(c) = c & 0xc0 == 0x80
 ## required core functionality ##
 
 function next(s::String, i::Int)
-    @boundscheck 1 ≤ i ≤ sizeof(s) || throw(BoundsError(s, i))
+    @boundscheck checkbounds(s, i)
     @inbounds b = codeunit(s, i)
     # TODO: check index validity
     u = UInt32(b) << 24
@@ -178,9 +178,8 @@ end
 end
 
 function getindex(s::String, i::Int)
-    @boundscheck 1 ≤ i ≤ ncodeunits(s) || throw(BoundsError(s, i))
+    @boundscheck checkbounds(s, i)
     @inbounds b = codeunit(s, i)
-    # TODO: check index validity
     u = UInt32(b) << 24
     (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u)
     return getindex_continued(s, i, u)

From 6e15be871bc7284d26181f8995c6b1faae18c4f9 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Fri, 8 Dec 2017 18:45:34 -0500
Subject: [PATCH 04/22] repeat(Char, Integer): update & simplify for new Char
 rep

---
 base/strings/string.jl | 67 ++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index f48fd5ebee764..3bf718c79865e 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -381,11 +381,12 @@ function repeat(s::String, r::Integer)
     r < 0 && throw(ArgumentError("can't repeat a string $r times"))
     n = sizeof(s)
     out = _string_n(n*r)
-    if n == 1 # common case: repeating a single ASCII char
-        @inbounds ccall(:memset, Ptr{Void}, (Ptr{UInt8}, Cint, Csize_t), out, codeunit(s, 1), r)
+    if n == 1 # common case: repeating a single-byte string
+        @inbounds b = codeunit(s, 1)
+        ccall(:memset, Ptr{Void}, (Ptr{UInt8}, Cint, Csize_t), out, b, r)
     else
-        for i=1:r
-            unsafe_copy!(pointer(out, 1+(i-1)*n), pointer(s), n)
+        for i = 0:r-1
+            unsafe_copy!(pointer(out, i*n+1), pointer(s), n)
         end
     end
     return out
@@ -403,43 +404,33 @@ julia> repeat('A', 3)
 ```
 """
 function repeat(c::Char, r::Integer)
-    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
     r == 0 && return ""
-    ch = UInt(c)
-    if ch < 0x80
-        out = _string_n(r)
-        ccall(:memset, Ptr{Void}, (Ptr{UInt8}, Cint, Csize_t), out, c, r)
-    elseif ch < 0x800
-        out = _string_n(2r)
-        p16 = reinterpret(Ptr{UInt16}, pointer(out))
-        u16 = ((ch >> 0x6) | (ch & 0x3f) << 0x8) % UInt16 | 0x80c0
-        @inbounds for i = 1:r
-            unsafe_store!(p16, u16, i)
+    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
+    u = bswap(reinterpret(UInt32, c))
+    n = 4 - (leading_zeros(u | 0xff) >> 3)
+    s = _string_n(n*r)
+    p = pointer(s)
+    if n == 1
+        ccall(:memset, Ptr{Void}, (Ptr{UInt8}, Cint, Csize_t), p, u % UInt8, r)
+    elseif n == 2
+        p16 = reinterpret(Ptr{UInt16}, p)
+        for i = 1:r
+            unsafe_store!(p16, u % UInt16, i)
         end
-    elseif ch < 0x10000
-        (0xd800 ≥ ch ≤ 0xdfff) || throw(ArgumentError("invalid character 0x$(hex(ch))"))
-        out = _string_n(3r)
-        p = pointer(out)
-        b1 = (ch >> 0xc) % UInt8 | 0xe0
-        b2 = ((ch >> 0x6) & 0x3f) % UInt8 | 0x80
-        b3 = (ch & 0x3f) % UInt8 | 0x80
-        @inbounds for i = 1:r
-            unsafe_store!(p, b1)
-            unsafe_store!(p, b2, 2)
-            unsafe_store!(p, b3, 3)
-            p += 3
+    elseif n == 3
+        b1 = (u >> 0) % UInt8
+        b2 = (u >> 8) % UInt8
+        b3 = (u >> 16) % UInt8
+        for i = 0:r-1
+            unsafe_store!(p, b1, 3i + 1)
+            unsafe_store!(p, b2, 3i + 2)
+            unsafe_store!(p, b3, 3i + 3)
         end
-    elseif ch < 0x110000
-        out = _string_n(4r)
-        p32 = reinterpret(Ptr{UInt32}, pointer(out))
-        u32 = ((ch >> 0x12) | ((ch >> 0x4) & 0x03f00) |
-            ((ch << 0xa) & 0x3f0000) | ((ch & 0x3f) << 0x18)) % UInt32 | 0x808080f0
-        @inbounds for i = 1:r
-            unsafe_store!(p32, u32)
-            p32 += 4
+    elseif n == 4
+        p32 = reinterpret(Ptr{UInt32}, pointer(s))
+        for i = 1:r
+            unsafe_store!(p32, u, i)
         end
-    else
-        throw(ArgumentError("invalid character 0x$(hex(ch))"))
     end
-    return out
+    return s
 end

From 5cd9263b686848f88d5ebb1c44f6b3d39ec37f8f Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Fri, 8 Dec 2017 19:54:34 -0500
Subject: [PATCH 05/22] replace UnicodeError with StringIndexError

This type is appropriate to indicate that someone has indexed into
the non-leading code unit of a character in a string, regardless of
its encoding and the error is no longer used to indicate any other
Unicode-related errors, so the name was no longer fitting.
---
 base/exports.jl              |  2 +-
 base/strings/basic.jl        |  2 +-
 base/strings/errors.jl       | 15 ---------------
 base/strings/string.jl       | 19 ++++++++++++-------
 base/strings/strings.jl      |  1 -
 base/strings/substring.jl    |  6 ++----
 test/choosetests.jl          |  2 +-
 test/strings/basic.jl        |  6 +++---
 test/strings/search.jl       |  6 +++---
 test/strings/types.jl        | 16 ++++++++--------
 test/unicode/UnicodeError.jl |  9 ---------
 11 files changed, 31 insertions(+), 53 deletions(-)
 delete mode 100644 base/strings/errors.jl
 delete mode 100644 test/unicode/UnicodeError.jl

diff --git a/base/exports.jl b/base/exports.jl
index e5193eff91570..b884a3641f7f3 100644
--- a/base/exports.jl
+++ b/base/exports.jl
@@ -154,7 +154,7 @@ export
     NullException,
     ParseError,
     SystemError,
-    UnicodeError,
+    StringIndexError,
 
 # Global constants and variables
     ARGS,
diff --git a/base/strings/basic.jl b/base/strings/basic.jl
index 7ff30d09bc027..1e1b05fba1e71 100644
--- a/base/strings/basic.jl
+++ b/base/strings/basic.jl
@@ -107,7 +107,7 @@ julia> isvalid(str, 2)
 false
 
 julia> str[2]
-ERROR: UnicodeError: invalid character index
+ERROR: StringIndexError: invalid character index
 Stacktrace:
 [...]
 ```
diff --git a/base/strings/errors.jl b/base/strings/errors.jl
deleted file mode 100644
index f97d21dc19321..0000000000000
--- a/base/strings/errors.jl
+++ /dev/null
@@ -1,15 +0,0 @@
-# This file is a part of Julia. License is MIT: https://julialang.org/license
-
-##    Error messages for Unicode / UTF support
-
-const UTF_ERR_SHORT             = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> missing one or more continuation bytes)"
-const UTF_ERR_INVALID_INDEX     = "invalid character index <<1>> (0x<<2>> is a continuation byte)"
-
-struct UnicodeError <: Exception
-    errmsg::AbstractString   ##< A UTF_ERR_ message
-    errpos::Int32            ##< Position of invalid character
-    errchr::UInt32           ##< Invalid character
-end
-
-show(io::IO, exc::UnicodeError) = print(io, replace(replace(string("UnicodeError: ",exc.errmsg),
-    "<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr)))
diff --git a/base/strings/string.jl b/base/strings/string.jl
index 3bf718c79865e..47dc0326914cb 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -1,5 +1,12 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
+struct StringIndexError <: Exception
+    string::AbstractString
+    index::Integer
+end
+@noinline string_index_err(s::AbstractString, i::Integer) =
+    throw(StringIndexError(s, Int(i)))
+
 const ByteArray = Union{Vector{UInt8},Vector{Int8}}
 
 @inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
@@ -155,7 +162,7 @@ end
 @noinline function next_continued(s::String, i::Int, u::UInt32)
     if u < 0xc0000000
         isvalid(s, i) && (i += 1; @goto ret)
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, (u >> 24) % UInt8))
+        string_index_err(s, i)
     end
     n = ncodeunits(s)
     # first continuation byte
@@ -188,7 +195,7 @@ end
 @noinline function getindex_continued(s::String, i::Int, u::UInt32)
     if u < 0xc0000000
         isvalid(s, i) && @goto ret
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, (u >> 24) % UInt8))
+        string_index_err(s, i)
     end
     n = ncodeunits(s)
     # first continuation byte
@@ -217,10 +224,8 @@ function getindex(s::String, r::UnitRange{Int})
     i, j = first(r), last(r)
     @boundscheck begin
         checkbounds(s, r)
-        @inbounds isvalid(s, i) ||
-            throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i)))
-        @inbounds isvalid(s, j) ||
-            throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, codeunit(s, j)))
+        @inbounds isvalid(s, i) || string_index_err(s, i)
+        @inbounds isvalid(s, j) || string_index_err(s, j)
     end
     j = nextind(s, j) - 1
     n = j - i + 1
@@ -284,7 +289,7 @@ function search(s::String, c::Char, i::Integer = 1)
         throw(BoundsError(s, i))
     end
     @inbounds if is_valid_continuation(codeunit(s,i))
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s,i)))
+        string_index_err(s, i)
     end
     c ≤ '\x7f' && return search(s, c % UInt8, i)
     while true
diff --git a/base/strings/strings.jl b/base/strings/strings.jl
index 961f05cdc675a..91f10436a4e73 100644
--- a/base/strings/strings.jl
+++ b/base/strings/strings.jl
@@ -1,6 +1,5 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-include("strings/errors.jl")
 include("strings/substring.jl")
 include("strings/basic.jl")
 include("strings/search.jl")
diff --git a/base/strings/substring.jl b/base/strings/substring.jl
index 75dc64e8b01d7..4d33f89754d1c 100644
--- a/base/strings/substring.jl
+++ b/base/strings/substring.jl
@@ -28,10 +28,8 @@ struct SubString{T<:AbstractString} <: AbstractString
         i ≤ j || return new(s, i-1, 0)
         @boundscheck begin
             checkbounds(s, i:j)
-            @inbounds isvalid(s, i) ||
-                throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i)))
-            @inbounds isvalid(s, j) ||
-                throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, codeunit(s, j)))
+            @inbounds isvalid(s, i) || string_index_err(s, i)
+            @inbounds isvalid(s, j) || string_index_err(s, j)
         end
         return new(s, i-1, nextind(s,j)-i)
     end
diff --git a/test/choosetests.jl b/test/choosetests.jl
index 4e324af14d7aa..b1057030069e4 100644
--- a/test/choosetests.jl
+++ b/test/choosetests.jl
@@ -82,7 +82,7 @@ function choosetests(choices = [])
     end
 
 
-    unicodetests = ["unicode/UnicodeError", "unicode/utf8"]
+    unicodetests = ["unicode/utf8"]
     if "unicode" in skip_tests
         filter!(x -> (x != "unicode" && !(x in unicodetests)), tests)
     elseif "unicode" in tests
diff --git a/test/strings/basic.jl b/test/strings/basic.jl
index de46027782d8c..f6165cacaef27 100644
--- a/test/strings/basic.jl
+++ b/test/strings/basic.jl
@@ -143,7 +143,7 @@ end
     @test get(utf8_str, 1000, 'X') == 'X'
 
     # Test that indexing into the middle of a character throws
-    @test_throws UnicodeError get(utf8_str, 2, 'X')
+    @test_throws StringIndexError get(utf8_str, 2, 'X')
 end
 
 #=
@@ -174,7 +174,7 @@ let s = "x\u0302"
     @test s[1:2] == s
     @test_throws BoundsError s[0:3]
     @test_throws BoundsError s[1:4]
-    @test_throws UnicodeError s[1:3]
+    @test_throws StringIndexError s[1:3]
 end
 
 @testset "issue #9781" begin
@@ -215,7 +215,7 @@ end
     @test gstr[[1]] == "1"
 
     @test s"∀∃"[big(1)] == '∀'
-    @test_throws UnicodeError GenericString("∀∃")[Int8(2)]
+    @test_throws StringIndexError GenericString("∀∃")[Int8(2)]
     @test_throws BoundsError GenericString("∀∃")[UInt16(10)]
 
     @test done(eachindex("foobar"),7)
diff --git a/test/strings/search.jl b/test/strings/search.jl
index 251d32ba5b256..c609066c4f05c 100644
--- a/test/strings/search.jl
+++ b/test/strings/search.jl
@@ -84,16 +84,16 @@ for str in (u8str, GenericString(u8str))
     @test search(str, '\u80') == 0
     @test search(str, '∄') == 0
     @test search(str, '∀') == 1
-    @test_throws UnicodeError search(str, '∀', 2)
+    @test_throws StringIndexError search(str, '∀', 2)
     @test search(str, '∀', 4) == 0
     @test search(str, '∃') == 13
-    @test_throws UnicodeError search(str, '∃', 15)
+    @test_throws StringIndexError search(str, '∃', 15)
     @test search(str, '∃', 16) == 0
     @test search(str, 'x') == 26
     @test search(str, 'x', 27) == 43
     @test search(str, 'x', 44) == 0
     @test search(str, 'δ') == 17
-    @test_throws UnicodeError search(str, 'δ', 18)
+    @test_throws StringIndexError search(str, 'δ', 18)
     @test search(str, 'δ', nextind(str,17)) == 33
     @test search(str, 'δ', nextind(str,33)) == 0
     @test search(str, 'ε') == 5
diff --git a/test/strings/types.jl b/test/strings/types.jl
index 00bac71f826b8..f89907c6f5b88 100644
--- a/test/strings/types.jl
+++ b/test/strings/types.jl
@@ -32,17 +32,17 @@ for idx in 0:1
 end
 
 # Substring provided with invalid end index throws BoundsError
-@test_throws UnicodeError SubString("∀", 1, 2)
-@test_throws UnicodeError SubString("∀", 1, 3)
+@test_throws StringIndexError SubString("∀", 1, 2)
+@test_throws StringIndexError SubString("∀", 1, 3)
 @test_throws BoundsError SubString("∀", 1, 4)
 
 # Substring provided with invalid start index throws BoundsError
 @test SubString("∀∀", 1:1) == "∀"
 @test SubString("∀∀", 1:4) == "∀∀"
 @test SubString("∀∀", 4:4) == "∀"
-@test_throws UnicodeError SubString("∀∀", 1:2)
-@test_throws UnicodeError SubString("∀∀", 1:5)
-@test_throws UnicodeError SubString("∀∀", 2:4)
+@test_throws StringIndexError SubString("∀∀", 1:2)
+@test_throws StringIndexError SubString("∀∀", 1:5)
+@test_throws StringIndexError SubString("∀∀", 2:4)
 @test_throws BoundsError SubString("∀∀", 0:1)
 @test_throws BoundsError SubString("∀∀", 0:4)
 @test_throws BoundsError SubString("∀∀", 1:7)
@@ -57,7 +57,7 @@ end
 
 # index beyond endof("∀∀")
 for idx in [2:3; 5:6]
-    @test_throws UnicodeError SubString("∀∀", 1, idx)
+    @test_throws StringIndexError SubString("∀∀", 1, idx)
 end
 for idx in 7:8
     @test_throws BoundsError SubString("∀∀", 1, idx)
@@ -94,7 +94,7 @@ let str = "aa\u2200\u2222bb"
     write(b, u)
     @test String(take!(b)) == "\u2200\u2222"
 
-    @test_throws UnicodeError SubString(str, 4, 5)
+    @test_throws StringIndexError SubString(str, 4, 5)
     @test_throws BoundsError next(u, 0)
     @test_throws BoundsError next(u, 7)
     @test_throws BoundsError getindex(u, 0)
@@ -198,7 +198,7 @@ let s = "Σx + βz - 2"
                     @test isvalid(ss, j) == isvalid(s, j)
                 end
             else
-                @test_throws UnicodeError SubString(s, 1, i)
+                @test_throws StringIndexError SubString(s, 1, i)
             end
         elseif i > 0
             @test_throws BoundsError SubString(s, 1, i)
diff --git a/test/unicode/UnicodeError.jl b/test/unicode/UnicodeError.jl
deleted file mode 100644
index a5665dff49bbe..0000000000000
--- a/test/unicode/UnicodeError.jl
+++ /dev/null
@@ -1,9 +0,0 @@
-# This file is a part of Julia. License is MIT: https://julialang.org/license
-
-@testset "invalid utf8" begin
-    let io = IOBuffer()
-        show(io, UnicodeError(Base.UTF_ERR_SHORT, 1, 10))
-        check = "UnicodeError: invalid UTF-8 sequence starting at index 1 (0xa missing one or more continuation bytes)"
-        @test String(take!(io)) == check
-    end
-end

From 37aff062a620594df2df55e398d7daa009369efb Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Sat, 9 Dec 2017 13:53:42 -0500
Subject: [PATCH 06/22] See also: use `[name](@ref)` cross reference links.

---
 base/strings/basic.jl   | 14 +++++++-------
 base/strings/unicode.jl |  6 ++++--
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/base/strings/basic.jl b/base/strings/basic.jl
index 1e1b05fba1e71..4ec5c62e707ed 100644
--- a/base/strings/basic.jl
+++ b/base/strings/basic.jl
@@ -30,7 +30,7 @@ indexing functions include those intended for index arithmetic: `thisind`,
 bounds indices as intermediate values so long as one never uses them to retrieve
 a character, which often helps avoid needing to code around edge cases.
 
-See also: `codeunit`, `ncodeunits`, `thisind`, `nextind`, `prevind`
+See also: [`codeunit`](@ref), [`ncodeunits`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref)
 """
 AbstractString
 
@@ -44,7 +44,7 @@ access this string must satisfy `1 ≤ i ≤ ncodeunits(s)`. Not all such indic
 are valid – they may not be the start of a character, but they will return a
 code unit value when calling `codeunit(s,i)`.
 
-See also: `codeunit`, `checkbounds`, `sizeof`, `length`, `endof`
+See also: [`codeunit`](@ref), [`checkbounds`](@ref), [`sizeof`](@ref), [`length`](@ref), [`endof`](@ref)
 """
 ncodeunits(s::AbstractString)
 
@@ -58,7 +58,7 @@ limited to these three types, but it's hard to think of widely used string
 encodings that don't use one of these units. `codeunit(s)` is the same as
 `typeof(codeunit(s,1))` when `s` is a non-empty string.
 
-See also: `ncodeunits`
+See also: [`ncodeunits`](@ref)
 """
 codeunit(s::AbstractString)
 
@@ -72,7 +72,7 @@ Return the code unit value in the string `s` at index `i`. Note that
 I.e. the value returned by `codeunit(s, i)` is of the type returned by
 `codeunit(s)`.
 
-See also: `ncodeunits`, `checkbounds`
+See also: [`ncodeunits`](@ref), [`checkbounds`](@ref)
 """
 codeunit(s::AbstractString, i::Integer) = typeof(i) === Int ?
     throw(MethodError(codeunit, Tuple{typeof(s),Int})) :
@@ -90,7 +90,7 @@ In order for `isvalid(s, i)` to be an O(1) function, the encoding of `s` must
 be [self-synchronizing](https://en.wikipedia.org/wiki/Self-synchronizing_code);
 this is a basic assumption of Julia's generic string support.
 
-See also: `getindex`, `next`, `thisind`, `nextind`, `prevind`, `length`
+See also: [`getindex`](@ref), [`next`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref), [`length`](@ref)
 
 # Examples
 
@@ -125,7 +125,7 @@ be iterated, yielding a sequences of characters. If `i` is out of bounds in `s`
 then a bounds error is raised; if `i` is not a valid character index in `s` then
 a Unicode index error is raised.
 
-See also: `getindex`, `start`, `done`, `checkbounds`
+See also: [`getindex`](@ref), [`start`](@ref), [`done`](@ref), [`checkbounds`](@ref)
 """
 next(s::AbstractString, i::Integer) = typeof(i) === Int ?
     throw(MethodError(next, Tuple{typeof(s),Int})) :
@@ -303,7 +303,7 @@ number of characters in the entire string. If `lo` or `hi` are out of ranges
 each out of range code unit is considered to be one character. This matches the
 "loose" indexing model of `thisind`, `nextind` and `prevind`.
 
-See also: `isvalid`, `ncodeunits`, `endof`, `thisind`, `nextind`, `prevind`
+See also: [`isvalid`](@ref), [`ncodeunits`](@ref), [`endof`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref)
 
 # Examples
 ```jldoctest
diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl
index 202b481896a7e..64b5f6d5611ed 100644
--- a/base/strings/unicode.jl
+++ b/base/strings/unicode.jl
@@ -710,7 +710,8 @@ Return `s` with the first character converted to uppercase (technically "title
 case" for Unicode). See also [`titlecase`](@ref) to capitalize the first
 character of every word in `s`.
 
-See also: `lcfirst`, `uppercase`, `lowercase`, `titlecase`
+See also: [`lcfirst`](@ref), [`uppercase`](@ref), [`lowercase`](@ref),
+[`titlecase`](@ref)
 
 # Examples
 ```jldoctest
@@ -731,7 +732,8 @@ end
 
 Return `s` with the first character converted to lowercase.
 
-See also: `ucfirst`, `uppercase`, `lowercase`, `titlecase`
+See also: [`ucfirst`](@ref), [`uppercase`](@ref), [`lowercase`](@ref),
+[`titlecase`](@ref)
 
 # Examples
 ```jldoctest

From 7eb0def29c64d34b67766e66bc818a51748a5505 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Sat, 9 Dec 2017 14:27:59 -0500
Subject: [PATCH 07/22] address Milan's review comments

---
 base/strings/basic.jl | 53 +++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/base/strings/basic.jl b/base/strings/basic.jl
index 4ec5c62e707ed..e190db531970e 100644
--- a/base/strings/basic.jl
+++ b/base/strings/basic.jl
@@ -16,19 +16,24 @@ about strings:
   * Each `Char` in a string is encoded by one or more code units
   * Only the index of the first code unit of a `Char` is a valid index
   * The encoding of a `Char` is independent of what precedes or follows it
-  * String encodings are "self-synchronizing" – i.e. `isvalid(s,i)` is O(1)
-
-Some string functions error if you use an out-of-bounds or invalid string index,
-including code unit extraction `codeunit(s,i)`, string indexing `s[i]`, and
-string iteration `next(s,i)`. Other string functions take a more relaxed
-approach to indexing and give you the closest valid string index when in-bounds,
-or when out-of-bounds, behave as if there were an infinite number of characters
-padding each side of the string. Usually these imaginary padding characters have
-code unit length `1`, but string types may choose different sizes. Relaxed
-indexing functions include those intended for index arithmetic: `thisind`,
-`nextind` and `prevind`. This model allows index arithmetic to work with out-of-
-bounds indices as intermediate values so long as one never uses them to retrieve
-a character, which often helps avoid needing to code around edge cases.
+  * String encodings are [self-synchronizing] – i.e. `isvalid(s,i)` is O(1)
+
+[self-synchronizing]: https://en.wikipedia.org/wiki/Self-synchronizing_code
+
+Some string functions that extract code units, characters or substrings from
+strings error if you pass them out-of-bounds or invalid string indices. This
+includes `codeunit(s, i)`, `s[i]`, and `next(s, i)`. Functions that do string
+index arithmetic take a more relaxed approach to indexing and give you the
+closest valid string index when in-bounds, or when out-of-bounds, behave as if
+there were an infinite number of characters padding each side of the string.
+Usually these imaginary padding characters have code unit length `1` but string
+types may choose different "imaginary" character sizes as makes sense for their
+implementations (e.g. substrings may pass index arithmetic through to the
+underlying string they provide a view into). Relaxed indexing functions include
+those intended for index arithmetic: `thisind`, `nextind` and `prevind`. This
+model allows index arithmetic to work with out-of- bounds indices as
+intermediate values so long as one never uses them to retrieve a character,
+which often helps avoid needing to code around edge cases.
 
 See also: [`codeunit`](@ref), [`ncodeunits`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref)
 """
@@ -75,8 +80,7 @@ I.e. the value returned by `codeunit(s, i)` is of the type returned by
 See also: [`ncodeunits`](@ref), [`checkbounds`](@ref)
 """
 codeunit(s::AbstractString, i::Integer) = typeof(i) === Int ?
-    throw(MethodError(codeunit, Tuple{typeof(s),Int})) :
-        codeunit(s, Int(i))
+    throw(MethodError(codeunit, Tuple{typeof(s),Int})) : codeunit(s, Int(i))
 
 """
     isvalid(s::AbstractString, i::Integer) -> Bool
@@ -113,8 +117,7 @@ Stacktrace:
 ```
 """
 isvalid(s::AbstractString, i::Integer) = typeof(i) === Int ?
-    throw(MethodError(isvalid, Tuple{typeof(s),Int})) :
-        isvalid(s, Int(i))
+    throw(MethodError(isvalid, Tuple{typeof(s),Int})) : isvalid(s, Int(i))
 
 """
     next(s::AbstractString, i::Integer) -> Tuple{Char, Int}
@@ -128,8 +131,7 @@ a Unicode index error is raised.
 See also: [`getindex`](@ref), [`start`](@ref), [`done`](@ref), [`checkbounds`](@ref)
 """
 next(s::AbstractString, i::Integer) = typeof(i) === Int ?
-    throw(MethodError(next, Tuple{typeof(s),Int})) :
-        next(s, Int(i))
+    throw(MethodError(next, Tuple{typeof(s),Int})) : next(s, Int(i))
 
 ## basic generic definitions ##
 
@@ -182,10 +184,12 @@ promote_rule(::Type{<:AbstractString}, ::Type{<:AbstractString}) = String
 ## string & character concatenation ##
 
 """
-    *(s::Union{AbstractString, Char}, t::Union{AbstractString, Char}...) -> String
+    *(s::Union{AbstractString, Char}, t::Union{AbstractString, Char}...) -> AbstractString
 
 Concatenate strings and/or characters, producing a [`String`](@ref). This is equivalent
-to calling the [`string`](@ref) function on the arguments.
+to calling the [`string`](@ref) function on the arguments. Concatenation of built-in
+string types always produces a value of type `String` but other string types may choose
+to return a string of a different type as appropriate.
 
 # Examples
 ```jldoctest
@@ -299,9 +303,10 @@ isless(a::Symbol, b::Symbol) = cmp(a, b) < 0
 The number of characters in string `s` from indices `lo` through `hi`. This is
 computed as the number of code unit indices from `lo` to `hi` which are valid
 character indices. Without only a single string argument, this computes the
-number of characters in the entire string. If `lo` or `hi` are out of ranges
-each out of range code unit is considered to be one character. This matches the
-"loose" indexing model of `thisind`, `nextind` and `prevind`.
+number of characters in the entire string. With `lo` and `hi` arguments it computes
+the number of indices between `lo` and `hi` inclusive that are valid indices in
+the string `s`. Note that the trailing character may include code units past `hi`
+and still be counted.
 
 See also: [`isvalid`](@ref), [`ncodeunits`](@ref), [`endof`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref)
 

From fcae4238fce26454728943f8c8a6ffa458981954 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Sat, 9 Dec 2017 14:27:39 -0500
Subject: [PATCH 08/22] use stevengj's suggested `hash(Char)` definition

---
 base/char.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/base/char.jl b/base/char.jl
index 6d21af949ebe8..92c52efc2d66a 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -81,7 +81,8 @@ in(x::Char, y::Char) = x == y
 
 ==(x::Char, y::Char) = reinterpret(UInt32, x) == reinterpret(UInt32, y)
 isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y)
-hash(x::Char, h::UInt) = hash(reinterpret(UInt32, x), hash(Char, h))
+hash(x::Char, h::UInt) =
+    hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h))
 
 -(x::Char, y::Char) = Int(x) - Int(y)
 -(x::Char, y::Integer) = Char(Int32(x) - Int32(y))

From f65c90b3a6fd22b705cdcd917ed16b0836467d18 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Sat, 9 Dec 2017 23:17:02 -0500
Subject: [PATCH 09/22] add ncodeunits to stdlib docs

---
 doc/src/stdlib/strings.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/src/stdlib/strings.md b/doc/src/stdlib/strings.md
index 39f2f15a06b3b..98f53c3b27dad 100644
--- a/doc/src/stdlib/strings.md
+++ b/doc/src/stdlib/strings.md
@@ -13,7 +13,8 @@ Core.String(::AbstractString)
 Base.SubString
 Base.transcode
 Base.unsafe_string
-Base.codeunit(::AbstractString, ::Integer)
+Base.ncodeunits(::AbstractString)
+Base.codeunit
 Base.ascii
 Base.@r_str
 Base.@raw_str

From 100d8192b8de4aca344dd54080486ad970315ae4 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Sun, 10 Dec 2017 12:12:05 -0500
Subject: [PATCH 10/22] strings: some formatting tweaks

---
 base/strings/basic.jl | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/base/strings/basic.jl b/base/strings/basic.jl
index e190db531970e..b32b90ff6bd91 100644
--- a/base/strings/basic.jl
+++ b/base/strings/basic.jl
@@ -16,7 +16,7 @@ about strings:
   * Each `Char` in a string is encoded by one or more code units
   * Only the index of the first code unit of a `Char` is a valid index
   * The encoding of a `Char` is independent of what precedes or follows it
-  * String encodings are [self-synchronizing] – i.e. `isvalid(s,i)` is O(1)
+  * String encodings are [self-synchronizing] – i.e. `isvalid(s, i)` is O(1)
 
 [self-synchronizing]: https://en.wikipedia.org/wiki/Self-synchronizing_code
 
@@ -35,7 +35,8 @@ model allows index arithmetic to work with out-of- bounds indices as
 intermediate values so long as one never uses them to retrieve a character,
 which often helps avoid needing to code around edge cases.
 
-See also: [`codeunit`](@ref), [`ncodeunits`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref)
+See also: [`codeunit`](@ref), [`ncodeunits`](@ref), [`thisind`](@ref),
+[`nextind`](@ref), [`prevind`](@ref)
 """
 AbstractString
 
@@ -49,7 +50,8 @@ access this string must satisfy `1 ≤ i ≤ ncodeunits(s)`. Not all such indic
 are valid – they may not be the start of a character, but they will return a
 code unit value when calling `codeunit(s,i)`.
 
-See also: [`codeunit`](@ref), [`checkbounds`](@ref), [`sizeof`](@ref), [`length`](@ref), [`endof`](@ref)
+See also: [`codeunit`](@ref), [`checkbounds`](@ref), [`sizeof`](@ref),
+[`length`](@ref), [`endof`](@ref)
 """
 ncodeunits(s::AbstractString)
 
@@ -94,7 +96,8 @@ In order for `isvalid(s, i)` to be an O(1) function, the encoding of `s` must
 be [self-synchronizing](https://en.wikipedia.org/wiki/Self-synchronizing_code);
 this is a basic assumption of Julia's generic string support.
 
-See also: [`getindex`](@ref), [`next`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref), [`length`](@ref)
+See also: [`getindex`](@ref), [`next`](@ref), [`thisind`](@ref),
+[`nextind`](@ref), [`prevind`](@ref), [`length`](@ref)
 
 # Examples
 
@@ -128,7 +131,8 @@ be iterated, yielding a sequences of characters. If `i` is out of bounds in `s`
 then a bounds error is raised; if `i` is not a valid character index in `s` then
 a Unicode index error is raised.
 
-See also: [`getindex`](@ref), [`start`](@ref), [`done`](@ref), [`checkbounds`](@ref)
+See also: [`getindex`](@ref), [`start`](@ref), [`done`](@ref),
+[`checkbounds`](@ref)
 """
 next(s::AbstractString, i::Integer) = typeof(i) === Int ?
     throw(MethodError(next, Tuple{typeof(s),Int})) : next(s, Int(i))
@@ -308,7 +312,8 @@ the number of indices between `lo` and `hi` inclusive that are valid indices in
 the string `s`. Note that the trailing character may include code units past `hi`
 and still be counted.
 
-See also: [`isvalid`](@ref), [`ncodeunits`](@ref), [`endof`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref)
+See also: [`isvalid`](@ref), [`ncodeunits`](@ref), [`endof`](@ref),
+[`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref)
 
 # Examples
 ```jldoctest
@@ -561,9 +566,9 @@ last(s::AbstractString, n::Integer) = s[max(1, prevind(s, ncodeunits(s)+1, n)):e
 """
     reverseind(v, i)
 
-Given an index `i` in [`reverse(v)`](@ref), return the corresponding index in `v` so that
-`v[reverseind(v,i)] == reverse(v)[i]`. (This can be nontrivial in cases where `v` contains
-non-ASCII characters.)
+Given an index `i` in [`reverse(v)`](@ref), return the corresponding index in
+`v` so that `v[reverseind(v,i)] == reverse(v)[i]`. (This can be nontrivial in
+cases where `v` contains non-ASCII characters.)
 
 # Examples
 ```jldoctest
@@ -581,7 +586,9 @@ reverseind(s::AbstractString, i::Integer) = thisind(s, ncodeunits(s)-i+1)
 """
     repeat(s::AbstractString, r::Integer)
 
-Repeat a string `r` times. This can equivalently be accomplished by calling [`s^r`](@ref ^).
+Repeat a string `r` times. This can be written as `s^r`.
+
+See also: [`^`](@ref)
 
 # Examples
 ```jldoctest
@@ -594,8 +601,9 @@ repeat(s::AbstractString, r::Integer) = repeat(convert(String, s), r)
 """
     ^(s::Union{AbstractString,Char}, n::Integer)
 
-Repeat a string or character `n` times.
-The [`repeat`](@ref) function is an alias to this operator.
+Repeat a string or character `n` times. This can also be written as `repeat(s, n)`.
+
+See also: [`repeat`](@ref)
 
 # Examples
 ```jldoctest

From 166924cea254ab1f0ce701e328fef01cfeb35bc9 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Sun, 10 Dec 2017 13:04:25 -0500
Subject: [PATCH 11/22] delete bswap(::Char)

---
 base/char.jl |  2 --
 test/char.jl | 11 ++++-------
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/base/char.jl b/base/char.jl
index 92c52efc2d66a..76f87409a656d 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -89,8 +89,6 @@ hash(x::Char, h::UInt) =
 +(x::Char, y::Integer) = Char(Int32(x) + Int32(y))
 +(x::Integer, y::Char) = y + x
 
-bswap(x::Char) = Char(bswap(UInt32(x)))
-
 print(io::IO, c::Char) = (write(io, c); nothing)
 
 const hex_chars = UInt8['0':'9';'a':'z']
diff --git a/test/char.jl b/test/char.jl
index 85b2acf5385ef..b6548183891ff 100644
--- a/test/char.jl
+++ b/test/char.jl
@@ -5,13 +5,10 @@
 @test typemin(Char) == Char(0)
 @test ndims(Char) == 0
 @test getindex('a', 1) == 'a'
-@test_throws BoundsError getindex('a',2)
-# This is current behavior, but it seems incorrect
-@test getindex('a',1,1,1) == 'a'
-@test_throws BoundsError getindex('a',1,1,2)
-# bswap of a Char should be removed, only the underlying codeunit (UInt32)
-# should be swapped
-@test bswap('\U10200') == '\U20100'
+@test_throws BoundsError getindex('a', 2)
+# This is current behavior, but it seems questionable
+@test getindex('a', 1, 1, 1) == 'a'
+@test_throws BoundsError getindex('a', 1, 1, 2)
 
 @test 'b' + 1 == 'c'
 @test typeof('b' + 1) == Char

From dcf9552ace3331cbd5426f91a5c84c8e810f9a91 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Sun, 10 Dec 2017 15:07:37 -0500
Subject: [PATCH 12/22] deprecate chr2ind and ind2chr

---
 base/deprecated.jl            |  4 ++++
 base/exports.jl               |  2 --
 base/strings/basic.jl         | 42 -----------------------------------
 base/strings/util.jl          | 22 +++++-------------
 doc/src/manual/strings.md     |  7 +++---
 doc/src/stdlib/strings.md     |  2 --
 stdlib/Dates/src/parse.jl     |  2 +-
 stdlib/Profile/src/Profile.jl |  4 ++--
 test/lineedit.jl              |  4 ++--
 test/strings/basic.jl         | 20 ++++++++---------
 test/strings/types.jl         | 19 ++++++++--------
 11 files changed, 39 insertions(+), 89 deletions(-)

diff --git a/base/deprecated.jl b/base/deprecated.jl
index 2980f3ff14155..fe15713c47fac 100644
--- a/base/deprecated.jl
+++ b/base/deprecated.jl
@@ -2992,6 +2992,10 @@ end
 @deprecate_binding Complex64  ComplexF32
 @deprecate_binding Complex128 ComplexF64
 
+# PR #24999
+@deprecate ind2chr(s::AbstractString, i::Integer) length(s, 1, i)
+@deprecate chr2ind(s::AbstractString, n::Integer) nextind(s, 0, n)
+
 # END 0.7 deprecations
 
 # BEGIN 1.0 deprecations
diff --git a/base/exports.jl b/base/exports.jl
index b884a3641f7f3..9f8a56bd2b9a5 100644
--- a/base/exports.jl
+++ b/base/exports.jl
@@ -716,7 +716,6 @@ export
     bytes2hex,
     chomp,
     chop,
-    chr2ind,
     codeunit,
     dec,
     digits,
@@ -728,7 +727,6 @@ export
     hex,
     hex2bytes,
     hex2bytes!,
-    ind2chr,
     info,
     ismatch,
     isvalid,
diff --git a/base/strings/basic.jl b/base/strings/basic.jl
index b32b90ff6bd91..4f8a09dbd26a9 100644
--- a/base/strings/basic.jl
+++ b/base/strings/basic.jl
@@ -446,48 +446,6 @@ function nextind(s::AbstractString, i::Integer, n::Integer=1)
     return i + n
 end
 
-"""
-    ind2chr(s::AbstractString, i::Integer)
-
-Convert a byte index `i` to a character index with
-respect to string `s`.
-
-See also [`chr2ind`](@ref).
-
-# Examples
-```jldoctest
-julia> str = "αβγdef";
-
-julia> ind2chr(str, 3)
-2
-
-julia> chr2ind(str, 2)
-3
-```
-"""
-ind2chr(s::AbstractString, i::Integer) = length(s, 1, i)
-
-"""
-    chr2ind(s::AbstractString, i::Integer)
-
-Convert a character index `i` to a byte index.
-
-See also [`ind2chr`](@ref).
-
-# Examples
-```jldoctest
-julia> str = "αβγdef";
-
-julia> chr2ind(str, 2)
-3
-
-julia> ind2chr(str, 3)
-2
-```
-"""
-chr2ind(s::AbstractString, n::Integer) =
-    n < 0 ? prevind(s, 0, -n) : nextind(s, 0, n)
-
 ## string index iteration type ##
 
 struct EachStringIndex{T<:AbstractString}
diff --git a/base/strings/util.jl b/base/strings/util.jl
index da299d538a55f..d92969de12ee1 100644
--- a/base/strings/util.jl
+++ b/base/strings/util.jl
@@ -206,28 +206,18 @@ strip(s::AbstractString, chars::Chars) = lstrip(rstrip(s, chars), chars)
 
 function lpad(s::AbstractString, n::Integer, p::AbstractString=" ")
     m = n - Unicode.textwidth(s)
-    (m <= 0) && (return s)
+    m ≤ 0 && return s
     l = Unicode.textwidth(p)
-    if l==1
-        return string(p^m, s)
-    end
-    q = div(m,l)
-    r = m - q*l
-    i = r != 0 ? chr2ind(p, r) : -1
-    string(p^q, p[1:i], s)
+    q, r = divrem(m, l)
+    string(p^q, first(p, r), s)
 end
 
 function rpad(s::AbstractString, n::Integer, p::AbstractString=" ")
     m = n - Unicode.textwidth(s)
-    (m <= 0) && (return s)
+    m ≤ 0 && return s
     l = Unicode.textwidth(p)
-    if l==1
-        return string(s, p^m)
-    end
-    q = div(m,l)
-    r = m - q*l
-    i = r != 0 ? chr2ind(p, r) : -1
-    string(s, p^q, p[1:i])
+    q, r = divrem(m, l)
+    string(s, p^q, first(p, r))
 end
 
 """
diff --git a/doc/src/manual/strings.md b/doc/src/manual/strings.md
index 9feb04345114c..3178c638a84ca 100644
--- a/doc/src/manual/strings.md
+++ b/doc/src/manual/strings.md
@@ -565,14 +565,15 @@ Some other useful functions include:
 
   * [`endof(str)`](@ref) gives the maximal (byte) index that can be used to index into `str`.
   * [`length(str)`](@ref) the number of characters in `str`.
+  * [`length(str, i, j)`](@ref) the number of valid character indices in `str` from `i` to `j`.
   * [`i = start(str)`](@ref start) gives the first valid index at which a character can be found in `str`
     (typically 1).
   * [`c, j = next(str,i)`](@ref next) returns next character at or after the index `i` and the next valid
     character index following that. With [`start`](@ref) and [`endof`](@ref), can be used to iterate
     through the characters in `str`.
-  * [`ind2chr(str,i)`](@ref) gives the number of characters in `str` up to and including any at index
-    `i`.
-  * [`chr2ind(str,j)`](@ref) gives the index at which the `j`th character in `str` occurs.
+  * [`thisind(str, i)`](@ref) given an arbitrary index into a string find the first index of the character into which the index points.
+  * [`nextind(str, i, n=1)`](@ref) find the start of the `n`th character starting after index `i`.
+  * [`prevind(str, i, n=1)`](@ref) find the start of the `n`th character starting before index `i`.
 
 ## [Non-Standard String Literals](@id non-standard-string-literals)
 
diff --git a/doc/src/stdlib/strings.md b/doc/src/stdlib/strings.md
index 98f53c3b27dad..0426b5ba3f6de 100644
--- a/doc/src/stdlib/strings.md
+++ b/doc/src/stdlib/strings.md
@@ -51,8 +51,6 @@ Base.last(::AbstractString, ::Integer)
 Base.join
 Base.chop
 Base.chomp
-Base.ind2chr
-Base.chr2ind
 Base.thisind
 Base.nextind
 Base.prevind
diff --git a/stdlib/Dates/src/parse.jl b/stdlib/Dates/src/parse.jl
index 4f4cb0d7891ea..35829f00e7b5a 100644
--- a/stdlib/Dates/src/parse.jl
+++ b/stdlib/Dates/src/parse.jl
@@ -181,7 +181,7 @@ end
 
 @inline function tryparsenext_word(str::AbstractString, i, len, locale, maxchars=0)
     word_start, word_end = i, 0
-    max_pos = maxchars <= 0 ? len : min(chr2ind(str, ind2chr(str,i) + maxchars - 1), len)
+    max_pos = maxchars <= 0 ? len : min(len, nextind(str, i, maxchars-1))
     @inbounds while i <= max_pos
         c, ii = next(str, i)
         if Base.Unicode.isalpha(c)
diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index 7986e910de0a7..299439b1c1d76 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -645,14 +645,14 @@ function rtruncto(str::String, w::Int)
     if length(str) <= w
         return str
     else
-        return string("...", str[chr2ind(str, length(str)-w+4):end])
+        return string("...", str[prevind(str, end, w-4):end])
     end
 end
 function ltruncto(str::String, w::Int)
     if length(str) <= w
         return str
     else
-        return string(str[1:chr2ind(str,w-4)], "...")
+        return string(str[1:nextind(str, 1, w-4)], "...")
     end
 end
 
diff --git a/test/lineedit.jl b/test/lineedit.jl
index 571ed8a75d36b..61e1d3bd42562 100644
--- a/test/lineedit.jl
+++ b/test/lineedit.jl
@@ -16,8 +16,8 @@ function new_state()
     LineEdit.init_state(term, ModalInterface([Prompt("test> ")]))
 end
 
-charseek(buf, i) = seek(buf, chr2ind(content(buf), i+1)-1)
-charpos(buf, pos=position(buf)) = ind2chr(content(buf), pos+1)-1
+charseek(buf, i) = seek(buf, nextind(content(buf), 0, i+1)-1)
+charpos(buf, pos=position(buf)) = length(content(buf), 1, pos+1)-1
 
 function transform!(f, s, i = -1) # i is char-based (not bytes) buffer position
     buf = buffer(s)
diff --git a/test/strings/basic.jl b/test/strings/basic.jl
index f6165cacaef27..27323005e7f31 100644
--- a/test/strings/basic.jl
+++ b/test/strings/basic.jl
@@ -99,14 +99,14 @@ end
 end
 
 @testset "issue #7248" begin
-    @test ind2chr("hello", -1) == -1
-    @test chr2ind("hello", -1) == -1
-    @test ind2chr("hellø", -1) == -1
-    @test chr2ind("hellø", -1) == -1
-    @test ind2chr("hello", 10) == 10
-    @test chr2ind("hello", 10) == 10
-    @test ind2chr("hellø", 10) == 9
-    @test chr2ind("hellø", 10) == 11
+    @test length("hello", 1, -1) == -1
+    @test prevind("hello", 0, 1) == -1
+    @test length("hellø", 1, -1) == -1
+    @test prevind("hellø", 0, 1) == -1
+    @test length("hello", 1, 10) == 10
+    @test nextind("hello", 0, 10) == 10
+    @test length("hellø", 1, 10) == 9
+    @test nextind("hellø", 0, 10) == 11
     @test_throws BoundsError checkbounds("hello", 0)
     @test_throws BoundsError checkbounds("hello", 6)
     @test_throws BoundsError checkbounds("hello", 0:3)
@@ -221,7 +221,7 @@ end
     @test done(eachindex("foobar"),7)
     @test eltype(Base.EachStringIndex) == Int
     @test map(Base.Unicode.uppercase, "foó") == "FOÓ"
-    @test chr2ind("fóobar", 3) == 4
+    @test nextind("fóobar", 0, 3) == 4
 
     @test Symbol(gstr) == Symbol("12")
 
@@ -233,7 +233,7 @@ end
     @test nextind(1:1, 1) == 2
     @test nextind([1], 1) == 2
 
-    @test ind2chr(gstr, 2) == 2
+    @test length(gstr, 1, 2) == 2
 
     # tests promote_rule
     let svec = [s"12", GenericString("12"), SubString("123", 1, 2)]
diff --git a/test/strings/types.jl b/test/strings/types.jl
index f89907c6f5b88..5d4d492dd0fea 100644
--- a/test/strings/types.jl
+++ b/test/strings/types.jl
@@ -157,7 +157,7 @@ end
 @test !ismatch(Regex("aa"), SubString("",1,0))
 @test ismatch(Regex(""), SubString("",1,0))
 
-# isvalid(), chr2ind() and ind2chr() for SubString{String}
+# isvalid(), formerly length() and nextind() for SubString{String}
 let s = "lorem ipsum", sdict = Dict(
     SubString(s, 1, 11)  => "lorem ipsum",
     SubString(s, 1, 6)   => "lorem ",
@@ -176,14 +176,15 @@ let s = "lorem ipsum", sdict = Dict(
                 @test_throws BoundsError isvalid(s, i)
             end
         end
-        for i in 1:ncodeunits(ss)
-            @test ind2chr(ss, i) == ind2chr(s, i)
+        for i in 1:ncodeunits(ss), j = i-1:ncodeunits(ss)
+            @test length(ss, i, j) == length(s, i, j)
         end
     end
     for (ss, s) in sdict
         @test length(ss) == length(s)
-        for i in 1:length(ss)
-            @test chr2ind(ss, i) == chr2ind(s, i)
+        for i in 0:length(ss)+1, j = 0:length(ss)+1
+            @test nextind(ss, i, j) == nextind(s, i, j)
+            @test prevind(ss, i, j) == prevind(s, i, j)
         end
     end
 end
@@ -209,10 +210,10 @@ let s = "Σx + βz - 2"
 end
 
 let ss = SubString("hello", 1, 5)
-    @test ind2chr(ss, -1) == -1
-    @test chr2ind(ss, -1) == -1
-    @test chr2ind(ss, 10) == 10
-    @test ind2chr(ss, 10) == 10
+    @test length(ss, 1, -1) == -1
+    @test length(ss, 1, 10) == 10
+    @test prevind(ss, 0, 1) == -1
+    @test nextind(ss, 0, 10) == 10
 end
 
 # length(SubString{String}) performance specialization

From 61d5003957c6d713300d8846fc1e3223f61a1221 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Sun, 10 Dec 2017 22:11:37 -0500
Subject: [PATCH 13/22] move string search functions into strings/search.jl

---
 base/strings/search.jl | 68 ++++++++++++++++++++++++++++++++++++++++++
 base/strings/string.jl | 68 ------------------------------------------
 2 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/base/strings/search.jl b/base/strings/search.jl
index 43e880a26b9e5..d2731d1e9d022 100644
--- a/base/strings/search.jl
+++ b/base/strings/search.jl
@@ -1,5 +1,73 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
+function search(s::String, c::Char, i::Integer = 1)
+    if i < 1 || i > sizeof(s)
+        i == sizeof(s) + 1 && return 0
+        throw(BoundsError(s, i))
+    end
+    @inbounds if is_valid_continuation(codeunit(s,i))
+        string_index_err(s, i)
+    end
+    c ≤ '\x7f' && return search(s, c % UInt8, i)
+    while true
+        i = search(s, first_utf8_byte(c), i)
+        (i == 0 || s[i] == c) && return i
+        i = next(s, i)[2]
+    end
+end
+
+function search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1)
+    if i < 1
+        throw(BoundsError(a, i))
+    end
+    n = sizeof(a)
+    if i > n
+        return i == n+1 ? 0 : throw(BoundsError(a, i))
+    end
+    p = pointer(a)
+    q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1)
+    q == C_NULL ? 0 : Int(q-p+1)
+end
+
+function search(a::ByteArray, b::Char, i::Integer = 1)
+    if Unicode.isascii(b)
+        search(a,UInt8(b),i)
+    else
+        search(a,Vector{UInt8}(string(b)),i).start
+    end
+end
+
+function rsearch(s::String, c::Char, i::Integer = sizeof(s))
+    c ≤ '\x7f' && return rsearch(s, c % UInt8, i)
+    b = first_utf8_byte(c)
+    while true
+        i = rsearch(s, b, i)
+        (i == 0 || s[i] == c) && return i
+        i = prevind(s, i)
+    end
+end
+
+function rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = sizeof(s))
+    if i < 1
+        return i == 0 ? 0 : throw(BoundsError(a, i))
+    end
+    n = sizeof(a)
+    if i > n
+        return i == n+1 ? 0 : throw(BoundsError(a, i))
+    end
+    p = pointer(a)
+    q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i)
+    q == C_NULL ? 0 : Int(q-p+1)
+end
+
+function rsearch(a::ByteArray, b::Char, i::Integer = length(a))
+    if Unicode.isascii(b)
+        rsearch(a,UInt8(b),i)
+    else
+        rsearch(a,Vector{UInt8}(string(b)),i).start
+    end
+end
+
 const Chars = Union{Char,Tuple{Vararg{Char}},AbstractVector{Char},Set{Char}}
 
 """
diff --git a/base/strings/string.jl b/base/strings/string.jl
index 47dc0326914cb..1e0aafa803c5c 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -283,74 +283,6 @@ function isvalid(s::String, i::Int)
 end
 isvalid(s::String, i::Integer) = isvalid(s, Int(i))
 
-function search(s::String, c::Char, i::Integer = 1)
-    if i < 1 || i > sizeof(s)
-        i == sizeof(s) + 1 && return 0
-        throw(BoundsError(s, i))
-    end
-    @inbounds if is_valid_continuation(codeunit(s,i))
-        string_index_err(s, i)
-    end
-    c ≤ '\x7f' && return search(s, c % UInt8, i)
-    while true
-        i = search(s, first_utf8_byte(c), i)
-        (i == 0 || s[i] == c) && return i
-        i = next(s, i)[2]
-    end
-end
-
-function search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1)
-    if i < 1
-        throw(BoundsError(a, i))
-    end
-    n = sizeof(a)
-    if i > n
-        return i == n+1 ? 0 : throw(BoundsError(a, i))
-    end
-    p = pointer(a)
-    q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1)
-    q == C_NULL ? 0 : Int(q-p+1)
-end
-
-function search(a::ByteArray, b::Char, i::Integer = 1)
-    if Unicode.isascii(b)
-        search(a,UInt8(b),i)
-    else
-        search(a,Vector{UInt8}(string(b)),i).start
-    end
-end
-
-function rsearch(s::String, c::Char, i::Integer = sizeof(s))
-    c ≤ '\x7f' && return rsearch(s, c % UInt8, i)
-    b = first_utf8_byte(c)
-    while true
-        i = rsearch(s, b, i)
-        (i == 0 || s[i] == c) && return i
-        i = prevind(s, i)
-    end
-end
-
-function rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = sizeof(s))
-    if i < 1
-        return i == 0 ? 0 : throw(BoundsError(a, i))
-    end
-    n = sizeof(a)
-    if i > n
-        return i == n+1 ? 0 : throw(BoundsError(a, i))
-    end
-    p = pointer(a)
-    q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i)
-    q == C_NULL ? 0 : Int(q-p+1)
-end
-
-function rsearch(a::ByteArray, b::Char, i::Integer = length(a))
-    if Unicode.isascii(b)
-        rsearch(a,UInt8(b),i)
-    else
-        rsearch(a,Vector{UInt8}(string(b)),i).start
-    end
-end
-
 ## optimized concatenation, reverse, repeat ##
 
 function string(a::String...)

From c713dffdaaa29bce1327cebe09a6a2a2714a5054 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Sun, 10 Dec 2017 23:58:28 -0500
Subject: [PATCH 14/22] optimize the length(::String) method better

---
 base/strings/string.jl | 60 ++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 1e0aafa803c5c..44e7d86f25eb4 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -184,9 +184,8 @@ end
     return reinterpret(Char, u), i
 end
 
-function getindex(s::String, i::Int)
-    @boundscheck checkbounds(s, i)
-    @inbounds b = codeunit(s, i)
+@propagate_inbounds function getindex(s::String, i::Int)
+    b = codeunit(s, i)
     u = UInt32(b) << 24
     (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u)
     return getindex_continued(s, i, u)
@@ -241,35 +240,38 @@ function length(s::String, lo::Int, hi::Int)
     z = ncodeunits(s)
     i = Int(max(1, min(z, lo)))
     n = Int(min(z, max(1, hi)))
-    c = i - n
-    if i ≤ n
-        i, j = thisind(s, i), i
-        c -= i < j
-        i -= 1
+    c = hi - lo + 1
+    i < n || return c
+    @inbounds i, j = thisind(s, i), i
+    c -= i < j
+    _length(s, i, n, c)
+end
+
+length(s::String) = _length(s, 1, ncodeunits(s), ncodeunits(s))
+
+function _length(s::String, i::Int, n::Int, c::Int)
+    i < n || return c
+    @inbounds b = codeunit(s, i)
+    @inbounds while true
         while true
-            (i += 1) ≤ n || break
-            @inbounds b = codeunit(s, i) # lead byte
-        @label L
-            c += 1
-            (0xc0 ≤ b) & (b < 0xf8) || continue
-            l = b
-
-            (i += 1) ≤ n || break
-            @inbounds b = codeunit(s, i) # cont byte 1
-            b & 0xc0 == 0x80 || @goto L
-            l ≥ 0xe0 || continue
-
-            (i += 1) ≤ n || break
-            @inbounds b = codeunit(s, i) # cont byte 2
-            b & 0xc0 == 0x80 || @goto L
-            l ≥ 0xf0 || continue
-
-            (i += 1) ≤ n || break
-            @inbounds b = codeunit(s, i) # cont byte 3
-            b & 0xc0 == 0x80 || @goto L
+            (i += 1) ≤ n || return c
+            between(b, 0xc0, 0xf7) && break
+            b = codeunit(s, i)
         end
+        l = b
+        b = codeunit(s, i) # cont byte 1
+        c -= (x = b & 0xc0 == 0x80)
+        x & (l ≥ 0xe0) || continue
+
+        (i += 1) ≤ n || return c
+        b = codeunit(s, i) # cont byte 2
+        c -= (x = b & 0xc0 == 0x80)
+        x & (l ≥ 0xf0) || continue
+
+        (i += 1) ≤ n || return c
+        b = codeunit(s, i) # cont byte 3
+        c -= (b & 0xc0 == 0x80)
     end
-    return c + hi - lo
 end
 
 # TODO: delete or move to char.jl

From b8cd96e03248f828b2169708043a6a1ece5a4c4f Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Mon, 11 Dec 2017 00:34:50 -0500
Subject: [PATCH 15/22] =?UTF-8?q?make=20length(string,=20i,=20j)=20?=
 =?UTF-8?q?=E2=89=A5=C2=A00=20for=20all=20i,=20j?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 base/strings/basic.jl  |  1 +
 base/strings/string.jl | 11 +++++++----
 test/strings/basic.jl  |  4 ++--
 test/strings/types.jl  |  2 +-
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/base/strings/basic.jl b/base/strings/basic.jl
index 4f8a09dbd26a9..18a87c9226e8f 100644
--- a/base/strings/basic.jl
+++ b/base/strings/basic.jl
@@ -322,6 +322,7 @@ julia> length("jμΛIα")
 ```
 """
 function length(s::AbstractString, lo::Integer=1, hi::Integer=ncodeunits(s))
+    lo ≤ hi || return 0
     z = ncodeunits(s)
     a = Int(max(1, min(z, lo)))
     b = Int(min(z, max(1, hi)))
diff --git a/base/strings/string.jl b/base/strings/string.jl
index 44e7d86f25eb4..638ba8dfae73a 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -237,10 +237,13 @@ function getindex(s::String, r::UnitRange{Int})
 end
 
 function length(s::String, lo::Int, hi::Int)
-    z = ncodeunits(s)
-    i = Int(max(1, min(z, lo)))
-    n = Int(min(z, max(1, hi)))
-    c = hi - lo + 1
+    i, n = lo, hi
+    c = max(0, hi - lo + 1)
+    @boundscheck begin
+        z = ncodeunits(s)
+        i = Int(max(1, min(z, lo)))
+        n = Int(min(z, max(1, hi)))
+    end
     i < n || return c
     @inbounds i, j = thisind(s, i), i
     c -= i < j
diff --git a/test/strings/basic.jl b/test/strings/basic.jl
index 27323005e7f31..49df4fe48252d 100644
--- a/test/strings/basic.jl
+++ b/test/strings/basic.jl
@@ -99,9 +99,9 @@ end
 end
 
 @testset "issue #7248" begin
-    @test length("hello", 1, -1) == -1
+    @test length("hello", 1, -1) == 0
     @test prevind("hello", 0, 1) == -1
-    @test length("hellø", 1, -1) == -1
+    @test length("hellø", 1, -1) == 0
     @test prevind("hellø", 0, 1) == -1
     @test length("hello", 1, 10) == 10
     @test nextind("hello", 0, 10) == 10
diff --git a/test/strings/types.jl b/test/strings/types.jl
index 5d4d492dd0fea..452da4cbc2ed6 100644
--- a/test/strings/types.jl
+++ b/test/strings/types.jl
@@ -210,7 +210,7 @@ let s = "Σx + βz - 2"
 end
 
 let ss = SubString("hello", 1, 5)
-    @test length(ss, 1, -1) == -1
+    @test length(ss, 1, -1) == 0
     @test length(ss, 1, 10) == 10
     @test prevind(ss, 0, 1) == -1
     @test nextind(ss, 0, 10) == 10

From f9e1acb4007b7896440b05538ce985c2d64747d3 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Mon, 11 Dec 2017 12:20:40 -0500
Subject: [PATCH 16/22] moar Unicode operators

---
 base/strings/string.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index 638ba8dfae73a..fdbf1b37ade87 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -124,13 +124,13 @@ function nextind(s::String, i::Int)
     end
     # first continuation byte
     @inbounds b = codeunit(s, i += 1)
-    (b & 0xc0 != 0x80) | ((i += 1) > n) | (l < 0xe0) && return i
+    (b & 0xc0 ≠ 0x80) | ((i += 1) > n) | (l < 0xe0) && return i
     # second continuation byte
     @inbounds b = codeunit(s, i)
-    (b & 0xc0 != 0x80) | ((i += 1) > n) | (l < 0xf0) && return i
+    (b & 0xc0 ≠ 0x80) | ((i += 1) > n) | (l < 0xf0) && return i
     # third continuation byte
     @inbounds b = codeunit(s, i)
-    ifelse(b & 0xc0 != 0x80, i, i+1)
+    ifelse(b & 0xc0 ≠ 0x80, i, i+1)
 end
 
 ## checking UTF-8 & ACSII validity ##
@@ -143,7 +143,7 @@ byte_string_classify(s::String) =
     # 1: valid ASCII
     # 2: valid UTF-8
 
-isvalid(::Type{String}, s::Union{Vector{UInt8},String}) = byte_string_classify(s) != 0
+isvalid(::Type{String}, s::Union{Vector{UInt8},String}) = byte_string_classify(s) ≠ 0
 isvalid(s::String) = isvalid(String, s)
 
 is_valid_continuation(c) = c & 0xc0 == 0x80

From a7face95824c65eceeb73f775833c4cc333ab526 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Tue, 12 Dec 2017 00:06:48 -0500
Subject: [PATCH 17/22] change some code point conversions to char comparisons

---
 base/strings/unicode.jl | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl
index 64b5f6d5611ed..5c28f51189560 100644
--- a/base/strings/unicode.jl
+++ b/base/strings/unicode.jl
@@ -43,11 +43,10 @@ true
 """
 isvalid(T,value)
 
-isvalid(::Type{Char}, ch::Unsigned) = !((ch - 0xd800 < 0x800) | (ch > 0x10ffff))
-isvalid(::Type{Char}, ch::Integer) = isvalid(Char, Unsigned(ch))
-isvalid(::Type{Char}, ch::Char) = isvalid(Char, UInt32(ch))
-
-isvalid(ch::Char) = isvalid(Char, ch)
+isvalid(c::Char) = !ismalformed(c) & ((c ≤ '\ud7ff') | ('\ue000' ≤ c) & (c ≤ '\U10ffff'))
+isvalid(::Type{Char}, c::Unsigned) = ((c ≤  0xd7ff ) | ( 0xe000  ≤ c) & (c ≤  0x10ffff ))
+isvalid(::Type{Char}, c::Integer)  = isvalid(Char, Unsigned(c))
+isvalid(::Type{Char}, c::Char)     = isvalid(c)
 
 # utf8 category constants
 const UTF8PROC_CATEGORY_CN = 0
@@ -301,15 +300,15 @@ titlecase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
 # returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
 function category_code(c::Char)
     ismalformed(c) && return Cint(31)
-    (u = UInt32(c)) ≤ 0x10ffff || return Cint(30)
-    ccall(:utf8proc_category, Cint, (UInt32,), u)
+    c ≤ '\U10ffff' || return Cint(30)
+    ccall(:utf8proc_category, Cint, (UInt32,), c)
 end
 
 # more human-readable representations of the category code
 function category_abbrev(c)
     ismalformed(c) && return "Ma"
-    (u = UInt32(c)) ≤ 0x10ffff || return "In"
-    unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), u))
+    c ≤ '\U10ffff' || return "In"
+    unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), c))
 end
 
 category_string(c) = category_strings[category_code(c)+1]

From 1f0c6fa35ab64ad66a5bb413fad474e2c722c290 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Mon, 11 Dec 2017 12:23:47 -0500
Subject: [PATCH 18/22] various String performance tweaks

- {next,getindex}_continued don't need to re-check bounds
- short-circuiting is slightly faster in length
---
 base/strings/string.jl | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index fdbf1b37ade87..91497d5c68c81 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -150,12 +150,10 @@ is_valid_continuation(c) = c & 0xc0 == 0x80
 
 ## required core functionality ##
 
-function next(s::String, i::Int)
-    @boundscheck checkbounds(s, i)
-    @inbounds b = codeunit(s, i)
-    # TODO: check index validity
+@propagate_inbounds function next(s::String, i::Int)
+    b = codeunit(s, i)
     u = UInt32(b) << 24
-    (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u), i+1
+    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
     return next_continued(s, i, u)
 end
 
@@ -187,29 +185,30 @@ end
 @propagate_inbounds function getindex(s::String, i::Int)
     b = codeunit(s, i)
     u = UInt32(b) << 24
-    (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u)
+    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
     return getindex_continued(s, i, u)
 end
 
-@noinline function getindex_continued(s::String, i::Int, u::UInt32)
+function getindex_continued(s::String, i::Int, u::UInt32)
     if u < 0xc0000000
-        isvalid(s, i) && @goto ret
+        # called from `getindex` which checks bounds
+        @inbounds isvalid(s, i) && @goto ret
         string_index_err(s, i)
     end
     n = ncodeunits(s)
-    # first continuation byte
+
     (i += 1) > n && @goto ret
-    @inbounds b = codeunit(s, i)
+    @inbounds b = codeunit(s, i) # cont byte 1
     b & 0xc0 == 0x80 || @goto ret
     u |= UInt32(b) << 16
-    # second continuation byte
+
     ((i += 1) > n) | (u < 0xe0000000) && @goto ret
-    @inbounds b = codeunit(s, i)
+    @inbounds b = codeunit(s, i) # cont byte 2
     b & 0xc0 == 0x80 || @goto ret
     u |= UInt32(b) << 8
-    # third continuation byte
+
     ((i += 1) > n) | (u < 0xf0000000) && @goto ret
-    @inbounds b = codeunit(s, i)
+    @inbounds b = codeunit(s, i) # cont byte 3
     b & 0xc0 == 0x80 || @goto ret
     u |= UInt32(b)
 @label ret
@@ -252,13 +251,13 @@ end
 
 length(s::String) = _length(s, 1, ncodeunits(s), ncodeunits(s))
 
-function _length(s::String, i::Int, n::Int, c::Int)
+@inline function _length(s::String, i::Int, n::Int, c::Int)
     i < n || return c
     @inbounds b = codeunit(s, i)
     @inbounds while true
         while true
             (i += 1) ≤ n || return c
-            between(b, 0xc0, 0xf7) && break
+            0xc0 ≤ b ≤ 0xf7 && break
             b = codeunit(s, i)
         end
         l = b

From 6f10ca2184c74de8d4e1380c4d9e6221feea5a34 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Tue, 12 Dec 2017 00:28:04 -0500
Subject: [PATCH 19/22] add various `@propagate_inbounds` annotations

also: avoid second bounds check in `get` with default
---
 base/strings/basic.jl | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/base/strings/basic.jl b/base/strings/basic.jl
index 18a87c9226e8f..667d884bb10d9 100644
--- a/base/strings/basic.jl
+++ b/base/strings/basic.jl
@@ -81,7 +81,7 @@ I.e. the value returned by `codeunit(s, i)` is of the type returned by
 
 See also: [`ncodeunits`](@ref), [`checkbounds`](@ref)
 """
-codeunit(s::AbstractString, i::Integer) = typeof(i) === Int ?
+@propagate_inbounds codeunit(s::AbstractString, i::Integer) = typeof(i) === Int ?
     throw(MethodError(codeunit, Tuple{typeof(s),Int})) : codeunit(s, Int(i))
 
 """
@@ -119,7 +119,7 @@ Stacktrace:
 [...]
 ```
 """
-isvalid(s::AbstractString, i::Integer) = typeof(i) === Int ?
+@propagate_inbounds isvalid(s::AbstractString, i::Integer) = typeof(i) === Int ?
     throw(MethodError(isvalid, Tuple{typeof(s),Int})) : isvalid(s, Int(i))
 
 """
@@ -134,7 +134,7 @@ a Unicode index error is raised.
 See also: [`getindex`](@ref), [`start`](@ref), [`done`](@ref),
 [`checkbounds`](@ref)
 """
-next(s::AbstractString, i::Integer) = typeof(i) === Int ?
+@propagate_inbounds next(s::AbstractString, i::Integer) = typeof(i) === Int ?
     throw(MethodError(next, Tuple{typeof(s),Int})) : next(s, Int(i))
 
 ## basic generic definitions ##
@@ -148,13 +148,21 @@ endof(s::AbstractString) = thisind(s, ncodeunits(s))
 getindex(s::AbstractString, i::Integer) = next(s, i)[1]
 getindex(s::AbstractString, i::Colon) = s
 # TODO: handle other ranges with stride ±1 specially?
+# TODO: add more @propagate_inbounds annotations?
 getindex(s::AbstractString, r::UnitRange{<:Integer}) = SubString(s, r)
 getindex(s::AbstractString, v::AbstractVector{<:Integer}) =
     sprint(length(v), io->(for i in v; write(io, s[i]) end))
 getindex(s::AbstractString, v::AbstractVector{Bool}) =
     throw(ArgumentError("logical indexing not supported for strings"))
 
-get(s::AbstractString, i::Integer, default) = checkbounds(Bool, s, i) ? s[i] : default
+function get(s::AbstractString, i::Integer, default)
+# TODO: use ternary once @inbounds is expression-like
+    if checkbounds(Bool, s, i)
+        @inbounds return s[i]
+    else
+        return default
+    end
+end
 
 ## bounds checking ##
 

From 937c3ada77398f45c8a425b5b62265bef7f4db8f Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Tue, 12 Dec 2017 02:59:32 -0500
Subject: [PATCH 20/22] isvalid: return false out of bounds instead of throwing

also: `next` can assume that incoming indices are valid
---
 base/strings/basic.jl     | 25 ++++++++++++++-----------
 base/strings/search.jl    | 15 +++++----------
 base/strings/string.jl    | 13 +++----------
 base/strings/substring.jl |  5 +++--
 test/strings/types.jl     |  7 +------
 5 files changed, 26 insertions(+), 39 deletions(-)

diff --git a/base/strings/basic.jl b/base/strings/basic.jl
index 667d884bb10d9..a9dcb14f54cde 100644
--- a/base/strings/basic.jl
+++ b/base/strings/basic.jl
@@ -87,14 +87,13 @@ See also: [`ncodeunits`](@ref), [`checkbounds`](@ref)
 """
     isvalid(s::AbstractString, i::Integer) -> Bool
 
-Predicate indicating whether the given index is the start of the encoding of
-a character in `s` or not. If `isvalid(s, i)` is true then `s[i]` will return
-the character whose encoding starts at that index, if it's false, then `s[i]`
-will raise an invalid index error. Behavior of `next(s, i)` is similar except
-that the character is returned along with the index of the following character.
-In order for `isvalid(s, i)` to be an O(1) function, the encoding of `s` must
-be [self-synchronizing](https://en.wikipedia.org/wiki/Self-synchronizing_code);
-this is a basic assumption of Julia's generic string support.
+Predicate indicating whether the given index is the start of the encoding of a
+character in `s` or not. If `isvalid(s, i)` is true then `s[i]` will return the
+character whose encoding starts at that index, if it's false, then `s[i]` will
+raise an invalid index error or a bounds error depending on if `i` is in bounds.
+In order for `isvalid(s, i)` to be an O(1) function, the encoding of `s` must be
+[self-synchronizing](https://en.wikipedia.org/wiki/Self-synchronizing_code) this
+is a basic assumption of Julia's generic string support.
 
 See also: [`getindex`](@ref), [`next`](@ref), [`thisind`](@ref),
 [`nextind`](@ref), [`prevind`](@ref), [`length`](@ref)
@@ -128,8 +127,8 @@ Stacktrace:
 Return a tuple of the character in `s` at index `i` with the index of the start
 of the following character in `s`. This is the key method that allows strings to
 be iterated, yielding a sequences of characters. If `i` is out of bounds in `s`
-then a bounds error is raised; if `i` is not a valid character index in `s` then
-a Unicode index error is raised.
+then a bounds error is raised. The `next` function, as part of the iteration
+protocoal may assume that `i` is the start of a character in `s`.
 
 See also: [`getindex`](@ref), [`start`](@ref), [`done`](@ref),
 [`checkbounds`](@ref)
@@ -145,7 +144,11 @@ eltype(::Type{<:AbstractString}) = Char
 sizeof(s::AbstractString) = ncodeunits(s) * sizeof(codeunit(s))
 endof(s::AbstractString) = thisind(s, ncodeunits(s))
 
-getindex(s::AbstractString, i::Integer) = next(s, i)[1]
+function getindex(s::AbstractString, i::Integer)
+    @boundscheck checkbounds(s, i)
+    @inbounds return isvalid(s, i) ? next(s, i)[1] : string_index_err(s, i)
+end
+
 getindex(s::AbstractString, i::Colon) = s
 # TODO: handle other ranges with stride ±1 specially?
 # TODO: add more @propagate_inbounds annotations?
diff --git a/base/strings/search.jl b/base/strings/search.jl
index d2731d1e9d022..5b0fa167a783f 100644
--- a/base/strings/search.jl
+++ b/base/strings/search.jl
@@ -5,9 +5,7 @@ function search(s::String, c::Char, i::Integer = 1)
         i == sizeof(s) + 1 && return 0
         throw(BoundsError(s, i))
     end
-    @inbounds if is_valid_continuation(codeunit(s,i))
-        string_index_err(s, i)
-    end
+    @inbounds isvalid(s, i) || string_index_err(s, i)
     c ≤ '\x7f' && return search(s, c % UInt8, i)
     while true
         i = search(s, first_utf8_byte(c), i)
@@ -94,13 +92,10 @@ julia> search("JuliaLang","Julia")
 ```
 """
 function search(s::AbstractString, c::Chars, i::Integer)
-    if isempty(c)
-        return 1 <= i <= nextind(s,endof(s)) ? i :
-               throw(BoundsError(s, i))
-    end
-    if i < 1 || i > nextind(s,endof(s))
-        throw(BoundsError(s, i))
-    end
+    z = ncodeunits(s) + 1
+    isempty(c) && return 1 ≤ i ≤ z ? i : throw(BoundsError(s, i))
+    1 ≤ i ≤ z || throw(BoundsError(s, i))
+    @inbounds i == z || isvalid(s, i) || string_index_err(s, i)
     while !done(s,i)
         d, j = next(s,i)
         if d in c
diff --git a/base/strings/string.jl b/base/strings/string.jl
index 91497d5c68c81..b1d365bb35b17 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -157,11 +157,8 @@ is_valid_continuation(c) = c & 0xc0 == 0x80
     return next_continued(s, i, u)
 end
 
-@noinline function next_continued(s::String, i::Int, u::UInt32)
-    if u < 0xc0000000
-        isvalid(s, i) && (i += 1; @goto ret)
-        string_index_err(s, i)
-    end
+function next_continued(s::String, i::Int, u::UInt32)
+    u < 0xc0000000 && (i += 1; @goto ret)
     n = ncodeunits(s)
     # first continuation byte
     (i += 1) > n && @goto ret
@@ -281,11 +278,7 @@ first_utf8_byte(c::Char) = (reinterpret(UInt32, c) >> 24) % UInt8
 
 ## overload methods for efficiency ##
 
-function isvalid(s::String, i::Int)
-    @boundscheck checkbounds(s, i)
-    return thisind(s, i) == i
-end
-isvalid(s::String, i::Integer) = isvalid(s, Int(i))
+isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
 
 ## optimized concatenation, reverse, repeat ##
 
diff --git a/base/strings/substring.jl b/base/strings/substring.jl
index 4d33f89754d1c..e389de6518b49 100644
--- a/base/strings/substring.jl
+++ b/base/strings/substring.jl
@@ -73,8 +73,9 @@ function getindex(s::SubString, i::Integer)
 end
 
 function isvalid(s::SubString, i::Integer)
-    @boundscheck checkbounds(s, i)
-    @inbounds return isvalid(s.string, s.offset + i)
+    ib = true
+    @boundscheck ib = checkbounds(Bool, s, i)
+    @inbounds return ib && isvalid(s.string, s.offset + i)
 end
 
 thisind(s::SubString, i::Integer) = thisind(s.string, s.offset + i) - s.offset
diff --git a/test/strings/types.jl b/test/strings/types.jl
index 452da4cbc2ed6..0af2713349465 100644
--- a/test/strings/types.jl
+++ b/test/strings/types.jl
@@ -169,12 +169,7 @@ let s = "lorem ipsum", sdict = Dict(
     for (ss, s) in sdict
         @test ncodeunits(ss) == ncodeunits(s)
         for i in -2:13
-            if 1 ≤ i ≤ ncodeunits(ss)
-                @test isvalid(ss, i) == isvalid(s, i)
-            else
-                @test_throws BoundsError isvalid(ss, i)
-                @test_throws BoundsError isvalid(s, i)
-            end
+            @test isvalid(ss, i) == isvalid(s, i)
         end
         for i in 1:ncodeunits(ss), j = i-1:ncodeunits(ss)
             @test length(ss, i, j) == length(s, i, j)

From feb1f6827124dcaed1f734a310bf5f43837a104b Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Mon, 11 Dec 2017 18:24:03 -0500
Subject: [PATCH 21/22] bounds checks on string length(s, i, j)

---
 base/strings/basic.jl  | 39 +++++++++++++++++++++++----------------
 base/strings/string.jl | 18 ++++++++----------
 test/lineedit.jl       |  2 +-
 test/strings/basic.jl  |  8 ++++----
 test/strings/types.jl  |  6 ++++--
 5 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/base/strings/basic.jl b/base/strings/basic.jl
index a9dcb14f54cde..fa607b6003d32 100644
--- a/base/strings/basic.jl
+++ b/base/strings/basic.jl
@@ -313,15 +313,17 @@ isless(a::Symbol, b::Symbol) = cmp(a, b) < 0
 ## character index arithmetic ##
 
 """
-    length(s::AbstractString, lo::Integer=1, hi::Integer=ncodeunits(s)) -> Integer
+    length(s::AbstractString) -> Int
+    length(s::AbstractString, i::Integer, j::Integer) -> Int
 
-The number of characters in string `s` from indices `lo` through `hi`. This is
-computed as the number of code unit indices from `lo` to `hi` which are valid
+The number of characters in string `s` from indices `i` through `j`. This is
+computed as the number of code unit indices from `i` to `j` which are valid
 character indices. Without only a single string argument, this computes the
-number of characters in the entire string. With `lo` and `hi` arguments it computes
-the number of indices between `lo` and `hi` inclusive that are valid indices in
-the string `s`. Note that the trailing character may include code units past `hi`
-and still be counted.
+number of characters in the entire string. With `i` and `j` arguments it
+computes the number of indices between `i` and `j` inclusive that are valid
+indices in the string `s`. In addition to in-bounds values, `i` may take the
+out-of-bounds value `ncodeunits(s) + 1` and `j` may take the out-of-bounds
+value `0`.
 
 See also: [`isvalid`](@ref), [`ncodeunits`](@ref), [`endof`](@ref),
 [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref)
@@ -332,18 +334,23 @@ julia> length("jμΛIα")
 5
 ```
 """
-function length(s::AbstractString, lo::Integer=1, hi::Integer=ncodeunits(s))
-    lo ≤ hi || return 0
-    z = ncodeunits(s)
-    a = Int(max(1, min(z, lo)))
-    b = Int(min(z, max(1, hi)))
-    n = a - b
-    for i = a:b
-        n += isvalid(s, i)
+length(s::AbstractString) = @inbounds return length(s, 1, ncodeunits(s))
+
+function length(s::AbstractString, i::Int, j::Int)
+    @boundscheck begin
+        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
+        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
     end
-    return n + hi - lo
+    n = 0
+    for k = i:j
+        @inbounds n += isvalid(s, k)
+    end
+    return n
 end
 
+@propagate_inbounds length(s::AbstractString, i::Integer, j::Integer) =
+    length(s, Int(i), Int(j))
+
 """
     thisind(s::AbstractString, i::Integer) -> Int
 
diff --git a/base/strings/string.jl b/base/strings/string.jl
index b1d365bb35b17..2cc20a714ea69 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -232,18 +232,16 @@ function getindex(s::String, r::UnitRange{Int})
     return ss
 end
 
-function length(s::String, lo::Int, hi::Int)
-    i, n = lo, hi
-    c = max(0, hi - lo + 1)
+function length(s::String, i::Int, j::Int)
     @boundscheck begin
-        z = ncodeunits(s)
-        i = Int(max(1, min(z, lo)))
-        n = Int(min(z, max(1, hi)))
+        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
+        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
     end
-    i < n || return c
-    @inbounds i, j = thisind(s, i), i
-    c -= i < j
-    _length(s, i, n, c)
+    j < i && return 0
+    c = j - i + 1
+    @inbounds i, k = thisind(s, i), i
+    c -= i < k
+    _length(s, i, j, c)
 end
 
 length(s::String) = _length(s, 1, ncodeunits(s), ncodeunits(s))
diff --git a/test/lineedit.jl b/test/lineedit.jl
index 61e1d3bd42562..cb870b8842422 100644
--- a/test/lineedit.jl
+++ b/test/lineedit.jl
@@ -17,7 +17,7 @@ function new_state()
 end
 
 charseek(buf, i) = seek(buf, nextind(content(buf), 0, i+1)-1)
-charpos(buf, pos=position(buf)) = length(content(buf), 1, pos+1)-1
+charpos(buf, pos=position(buf)) = length(content(buf), 1, pos)
 
 function transform!(f, s, i = -1) # i is char-based (not bytes) buffer position
     buf = buffer(s)
diff --git a/test/strings/basic.jl b/test/strings/basic.jl
index 49df4fe48252d..512eee29e0866 100644
--- a/test/strings/basic.jl
+++ b/test/strings/basic.jl
@@ -99,13 +99,13 @@ end
 end
 
 @testset "issue #7248" begin
-    @test length("hello", 1, -1) == 0
+    @test_throws BoundsError length("hello", 1, -1) == 0
     @test prevind("hello", 0, 1) == -1
-    @test length("hellø", 1, -1) == 0
+    @test_throws BoundsError length("hellø", 1, -1) == 0
     @test prevind("hellø", 0, 1) == -1
-    @test length("hello", 1, 10) == 10
+    @test_throws BoundsError length("hello", 1, 10) == 10
     @test nextind("hello", 0, 10) == 10
-    @test length("hellø", 1, 10) == 9
+    @test_throws BoundsError length("hellø", 1, 10) == 9
     @test nextind("hellø", 0, 10) == 11
     @test_throws BoundsError checkbounds("hello", 0)
     @test_throws BoundsError checkbounds("hello", 6)
diff --git a/test/strings/types.jl b/test/strings/types.jl
index 0af2713349465..f3c549ba6b36a 100644
--- a/test/strings/types.jl
+++ b/test/strings/types.jl
@@ -205,8 +205,10 @@ let s = "Σx + βz - 2"
 end
 
 let ss = SubString("hello", 1, 5)
-    @test length(ss, 1, -1) == 0
-    @test length(ss, 1, 10) == 10
+    @test length(ss, 1, 0) == 0
+    @test_throws BoundsError length(ss, 1, -1) == 0
+    @test_throws BoundsError length(ss, 1, 6)
+    @test_throws BoundsError length(ss, 1, 10)
     @test prevind(ss, 0, 1) == -1
     @test nextind(ss, 0, 10) == 10
 end

From 8de25f5ac6c8a8ef9a8872f2d9aaaee9ddbf6bf7 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Tue, 12 Dec 2017 16:31:25 -0500
Subject: [PATCH 22/22] bounds check thisind, nextind and prevind as well

---
 base/repl/REPL.jl         |   6 +-
 base/strings/basic.jl     |  30 ++++---
 base/strings/search.jl    |   3 +-
 base/strings/string.jl    |  11 ++-
 base/strings/substring.jl |  23 +++++-
 base/strings/util.jl      |  29 +++----
 test/strings/basic.jl     | 160 ++++++++++++++++++++------------------
 test/strings/types.jl     |  24 ++++--
 8 files changed, 168 insertions(+), 118 deletions(-)

diff --git a/base/repl/REPL.jl b/base/repl/REPL.jl
index f7585feaa2a05..c22010c168503 100644
--- a/base/repl/REPL.jl
+++ b/base/repl/REPL.jl
@@ -609,7 +609,11 @@ function history_search(hist::REPLHistoryProvider, query_buffer::IOBuffer, respo
 
     # Alright, first try to see if the current match still works
     a = position(response_buffer) + 1 # position is zero-indexed
-    b = min(endof(response_str), prevind(response_str, a + sizeof(searchdata))) # ensure that b is valid
+    # FIXME: I'm pretty sure this is broken since it uses an index
+    # into the search data to index into the response string
+    b = a + sizeof(searchdata)
+    b = b ≤ ncodeunits(response_str) ? prevind(response_str, b) : b-1
+    b = min(endof(response_str), b) # ensure that b is valid
 
     !skip_current && searchdata == response_str[a:b] && return true
 
diff --git a/base/strings/basic.jl b/base/strings/basic.jl
index fa607b6003d32..407ebb3638e7b 100644
--- a/base/strings/basic.jl
+++ b/base/strings/basic.jl
@@ -383,8 +383,12 @@ julia> thisind("αβγdef", 10)
 julia> thisind("αβγdef", 20)
 20
 """
-function thisind(s::AbstractString, i::Integer)
-    i ≤ ncodeunits(s) || return i
+thisind(s::AbstractString, i::Integer) = thisind(s, Int(i))
+
+function thisind(s::AbstractString, i::Int)
+    z = ncodeunits(s) + 1
+    i == z && return i
+    @boundscheck 0 ≤ i ≤ z || throw(BoundsError(s, i))
     @inbounds while 1 < i && !isvalid(s, i)
         i -= 1
     end
@@ -415,13 +419,14 @@ julia> prevind("αβγdef", 3, 2)
 0
 ```
 """
-function prevind(s::AbstractString, i::Integer, n::Integer=1)
+prevind(s::AbstractString, i::Integer, n::Integer) = prevind(s, Int(i), Int(n))
+prevind(s::AbstractString, i::Integer)             = prevind(s, Int(i))
+prevind(s::AbstractString, i::Int)                 = prevind(s, i, 1)
+
+function prevind(s::AbstractString, i::Int, n::Int)
     n < 0 && throw(ArgumentError("n cannot be negative: $n"))
     z = ncodeunits(s) + 1
-    if i > z
-        n -= i - z
-        i = z
-    end
+    @boundscheck 0 < i ≤ z || throw(BoundsError(s, i))
     while n > 0 && 1 < i
         @inbounds n -= isvalid(s, i -= 1)
     end
@@ -452,13 +457,14 @@ julia> nextind(str, 9)
 10
 ```
 """
-function nextind(s::AbstractString, i::Integer, n::Integer=1)
+nextind(s::AbstractString, i::Integer, n::Integer) = nextind(s, Int(i), Int(n))
+nextind(s::AbstractString, i::Integer)             = nextind(s, Int(i))
+nextind(s::AbstractString, i::Int)                 = nextind(s, i, 1)
+
+function nextind(s::AbstractString, i::Int, n::Int)
     n < 0 && throw(ArgumentError("n cannot be negative: $n"))
-    if i < 1
-        n += i - 1
-        i = 1
-    end
     z = ncodeunits(s)
+    @boundscheck 0 ≤ i ≤ z || throw(BoundsError(s, i))
     while n > 0 && i < z
         @inbounds n -= isvalid(s, i += 1)
     end
diff --git a/base/strings/search.jl b/base/strings/search.jl
index 5b0fa167a783f..4b108e4e0a0b5 100644
--- a/base/strings/search.jl
+++ b/base/strings/search.jl
@@ -412,7 +412,8 @@ function rsearchindex(s::String, t::String, i::Integer)
     if endof(t) == 1
         rsearch(s, t[1], i)
     elseif endof(t) != 0
-        _rsearchindex(s, t, nextind(s, i)-1)
+        j = i ≤ ncodeunits(s) ? nextind(s, i)-1 : i
+        _rsearchindex(s, t, j)
     elseif i > sizeof(s)
         return 0
     elseif i == 0
diff --git a/base/strings/string.jl b/base/strings/string.jl
index 2cc20a714ea69..223f0fc817b63 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -92,14 +92,12 @@ function ==(a::String, b::String)
     al == sizeof(b) && 0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, al)
 end
 
-## thisind, nextind, prevind ##
-
-thisind(s::String, i::Integer) = oftype(i, thisind(s, Int(i)))
-nextind(s::String, i::Integer) = oftype(i, nextind(s, Int(i)))
+## thisind, prevind, nextind ##
 
 function thisind(s::String, i::Int)
     n = ncodeunits(s)
-    between(i, 2, n) || return i
+    i == n + 1 && return i
+    @boundscheck between(i, 0, n) || throw(BoundsError(s, i))
     @inbounds b = codeunit(s, i)
     b & 0xc0 == 0x80 || return i
     @inbounds b = codeunit(s, i-1)
@@ -114,8 +112,9 @@ function thisind(s::String, i::Int)
 end
 
 function nextind(s::String, i::Int)
+    i == 0 && return 1
     n = ncodeunits(s)
-    between(i, 1, n-1) || return i+1
+    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
     @inbounds l = codeunit(s, i)
     (l < 0x80) | (0xf8 ≤ l) && return i+1
     if l < 0xc0
diff --git a/base/strings/substring.jl b/base/strings/substring.jl
index e389de6518b49..f7abe7dfb1e4f 100644
--- a/base/strings/substring.jl
+++ b/base/strings/substring.jl
@@ -78,9 +78,26 @@ function isvalid(s::SubString, i::Integer)
     @inbounds return ib && isvalid(s.string, s.offset + i)
 end
 
-thisind(s::SubString, i::Integer) = thisind(s.string, s.offset + i) - s.offset
-nextind(s::SubString, i::Integer) = nextind(s.string, s.offset + i) - s.offset
-prevind(s::SubString, i::Integer) = prevind(s.string, s.offset + i) - s.offset
+function thisind(s::SubString, i::Int)
+    @boundscheck 0 ≤ i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
+    @inbounds return thisind(s.string, s.offset + i) - s.offset
+end
+function nextind(s::SubString, i::Int, n::Int)
+    @boundscheck 0 ≤ i < ncodeunits(s)+1 || throw(BoundsError(s, i))
+    @inbounds return nextind(s.string, s.offset + i, n) - s.offset
+end
+function nextind(s::SubString, i::Int)
+    @boundscheck 0 ≤ i < ncodeunits(s)+1 || throw(BoundsError(s, i))
+    @inbounds return nextind(s.string, s.offset + i) - s.offset
+end
+function prevind(s::SubString, i::Int, n::Int)
+    @boundscheck 0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
+    @inbounds return prevind(s.string, s.offset + i, n) - s.offset
+end
+function prevind(s::SubString, i::Int)
+    @boundscheck 0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
+    @inbounds return prevind(s.string, s.offset + i) - s.offset
+end
 
 function cmp(a::SubString{String}, b::SubString{String})
     na = sizeof(a)
diff --git a/base/strings/util.jl b/base/strings/util.jl
index d92969de12ee1..43e96fc1a8b22 100644
--- a/base/strings/util.jl
+++ b/base/strings/util.jl
@@ -281,17 +281,20 @@ function _split(str::AbstractString, splitter, limit::Integer, keep_empty::Bool,
     i = start(str)
     n = endof(str)
     r = search(str,splitter,i)
-    j, k = first(r), nextind(str,last(r))
-    while 0 < j <= n && length(strs) != limit-1
-        if i < k
-            if keep_empty || i < j
-                push!(strs, SubString(str,i,prevind(str,j)))
+    if r != 0:-1
+        j, k = first(r), nextind(str,last(r))
+        while 0 < j <= n && length(strs) != limit-1
+            if i < k
+                if keep_empty || i < j
+                    push!(strs, SubString(str,i,prevind(str,j)))
+                end
+                i = k
             end
-            i = k
+            (k <= j) && (k = nextind(str,j))
+            r = search(str,splitter,k)
+            r == 0:-1 && break
+            j, k = first(r), nextind(str,last(r))
         end
-        (k <= j) && (k = nextind(str,j))
-        r = search(str,splitter,k)
-        j, k = first(r), nextind(str,last(r))
     end
     if keep_empty || !done(str,i)
         push!(strs, SubString(str,i))
@@ -377,18 +380,16 @@ function replace_new(str::String, pattern, repl, count::Integer)
             unsafe_write(out, pointer(str, i), UInt(j-i))
             _replace(out, repl, str, r, pattern)
         end
-        if k<j
+        if k < j
             i = j
+            j > e && break
             k = nextind(str, j)
         else
             i = k = nextind(str, k)
         end
-        if j > e
-            break
-        end
         r = search(str,pattern,k)
+        r == 0:-1 || n == count && break
         j, k = first(r), last(r)
-        n == count && break
         n += 1
     end
     write(out, SubString(str,i))
diff --git a/test/strings/basic.jl b/test/strings/basic.jl
index 512eee29e0866..512bbd0805943 100644
--- a/test/strings/basic.jl
+++ b/test/strings/basic.jl
@@ -99,11 +99,11 @@ end
 end
 
 @testset "issue #7248" begin
-    @test_throws BoundsError length("hello", 1, -1) == 0
-    @test prevind("hello", 0, 1) == -1
-    @test_throws BoundsError length("hellø", 1, -1) == 0
-    @test prevind("hellø", 0, 1) == -1
-    @test_throws BoundsError length("hello", 1, 10) == 10
+    @test_throws BoundsError length("hello", 1, -1)
+    @test_throws BoundsError prevind("hello", 0, 1)
+    @test_throws BoundsError length("hellø", 1, -1)
+    @test_throws BoundsError prevind("hellø", 0, 1)
+    @test_throws BoundsError length("hello", 1, 10)
     @test nextind("hello", 0, 10) == 10
     @test_throws BoundsError length("hellø", 1, 10) == 9
     @test nextind("hellø", 0, 10) == 11
@@ -512,7 +512,8 @@ end
                    SubString("123∀α>β:α+1>β123", 4, 18),
                    SubString(s"123∀α>β:α+1>β123", 4, 18)]
         for s in strs
-            @test thisind(s, -2) == -2
+            @test_throws BoundsError thisind(s, -2)
+            @test_throws BoundsError thisind(s, -1)
             @test thisind(s, 0) == 0
             @test thisind(s, 1) == 1
             @test thisind(s, 2) == 1
@@ -523,86 +524,97 @@ end
             @test thisind(s, 15) == 15
             @test thisind(s, 16) == 15
             @test thisind(s, 17) == 17
-            @test thisind(s, 30) == 30
+            @test_throws BoundsError thisind(s, 18)
+            @test_throws BoundsError thisind(s, 19)
         end
     end
 
     let strs = Any["", s"", SubString("123", 2, 1), SubString(s"123", 2, 1)]
-        for s in strs, i in -2:2
-            @test thisind(s, i) == i
+        for s in strs
+            @test_throws BoundsError thisind(s, -1)
+            @test thisind(s, 0) == 0
+            @test thisind(s, 1) == 1
+            @test_throws BoundsError thisind(s, 2)
         end
     end
 end
 
 @testset "prevind and nextind" begin
-    let strs = Any["∀α>β:α+1>β", GenericString("∀α>β:α+1>β")]
-        for i in 1:2
-            @test prevind(strs[i], 1) == 0
-            @test prevind(strs[i], 1, 1) == 0
-            @test prevind(strs[i], 2) == 1
-            @test prevind(strs[i], 2, 1) == 1
-            @test prevind(strs[i], 4) == 1
-            @test prevind(strs[i], 4, 1) == 1
-            @test prevind(strs[i], 5) == 4
-            @test prevind(strs[i], 5, 1) == 4
-            @test prevind(strs[i], 5, 2) == 1
-            @test prevind(strs[i], 5, 3) == 0
-            @test prevind(strs[i], 15) == 14
-            @test prevind(strs[i], 15, 1) == 14
-            @test prevind(strs[i], 15, 2) == 13
-            @test prevind(strs[i], 15, 3) == 12
-            @test prevind(strs[i], 15, 4) == 10
-            @test prevind(strs[i], 15, 10) == 0
-            @test prevind(strs[i], 15, 9) == 1
-            @test prevind(strs[i], 16) == 15
-            @test prevind(strs[i], 16, 1) == 15
-            @test prevind(strs[i], 16, 2) == 14
-            @test prevind(strs[i], 20) == 19
-            @test prevind(strs[i], 20, 1) == 19
-            @test prevind(strs[i], 20, 10) == 7
-            @test prevind(strs[i], 20, 0) == 20
-
-            @test nextind(strs[i], -1) == 0
-            @test nextind(strs[i], -1, 1) == 0
-            @test nextind(strs[i], -1, 2) == 1
-            @test nextind(strs[i], -1, 3) == 4
-            @test nextind(strs[i], 0, 2) == 4
-            @test nextind(strs[i], 0, 20) == 26
-            @test nextind(strs[i], 0, 10) == 15
-            @test nextind(strs[i], 1) == 4
-            @test nextind(strs[i], 1, 1) == 4
-            @test nextind(strs[i], 1, 2) == 6
-            @test nextind(strs[i], 1, 9) == 15
-            @test nextind(strs[i], 1, 10) == 17
-            @test nextind(strs[i], 2) == 4
-            @test nextind(strs[i], 2, 1) == 4
-            @test nextind(strs[i], 3) == 4
-            @test nextind(strs[i], 3, 1) == 4
-            @test nextind(strs[i], 4) == 6
-            @test nextind(strs[i], 4, 1) == 6
-            @test nextind(strs[i], 14) == 15
-            @test nextind(strs[i], 14, 1) == 15
-            @test nextind(strs[i], 15) == 17
-            @test nextind(strs[i], 15, 1) == 17
-            @test nextind(strs[i], 20) == 21
-            @test nextind(strs[i], 20, 1) == 21
-            @test nextind(strs[i], 20, 0) == 20
-
-            for x in -10:20
-                n = p = x
-                for j in 1:40
-                    p = prevind(strs[i], p)
-                    @test prevind(strs[i], x, j) == p
-                    n = nextind(strs[i], n)
-                    @test nextind(strs[i], x, j) == n
+    for s in Any["∀α>β:α+1>β", GenericString("∀α>β:α+1>β")]
+        @test_throws BoundsError prevind(s, 0)
+        @test_throws BoundsError prevind(s, 0, 0)
+        @test_throws BoundsError prevind(s, 0, 1)
+        @test prevind(s, 1) == 0
+        @test prevind(s, 1, 1) == 0
+        @test prevind(s, 1, 0) == 1
+        @test prevind(s, 2) == 1
+        @test prevind(s, 2, 1) == 1
+        @test prevind(s, 4) == 1
+        @test prevind(s, 4, 1) == 1
+        @test prevind(s, 5) == 4
+        @test prevind(s, 5, 1) == 4
+        @test prevind(s, 5, 2) == 1
+        @test prevind(s, 5, 3) == 0
+        @test prevind(s, 15) == 14
+        @test prevind(s, 15, 1) == 14
+        @test prevind(s, 15, 2) == 13
+        @test prevind(s, 15, 3) == 12
+        @test prevind(s, 15, 4) == 10
+        @test prevind(s, 15, 10) == 0
+        @test prevind(s, 15, 9) == 1
+        @test prevind(s, 16) == 15
+        @test prevind(s, 16, 1) == 15
+        @test prevind(s, 16, 2) == 14
+        @test prevind(s, 17) == 15
+        @test prevind(s, 17, 1) == 15
+        @test prevind(s, 17, 2) == 14
+        @test_throws BoundsError prevind(s, 18)
+        @test_throws BoundsError prevind(s, 18, 0)
+        @test_throws BoundsError prevind(s, 18, 1)
+
+        @test_throws BoundsError nextind(s, -1)
+        @test_throws BoundsError nextind(s, -1, 0)
+        @test_throws BoundsError nextind(s, -1, 1)
+        @test nextind(s, 0, 2) == 4
+        @test nextind(s, 0, 20) == 26
+        @test nextind(s, 0, 10) == 15
+        @test nextind(s, 1) == 4
+        @test nextind(s, 1, 1) == 4
+        @test nextind(s, 1, 2) == 6
+        @test nextind(s, 1, 9) == 15
+        @test nextind(s, 1, 10) == 17
+        @test nextind(s, 2) == 4
+        @test nextind(s, 2, 1) == 4
+        @test nextind(s, 3) == 4
+        @test nextind(s, 3, 1) == 4
+        @test nextind(s, 4) == 6
+        @test nextind(s, 4, 1) == 6
+        @test nextind(s, 14) == 15
+        @test nextind(s, 14, 1) == 15
+        @test nextind(s, 15) == 17
+        @test nextind(s, 15, 1) == 17
+        @test nextind(s, 15, 2) == 18
+        @test nextind(s, 16) == 17
+        @test nextind(s, 16, 1) == 17
+        @test nextind(s, 16, 2) == 18
+        @test nextind(s, 16, 3) == 19
+        @test_throws BoundsError nextind(s, 17)
+        @test_throws BoundsError nextind(s, 17, 0)
+        @test_throws BoundsError nextind(s, 17, 1)
+
+        for x in 0:ncodeunits(s)+1
+            n = p = x
+            for j in 1:40
+                if 1 ≤ p
+                    p = prevind(s, p)
+                    @test prevind(s, x, j) == p
+                end
+                if n ≤ ncodeunits(s)
+                    n = nextind(s, n)
+                    @test nextind(s, x, j) == n
                 end
             end
         end
-        @test prevind(strs[1], -1) == -2
-        @test prevind(strs[1], -1, 1) == -2
-
-        @test prevind(strs[2], -1) == -2
-        @test prevind(strs[2], -1, 1) == -2
     end
 end
 
diff --git a/test/strings/types.jl b/test/strings/types.jl
index f3c549ba6b36a..b849ddac07573 100644
--- a/test/strings/types.jl
+++ b/test/strings/types.jl
@@ -146,7 +146,7 @@ end
 @test prevind(SubString("{var}",2,4),4) == 3
 
 # issue #4183
-@test split(SubString("x", 2, 0), "y") == AbstractString[""]
+@test split(SubString("x", 2, 0), "y") == [""]
 
 # issue #6772
 @test parse(Float64, SubString("10",1,1)) === 1.0
@@ -157,7 +157,7 @@ end
 @test !ismatch(Regex("aa"), SubString("",1,0))
 @test ismatch(Regex(""), SubString("",1,0))
 
-# isvalid(), formerly length() and nextind() for SubString{String}
+# isvalid, length, prevind, nextind for SubString{String}
 let s = "lorem ipsum", sdict = Dict(
     SubString(s, 1, 11)  => "lorem ipsum",
     SubString(s, 1, 6)   => "lorem ",
@@ -177,10 +177,14 @@ let s = "lorem ipsum", sdict = Dict(
     end
     for (ss, s) in sdict
         @test length(ss) == length(s)
-        for i in 0:length(ss)+1, j = 0:length(ss)+1
+        for i in 0:ncodeunits(ss), j = 0:length(ss)+1
+            @test prevind(ss, i+1, j) == prevind(s, i+1, j)
             @test nextind(ss, i, j) == nextind(s, i, j)
-            @test prevind(ss, i, j) == prevind(s, i, j)
         end
+        @test_throws BoundsError prevind(s, 0)
+        @test_throws BoundsError prevind(ss, 0)
+        @test_throws BoundsError nextind(s, ncodeunits(ss)+1)
+        @test_throws BoundsError nextind(ss, ncodeunits(ss)+1)
     end
 end
 
@@ -206,11 +210,17 @@ end
 
 let ss = SubString("hello", 1, 5)
     @test length(ss, 1, 0) == 0
-    @test_throws BoundsError length(ss, 1, -1) == 0
+    @test_throws BoundsError length(ss, 1, -1)
     @test_throws BoundsError length(ss, 1, 6)
     @test_throws BoundsError length(ss, 1, 10)
-    @test prevind(ss, 0, 1) == -1
-    @test nextind(ss, 0, 10) == 10
+    @test_throws BoundsError prevind(ss, 0, 1)
+    @test prevind(ss, 1, 1) == 0
+    @test prevind(ss, 6, 1) == 5
+    @test_throws BoundsError prevind(ss, 7, 1)
+    @test_throws BoundsError nextind(ss, -1, 1)
+    @test nextind(ss, 0, 1) == 1
+    @test nextind(ss, 5, 1) == 6
+    @test_throws BoundsError nextind(ss, 6, 1)
 end
 
 # length(SubString{String}) performance specialization