From e596847b8648529e29a95fe38b4866eff08f41c2 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Fri, 8 Dec 2017 17:23:17 -0500 Subject: [PATCH 01/22] string overhaul: new Char representation, revamped core string API --- base/char.jl | 86 ++++- base/filesystem.jl | 20 ++ base/intfuncs.jl | 4 +- base/io.jl | 66 ++-- base/iostream.jl | 32 +- base/parse.jl | 4 +- base/regex.jl | 8 +- base/repl/REPLCompletions.jl | 2 +- base/stream.jl | 8 + base/strings/basic.jl | 580 +++++++++++++++++--------------- base/strings/io.jl | 74 ++-- base/strings/string.jl | 433 ++++++++++-------------- base/strings/substring.jl | 144 ++------ base/strings/unicode.jl | 135 +++++--- base/strings/util.jl | 35 +- src/ast.c | 12 +- src/datatype.c | 18 +- src/jl_uv.c | 15 +- stdlib/Test/src/Test.jl | 7 +- stdlib/Unicode/test/runtests.jl | 4 +- test/char.jl | 22 ++ test/intfuncs.jl | 2 +- test/lineedit.jl | 4 +- test/strings/basic.jl | 94 +++--- test/strings/io.jl | 3 +- test/strings/types.jl | 119 ++++--- test/unicode/utf8.jl | 21 +- 27 files changed, 1001 insertions(+), 951 deletions(-) diff --git a/base/char.jl b/base/char.jl index 9b99bb50b086f..6d21af949ebe8 100644 --- a/base/char.jl +++ b/base/char.jl @@ -1,8 +1,58 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -convert(::Type{Char}, x::UInt32) = reinterpret(Char, x) +struct MalformedCharError <: Exception + char::Char +end +struct CodePointError <: Exception + code::Integer +end +@noinline malformed_char(c::Char) = throw(MalformedCharError(c)) +@noinline code_point_err(u::UInt32) = throw(CodePointError(u)) + +function ismalformed(c::Char) + u = reinterpret(UInt32, c) + l1 = leading_ones(u) << 3 + t0 = trailing_zeros(u) & 56 + (l1 == 8) | (l1 + t0 > 32) | + (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) +end + +function convert(::Type{UInt32}, c::Char) + # TODO: use optimized inline LLVM + u = reinterpret(UInt32, c) + u < 0x80000000 && return reinterpret(UInt32, u >> 24) + l1 = leading_ones(u) + t0 = trailing_zeros(u) & 56 + (l1 == 1) | (8l1 + t0 > 32) | + (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) && + malformed_char(c)::Union{} + u &= 0xffffffff >> l1 + u >>= t0 + (u & 0x0000007f >> 0) | (u & 0x00007f00 >> 2) | + (u & 0x007f0000 >> 4) | (u & 0x7f000000 >> 6) +end + +function convert(::Type{Char}, u::UInt32) + u < 0x80 && return reinterpret(Char, u << 24) + u < 0x00200000 || code_point_err(u)::Union{} + c = ((u << 0) & 0x0000003f) | ((u << 2) & 0x00003f00) | + ((u << 4) & 0x003f0000) | ((u << 6) & 0x3f000000) + c = u < 0x00000800 ? (c << 16) | 0xc0800000 : + u < 0x00010000 ? (c << 08) | 0xe0808000 : + (c << 00) | 0xf0808080 + reinterpret(Char, c) +end + +function convert(::Type{T}, c::Char) where T <: Union{Int8,UInt8} + i = reinterpret(Int32, c) + i ≥ 0 ? ((i >>> 24) % T) : T(UInt32(c)) +end + +function convert(::Type{Char}, b::Union{Int8,UInt8}) + 0 ≤ b ≤ 0x7f ? reinterpret(Char, (b % UInt32) << 24) : Char(UInt32(b)) +end + convert(::Type{Char}, x::Number) = Char(UInt32(x)) -convert(::Type{UInt32}, x::Char) = reinterpret(UInt32, x) convert(::Type{T}, x::Char) where {T<:Number} = convert(T, UInt32(x)) rem(x::Char, ::Type{T}) where {T<:Number} = rem(UInt32(x), T) @@ -29,11 +79,9 @@ done(c::Char, state) = state isempty(c::Char) = false in(x::Char, y::Char) = x == y -==(x::Char, y::Char) = UInt32(x) == UInt32(y) -isless(x::Char, y::Char) = UInt32(x) < UInt32(y) - -const hashchar_seed = 0xd4d64234 -hash(x::Char, h::UInt) = hash_uint64(((UInt64(x)+hashchar_seed)<<32) ⊻ UInt64(h)) +==(x::Char, y::Char) = reinterpret(UInt32, x) == reinterpret(UInt32, y) +isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y) +hash(x::Char, h::UInt) = hash(reinterpret(UInt32, x), hash(Char, h)) -(x::Char, y::Char) = Int(x) - Int(y) -(x::Char, y::Integer) = Char(Int32(x) - Int32(y)) @@ -66,7 +114,7 @@ function show(io::IO, c::Char) end if Unicode.isprint(c) write(io, 0x27, c, 0x27) - else + elseif !ismalformed(c) u = UInt32(c) write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55) d = max(2, 8 - (leading_zeros(u) >> 2)) @@ -74,13 +122,29 @@ function show(io::IO, c::Char) write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1]) end write(io, 0x27) + else # malformed + write(io, 0x27) + u = reinterpret(UInt32, c) + while true + a = hex_chars[((u >> 28) & 0xf) + 1] + b = hex_chars[((u >> 24) & 0xf) + 1] + write(io, 0x5c, 'x', a, b) + (u <<= 8) == 0 && break + end + write(io, 0x27) end return end function show(io::IO, ::MIME"text/plain", c::Char) show(io, c) - u = UInt32(c) - print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4)) - print(io, " (category ", Unicode.category_abbrev(c), ": ", Unicode.category_string(c), ")") + if !ismalformed(c) + u = UInt32(c) + print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4)) + else + print(io, ": Malformed UTF-8") + end + abr = Unicode.category_abbrev(c) + str = Unicode.category_string(c) + print(io, " (category ", abr, ": ", str, ")") end diff --git a/base/filesystem.jl b/base/filesystem.jl index c5f8e4b10854d..6268d1d420752 100644 --- a/base/filesystem.jl +++ b/base/filesystem.jl @@ -149,6 +149,26 @@ function read(f::File, ::Type{UInt8}) return ret % UInt8 end +function read(f::File, ::Type{Char}) + b0 = read(f, UInt8) + l = 8(4-leading_ones(b0)) + c = UInt32(b0) << 24 + if l < 24 + s = 16 + while s ≥ l && !eof(f) + p = position(f) + b = read(f, UInt8) + if b & 0xc0 != 0x80 + seek(f, p) + break + end + c |= UInt32(b) << s + s -= 8 + end + end + return reinterpret(Char, c) +end + function unsafe_read(f::File, p::Ptr{UInt8}, nel::UInt) check_open(f) ret = ccall(:jl_fs_read, Int32, (Int32, Ptr{Void}, Csize_t), diff --git a/base/intfuncs.jl b/base/intfuncs.jl index abc1fd95b3e6a..76b45f90cf4e8 100644 --- a/base/intfuncs.jl +++ b/base/intfuncs.jl @@ -654,8 +654,8 @@ for sym in (:bin, :oct, :dec, :hex) @eval begin ($sym)(x::Unsigned, p::Int) = ($sym)(x,p,false) ($sym)(x::Unsigned) = ($sym)(x,1,false) - ($sym)(x::Char, p::Int) = ($sym)(unsigned(x),p,false) - ($sym)(x::Char) = ($sym)(unsigned(x),1,false) + ($sym)(x::Char, p::Int) = ($sym)(UInt32(x),p,false) + ($sym)(x::Char) = ($sym)(UInt32(x),1,false) ($sym)(x::Integer, p::Int) = ($sym)(unsigned(abs(x)),p,x<0) ($sym)(x::Integer) = ($sym)(unsigned(abs(x)),1,x<0) end diff --git a/base/io.jl b/base/io.jl index 4f88d24cd7a0a..0cd51fd955bdc 100644 --- a/base/io.jl +++ b/base/io.jl @@ -535,25 +535,13 @@ function write(s::IO, a::SubArray{T,N,<:Array}) where {T,N} end end - -function write(s::IO, ch::Char) - c = reinterpret(UInt32, ch) - if c < 0x80 - return write(s, c%UInt8) - elseif c < 0x800 - return (write(s, (( c >> 6 ) | 0xC0)%UInt8)) + - (write(s, (( c & 0x3F ) | 0x80)%UInt8)) - elseif c < 0x10000 - return (write(s, (( c >> 12 ) | 0xE0)%UInt8)) + - (write(s, (((c >> 6) & 0x3F ) | 0x80)%UInt8)) + - (write(s, (( c & 0x3F ) | 0x80)%UInt8)) - elseif c < 0x110000 - return (write(s, (( c >> 18 ) | 0xF0)%UInt8)) + - (write(s, (((c >> 12) & 0x3F ) | 0x80)%UInt8)) + - (write(s, (((c >> 6) & 0x3F ) | 0x80)%UInt8)) + - (write(s, (( c & 0x3F ) | 0x80)%UInt8)) - else - return write(s, '\ufffd') +function write(io::IO, c::Char) + u = bswap(reinterpret(UInt32, c)) + n = 1 + while true + write(io, u % UInt8) + (u >>= 8) == 0 && return n + n += 1 end end @@ -596,23 +584,20 @@ function read!(s::IO, a::Array{T}) where T return a end -function read(s::IO, ::Type{Char}) - ch = read(s, UInt8) - if ch < 0x80 - return Char(ch) - end - - # mimic utf8.next function - trailing = Base.utf8_trailing[ch+1] - c::UInt32 = 0 - for j = 1:trailing - c += ch - c <<= 6 - ch = read(s, UInt8) +function read(io::IO, ::Type{Char}) + b0 = read(io, UInt8) + l = 8(4-leading_ones(b0)) + c = UInt32(b0) << 24 + if l < 24 + s = 16 + while s ≥ l && !eof(io) + peek(io) & 0xc0 == 0x80 || break + b = read(io, UInt8) + c |= UInt32(b) << s + s -= 8 + end end - c += ch - c -= Base.utf8_offset[trailing+1] - return Char(c) + return reinterpret(Char, c) end # readuntil_string is useful below since it has @@ -620,7 +605,7 @@ end readuntil_string(s::IO, delim::UInt8) = String(readuntil(s, delim)) function readuntil(s::IO, delim::Char) - if delim < Char(0x80) + if delim ≤ '\x7f' return readuntil_string(s, delim % UInt8) end out = IOBuffer() @@ -701,7 +686,7 @@ function readuntil(io::IO, target::AbstractString) i = start(target) done(target, i) && return "" c, i = next(target, start(target)) - if done(target, i) && c < Char(0x80) + if done(target, i) && c <= '\x7f' return readuntil_string(io, c % UInt8) end # decide how we can index target @@ -728,12 +713,11 @@ function readuntil(io::IO, target::AbstractVector{T}) where T return out end - """ readchomp(x) -Read the entirety of `x` as a string and remove a single trailing newline. -Equivalent to `chomp!(read(x, String))`. +Read the entirety of `x` as a string and remove a single trailing newline +if there is one. Equivalent to `chomp(read(x, String))`. # Examples ```jldoctest @@ -747,7 +731,7 @@ julia> readchomp("my_file.txt") julia> rm("my_file.txt"); ``` """ -readchomp(x) = chomp!(read(x, String)) +readchomp(x) = chomp(read(x, String)) # read up to nb bytes into nb, returning # bytes read diff --git a/base/iostream.jl b/base/iostream.jl index 117bf77e7f8a6..347b86ca10f34 100644 --- a/base/iostream.jl +++ b/base/iostream.jl @@ -315,12 +315,13 @@ end ## low-level calls ## -write(s::IOStream, b::UInt8) = Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios)) +function write(s::IOStream, b::UInt8) + iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable")) + Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios)) +end function unsafe_write(s::IOStream, p::Ptr{UInt8}, nb::UInt) - if !iswritable(s) - throw(ArgumentError("write failed, IOStream is not writeable")) - end + iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable")) return Int(ccall(:ios_write, Csize_t, (Ptr{Void}, Ptr{Void}, Csize_t), s.ios, p, nb)) end @@ -353,14 +354,6 @@ end ## text I/O ## -function write(s::IOStream, c::Char) - if !iswritable(s) - throw(ArgumentError("write failed, IOStream is not writeable")) - end - Int(ccall(:ios_pututf8, Cint, (Ptr{Void}, UInt32), s.ios, c)) -end -read(s::IOStream, ::Type{Char}) = Char(ccall(:jl_getutf8, UInt32, (Ptr{Void},), s.ios)) - take!(s::IOStream) = ccall(:jl_take_buffer, Vector{UInt8}, (Ptr{Void},), s.ios) @@ -452,14 +445,23 @@ function read(s::IOStream, nb::Integer; all::Bool=true) end ## Character streams ## -const _chtmp = Ref{Char}() + function peekchar(s::IOStream) - if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{Char}), s, _chtmp) < 0 + chref = Ref{UInt32}() + if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{UInt32}), s, chref) < 0 return typemax(Char) end - return _chtmp[] + return Char(chref[]) end function peek(s::IOStream) ccall(:ios_peekc, Cint, (Ptr{Void},), s) end + +function peek(s::IO) + mark(s) + try read(s, UInt8) + finally + reset(s) + end +end diff --git a/base/parse.jl b/base/parse.jl index ddbf833cb162f..7181b3538c457 100644 --- a/base/parse.jl +++ b/base/parse.jl @@ -224,12 +224,12 @@ end ## string to float functions ## tryparse(::Type{Float64}, s::String) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s)) -tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof) +tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits) tryparse_internal(::Type{Float64}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1) tryparse_internal(::Type{Float64}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1) tryparse(::Type{Float32}, s::String) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s)) -tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof) +tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits) tryparse_internal(::Type{Float32}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1) tryparse_internal(::Type{Float32}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1) diff --git a/base/regex.jl b/base/regex.jl index 555032ef30d23..2fd5a804bf1af 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -303,8 +303,12 @@ struct SubstitutionString{T<:AbstractString} <: AbstractString string::T end -endof(s::SubstitutionString) = endof(s.string) -next(s::SubstitutionString, idx::Int) = next(s.string, idx) +ncodeunits(s::SubstitutionString) = ncodeunits(s.string) +codeunit(s::SubstitutionString) = codeunit(s.string) +codeunit(s::SubstitutionString, i::Integer) = codeunit(s.string, i) +isvalid(s::SubstitutionString, i::Integer) = isvalid(s.string, i) +next(s::SubstitutionString, i::Integer) = next(s.string, i) + function show(io::IO, s::SubstitutionString) print(io, "s") show(io, s.string) diff --git a/base/repl/REPLCompletions.jl b/base/repl/REPLCompletions.jl index 3e5056d613f26..2c4ba328093fa 100644 --- a/base/repl/REPLCompletions.jl +++ b/base/repl/REPLCompletions.jl @@ -106,7 +106,7 @@ const sorted_keywords = [ "primitive type", "quote", "return", "struct", "true", "try", "using", "while"] -function complete_keyword(s::String) +function complete_keyword(s::Union{String,SubString{String}}) r = searchsorted(sorted_keywords, s) i = first(r) n = length(sorted_keywords) diff --git a/base/stream.jl b/base/stream.jl index 24831e3ce83da..e60d5386ce7a5 100644 --- a/base/stream.jl +++ b/base/stream.jl @@ -1148,6 +1148,14 @@ unmark(x::LibuvStream) = unmark(x.buffer) reset(x::LibuvStream) = reset(x.buffer) ismarked(x::LibuvStream) = ismarked(x.buffer) +function peek(s::LibuvStream) + mark(s) + try read(s, UInt8) + finally + reset(s) + end +end + # BufferStream's are non-OS streams, backed by a regular IOBuffer mutable struct BufferStream <: LibuvStream buffer::IOBuffer diff --git a/base/strings/basic.jl b/base/strings/basic.jl index 0c3044573a215..7ff30d09bc027 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -1,57 +1,188 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -## core string functions ## +""" +The `AbstractString` type is the supertype of all string implementations in +Julia. Strings are encodings of sequences of [Unicode](https://unicode.org/) +code points as represented by the `Char` type. Julia makes a few assumptions +about strings: + +* Strings are encoded in terms of fixed-size "code units" + * Code units can be extracted with `codeunit(s, i)` + * The first code unit has index `1` + * The last code unit has index `ncodeunits(s)` + * Any index `i` such that `1 ≤ i ≤ ncodeunits(s)` is in bounds +* String indexing is done in terms of these code units: + * Characters are extracted by `s[i]` with a valid string index `i` + * Each `Char` in a string is encoded by one or more code units + * Only the index of the first code unit of a `Char` is a valid index + * The encoding of a `Char` is independent of what precedes or follows it + * String encodings are "self-synchronizing" – i.e. `isvalid(s,i)` is O(1) + +Some string functions error if you use an out-of-bounds or invalid string index, +including code unit extraction `codeunit(s,i)`, string indexing `s[i]`, and +string iteration `next(s,i)`. Other string functions take a more relaxed +approach to indexing and give you the closest valid string index when in-bounds, +or when out-of-bounds, behave as if there were an infinite number of characters +padding each side of the string. Usually these imaginary padding characters have +code unit length `1`, but string types may choose different sizes. Relaxed +indexing functions include those intended for index arithmetic: `thisind`, +`nextind` and `prevind`. This model allows index arithmetic to work with out-of- +bounds indices as intermediate values so long as one never uses them to retrieve +a character, which often helps avoid needing to code around edge cases. + +See also: `codeunit`, `ncodeunits`, `thisind`, `nextind`, `prevind` +""" +AbstractString -endof(s::AbstractString) = error("you must implement endof(", typeof(s), ")") -next(s::AbstractString, i::Int) = error("you must implement next(", typeof(s), ",Int)") -next(s::AbstractString, i::Integer) = next(s,Int(i)) +## required string functions ## -string() = "" -string(s::AbstractString) = s +""" + ncodeunits(s::AbstractString) -> Int -(::Type{Vector{UInt8}})(s::AbstractString) = Vector{UInt8}(String(s)) -(::Type{Array{UInt8}})(s::AbstractString) = Vector{UInt8}(s) -(::Type{Vector{Char}})(s::AbstractString) = collect(s) +Return the number of code units in a string. Indices that are in bounds to +access this string must satisfy `1 ≤ i ≤ ncodeunits(s)`. Not all such indices +are valid – they may not be the start of a character, but they will return a +code unit value when calling `codeunit(s,i)`. -Symbol(s::AbstractString) = Symbol(String(s)) +See also: `codeunit`, `checkbounds`, `sizeof`, `length`, `endof` +""" +ncodeunits(s::AbstractString) -# string types are convertible -convert(::Type{T}, s::T) where {T<:AbstractString} = s -convert(::Type{T}, s::AbstractString) where {T<:AbstractString} = T(s) +""" + codeunit(s::AbstractString) -> Type{<:Union{UInt8, UInt16, UInt32}} -## generic supplied functions ## +Return the code unit type of the given string object. For ASCII, Latin-1, or +UTF-8 encoded strings, this would be `UInt8`; for UCS-2 and UTF-16 it would be +`UInt16`; for UTF-32 it would be `UInt32`. The unit code type need not be +limited to these three types, but it's hard to think of widely used string +encodings that don't use one of these units. `codeunit(s)` is the same as +`typeof(codeunit(s,1))` when `s` is a non-empty string. -start(s::AbstractString) = 1 -done(s::AbstractString,i) = (i > endof(s)) -getindex(s::AbstractString, i::Int) = next(s,i)[1] -getindex(s::AbstractString, i::Integer) = s[Int(i)] -getindex(s::AbstractString, i::Colon) = s -getindex(s::AbstractString, r::UnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))] -# TODO: handle other ranges with stride ±1 specially? -getindex(s::AbstractString, v::AbstractVector{<:Integer}) = - sprint(length(v), io->(for i in v; write(io,s[i]) end)) -getindex(s::AbstractString, v::AbstractVector{Bool}) = - throw(ArgumentError("logical indexing not supported for strings")) +See also: `ncodeunits` +""" +codeunit(s::AbstractString) -get(s::AbstractString, i::Integer, default) = isvalid(s,i) ? s[i] : default +""" + codeunit(s::AbstractString, i::Integer) -> Union{UInt8, UInt16, UInt32} + +Return the code unit value in the string `s` at index `i`. Note that + + codeunit(s, i) :: codeunit(s) + +I.e. the value returned by `codeunit(s, i)` is of the type returned by +`codeunit(s)`. + +See also: `ncodeunits`, `checkbounds` +""" +codeunit(s::AbstractString, i::Integer) = typeof(i) === Int ? + throw(MethodError(codeunit, Tuple{typeof(s),Int})) : + codeunit(s, Int(i)) """ - sizeof(s::AbstractString) + isvalid(s::AbstractString, i::Integer) -> Bool -The number of bytes in string `s`. +Predicate indicating whether the given index is the start of the encoding of +a character in `s` or not. If `isvalid(s, i)` is true then `s[i]` will return +the character whose encoding starts at that index, if it's false, then `s[i]` +will raise an invalid index error. Behavior of `next(s, i)` is similar except +that the character is returned along with the index of the following character. +In order for `isvalid(s, i)` to be an O(1) function, the encoding of `s` must +be [self-synchronizing](https://en.wikipedia.org/wiki/Self-synchronizing_code); +this is a basic assumption of Julia's generic string support. + +See also: `getindex`, `next`, `thisind`, `nextind`, `prevind`, `length` # Examples + ```jldoctest -julia> sizeof("❤") -3 +julia> str = "αβγdef"; + +julia> isvalid(str, 1) +true + +julia> str[1] +'α': Unicode U+03b1 (category Ll: Letter, lowercase) + +julia> isvalid(str, 2) +false + +julia> str[2] +ERROR: UnicodeError: invalid character index +Stacktrace: +[...] ``` """ -sizeof(s::AbstractString) = error("type $(typeof(s)) has no canonical binary representation") +isvalid(s::AbstractString, i::Integer) = typeof(i) === Int ? + throw(MethodError(isvalid, Tuple{typeof(s),Int})) : + isvalid(s, Int(i)) + +""" + next(s::AbstractString, i::Integer) -> Tuple{Char, Int} +Return a tuple of the character in `s` at index `i` with the index of the start +of the following character in `s`. This is the key method that allows strings to +be iterated, yielding a sequences of characters. If `i` is out of bounds in `s` +then a bounds error is raised; if `i` is not a valid character index in `s` then +a Unicode index error is raised. + +See also: `getindex`, `start`, `done`, `checkbounds` +""" +next(s::AbstractString, i::Integer) = typeof(i) === Int ? + throw(MethodError(next, Tuple{typeof(s),Int})) : + next(s, Int(i)) + +## basic generic definitions ## + +start(s::AbstractString) = 1 +done(s::AbstractString, i::Integer) = i > ncodeunits(s) eltype(::Type{<:AbstractString}) = Char +sizeof(s::AbstractString) = ncodeunits(s) * sizeof(codeunit(s)) +endof(s::AbstractString) = thisind(s, ncodeunits(s)) + +getindex(s::AbstractString, i::Integer) = next(s, i)[1] +getindex(s::AbstractString, i::Colon) = s +# TODO: handle other ranges with stride ±1 specially? +getindex(s::AbstractString, r::UnitRange{<:Integer}) = SubString(s, r) +getindex(s::AbstractString, v::AbstractVector{<:Integer}) = + sprint(length(v), io->(for i in v; write(io, s[i]) end)) +getindex(s::AbstractString, v::AbstractVector{Bool}) = + throw(ArgumentError("logical indexing not supported for strings")) + +get(s::AbstractString, i::Integer, default) = checkbounds(Bool, s, i) ? s[i] : default + +## bounds checking ## + +checkbounds(::Type{Bool}, s::AbstractString, i::Integer) = + 1 ≤ i ≤ ncodeunits(s) +checkbounds(::Type{Bool}, s::AbstractString, r::AbstractRange{<:Integer}) = + isempty(r) || (1 ≤ minimum(r) && maximum(r) ≤ ncodeunits(s)) +checkbounds(::Type{Bool}, s::AbstractString, I::AbstractArray{<:Real}) = + all(i -> checkbounds(s, i), I) +checkbounds(::Type{Bool}, s::AbstractString, I::AbstractArray{<:Integer}) = + all(i -> checkbounds(s, i), I) +checkbounds(s::AbstractString, I::Union{Integer,AbstractArray}) = + checkbounds(Bool, s, I) || throw(BoundsError(s, I)) + +## construction, conversion, promotion ## + +string() = "" +string(s::AbstractString) = s + +(::Type{Vector{UInt8}})(s::AbstractString) = Vector{UInt8}(String(s)) +(::Type{Array{UInt8}})(s::AbstractString) = Vector{UInt8}(s) +(::Type{Vector{Char}})(s::AbstractString) = collect(s) + +Symbol(s::AbstractString) = Symbol(String(s)) + +convert(::Type{T}, s::T) where {T<:AbstractString} = s +convert(::Type{T}, s::AbstractString) where {T<:AbstractString} = T(s) + +promote_rule(::Type{<:AbstractString}, ::Type{<:AbstractString}) = String + +## string & character concatenation ## """ - *(s::Union{AbstractString, Char}, t::Union{AbstractString, Char}...) + *(s::Union{AbstractString, Char}, t::Union{AbstractString, Char}...) -> String Concatenate strings and/or characters, producing a [`String`](@ref). This is equivalent to calling the [`string`](@ref) function on the arguments. @@ -69,49 +200,16 @@ julia> 'j' * "ulia" one(::Union{T,Type{T}}) where {T<:AbstractString} = convert(T, "") -# generic number of code units; implementations generally know how long a string -# is though and should override this with a more efficient method -ncodeunits(s::AbstractString) = nextind(s, endof(s)) - 1 - -""" - length(s::AbstractString) - -The number of characters in string `s`. - -# Examples -```jldoctest -julia> length("jμΛIα") -5 -``` -""" -function length(s::AbstractString) - i = start(s) - if done(s,i) - return 0 - end - n = 1 - while true - c, j = next(s,i) - if done(s,j) - return n - end - n += 1 - i = j - end -end +## generic string comparison ## -## string comparison functions ## """ - cmp(a::AbstractString, b::AbstractString) - -Compare two strings for equality. + cmp(a::AbstractString, b::AbstractString) -> Int -Return `0` if both strings have the same length and the character -at each index is the same in both strings. -Return `-1` if `a` is a substring of `b`, or if `a` comes before -`b` in alphabetical order. -Return `1` if `b` is a substring of `a`, or if `b` comes before -`a` in alphabetical order. +Compare two strings for equality. Return `0` if both strings have the same +length and the character at each index is the same in both strings. Return `-1` +if `a` is a substring of `b`, or if `a` comes before `b` in alphabetical order. +Return `1` if `b` is a substring of `a`, or if `b` comes before `a` in +alphabetical order (technically, lexicographical order by Unicode code points). # Examples ```jldoctest @@ -138,28 +236,23 @@ julia> cmp("b", "β") ``` """ function cmp(a::AbstractString, b::AbstractString) - if a === b - return 0 - end + a === b && return 0 i = start(a) j = start(b) - while !done(a,i) - if done(b,j) - return +1 - end - c, i = next(a,i) - d, j = next(b,j) - if c != d - return c < d ? -1 : +1 - end + while !done(a, i) + done(b, j) && return 1 + c, i = next(a, i) + d, j = next(b, j) + c ≠ d && return ifelse(c < d, -1, 1) end - done(b,j) ? 0 : -1 + return ifelse(done(b, j), 0, -1) end """ - ==(a::AbstractString, b::AbstractString) + ==(a::AbstractString, b::AbstractString) -> Bool -Test whether two strings are equal character by character. +Test whether two strings are equal character by character (technically, Unicode +code point by code point). # Examples ```jldoctest @@ -170,12 +263,13 @@ julia> "abc" == "αβγ" false ``` """ -==(a::AbstractString, b::AbstractString) = cmp(a,b) == 0 +==(a::AbstractString, b::AbstractString) = cmp(a, b) == 0 """ - isless(a::AbstractString, b::AbstractString) + isless(a::AbstractString, b::AbstractString) -> Bool -Test whether string `a` comes before string `b` in alphabetical order. +Test whether string `a` comes before string `b` in alphabetical order +(technically, in lexicographical order by Unicode code points). # Examples ```jldoctest @@ -189,64 +283,58 @@ julia> isless("a", "a") false ``` """ -isless(a::AbstractString, b::AbstractString) = cmp(a,b) < 0 +isless(a::AbstractString, b::AbstractString) = cmp(a, b) < 0 # faster comparisons for symbols cmp(a::Symbol, b::Symbol) = Int(sign(ccall(:strcmp, Int32, (Cstring, Cstring), a, b))) -isless(a::Symbol, b::Symbol) = cmp(a,b) < 0 +isless(a::Symbol, b::Symbol) = cmp(a, b) < 0 -## Generic validation functions ## +## character index arithmetic ## """ - isvalid(str::AbstractString, i::Integer) + length(s::AbstractString, lo::Integer=1, hi::Integer=ncodeunits(s)) -> Integer + +The number of characters in string `s` from indices `lo` through `hi`. This is +computed as the number of code unit indices from `lo` to `hi` which are valid +character indices. Without only a single string argument, this computes the +number of characters in the entire string. If `lo` or `hi` are out of ranges +each out of range code unit is considered to be one character. This matches the +"loose" indexing model of `thisind`, `nextind` and `prevind`. -Tell whether index `i` is valid for the given string. +See also: `isvalid`, `ncodeunits`, `endof`, `thisind`, `nextind`, `prevind` # Examples ```jldoctest -julia> str = "αβγdef"; - -julia> isvalid(str, 1) -true - -julia> str[1] -'α': Unicode U+03b1 (category Ll: Letter, lowercase) - -julia> isvalid(str, 2) -false - -julia> str[2] -ERROR: UnicodeError: invalid character index -Stacktrace: -[...] +julia> length("jμΛIα") +5 ``` """ -function isvalid(s::AbstractString, i::Integer) - i < 1 && return false - done(s,i) && return false - try - next(s,i) - true - catch - false +function length(s::AbstractString, lo::Integer=1, hi::Integer=ncodeunits(s)) + z = ncodeunits(s) + a = Int(max(1, min(z, lo))) + b = Int(min(z, max(1, hi))) + n = a - b + for i = a:b + n += isvalid(s, i) end + return n + hi - lo end -## Generic indexing functions ## - """ - thisind(s::AbstractString, i::Integer) + thisind(s::AbstractString, i::Integer) -> Int -If `i` is the index into a character in `s` then `thisind` returns the index of the -start of that character. If `i < start(s)` then it returns `start(s) - 1`. -If `i > ncodeunits(s)` then it returns `ncodeunits(s) + 1`. +If `i` is in bounds in `s` return the index of the start of the character whose +encoding code unit `i` is part of. In other words, if `i` is the start of a +character, return `i`; if `i` is not the start of a character, rewind until the +start of a character and return that index. If `i` is out of bounds in `s` +return `i`. # Examples ```jldoctest julia> thisind("αβγdef", -5) -0 +-5 julia> thisind("αβγdef", 1) 1 @@ -264,23 +352,24 @@ julia> thisind("αβγdef", 10) 10 julia> thisind("αβγdef", 20) -10 +20 """ function thisind(s::AbstractString, i::Integer) - j = Int(i) - isvalid(s, j) && return j - j < start(s) && return 0 - n = ncodeunits(s) - j > n && return n + 1 - prevind(s, j) + i ≤ ncodeunits(s) || return i + @inbounds while 1 < i && !isvalid(s, i) + i -= 1 + end + return i end """ - prevind(str::AbstractString, i::Integer, nchar::Integer=1) + prevind(str::AbstractString, i::Integer, n::Integer=1) -> Int -Get the previous valid string index before `i`. -Returns a value less than `1` at the beginning of the string. -If the `nchar` argument is given the function goes back `nchar` characters. +If `i` is in bounds in `s` return the index of the start of the character whose +encoding starts before index `i`. In other words, if `i` is the start of a +character, return the start of the previous character; if `i` is not the start +of a character, rewind until the start of a character and return that index. +If `i` is out of bounds in `s` return `i - 1`. If `n == 0` return `i`. # Examples ```jldoctest @@ -290,51 +379,32 @@ julia> prevind("αβγdef", 3) julia> prevind("αβγdef", 1) 0 +julia> prevind("αβγdef", 0) +-1 + julia> prevind("αβγdef", 3, 2) 0 ``` """ -function prevind(s::AbstractString, i::Integer) - e = endof(s) - if i > e - return e +function prevind(s::AbstractString, i::Integer, n::Integer=1) + n < 0 && throw(ArgumentError("n cannot be negative: $n")) + z = ncodeunits(s) + 1 + if i > z + n -= i - z + i = z end - j = Int(i)-1 - while j >= 1 - if isvalid(s,j) - return j - end - j -= 1 - end - return 0 # out of range -end - -function prevind(s::AbstractString, i::Integer, nchar::Integer) - nchar > 0 || throw(ArgumentError("nchar must be greater than 0")) - e = endof(s) - j = Int(i) - j < 1 && return 0 - while nchar > 0 - if j > e - j = e - else - j -= 1 - while j >= 1 && !isvalid(s,j) - j -= 1 - end - end - j < 1 && return 0 - nchar -= 1 + while n > 0 && 1 < i + @inbounds n -= isvalid(s, i -= 1) end - j + return i - n end """ - nextind(str::AbstractString, i::Integer, nchar::Integer=1) + nextind(str::AbstractString, i::Integer, n::Integer=1) -> Int -Get the next valid string index after `i`. -Returns a value greater than `endof(str)` at or after the end of the string. -If the `nchar` argument is given the function goes forward `nchar` characters. +If `i` is in bounds in `s` return the index of the start of the character whose +encoding starts after index `i`. If `i` is out of bounds in `s` return `i + 1`. +If `n == 0` return `i`. # Examples ```jldoctest @@ -353,48 +423,19 @@ julia> nextind(str, 9) 10 ``` """ -function nextind(s::AbstractString, i::Integer) - e = endof(s) +function nextind(s::AbstractString, i::Integer, n::Integer=1) + n < 0 && throw(ArgumentError("n cannot be negative: $n")) if i < 1 - return 1 - end - if i > e - return Int(i)+1 - end - for j = Int(i)+1:e - if isvalid(s,j) - return j - end + n += i - 1 + i = 1 end - next(s,e)[2] # out of range -end - -function nextind(s::AbstractString, i::Integer, nchar::Integer) - nchar > 0 || throw(ArgumentError("nchar must be greater than 0")) - e = endof(s) - j = Int(i) - while nchar > 0 - if j < 1 - j = 1 - else - j > e && return j + nchar - j == e && return next(s,e)[2] + nchar - 1 - for outer j = j+1:e - isvalid(s,j) && break - end - end - nchar -= 1 + z = ncodeunits(s) + while n > 0 && i < z + @inbounds n -= isvalid(s, i += 1) end - j + return i + n end -checkbounds(s::AbstractString, i::Integer) = start(s) <= i <= endof(s) || throw(BoundsError(s, i)) -checkbounds(s::AbstractString, r::AbstractRange{<:Integer}) = isempty(r) || (minimum(r) >= start(s) && maximum(r) <= endof(s)) || throw(BoundsError(s, r)) -# The following will end up using a deprecated checkbounds, when the covariant parameter is not Integer -checkbounds(s::AbstractString, I::AbstractArray{<:Real}) = all(i -> checkbounds(s, i), I) -checkbounds(s::AbstractString, I::AbstractArray{<:Integer}) = all(i -> checkbounds(s, i), I) - - """ ind2chr(s::AbstractString, i::Integer) @@ -414,10 +455,7 @@ julia> chr2ind(str, 2) 3 ``` """ -function ind2chr(s::AbstractString, i::Integer) - s[i] # throws error if invalid - unsafe_ind2chr(s, i) -end +ind2chr(s::AbstractString, i::Integer) = length(s, 1, i) """ chr2ind(s::AbstractString, i::Integer) @@ -437,26 +475,10 @@ julia> ind2chr(str, 3) 2 ``` """ -function chr2ind(s::AbstractString, i::Integer) - i < start(s) && throw(BoundsError(s, i)) - k = unsafe_chr2ind(s, i) - s[k] # throws error if invalid - k -end - -function map_chr_ind(s::AbstractString, i::Integer, stop, ret) - j = 1 - k = start(s) - while true - i == stop((j, k)) && return ret((j, k)) # k could point after the last character - _, k = next(s, k) - j += 1 - end -end - -unsafe_ind2chr(s::AbstractString, i::Integer) = map_chr_ind(s, i, last, first) -unsafe_chr2ind(s::AbstractString, i::Integer) = map_chr_ind(s, i, first, last) +chr2ind(s::AbstractString, n::Integer) = + n < 0 ? prevind(s, 0, -n) : nextind(s, 0, n) +## string index iteration type ## struct EachStringIndex{T<:AbstractString} s::T @@ -469,32 +491,26 @@ next(e::EachStringIndex, state) = (state, nextind(e.s, state)) done(e::EachStringIndex, state) = done(e.s, state) eltype(::Type{EachStringIndex}) = Int -## string promotion rules ## - -promote_rule(::Type{<:AbstractString}, ::Type{<:AbstractString}) = String - ## string map, filter, has ## function map(f, s::AbstractString) - out = IOBuffer(StringVector(endof(s)),true,true) - truncate(out,0) + out = IOBuffer(StringVector(endof(s)), true, true) + truncate(out, 0) for c in s - c2 = f(c) - if !isa(c2,Char) - throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead")) - end - write(out, c2::Char) + c′ = f(c) + isa(c′, Char) || throw(ArgumentError( + "map(f, s::AbstractString) requires f to return Char; " * + "try map(f, collect(s)) or a comprehension instead")) + write(out, c′::Char) end String(take!(out)) end function filter(f, s::AbstractString) - out = IOBuffer(StringVector(endof(s)),true,true) - truncate(out,0) + out = IOBuffer(StringVector(endof(s)), true, true) + truncate(out, 0) for c in s - if f(c) - write(out, c) - end + f(c) && write(out, c) end String(take!(out)) end @@ -502,9 +518,9 @@ end ## string first and last ## """ - first(str::AbstractString, nchar::Integer) + first(s::AbstractString, n::Integer) -Get a string consisting of the first `nchar` characters of `str`. +Get a string consisting of the first `n` characters of `s`. ```jldoctest julia> first("∀ϵ≠0: ϵ²>0", 0) @@ -517,17 +533,12 @@ julia> first("∀ϵ≠0: ϵ²>0", 3) "∀ϵ≠" ``` """ -function first(str::AbstractString, nchar::Integer) - if 0 <= nchar <= 1 - return str[1:nchar] - end - str[1:nextind(str, 1, nchar-1)] -end +first(s::AbstractString, n::Integer) = s[1:min(end, nextind(s, 0, n))] """ - last(str::AbstractString, nchar::Integer) + last(s::AbstractString, n::Integer) -Get a string consisting of the last `nchar` characters of `str`. +Get a string consisting of the last `n` characters of `s`. ```jldoctest julia> last("∀ϵ≠0: ϵ²>0", 0) @@ -540,13 +551,54 @@ julia> last("∀ϵ≠0: ϵ²>0", 3) "²>0" ``` """ -function last(str::AbstractString, nchar::Integer) - e = endof(str) - if 0 <= nchar <= 1 - return str[(e-nchar+1):e] - end - str[prevind(str, e, nchar-1):e] -end +last(s::AbstractString, n::Integer) = s[max(1, prevind(s, ncodeunits(s)+1, n)):end] + +""" + reverseind(v, i) + +Given an index `i` in [`reverse(v)`](@ref), return the corresponding index in `v` so that +`v[reverseind(v,i)] == reverse(v)[i]`. (This can be nontrivial in cases where `v` contains +non-ASCII characters.) + +# Examples +```jldoctest +julia> r = reverse("Julia") +"ailuJ" + +julia> for i in 1:length(r) + print(r[reverseind("Julia", i)]) + end +Julia +``` +""" +reverseind(s::AbstractString, i::Integer) = thisind(s, ncodeunits(s)-i+1) + +""" + repeat(s::AbstractString, r::Integer) + +Repeat a string `r` times. This can equivalently be accomplished by calling [`s^r`](@ref ^). + +# Examples +```jldoctest +julia> repeat("ha", 3) +"hahaha" +``` +""" +repeat(s::AbstractString, r::Integer) = repeat(convert(String, s), r) + +""" + ^(s::Union{AbstractString,Char}, n::Integer) + +Repeat a string or character `n` times. +The [`repeat`](@ref) function is an alias to this operator. + +# Examples +```jldoctest +julia> "Test "^3 +"Test Test Test " +``` +""" +(^)(s::Union{AbstractString,Char}, r::Integer) = repeat(s, r) # reverse-order iteration for strings and indices thereof start(r::Iterators.Reverse{<:AbstractString}) = endof(r.itr) diff --git a/base/strings/io.jl b/base/strings/io.jl index 49d223111041b..98648bac824a6 100644 --- a/base/strings/io.jl +++ b/base/strings/io.jl @@ -139,7 +139,7 @@ write(io::IO, s::AbstractString) = (len = 0; for c in s; len += write(io, c); en show(io::IO, s::AbstractString) = print_quoted(io, s) write(to::GenericIOBuffer, s::SubString{String}) = - s.endof==0 ? 0 : unsafe_write(to, pointer(s.string, s.offset + 1), UInt(nextind(s, s.endof) - 1)) + s.ncodeunits ≤ 0 ? 0 : unsafe_write(to, pointer(s.string, s.offset+1), UInt(s.ncodeunits)) ## printing literal quoted string data ## @@ -271,15 +271,23 @@ function escape_string(io, s::AbstractString, esc::AbstractString="") i = start(s) while !done(s,i) c, j = next(s,i) - c == '\0' ? print(io, escape_nul(s,j)) : - c == '\e' ? print(io, "\\e") : - c == '\\' ? print(io, "\\\\") : - c in esc ? print(io, '\\', c) : - '\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) : - Unicode.isprint(c) ? print(io, c) : - c <= '\x7f' ? print(io, "\\x", hex(c, 2)) : - c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) : - print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4)) + if !ismalformed(c) + c == '\0' ? print(io, escape_nul(s,j)) : + c == '\e' ? print(io, "\\e") : + c == '\\' ? print(io, "\\\\") : + c in esc ? print(io, '\\', c) : + '\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) : + Unicode.isprint(c) ? print(io, c) : + c <= '\x7f' ? print(io, "\\x", hex(c, 2)) : + c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) : + print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4)) + else # malformed + u = bswap(reinterpret(UInt32, c)) + while true + print(io, "\\x", hex(u % UInt8, 2)) + (u >>= 8) == 0 && break + end + end i = j end end @@ -290,27 +298,10 @@ function print_quoted(io, s::AbstractString) print(io, '"') end -# bare minimum unescaping function unescapes only given characters - -function print_unescaped_chars(io, s::AbstractString, esc::AbstractString) - if !('\\' in esc) - esc = string("\\", esc) - end - i = start(s) - while !done(s,i) - c, i = next(s,i) - if c == '\\' && !done(s,i) && s[i] in esc - c, i = next(s,i) - end - print(io, c) - end -end - -unescape_chars(s::AbstractString, esc::AbstractString) = - sprint(endof(s), print_unescaped_chars, s, esc) - # general unescaping of traditional C and Unicode escape sequences +# TODO: handle unescaping invalid UTF-8 sequences + """ unescape_string(str::AbstractString) -> AbstractString @@ -334,16 +325,16 @@ function unescape_string(io, s::AbstractString) n = k = 0 m = c == 'x' ? 2 : c == 'u' ? 4 : 8 - while (k+=1) <= m && !done(s,i) + while (k += 1) <= m && !done(s,i) c, j = next(s,i) - n = '0' <= c <= '9' ? n<<4 + c-'0' : - 'a' <= c <= 'f' ? n<<4 + c-'a'+10 : - 'A' <= c <= 'F' ? n<<4 + c-'A'+10 : break + n = '0' <= c <= '9' ? n<<4 + (c-'0') : + 'a' <= c <= 'f' ? n<<4 + (c-'a'+10) : + 'A' <= c <= 'F' ? n<<4 + (c-'A'+10) : break i = j end if k == 1 throw(ArgumentError("invalid $(m == 2 ? "hex (\\x)" : - "unicode (\\u)") escape sequence used in $(repr(s))")) + "unicode (\\u)") escape sequence used in $(repr(s))")) end if m == 2 # \x escape sequence write(io, UInt8(n)) @@ -353,7 +344,7 @@ function unescape_string(io, s::AbstractString) elseif '0' <= c <= '7' k = 1 n = c-'0' - while (k+=1) <= 3 && !done(s,i) + while (k += 1) <= 3 && !done(s,i) c, j = next(s,i) n = ('0' <= c <= '7') ? n<<3 + c-'0' : break i = j @@ -503,18 +494,7 @@ end function convert(::Type{String}, chars::AbstractVector{Char}) sprint(length(chars), io->begin - state = start(chars) - while !done(chars, state) - c, state = next(chars, state) - if '\ud7ff' < c && c + 1024 < '\ue000' - d, state = next(chars, state) - if '\ud7ff' < d - 1024 && d < '\ue000' - c = Char(0x10000 + ((UInt32(c) & 0x03ff) << 10) | (UInt32(d) & 0x03ff)) - else - write(io, c) - c = d - end - end + for c in chars write(io, c) end end) diff --git a/base/strings/string.jl b/base/strings/string.jl index 67c238358486f..0e1ef86e6a759 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -2,6 +2,8 @@ const ByteArray = Union{Vector{UInt8},Vector{Int8}} +@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi) + ## constructors and conversions ## # String constructor docstring from boot.jl, workaround for #16730 @@ -49,7 +51,6 @@ Convert a string to a contiguous byte array representation encoded as UTF-8 byte This representation is often appropriate for passing strings to C. """ String(s::AbstractString) = print_to_string(s) - String(s::Symbol) = unsafe_string(Cstring(s)) (::Type{Vector{UInt8}})(s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s) @@ -59,48 +60,14 @@ String(s::Symbol) = unsafe_string(Cstring(s)) pointer(s::String) = unsafe_convert(Ptr{UInt8}, s) pointer(s::String, i::Integer) = pointer(s)+(i-1) -sizeof(s::String) = Core.sizeof(s) - -""" - codeunit(s::AbstractString, i::Integer) - -Get the `i`th code unit of an encoded string. For example, -returns the `i`th byte of the representation of a UTF-8 string. - -# Examples -```jldoctest -julia> s = "δ=γ"; [codeunit(s, i) for i in 1:sizeof(s)] -5-element Array{UInt8,1}: - 0xce - 0xb4 - 0x3d - 0xce - 0xb3 -``` -""" -codeunit(s::AbstractString, i::Integer) +ncodeunits(s::String) = Core.sizeof(s) +codeunit(s::String) = UInt8 @inline function codeunit(s::String, i::Integer) - @boundscheck if (i < 1) | (i > sizeof(s)) - throw(BoundsError(s,i)) - end + @boundscheck between(i, 1, ncodeunits(s)) || throw(BoundsError(s, i)) @gc_preserve s unsafe_load(pointer(s, i)) end -""" - ncodeunits(s::AbstractString) - -The number of code units in a string. For example, for UTF-8-like data such as -the default `String` type, the number of code units is the number of bytes in -the string, a.k.a. `sizeof(s)`. For a UTF-16 encoded string type, however, the -code unit is `UInt16` so the number of code units is the number of `UInt16` -words in the representation of the string. The expression `codeunit(s, i)` is -valid and safe for precisely the range of `i` values `1:ncodeunits(s)`. - -See also: [`codeunit`](@ref). -""" -ncodeunits(s::String) = sizeof(s) - write(io::IO, s::String) = @gc_preserve s unsafe_write(io, pointer(s), reinterpret(UInt, sizeof(s))) @@ -118,81 +85,45 @@ function ==(a::String, b::String) al == sizeof(b) && 0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, al) end -## thisind, prevind and nextind ## +## thisind, nextind, prevind ## -function thisind(s::String, i::Integer) - j = Int(i) - j < 1 && return 0 - n = ncodeunits(s) - j > n && return n + 1 - @inbounds while j > 0 && is_valid_continuation(codeunit(s,j)) - j -= 1 - end - j -end +thisind(s::String, i::Integer) = oftype(i, thisind(s, Int(i))) +nextind(s::String, i::Integer) = oftype(i, nextind(s, Int(i))) -function prevind(s::String, i::Integer) - j = Int(i) - e = sizeof(s) - if j > e - return endof(s) - end - j -= 1 - @inbounds while j > 0 && is_valid_continuation(codeunit(s,j)) - j -= 1 - end - j -end - -function prevind(s::String, i::Integer, nchar::Integer) - nchar > 0 || throw(ArgumentError("nchar must be greater than 0")) - j = Int(i) - e = sizeof(s) - while nchar > 0 - if j > e - j = endof(s) - else - j -= 1 - @inbounds while j > 0 && is_valid_continuation(codeunit(s,j)) - j -= 1 - end - end - nchar -= 1 - j <= 0 && return j - nchar - end - j -end - -function nextind(s::String, i::Integer) - j = Int(i) - if j < 1 - return 1 - end - e = sizeof(s) - j += 1 - @inbounds while j <= e && is_valid_continuation(codeunit(s,j)) - j += 1 - end - j +function thisind(s::String, i::Int) + n = ncodeunits(s) + between(i, 2, n) || return i + @inbounds b = codeunit(s, i) + b & 0xc0 == 0x80 || return i + @inbounds b = codeunit(s, i-1) + between(b, 0b11000000, 0b11110111) && return i-1 + (b & 0xc0 == 0x80) & (i-2 > 0) || return i + @inbounds b = codeunit(s, i-2) + between(b, 0b11100000, 0b11110111) && return i-2 + (b & 0xc0 == 0x80) & (i-3 > 0) || return i + @inbounds b = codeunit(s, i-3) + between(b, 0b11110000, 0b11110111) && return i-3 + return i end -function nextind(s::String, i::Integer, nchar::Integer) - nchar > 0 || throw(ArgumentError("nchar must be greater than 0")) - j = Int(i) - e = sizeof(s) - while nchar > 0 - if j < 1 - j = 1 - else - j += 1 - @inbounds while j <= e && is_valid_continuation(codeunit(s,j)) - j += 1 - end - end - nchar -= 1 - j > e && return j + nchar - end - j +function nextind(s::String, i::Int) + n = ncodeunits(s) + between(i, 1, n-1) || return i+1 + @inbounds l = codeunit(s, i) + (l < 0x80) | (0xf8 ≤ l) && return i+1 + if l < 0xc0 + i′ = thisind(s, i) + return i′ < i ? nextind(s, i′) : i+1 + end + # first continuation byte + @inbounds b = codeunit(s, i += 1) + (b & 0xc0 != 0x80) | ((i += 1) > n) | (l < 0xe0) && return i + # second continuation byte + @inbounds b = codeunit(s, i) + (b & 0xc0 != 0x80) | ((i += 1) > n) | (l < 0xf0) && return i + # third continuation byte + @inbounds b = codeunit(s, i) + ifelse(b & 0xc0 != 0x80, i, i+1) end ## checking UTF-8 & ACSII validity ## @@ -208,121 +139,146 @@ byte_string_classify(s::String) = isvalid(::Type{String}, s::Union{Vector{UInt8},String}) = byte_string_classify(s) != 0 isvalid(s::String) = isvalid(String, s) -## basic UTF-8 decoding & iteration ## - -is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800) -is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00) -is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800) -is_valid_continuation(c) = ((c & 0xc0) == 0x80) - -const utf8_offset = [ - 0x00000000, 0x00003080, - 0x000e2080, 0x03c82080, - 0xfa082080, 0x82082080, -] - -const utf8_trailing = [ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5, -] +is_valid_continuation(c) = c & 0xc0 == 0x80 ## required core functionality ## -function endof(s::String) - i = sizeof(s) - @inbounds while i > 0 && is_valid_continuation(codeunit(s, i)) - i -= 1 - end - i +function next(s::String, i::Int) + @boundscheck 1 ≤ i ≤ sizeof(s) || throw(BoundsError(s, i)) + @inbounds b = codeunit(s, i) + # TODO: check index validity + u = UInt32(b) << 24 + (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u), i+1 + return next_continued(s, i, u) end -function length(s::String) - cnum = 0 - @inbounds for i = 1:sizeof(s) - cnum += !is_valid_continuation(codeunit(s, i)) +@noinline function next_continued(s::String, i::Int, u::UInt32) + if u < 0xc0000000 + isvalid(s, i) && (i += 1; @goto ret) + throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, (u >> 24) % UInt8)) end - cnum + n = ncodeunits(s) + # first continuation byte + (i += 1) > n && @goto ret + @inbounds b = codeunit(s, i) + b & 0xc0 == 0x80 || @goto ret + u |= UInt32(b) << 16 + # second continuation byte + ((i += 1) > n) | (u < 0xe0000000) && @goto ret + @inbounds b = codeunit(s, i) + b & 0xc0 == 0x80 || @goto ret + u |= UInt32(b) << 8 + # third continuation byte + ((i += 1) > n) | (u < 0xf0000000) && @goto ret + @inbounds b = codeunit(s, i) + b & 0xc0 == 0x80 || @goto ret + u |= UInt32(b); i += 1 +@label ret + return reinterpret(Char, u), i end -@noinline function slow_utf8_next(s::String, b::UInt8, i::Int, l::Int) - @inbounds if is_valid_continuation(b) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i))) - end - trailing = utf8_trailing[b + 1] - if l < i + trailing - return '\ufffd', i+1 - end - c::UInt32 = 0 - @inbounds for j = 1:(trailing + 1) - c <<= 6 - c += codeunit(s, i) - i += 1 - end - c -= utf8_offset[trailing + 1] - return Char(c), i +function getindex(s::String, i::Int) + @boundscheck 1 ≤ i ≤ ncodeunits(s) || throw(BoundsError(s, i)) + @inbounds b = codeunit(s, i) + # TODO: check index validity + u = UInt32(b) << 24 + (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u) + return getindex_continued(s, i, u) end -# This implementation relies on `next` returning a value past the end of the -# String's underlying data, which is true for valid Strings -done(s::String, state) = state > sizeof(s) - -@inline function next(s::String, i::Int) - # function is split into this critical fast-path - # for pure ascii data, such as parsing numbers, - # and a longer function that can handle any utf8 data - @boundscheck if (i < 1) | (i > sizeof(s)) - throw(BoundsError(s,i)) +@noinline function getindex_continued(s::String, i::Int, u::UInt32) + if u < 0xc0000000 + isvalid(s, i) && @goto ret + throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, (u >> 24) % UInt8)) end + n = ncodeunits(s) + # first continuation byte + (i += 1) > n && @goto ret @inbounds b = codeunit(s, i) - if b < 0x80 - return Char(b), i + 1 - end - return slow_utf8_next(s, b, i, sizeof(s)) + b & 0xc0 == 0x80 || @goto ret + u |= UInt32(b) << 16 + # second continuation byte + ((i += 1) > n) | (u < 0xe0000000) && @goto ret + @inbounds b = codeunit(s, i) + b & 0xc0 == 0x80 || @goto ret + u |= UInt32(b) << 8 + # third continuation byte + ((i += 1) > n) | (u < 0xf0000000) && @goto ret + @inbounds b = codeunit(s, i) + b & 0xc0 == 0x80 || @goto ret + u |= UInt32(b) +@label ret + return reinterpret(Char, u) end -function first_utf8_byte(ch::Char) - c = UInt32(ch) - b = c < 0x80 ? c%UInt8 : - c < 0x800 ? ((c>>6) | 0xc0)%UInt8 : - c < 0x10000 ? ((c>>12) | 0xe0)%UInt8 : - ((c>>18) | 0xf0)%UInt8 - return b -end - -## overload methods for efficiency ## - -isvalid(s::String, i::Integer) = - (1 <= i <= sizeof(s)) && ((@inbounds b = codeunit(s, i)); !is_valid_continuation(b)) +getindex(s::String, r::UnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))] function getindex(s::String, r::UnitRange{Int}) isempty(r) && return "" - l = sizeof(s) - i = first(r) - if i < 1 || i > l - throw(BoundsError(s, i)) - end - @inbounds si = codeunit(s, i) - if is_valid_continuation(si) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, si)) - end - j = last(r) - if j > l - throw(BoundsError(s, j)) - end - @inbounds sj = codeunit(s, j) - if is_valid_continuation(sj) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, sj)) + i, j = first(r), last(r) + @boundscheck begin + checkbounds(s, r) + @inbounds isvalid(s, i) || + throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i))) + @inbounds isvalid(s, j) || + throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, codeunit(s, j))) + end + j = nextind(s, j) - 1 + n = j - i + 1 + ss = _string_n(n) + p = pointer(ss) + for k = 1:n + unsafe_store!(p, codeunit(s, i + k - 1), k) + end + return ss +end + +function length(s::String, lo::Int, hi::Int) + z = ncodeunits(s) + i = Int(max(1, min(z, lo))) + n = Int(min(z, max(1, hi))) + c = i - n + if i ≤ n + i, j = thisind(s, i), i + c -= i < j + i -= 1 + while true + (i += 1) ≤ n || break + @inbounds b = codeunit(s, i) # lead byte + @label L + c += 1 + (0xc0 ≤ b) & (b < 0xf8) || continue + l = b + + (i += 1) ≤ n || break + @inbounds b = codeunit(s, i) # cont byte 1 + b & 0xc0 == 0x80 || @goto L + l ≥ 0xe0 || continue + + (i += 1) ≤ n || break + @inbounds b = codeunit(s, i) # cont byte 2 + b & 0xc0 == 0x80 || @goto L + l ≥ 0xf0 || continue + + (i += 1) ≤ n || break + @inbounds b = codeunit(s, i) # cont byte 3 + b & 0xc0 == 0x80 || @goto L + end end - j = nextind(s,j) - unsafe_string(pointer(s,i), j-i) + return c + hi - lo end +# TODO: delete or move to char.jl +first_utf8_byte(c::Char) = (reinterpret(UInt32, c) >> 24) % UInt8 + +## overload methods for efficiency ## + +function isvalid(s::String, i::Int) + @boundscheck checkbounds(s, i) + return thisind(s, i) == i +end +isvalid(s::String, i::Integer) = isvalid(s, Int(i)) + function search(s::String, c::Char, i::Integer = 1) if i < 1 || i > sizeof(s) i == sizeof(s) + 1 && return 0 @@ -331,11 +287,11 @@ function search(s::String, c::Char, i::Integer = 1) @inbounds if is_valid_continuation(codeunit(s,i)) throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s,i))) end - c < Char(0x80) && return search(s, c%UInt8, i) + c ≤ '\x7f' && return search(s, c % UInt8, i) while true i = search(s, first_utf8_byte(c), i) - (i==0 || s[i] == c) && return i - i = next(s,i)[2] + (i == 0 || s[i] == c) && return i + i = next(s, i)[2] end end @@ -361,12 +317,12 @@ function search(a::ByteArray, b::Char, i::Integer = 1) end function rsearch(s::String, c::Char, i::Integer = sizeof(s)) - c < Char(0x80) && return rsearch(s, c%UInt8, i) + c ≤ '\x7f' && return rsearch(s, c % UInt8, i) b = first_utf8_byte(c) while true i = rsearch(s, b, i) - (i==0 || s[i] == c) && return i - i = prevind(s,i) + (i == 0 || s[i] == c) && return i + i = prevind(s, i) end end @@ -411,62 +367,15 @@ function string(a::String...) end # UTF-8 encoding length of a character -function codelen(d::Char) - c = UInt32(d) - if c < 0x80 - return 1 - elseif c < 0x800 - return 2 - elseif c < 0x10000 - return 3 - elseif c < 0x110000 - return 4 - end - return 3 # '\ufffd' -end +# TODO: delete or move to char.jl +codelen(c::Char) = 4 - (trailing_zeros(0xff000000 | reinterpret(UInt32, c)) >> 3) function string(a::Union{String,Char}...) - n = 0 - for d in a - if isa(d,Char) - n += codelen(d::Char) - else - n += sizeof(d::String) + sprint() do io + for x in a + write(io, x) end end - out = _string_n(n) - offs = 1 - p = pointer(out) - for d in a - if isa(d,Char) - c = UInt32(d::Char) - if c < 0x80 - unsafe_store!(p, c%UInt8, offs); offs += 1 - elseif c < 0x800 - unsafe_store!(p, (( c >> 6 ) | 0xC0)%UInt8, offs); offs += 1 - unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1 - elseif c < 0x10000 - unsafe_store!(p, (( c >> 12 ) | 0xE0)%UInt8, offs); offs += 1 - unsafe_store!(p, (((c >> 6) & 0x3F ) | 0x80)%UInt8, offs); offs += 1 - unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1 - elseif c < 0x110000 - unsafe_store!(p, (( c >> 18 ) | 0xF0)%UInt8, offs); offs += 1 - unsafe_store!(p, (((c >> 12) & 0x3F ) | 0x80)%UInt8, offs); offs += 1 - unsafe_store!(p, (((c >> 6) & 0x3F ) | 0x80)%UInt8, offs); offs += 1 - unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1 - else - # '\ufffd' - unsafe_store!(p, 0xef, offs); offs += 1 - unsafe_store!(p, 0xbf, offs); offs += 1 - unsafe_store!(p, 0xbd, offs); offs += 1 - end - else - l = sizeof(d::String) - unsafe_copy!(pointer(out,offs), pointer(d::String), l) - offs += l - end - end - return out end function repeat(s::String, r::Integer) diff --git a/base/strings/substring.jl b/base/strings/substring.jl index b5fabef1788dc..75dc64e8b01d7 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -22,13 +22,18 @@ julia> SubString("abc", 2) struct SubString{T<:AbstractString} <: AbstractString string::T offset::Int - endof::Int + ncodeunits::Int function SubString{T}(s::T, i::Int, j::Int) where T<:AbstractString - i > j && return new(s, i - 1, 0) # always allow i > j as it is consistent with getindex - isvalid(s, i) || throw(BoundsError(s, i)) - isvalid(s, j) || throw(BoundsError(s, j)) - new(s, i-1, j-i+1) + i ≤ j || return new(s, i-1, 0) + @boundscheck begin + checkbounds(s, i:j) + @inbounds isvalid(s, i) || + throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i))) + @inbounds isvalid(s, j) || + throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, codeunit(s, j))) + end + return new(s, i-1, nextind(s,j)-i) end end @@ -37,11 +42,8 @@ SubString(s::AbstractString, i::Integer, j::Integer=endof(s)) = SubString(s, Int SubString(s::AbstractString, r::UnitRange{<:Integer}) = SubString(s, first(r), last(r)) function SubString(s::SubString, i::Int, j::Int) - # always allow i > j as it is consistent with getindex - i > j && return SubString(s.string, s.offset + i, s.offset + j) - i >= 1 || throw(BoundsError(s, i)) - j <= endof(s) || throw(BoundsError(s, j)) - SubString(s.string, s.offset + i, s.offset + j) + @boundscheck i ≤ j && checkbounds(s, i:j) + SubString(s.string, s.offset+i, s.offset+j) end SubString(s::AbstractString) = SubString(s, 1, endof(s)) @@ -50,78 +52,56 @@ SubString{T}(s::T) where {T<:AbstractString} = SubString{T}(s, 1, endof(s)) convert(::Type{SubString{S}}, s::AbstractString) where {S<:AbstractString} = SubString(convert(S, s)) -String(p::SubString{String}) = - unsafe_string(pointer(p.string, p.offset+1), nextind(p, p.endof)-1) +String(s::SubString{String}) = unsafe_string(pointer(s.string, s.offset+1), s.ncodeunits) -sizeof(s::SubString{String}) = s.endof == 0 ? 0 : nextind(s, s.endof) - 1 +ncodeunits(s::SubString) = s.ncodeunits +codeunit(s::SubString) = codeunit(s.string) +length(s::SubString) = length(s.string, s.offset+1, s.offset+s.ncodeunits) -# TODO: length(s::SubString) = ?? -# default implementation will work but it's slow -# can this be delegated efficiently somehow? -# that may require additional string interfaces -function length(s::SubString{String}) - return s.endof==0 ? 0 : Int(ccall(:u8_charnum, Csize_t, (Ptr{UInt8}, Csize_t), - pointer(s), nextind(s, s.endof) - 1)) +function codeunit(s::SubString, i::Integer) + @boundscheck checkbounds(s, i) + @inbounds return codeunit(s.string, s.offset + i) end -function next(s::SubString, i::Int) - if i < 1 || i > s.endof - throw(BoundsError(s, i)) - end - c, i = next(s.string, i+s.offset) - c, i-s.offset +function next(s::SubString, i::Integer) + @boundscheck checkbounds(s, i) + @inbounds c, i = next(s.string, s.offset + i) + return c, i - s.offset end -function getindex(s::SubString, i::Int) - if i < 1 || i > s.endof - throw(BoundsError(s, i)) - end - getindex(s.string, i+s.offset) +function getindex(s::SubString, i::Integer) + @boundscheck checkbounds(s, i) + @inbounds return getindex(s.string, s.offset + i) end -endof(s::SubString) = s.endof - function isvalid(s::SubString, i::Integer) - return (start(s) <= i <= endof(s)) && isvalid(s.string, s.offset+i) + @boundscheck checkbounds(s, i) + @inbounds return isvalid(s.string, s.offset + i) end -function thisind(s::SubString{String}, i::Integer) - j = Int(i) - j < start(s) && return 0 - n = ncodeunits(s) - j > n && return n + 1 - offset = s.offset - str = s.string - j += offset - @inbounds while j > offset && is_valid_continuation(codeunit(str, j)) - j -= 1 - end - j - offset -end - -nextind(s::SubString, i::Integer) = nextind(s.string, i+s.offset)-s.offset -prevind(s::SubString, i::Integer) = prevind(s.string, i+s.offset)-s.offset - -function getindex(s::AbstractString, r::UnitRange{Int}) - checkbounds(s, r) || throw(BoundsError(s, r)) - SubString(s, first(r), last(r)) -end +thisind(s::SubString, i::Integer) = thisind(s.string, s.offset + i) - s.offset +nextind(s::SubString, i::Integer) = nextind(s.string, s.offset + i) - s.offset +prevind(s::SubString, i::Integer) = prevind(s.string, s.offset + i) - s.offset function cmp(a::SubString{String}, b::SubString{String}) na = sizeof(a) nb = sizeof(b) c = ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), - pointer(a), pointer(b), min(na,nb)) - c < 0 ? -1 : c > 0 ? +1 : cmp(na,nb) + pointer(a), pointer(b), min(na, nb)) + return c < 0 ? -1 : c > 0 ? +1 : cmp(na, nb) end # don't make unnecessary copies when passing substrings to C functions cconvert(::Type{Ptr{UInt8}}, s::SubString{String}) = s cconvert(::Type{Ptr{Int8}}, s::SubString{String}) = s + function unsafe_convert(::Type{Ptr{R}}, s::SubString{String}) where R<:Union{Int8, UInt8} convert(Ptr{R}, pointer(s.string)) + s.offset end +pointer(x::SubString{String}) = pointer(x.string) + x.offset +pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1) + """ reverse(s::AbstractString) -> AbstractString @@ -159,53 +139,3 @@ function reverse(s::Union{String,SubString{String}})::String end end end - -""" - reverseind(v, i) - -Given an index `i` in [`reverse(v)`](@ref), return the corresponding index in `v` so that -`v[reverseind(v,i)] == reverse(v)[i]`. (This can be nontrivial in cases where `v` contains -non-ASCII characters.) - -# Examples -```jldoctest -julia> r = reverse("Julia") -"ailuJ" - -julia> for i in 1:length(r) - print(r[reverseind("Julia", i)]) - end -Julia -``` -""" -reverseind(s::AbstractString, i::Integer) = thisind(s, ncodeunits(s)-i+1) - -""" - repeat(s::AbstractString, r::Integer) - -Repeat a string `r` times. This can equivalently be accomplished by calling [`s^r`](@ref ^). - -# Examples -```jldoctest -julia> repeat("ha", 3) -"hahaha" -``` -""" -repeat(s::AbstractString, r::Integer) = repeat(convert(String, s), r) - -""" - ^(s::Union{AbstractString,Char}, n::Integer) - -Repeat a string or character `n` times. -The [`repeat`](@ref) function is an alias to this operator. - -# Examples -```jldoctest -julia> "Test "^3 -"Test Test Test " -``` -""" -(^)(s::Union{AbstractString,Char}, r::Integer) = repeat(s,r) - -pointer(x::SubString{String}) = pointer(x.string) + x.offset -pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1) diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index 67859b41b54c9..202b481896a7e 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -3,7 +3,8 @@ # Various Unicode functionality from the utf8proc library module Unicode -import Base: show, ==, hash, string, Symbol, isless, length, eltype, start, next, done, convert, isvalid +import Base: show, ==, hash, string, Symbol, isless, length, eltype, start, + next, done, convert, isvalid, MalformedCharError, ismalformed # whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff @@ -111,7 +112,9 @@ const category_strings = [ "Other, control", "Other, format", "Other, surrogate", - "Other, private use" + "Other, private use", + "Invalid, too high", + "Malformed, bad data", ] const UTF8PROC_STABLE = (1<<1) @@ -148,10 +151,26 @@ end utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags) -function normalize(s::AbstractString; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false) +function normalize( + s::AbstractString; + stable::Bool=false, + compat::Bool=false, + compose::Bool=true, + decompose::Bool=false, + stripignore::Bool=false, + rejectna::Bool=false, + newline2ls::Bool=false, + newline2ps::Bool=false, + newline2lf::Bool=false, + stripcc::Bool=false, + casefold::Bool=false, + lump::Bool=false, + stripmark::Bool=false, +) flags = 0 stable && (flags = flags | UTF8PROC_STABLE) compat && (flags = flags | UTF8PROC_COMPAT) + # TODO: error if compose & decompose? if decompose flags = flags | UTF8PROC_DECOMPOSE elseif compose @@ -250,7 +269,10 @@ julia> textwidth('❤') 2 ``` """ -textwidth(c::Char) = Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c)) +function textwidth(c::Char) + ismalformed(c) && (c = '\ufffd') + Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c)) +end """ textwidth(s::AbstractString) @@ -267,17 +289,29 @@ julia> textwidth("March") """ textwidth(s::AbstractString) = mapreduce(textwidth, +, 0, s) -lowercase(c::Char) = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) : Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c)) -uppercase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c)) -titlecase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : Char(ccall(:utf8proc_totitle, UInt32, (UInt32,), c)) +lowercase(c::Char) = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) : + Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c)) +uppercase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : + Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c)) +titlecase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : + Char(ccall(:utf8proc_totitle, UInt32, (UInt32,), c)) ############################################################################ # returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category -category_code(c) = ccall(:utf8proc_category, Cint, (UInt32,), c) +function category_code(c::Char) + ismalformed(c) && return Cint(31) + (u = UInt32(c)) ≤ 0x10ffff || return Cint(30) + ccall(:utf8proc_category, Cint, (UInt32,), u) +end # more human-readable representations of the category code -category_abbrev(c) = unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), c)) +function category_abbrev(c) + ismalformed(c) && return "Ma" + (u = UInt32(c)) ≤ 0x10ffff || return "In" + unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), u)) +end + category_string(c) = category_strings[category_code(c)+1] """ @@ -321,7 +355,7 @@ julia> islower('❤') false ``` """ -islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL) +islower(c::Char) = category_code(c) == UTF8PROC_CATEGORY_LL # true for Unicode upper and mixed case @@ -347,8 +381,8 @@ false ``` """ function isupper(c::Char) - ccode = category_code(c) - return ccode == UTF8PROC_CATEGORY_LU || ccode == UTF8PROC_CATEGORY_LT + cat = category_code(c) + cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LT end """ @@ -370,7 +404,7 @@ julia> isdigit('α') false ``` """ -isdigit(c::Char) = ('0' <= c <= '9') +isdigit(c::Char) = '0' <= c <= '9' """ isalpha(c::Char) -> Bool @@ -393,7 +427,7 @@ julia> isalpha('9') false ``` """ -isalpha(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO) +isalpha(c::Char) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO """ isnumeric(c::Char) -> Bool @@ -422,7 +456,7 @@ julia> isnumeric('❤') false ``` """ -isnumeric(c::Char) = (UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO) +isnumeric(c::Char) = UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO """ isalnum(c::Char) -> Bool @@ -446,9 +480,9 @@ true ``` """ function isalnum(c::Char) - ccode = category_code(c) - return (UTF8PROC_CATEGORY_LU <= ccode <= UTF8PROC_CATEGORY_LO) || - (UTF8PROC_CATEGORY_ND <= ccode <= UTF8PROC_CATEGORY_NO) + cat = category_code(c) + UTF8PROC_CATEGORY_LU <= cat <= UTF8PROC_CATEGORY_LO || + UTF8PROC_CATEGORY_ND <= cat <= UTF8PROC_CATEGORY_NO end # following C++ only control characters from the Latin-1 subset return true @@ -470,7 +504,7 @@ julia> iscntrl('a') false ``` """ -iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f)) +iscntrl(c::Char) = c <= '\x1f' || '\x7f' <= c <= '\u9f' """ ispunct(c::Char) -> Bool @@ -492,7 +526,7 @@ julia> ispunct(';') true ``` """ -ispunct(c::Char) = (UTF8PROC_CATEGORY_PC <= category_code(c) <= UTF8PROC_CATEGORY_PO) +ispunct(c::Char) = UTF8PROC_CATEGORY_PC <= category_code(c) <= UTF8PROC_CATEGORY_PO # \u85 is the Unicode Next Line (NEL) character @@ -520,7 +554,9 @@ julia> isspace('\\x20') true ``` """ -@inline isspace(c::Char) = c == ' ' || '\t' <= c <='\r' || c == '\u85' || '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS +@inline isspace(c::Char) = + c == ' ' || '\t' <= c <= '\r' || c == '\u85' || + '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS """ isprint(c::Char) -> Bool @@ -538,7 +574,7 @@ julia> isprint('A') true ``` """ -isprint(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_ZS) +isprint(c::Char) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_ZS # true in principal if a printer would use ink @@ -560,7 +596,7 @@ julia> isgraph('A') true ``` """ -isgraph(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_SO) +isgraph(c::Char) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_SO """ isascii(c::Union{Char,AbstractString}) -> Bool @@ -585,7 +621,7 @@ julia> isascii("αβγ") false ``` """ -isascii(c::Char) = c < Char(0x80) +isascii(c::Char) = bswap(reinterpret(UInt32, c)) < 0x80 isascii(s::AbstractString) = all(isascii, s) """ @@ -640,7 +676,7 @@ julia> lowercase("STRINGS AND THINGS") lowercase(s::AbstractString) = map(lowercase, s) """ - titlecase(s::AbstractString) + titlecase(s::AbstractString) -> String Capitalize the first character of each word in `s`. See also [`ucfirst`](@ref) to capitalize only the first @@ -648,9 +684,7 @@ character in `s`. # Examples ```jldoctest -julia> using Unicode - -julia> titlecase("the julia programming language") +julia> titlecase("the Julia programming language") "The Julia Programming Language" ``` """ @@ -670,56 +704,67 @@ function titlecase(s::AbstractString) end """ - ucfirst(s::AbstractString) + ucfirst(s::AbstractString) -> String -Return `string` with the first character converted to uppercase -(technically "title case" for Unicode). -See also [`titlecase`](@ref) to capitalize the first character of -every word in `s`. +Return `s` with the first character converted to uppercase (technically "title +case" for Unicode). See also [`titlecase`](@ref) to capitalize the first +character of every word in `s`. + +See also: `lcfirst`, `uppercase`, `lowercase`, `titlecase` # Examples ```jldoctest -julia> using Unicode - julia> ucfirst("python") "Python" ``` """ function ucfirst(s::AbstractString) - isempty(s) && return s + isempty(s) && return "" c = s[1] - tc = titlecase(c) - return c==tc ? s : string(tc,s[nextind(s,1):end]) + c′ = titlecase(c) + c == c′ ? convert(String, s) : + string(c′, SubString(s, nextind(s, 1))) end """ lcfirst(s::AbstractString) -Return `string` with the first character converted to lowercase. +Return `s` with the first character converted to lowercase. + +See also: `ucfirst`, `uppercase`, `lowercase`, `titlecase` # Examples ```jldoctest -julia> using Unicode - julia> lcfirst("Julia") "julia" ``` """ function lcfirst(s::AbstractString) - isempty(s) || islower(s[1]) ? s : string(lowercase(s[1]),s[nextind(s,1):end]) + isempty(s) && return "" + c = s[1] + c′ = lowercase(c) + c == c′ ? convert(String, s) : + string(c′, SubString(s, nextind(s, 1))) end ############################################################################ # iterators for grapheme segmentation isgraphemebreak(c1::Char, c2::Char) = + ismalformed(c1) || ismalformed(c2) || ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2) # Stateful grapheme break required by Unicode-9 rules: the string # must be processed in sequence, with state initialized to Ref{Int32}(0). # Requires utf8proc v2.0 or later. -isgraphemebreak!(state::Ref{Int32}, c1::Char, c2::Char) = - ccall(:utf8proc_grapheme_break_stateful, Bool, (UInt32, UInt32, Ref{Int32}), c1, c2, state) +function isgraphemebreak!(state::Ref{Int32}, c1::Char, c2::Char) + if ismalformed(c1) || ismalformed(c2) + state[] = 0 + return true + end + ccall(:utf8proc_grapheme_break_stateful, Bool, + (UInt32, UInt32, Ref{Int32}), c1, c2, state) +end struct GraphemeIterator{S<:AbstractString} s::S # original string (for generation of SubStrings) @@ -739,7 +784,7 @@ eltype(::Type{GraphemeIterator{S}}) where {S} = SubString{S} eltype(::Type{GraphemeIterator{SubString{S}}}) where {S} = SubString{S} function length(g::GraphemeIterator) - c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this) + c0 = typemax(Char) n = 0 state = Ref{Int32}(0) for c in g.s diff --git a/base/strings/util.jl b/base/strings/util.jl index 1f6777e7c6c0f..da299d538a55f 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -58,10 +58,12 @@ function endswith(a::AbstractString, b::AbstractString) end endswith(str::AbstractString, chars::Chars) = !isempty(str) && last(str) in chars -startswith(a::String, b::String) = - (sizeof(a) >= sizeof(b) && ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, sizeof(b)) == 0) -startswith(a::Vector{UInt8}, b::Vector{UInt8}) = - (length(a) >= length(b) && ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, length(b)) == 0) +# FIXME: check that end of `b` doesn't match a partial character in `a` +startswith(a::String, b::String) = sizeof(a) ≥ sizeof(b) && + ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, sizeof(b)) == 0 + +startswith(a::Vector{UInt8}, b::Vector{UInt8}) = length(a) ≥ length(b) && + ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, length(b)) == 0 # TODO: fast endswith @@ -88,15 +90,9 @@ julia> chop(a, 5, 5) "" ``` """ -function chop(s::AbstractString, head::Integer, tail::Integer) - # negative values of head/tail will throw error in nextind/prevind - headidx = head == 0 ? start(s) : nextind(s, start(s), head) - tailidx = tail == 0 ? endof(s) : prevind(s, endof(s), tail) - SubString(s, headidx, tailidx) -end - -# no head/tail version left for performance reasons chop(s::AbstractString) = SubString(s, start(s), prevind(s, endof(s))) +chop(s::AbstractString, head::Integer, tail::Integer) = + SubString(s, nextind(s, start(s), head), prevind(s, endof(s), tail)) """ chomp(s::AbstractString) @@ -127,17 +123,6 @@ function chomp(s::String) end end -# NOTE: use with caution -- breaks the immutable string convention! -# TODO: this is hard to provide with the new representation -#function chomp!(s::String) -# if !isempty(s) && codeunit(s,sizeof(s)) == 0x0a -# n = (endof(s) < 2 || s.data[end-1] != 0x0d) ? 1 : 2 -# ccall(:jl_array_del_end, Void, (Any, UInt), s.data, n) -# end -# return s -#end -chomp!(s::AbstractString) = chomp(s) # copying fallback for other string types - const _default_delims = [' ','\t','\n','\v','\f','\r'] """ @@ -449,6 +434,7 @@ replace(s::AbstractString, pat, f) = replace_new(String(s), pat, f, typemax(Int) # replace(s::AbstractString, pat, f, count::Integer=typemax(Int)) = # replace(String(s), pat, f, count) +# TODO: allow transform as the first argument to replace? # hex <-> bytes conversion @@ -550,7 +536,8 @@ end # check for pure ASCII-ness function ascii(s::String) - for (i, b) in enumerate(Vector{UInt8}(s)) + for i = 1:sizeof(s) + b = codeunit(s,i) b < 0x80 || throw(ArgumentError("invalid ASCII at index $i in $(repr(s))")) end return s diff --git a/src/ast.c b/src/ast.c index d54e5581fab89..fba225b231feb 100644 --- a/src/ast.c +++ b/src/ast.c @@ -557,7 +557,17 @@ static jl_value_t *scm_to_julia_(fl_context_t *fl_ctx, value_t e, jl_module_t *m return (jl_value_t*)ex; } if (iscprim(e) && cp_class((cprim_t*)ptr(e)) == fl_ctx->wchartype) { - return jl_box32(jl_char_type, *(int32_t*)cp_data((cprim_t*)ptr(e))); + uint32_t c, u = *(uint32_t*)cp_data((cprim_t*)ptr(e)); + if (u < 0x80) { + c = u << 24; + } else { + c = ((u << 0) & 0x0000003f) | ((u << 2) & 0x00003f00) | + ((u << 4) & 0x003f0000) | ((u << 6) & 0x3f000000); + c = u < 0x00000800 ? (c << 16) | 0xc0800000 : + u < 0x00010000 ? (c << 8) | 0xe0808000 : + (c << 0) | 0xf0808080 ; + } + return jl_box_char(c); } if (iscvalue(e) && cv_class((cvalue_t*)ptr(e)) == jl_ast_ctx(fl_ctx)->jvtype) { return *(jl_value_t**)cv_data((cvalue_t*)ptr(e)); diff --git a/src/datatype.c b/src/datatype.c index 41f5cdb62ac70..edf94df39591c 100644 --- a/src/datatype.c +++ b/src/datatype.c @@ -640,7 +640,6 @@ SIBOX_FUNC(int16, int16_t, 1) SIBOX_FUNC(int32, int32_t, 1) UIBOX_FUNC(uint16, uint16_t, 1) UIBOX_FUNC(uint32, uint32_t, 1) -UIBOX_FUNC(char, uint32_t, 1) UIBOX_FUNC(ssavalue, size_t, 1) UIBOX_FUNC(slotnumber, size_t, 1) #ifdef _P64 @@ -651,6 +650,17 @@ SIBOX_FUNC(int64, int64_t, 2) UIBOX_FUNC(uint64, uint64_t, 2) #endif +static jl_value_t *boxed_char_cache[128]; +JL_DLLEXPORT jl_value_t *jl_box_char(uint32_t x) +{ + jl_ptls_t ptls = jl_get_ptls_states(); + if (0 < (int32_t)x) + return boxed_char_cache[x >> 24]; + jl_value_t *v = jl_gc_alloc(ptls, sizeof(void*), jl_char_type); + *(uint32_t*)jl_data_ptr(v) = x; + return v; +} + static jl_value_t *boxed_int8_cache[256]; JL_DLLEXPORT jl_value_t *jl_box_int8(int8_t x) { @@ -684,14 +694,16 @@ void jl_init_int32_int64_cache(void) void jl_init_box_caches(void) { int64_t i; + for(i=0; i < 128; i++) { + boxed_char_cache[i] = jl_permbox32(jl_char_type, i << 24); + } for(i=0; i < 256; i++) { - boxed_int8_cache[i] = jl_permbox8(jl_int8_type, i); + boxed_int8_cache[i] = jl_permbox8(jl_int8_type, i); } for(i=0; i < NBOX_C; i++) { boxed_int16_cache[i] = jl_permbox16(jl_int16_type, i-NBOX_C/2); boxed_uint16_cache[i] = jl_permbox16(jl_uint16_type, i); boxed_uint32_cache[i] = jl_permbox32(jl_uint32_type, i); - boxed_char_cache[i] = jl_permbox32(jl_char_type, i); boxed_uint64_cache[i] = jl_permbox64(jl_uint64_type, i); } } diff --git a/src/jl_uv.c b/src/jl_uv.c index 77719693eb943..4753655bbdd9d 100644 --- a/src/jl_uv.c +++ b/src/jl_uv.c @@ -490,10 +490,21 @@ JL_DLLEXPORT void jl_uv_putb(uv_stream_t *stream, uint8_t b) jl_uv_puts(stream, (char*)&b, 1); } -JL_DLLEXPORT void jl_uv_putc(uv_stream_t *stream, uint32_t wchar) +JL_DLLEXPORT void jl_uv_putc(uv_stream_t *stream, uint32_t c) { char s[4]; - jl_uv_puts(stream, s, u8_wc_toutf8(s, wchar)); + int n = 1; + s[0] = c >> 24; + if ((s[1] = c >> 16)) { + n++; + if ((s[2] = c >> 8)) { + n++; + if ((s[3] = c)) { + n++; + } + } + } + jl_uv_puts(stream, s, n); } extern int vasprintf(char **str, const char *fmt, va_list ap); diff --git a/stdlib/Test/src/Test.jl b/stdlib/Test/src/Test.jl index 916834c42bf3b..97d46e237343e 100644 --- a/stdlib/Test/src/Test.jl +++ b/stdlib/Test/src/Test.jl @@ -1396,8 +1396,11 @@ with string types besides the standard `String` type. struct GenericString <: AbstractString string::AbstractString end -Base.endof(s::GenericString) = endof(s.string) -Base.next(s::GenericString, i::Int) = next(s.string, i) +Base.ncodeunits(s::GenericString) = ncodeunits(s.string) +Base.codeunit(s::GenericString) = codeunit(s.string) +Base.codeunit(s::GenericString, i::Integer) = codeunit(s.string, i) +Base.isvalid(s::GenericString, i::Integer) = isvalid(s.string, i) +Base.next(s::GenericString, i::Integer) = next(s.string, i) Base.reverse(s::GenericString) = GenericString(reverse(s.string)) Base.reverse(s::SubString{GenericString}) = GenericString(typeof(s.string)(reverse(String(s)))) diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl index 1a332e638a971..bcf3943c8b423 100644 --- a/stdlib/Unicode/test/runtests.jl +++ b/stdlib/Unicode/test/runtests.jl @@ -382,8 +382,8 @@ end foobar(ch) = Char(0xd800) foobaz(ch) = reinterpret(Char, typemax(UInt32)) @test_throws ArgumentError map(foomap, GenericString(str)) - @test map(foobar, GenericString(str)) == String(repeat(b"\ud800", outer=[17])) - @test map(foobaz, GenericString(str)) == String(repeat(b"\ufffd", outer=[17])) + @test map(foobar, GenericString(str)) == String(repeat(b"\ud800", outer=[length(str)])) + @test map(foobaz, GenericString(str)) == String(repeat([0xff], outer=[4*length(str)])) @test "a".*["b","c"] == ["ab","ac"] @test ["b","c"].*"a" == ["ba","ca"] diff --git a/test/char.jl b/test/char.jl index c40f60de3be23..85b2acf5385ef 100644 --- a/test/char.jl +++ b/test/char.jl @@ -198,3 +198,25 @@ end @test sprint(show, "text/plain", '$') == "'\$': ASCII/Unicode U+0024 (category Sc: Symbol, currency)" @test repr('$') == "'\$'" + +@testset "read incomplete character at end of stream or file" begin + local file = tempname() + local iob = IOBuffer([0xf0]) + local bytes(c::Char) = Vector{UInt8}(string(c)) + @test bytes(read(iob, Char)) == [0xf0] + @test eof(iob) + try + write(file, 0xf0) + open(file) do io + @test bytes(read(io, Char)) == [0xf0] + @test eof(io) + end + let io = Base.Filesystem.open(file, Base.Filesystem.JL_O_RDONLY) + @test bytes(read(io, Char)) == [0xf0] + @test eof(io) + close(io) + end + finally + rm(file, force=true) + end +end diff --git a/test/intfuncs.jl b/test/intfuncs.jl index 779ce240add9a..062d1103c530f 100644 --- a/test/intfuncs.jl +++ b/test/intfuncs.jl @@ -134,7 +134,7 @@ end @test base(2, 5, 7) == "0000101" @test bitstring(Int16(3)) == "0000000000000011" - @test bitstring('3') == "00000000000000000000000000110011" + @test bitstring('3') == "00110011000000000000000000000000" @test bitstring(1035) == (Int == Int32 ? "00000000000000000000010000001011" : "0000000000000000000000000000000000000000000000000000010000001011") @test bitstring(Int128(3)) == "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011" diff --git a/test/lineedit.jl b/test/lineedit.jl index de997ccfbcb98..571ed8a75d36b 100644 --- a/test/lineedit.jl +++ b/test/lineedit.jl @@ -16,8 +16,8 @@ function new_state() LineEdit.init_state(term, ModalInterface([Prompt("test> ")])) end -charseek(buf, i) = seek(buf, Base.unsafe_chr2ind(content(buf), i+1)-1) -charpos(buf, pos=position(buf)) = Base.unsafe_ind2chr(content(buf), pos+1)-1 +charseek(buf, i) = seek(buf, chr2ind(content(buf), i+1)-1) +charpos(buf, pos=position(buf)) = ind2chr(content(buf), pos+1)-1 function transform!(f, s, i = -1) # i is char-based (not bytes) buffer position buf = buffer(s) diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 1ad8ca9184e74..de46027782d8c 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -99,14 +99,14 @@ end end @testset "issue #7248" begin - @test_throws BoundsError ind2chr("hello", -1) - @test_throws BoundsError chr2ind("hello", -1) - @test_throws BoundsError ind2chr("hellø", -1) - @test_throws BoundsError chr2ind("hellø", -1) - @test_throws BoundsError ind2chr("hello", 10) - @test_throws BoundsError chr2ind("hello", 10) - @test_throws BoundsError ind2chr("hellø", 10) - @test_throws BoundsError chr2ind("hellø", 10) + @test ind2chr("hello", -1) == -1 + @test chr2ind("hello", -1) == -1 + @test ind2chr("hellø", -1) == -1 + @test chr2ind("hellø", -1) == -1 + @test ind2chr("hello", 10) == 10 + @test chr2ind("hello", 10) == 10 + @test ind2chr("hellø", 10) == 9 + @test chr2ind("hellø", 10) == 11 @test_throws BoundsError checkbounds("hello", 0) @test_throws BoundsError checkbounds("hello", 6) @test_throws BoundsError checkbounds("hello", 0:3) @@ -127,7 +127,6 @@ end @test SubString("hellø", 1, 5)[10:9] == "" @test SubString("hellø", 1, 0)[10:9] == "" @test SubString("", 1, 0)[10:9] == "" - @test_throws BoundsError SubString("", 1, 6) @test_throws BoundsError SubString("", 1, 1) end @@ -143,8 +142,8 @@ end @test get(utf8_str, -1, 'X') == 'X' @test get(utf8_str, 1000, 'X') == 'X' - # Test that indexing into the middle of a character returns the default - @test get(utf8_str, 2, 'X') == 'X' + # Test that indexing into the middle of a character throws + @test_throws UnicodeError get(utf8_str, 2, 'X') end #= @@ -172,8 +171,10 @@ end # make sure substrings do not accept code unit if it is not start of codepoint let s = "x\u0302" + @test s[1:2] == s + @test_throws BoundsError s[0:3] + @test_throws BoundsError s[1:4] @test_throws UnicodeError s[1:3] - @test s[1:2]==s end @testset "issue #9781" begin @@ -193,8 +194,15 @@ struct tstStringType <: AbstractString end @testset "AbstractString functions" begin tstr = tstStringType(Vector{UInt8}("12")) - @test_throws ErrorException endof(tstr) - @test_throws ErrorException next(tstr, Bool(1)) + @test_throws MethodError ncodeunits(tstr) + @test_throws MethodError codeunit(tstr) + @test_throws MethodError codeunit(tstr, 1) + @test_throws MethodError codeunit(tstr, true) + @test_throws MethodError isvalid(tstr, 1) + @test_throws MethodError isvalid(tstr, true) + @test_throws MethodError next(tstr, 1) + @test_throws MethodError next(tstr, true) + @test_throws MethodError endof(tstr) gstr = GenericString("12") @test string(gstr) isa GenericString @@ -213,18 +221,19 @@ end @test done(eachindex("foobar"),7) @test eltype(Base.EachStringIndex) == Int @test map(Base.Unicode.uppercase, "foó") == "FOÓ" - @test chr2ind("fóobar",3) == 4 - - @test Symbol(gstr)==Symbol("12") + @test chr2ind("fóobar", 3) == 4 - @test_throws ErrorException sizeof(gstr) + @test Symbol(gstr) == Symbol("12") - @test length(GenericString(""))==0 + @test sizeof(gstr) == 2 + @test ncodeunits(gstr) == 2 + @test length(gstr) == 2 + @test length(GenericString("")) == 0 @test nextind(1:1, 1) == 2 @test nextind([1], 1) == 2 - @test ind2chr(gstr,2)==2 + @test ind2chr(gstr, 2) == 2 # tests promote_rule let svec = [s"12", GenericString("12"), SubString("123", 1, 2)] @@ -421,7 +430,7 @@ end @test_throws ArgumentError ascii(GenericString("Hello, ∀")) end @testset "issue #17271: endof() doesn't throw an error even with invalid strings" begin - @test endof(String(b"\x90")) == 0 + @test endof(String(b"\x90")) == 1 @test endof(String(b"\xce")) == 1 end # issue #17624, missing getindex method for String @@ -503,7 +512,7 @@ end SubString("123∀α>β:α+1>β123", 4, 18), SubString(s"123∀α>β:α+1>β123", 4, 18)] for s in strs - @test thisind(s, -2) == 0 + @test thisind(s, -2) == -2 @test thisind(s, 0) == 0 @test thisind(s, 1) == 1 @test thisind(s, 2) == 1 @@ -514,13 +523,13 @@ end @test thisind(s, 15) == 15 @test thisind(s, 16) == 15 @test thisind(s, 17) == 17 - @test thisind(s, 30) == 17 + @test thisind(s, 30) == 30 end end let strs = Any["", s"", SubString("123", 2, 1), SubString(s"123", 2, 1)] for s in strs, i in -2:2 - @test thisind(s, i) == (i > 0) + @test thisind(s, i) == i end end end @@ -545,17 +554,18 @@ end @test prevind(strs[i], 15, 4) == 10 @test prevind(strs[i], 15, 10) == 0 @test prevind(strs[i], 15, 9) == 1 - @test prevind(strs[i], 15, 10) == 0 @test prevind(strs[i], 16) == 15 @test prevind(strs[i], 16, 1) == 15 @test prevind(strs[i], 16, 2) == 14 - @test prevind(strs[i], 20) == 15 - @test prevind(strs[i], 20, 1) == 15 - @test prevind(strs[i], 20, 10) == 1 - @test_throws ArgumentError prevind(strs[i], 20, 0) - - @test nextind(strs[i], -1) == 1 - @test nextind(strs[i], -1, 1) == 1 + @test prevind(strs[i], 20) == 19 + @test prevind(strs[i], 20, 1) == 19 + @test prevind(strs[i], 20, 10) == 7 + @test prevind(strs[i], 20, 0) == 20 + + @test nextind(strs[i], -1) == 0 + @test nextind(strs[i], -1, 1) == 0 + @test nextind(strs[i], -1, 2) == 1 + @test nextind(strs[i], -1, 3) == 4 @test nextind(strs[i], 0, 2) == 4 @test nextind(strs[i], 0, 20) == 26 @test nextind(strs[i], 0, 10) == 15 @@ -576,7 +586,7 @@ end @test nextind(strs[i], 15, 1) == 17 @test nextind(strs[i], 20) == 21 @test nextind(strs[i], 20, 1) == 21 - @test_throws ArgumentError nextind(strs[i], 20, 0) + @test nextind(strs[i], 20, 0) == 20 for x in -10:20 n = p = x @@ -591,8 +601,8 @@ end @test prevind(strs[1], -1) == -2 @test prevind(strs[1], -1, 1) == -2 - @test prevind(strs[2], -1) == 0 - @test prevind(strs[2], -1, 1) == 0 + @test prevind(strs[2], -1) == -2 + @test prevind(strs[2], -1, 1) == -2 end end @@ -605,7 +615,7 @@ end @test first(s, 3) == "∀ϵ≠" @test first(s, 4) == "∀ϵ≠0" @test first(s, length(s)) == s - @test_throws BoundsError first(s, length(s)+1) + @test first(s, length(s)+1) == s @test_throws ArgumentError last(s, -1) @test last(s, 0) == "" @test last(s, 1) == "0" @@ -613,21 +623,13 @@ end @test last(s, 3) == "²>0" @test last(s, 4) == "ϵ²>0" @test last(s, length(s)) == s - @test_throws BoundsError last(s, length(s)+1) + @test last(s, length(s)+1) == s end @testset "invalid code point" begin s = String([0x61, 0xba, 0x41]) @test !isvalid(s) - @test_throws UnicodeError s[2] - e = try - s[2] - catch e - e - end - b = IOBuffer() - show(b, e) - @test String(take!(b)) == "UnicodeError: invalid character index 2 (0xba is a continuation byte)" + @test s[2] == reinterpret(Char, UInt32(0xba) << 24) end @testset "ncodeunits" begin diff --git a/test/strings/io.jl b/test/strings/io.jl index ad770804d8e21..e320e97712242 100644 --- a/test/strings/io.jl +++ b/test/strings/io.jl @@ -172,8 +172,7 @@ myio = IOBuffer() join(myio, "", "", 1) @test isempty(take!(myio)) -@testset "unescape_chars" begin - @test Base.unescape_chars("\\t","t") == "t" +@testset "unescape_string ArgumentErrors" begin @test_throws ArgumentError unescape_string(IOBuffer(), string('\\',"xZ")) @test_throws ArgumentError unescape_string(IOBuffer(), string('\\',"777")) end diff --git a/test/strings/types.jl b/test/strings/types.jl index 12dd75a1bd421..00bac71f826b8 100644 --- a/test/strings/types.jl +++ b/test/strings/types.jl @@ -32,12 +32,21 @@ for idx in 0:1 end # Substring provided with invalid end index throws BoundsError -@test_throws BoundsError SubString("∀", 1, 2) -@test_throws BoundsError SubString("∀", 1, 3) +@test_throws UnicodeError SubString("∀", 1, 2) +@test_throws UnicodeError SubString("∀", 1, 3) @test_throws BoundsError SubString("∀", 1, 4) # Substring provided with invalid start index throws BoundsError -@test_throws BoundsError SubString("∀∀", 2:4) +@test SubString("∀∀", 1:1) == "∀" +@test SubString("∀∀", 1:4) == "∀∀" +@test SubString("∀∀", 4:4) == "∀" +@test_throws UnicodeError SubString("∀∀", 1:2) +@test_throws UnicodeError SubString("∀∀", 1:5) +@test_throws UnicodeError SubString("∀∀", 2:4) +@test_throws BoundsError SubString("∀∀", 0:1) +@test_throws BoundsError SubString("∀∀", 0:4) +@test_throws BoundsError SubString("∀∀", 1:7) +@test_throws BoundsError SubString("∀∀", 4:7) # tests for SubString of more than one multibyte `Char` string # we are consistent with `getindex` for `String` @@ -46,10 +55,12 @@ for idx in [0, 1, 4] @test SubString("∀∀", 4, idx) == "∀∀"[4:idx] end -# second index beyond endof("∀∀") -for idx in 5:8 +# index beyond endof("∀∀") +for idx in [2:3; 5:6] + @test_throws UnicodeError SubString("∀∀", 1, idx) +end +for idx in 7:8 @test_throws BoundsError SubString("∀∀", 1, idx) - @test_throws BoundsError SubString("∀∀", 4, idx) end let str="tempus fugit" #length(str)==12 @@ -65,13 +76,13 @@ let str="tempus fugit" #length(str)==12 ss=SubString(str,1:0) @test length(ss)==0 - @test_throws BoundsError SubString(str,14,20) #start indexing beyond source string length - @test_throws BoundsError SubString(str,10,16) #end indexing beyond source string length + @test_throws BoundsError SubString(str, 14, 20) #start indexing beyond source string length + @test_throws BoundsError SubString(str, 10, 16) #end indexing beyond source string length @test_throws BoundsError SubString("", 1, 4) #empty source string @test_throws BoundsError SubString("", 1, 1) #empty source string, identical start and end index @test_throws BoundsError SubString("", 10, 12) - @test SubString("",12,10) == "" + @test SubString("", 12, 10) == "" end @test SubString("foobar", big(1), big(3)) == "foo" @@ -83,7 +94,7 @@ let str = "aa\u2200\u2222bb" write(b, u) @test String(take!(b)) == "\u2200\u2222" - @test_throws BoundsError SubString(str, 4, 5) + @test_throws UnicodeError SubString(str, 4, 5) @test_throws BoundsError next(u, 0) @test_throws BoundsError next(u, 7) @test_throws BoundsError getindex(u, 0) @@ -147,64 +158,69 @@ end @test ismatch(Regex(""), SubString("",1,0)) # isvalid(), chr2ind() and ind2chr() for SubString{String} -let ss, s="lorem ipsum", - sdict=Dict(SubString(s,1,11)=>s, - SubString(s,1,6)=>"lorem ", - SubString(s,1,0)=>"", - SubString(s,2,4)=>"ore", - SubString(s,2,11)=>"orem ipsum", - SubString(s,15,14)=>"" - ) - for (ss,s) in sdict - local ss - for i in -1:12 - @test isvalid(ss,i)==isvalid(s,i) +let s = "lorem ipsum", sdict = Dict( + SubString(s, 1, 11) => "lorem ipsum", + SubString(s, 1, 6) => "lorem ", + SubString(s, 1, 0) => "", + SubString(s, 2, 4) => "ore", + SubString(s, 2, 11) => "orem ipsum", + SubString(s, 15, 14) => "", +) + for (ss, s) in sdict + @test ncodeunits(ss) == ncodeunits(s) + for i in -2:13 + if 1 ≤ i ≤ ncodeunits(ss) + @test isvalid(ss, i) == isvalid(s, i) + else + @test_throws BoundsError isvalid(ss, i) + @test_throws BoundsError isvalid(s, i) + end end - end - for (ss,s) in sdict - local ss - for i in 1:length(ss) - @test ind2chr(ss,i)==ind2chr(s,i) + for i in 1:ncodeunits(ss) + @test ind2chr(ss, i) == ind2chr(s, i) end end - for (ss,s) in sdict - local ss + for (ss, s) in sdict + @test length(ss) == length(s) for i in 1:length(ss) - @test chr2ind(ss,i)==chr2ind(s,i) + @test chr2ind(ss, i) == chr2ind(s, i) end end -end #let +end -#for isvalid(SubString{String}) +# for isvalid(SubString{String}) let s = "Σx + βz - 2" - for i in -1:(length(s)+2) - if isvalid(s, i) - ss=SubString(s,1,i) - # make sure isvalid gives equivalent results for SubString and String - @test isvalid(ss,i)==isvalid(s,i) - else - if i > 0 - @test_throws BoundsError SubString(s,1,i) + for i in -1:ncodeunits(s)+2 + if checkbounds(Bool, s, i) + if isvalid(s, i) + ss = SubString(s, 1, i) + for j = 1:ncodeunits(ss) + @test isvalid(ss, j) == isvalid(s, j) + end else - @test SubString(s,1,i) == "" + @test_throws UnicodeError SubString(s, 1, i) end + elseif i > 0 + @test_throws BoundsError SubString(s, 1, i) + else + @test SubString(s, 1, i) == "" end end end -let ss=SubString("hello",1,5) - @test_throws BoundsError ind2chr(ss, -1) - @test_throws BoundsError chr2ind(ss, -1) - @test_throws BoundsError chr2ind(ss, 10) - @test_throws BoundsError ind2chr(ss, 10) +let ss = SubString("hello", 1, 5) + @test ind2chr(ss, -1) == -1 + @test chr2ind(ss, -1) == -1 + @test chr2ind(ss, 10) == 10 + @test ind2chr(ss, 10) == 10 end # length(SubString{String}) performance specialization let s = "|η(α)-ϕ(κ)| < ε" - @test length(SubString(s,1,0))==length(s[1:0]) - @test length(SubString(s,4,4))==length(s[4:4]) - @test length(SubString(s,1,7))==length(s[1:7]) - @test length(SubString(s,4,11))==length(s[4:11]) + @test length(SubString(s, 1, 0)) == length(s[1:0]) + @test length(SubString(s, 4, 4)) == length(s[4:4]) + @test length(SubString(s, 1, 7)) == length(s[1:7]) + @test length(SubString(s, 4, 11)) == length(s[4:11]) end @testset "reverseind" for T in (String, SubString, GenericString) @@ -217,7 +233,8 @@ end @test c == s[reverseind(s, ri)] == r[ri] s = convert(T, string(prefix, prefix, c, suffix, suffix)) pre = convert(T, prefix) - sb = SubString(s, nextind(pre, endof(pre)), endof(convert(T, string(prefix, prefix, c, suffix)))) + sb = SubString(s, nextind(pre, endof(pre)), + endof(convert(T, string(prefix, prefix, c, suffix)))) r = reverse(sb) ri = search(r, c) @test c == sb[reverseind(sb, ri)] == r[ri] diff --git a/test/unicode/utf8.jl b/test/unicode/utf8.jl index a9db6316d2fa9..c65934217dfb9 100644 --- a/test/unicode/utf8.jl +++ b/test/unicode/utf8.jl @@ -1,24 +1,13 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -@testset "cesu8 input" begin - let ch = 0x10000 - for hi = 0xd800:0xdbff - for lo = 0xdc00:0xdfff - @test String(Vector{UInt8}(String(Char[hi, lo]))) == string(Char(ch)) - ch += 1 - end - end - end -end - @testset "string indexing" begin let str = String(b"this is a test\xed\x80") - @test next(str, 15) == ('\ufffd', 16) + @test next(str, 15) == (reinterpret(Char, 0xed800000), 17) @test_throws BoundsError getindex(str, 0:3) @test_throws BoundsError getindex(str, 17:18) @test_throws BoundsError getindex(str, 2:17) - @test_throws UnicodeError getindex(str, 16:17) - @test string(Char(0x110000)) == "\ufffd" + @test_throws BoundsError getindex(str, 16:17) + @test string(Char(0x110000)) == String(b"\xf4\x90\x80\x80") end end @@ -36,12 +25,12 @@ end b"xyz\xf0\x80" => b"\xf0\x80zyx", b"xyz\xf0\x80\x80" => b"\xf0\x80\x80zyx", ] - @test_broken reverse(String(s)) == String(r) + @test reverse(String(s)) == String(r) end end @testset "string convert" begin @test String(b"this is a test\xed\x80\x80") == "this is a test\ud000" - ## Specifically check UTF-8 string whose lead byte is same as a surrogate + # Specifically check UTF-8 string whose lead byte is same as a surrogate @test String(b"\xed\x9f\xbf") == "\ud7ff" end From 274cf84e51873063be282d9297eab4d55edd8eac Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Fri, 8 Dec 2017 18:44:23 -0500 Subject: [PATCH 02/22] LineEdit: use character syntax for wildcard character --- base/repl/LineEdit.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/base/repl/LineEdit.jl b/base/repl/LineEdit.jl index 607740573b0e9..deda700e72e80 100644 --- a/base/repl/LineEdit.jl +++ b/base/repl/LineEdit.jl @@ -1197,12 +1197,12 @@ end ### Keymap Support -const wildcard = Char(0x0010f7ff) # "Private Use" Char +const wildcard = '\U10f7ff' # "Private Use" Char normalize_key(key::Char) = string(key) normalize_key(key::Integer) = normalize_key(Char(key)) function normalize_key(key::AbstractString) - wildcard in key && error("Matching Char(0x0010f7ff) not supported.") + wildcard in key && error("Matching '\U10f7ff' not supported.") buf = IOBuffer() i = start(key) while !done(key, i) From 4d8b90f73cd4117a4f302ee928296378d97e1f72 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Fri, 8 Dec 2017 18:45:08 -0500 Subject: [PATCH 03/22] strings/string: use checkbounds helper in a few places --- base/strings/string.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 0e1ef86e6a759..f48fd5ebee764 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -64,7 +64,7 @@ ncodeunits(s::String) = Core.sizeof(s) codeunit(s::String) = UInt8 @inline function codeunit(s::String, i::Integer) - @boundscheck between(i, 1, ncodeunits(s)) || throw(BoundsError(s, i)) + @boundscheck checkbounds(s, i) @gc_preserve s unsafe_load(pointer(s, i)) end @@ -144,7 +144,7 @@ is_valid_continuation(c) = c & 0xc0 == 0x80 ## required core functionality ## function next(s::String, i::Int) - @boundscheck 1 ≤ i ≤ sizeof(s) || throw(BoundsError(s, i)) + @boundscheck checkbounds(s, i) @inbounds b = codeunit(s, i) # TODO: check index validity u = UInt32(b) << 24 @@ -178,9 +178,8 @@ end end function getindex(s::String, i::Int) - @boundscheck 1 ≤ i ≤ ncodeunits(s) || throw(BoundsError(s, i)) + @boundscheck checkbounds(s, i) @inbounds b = codeunit(s, i) - # TODO: check index validity u = UInt32(b) << 24 (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u) return getindex_continued(s, i, u) From 6e15be871bc7284d26181f8995c6b1faae18c4f9 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Fri, 8 Dec 2017 18:45:34 -0500 Subject: [PATCH 04/22] repeat(Char, Integer): update & simplify for new Char rep --- base/strings/string.jl | 67 ++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index f48fd5ebee764..3bf718c79865e 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -381,11 +381,12 @@ function repeat(s::String, r::Integer) r < 0 && throw(ArgumentError("can't repeat a string $r times")) n = sizeof(s) out = _string_n(n*r) - if n == 1 # common case: repeating a single ASCII char - @inbounds ccall(:memset, Ptr{Void}, (Ptr{UInt8}, Cint, Csize_t), out, codeunit(s, 1), r) + if n == 1 # common case: repeating a single-byte string + @inbounds b = codeunit(s, 1) + ccall(:memset, Ptr{Void}, (Ptr{UInt8}, Cint, Csize_t), out, b, r) else - for i=1:r - unsafe_copy!(pointer(out, 1+(i-1)*n), pointer(s), n) + for i = 0:r-1 + unsafe_copy!(pointer(out, i*n+1), pointer(s), n) end end return out @@ -403,43 +404,33 @@ julia> repeat('A', 3) ``` """ function repeat(c::Char, r::Integer) - r < 0 && throw(ArgumentError("can't repeat a character $r times")) r == 0 && return "" - ch = UInt(c) - if ch < 0x80 - out = _string_n(r) - ccall(:memset, Ptr{Void}, (Ptr{UInt8}, Cint, Csize_t), out, c, r) - elseif ch < 0x800 - out = _string_n(2r) - p16 = reinterpret(Ptr{UInt16}, pointer(out)) - u16 = ((ch >> 0x6) | (ch & 0x3f) << 0x8) % UInt16 | 0x80c0 - @inbounds for i = 1:r - unsafe_store!(p16, u16, i) + r < 0 && throw(ArgumentError("can't repeat a character $r times")) + u = bswap(reinterpret(UInt32, c)) + n = 4 - (leading_zeros(u | 0xff) >> 3) + s = _string_n(n*r) + p = pointer(s) + if n == 1 + ccall(:memset, Ptr{Void}, (Ptr{UInt8}, Cint, Csize_t), p, u % UInt8, r) + elseif n == 2 + p16 = reinterpret(Ptr{UInt16}, p) + for i = 1:r + unsafe_store!(p16, u % UInt16, i) end - elseif ch < 0x10000 - (0xd800 ≥ ch ≤ 0xdfff) || throw(ArgumentError("invalid character 0x$(hex(ch))")) - out = _string_n(3r) - p = pointer(out) - b1 = (ch >> 0xc) % UInt8 | 0xe0 - b2 = ((ch >> 0x6) & 0x3f) % UInt8 | 0x80 - b3 = (ch & 0x3f) % UInt8 | 0x80 - @inbounds for i = 1:r - unsafe_store!(p, b1) - unsafe_store!(p, b2, 2) - unsafe_store!(p, b3, 3) - p += 3 + elseif n == 3 + b1 = (u >> 0) % UInt8 + b2 = (u >> 8) % UInt8 + b3 = (u >> 16) % UInt8 + for i = 0:r-1 + unsafe_store!(p, b1, 3i + 1) + unsafe_store!(p, b2, 3i + 2) + unsafe_store!(p, b3, 3i + 3) end - elseif ch < 0x110000 - out = _string_n(4r) - p32 = reinterpret(Ptr{UInt32}, pointer(out)) - u32 = ((ch >> 0x12) | ((ch >> 0x4) & 0x03f00) | - ((ch << 0xa) & 0x3f0000) | ((ch & 0x3f) << 0x18)) % UInt32 | 0x808080f0 - @inbounds for i = 1:r - unsafe_store!(p32, u32) - p32 += 4 + elseif n == 4 + p32 = reinterpret(Ptr{UInt32}, pointer(s)) + for i = 1:r + unsafe_store!(p32, u, i) end - else - throw(ArgumentError("invalid character 0x$(hex(ch))")) end - return out + return s end From 5cd9263b686848f88d5ebb1c44f6b3d39ec37f8f Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Fri, 8 Dec 2017 19:54:34 -0500 Subject: [PATCH 05/22] replace UnicodeError with StringIndexError This type is appropriate to indicate that someone has indexed into the non-leading code unit of a character in a string, regardless of its encoding and the error is no longer used to indicate any other Unicode-related errors, so the name was no longer fitting. --- base/exports.jl | 2 +- base/strings/basic.jl | 2 +- base/strings/errors.jl | 15 --------------- base/strings/string.jl | 19 ++++++++++++------- base/strings/strings.jl | 1 - base/strings/substring.jl | 6 ++---- test/choosetests.jl | 2 +- test/strings/basic.jl | 6 +++--- test/strings/search.jl | 6 +++--- test/strings/types.jl | 16 ++++++++-------- test/unicode/UnicodeError.jl | 9 --------- 11 files changed, 31 insertions(+), 53 deletions(-) delete mode 100644 base/strings/errors.jl delete mode 100644 test/unicode/UnicodeError.jl diff --git a/base/exports.jl b/base/exports.jl index e5193eff91570..b884a3641f7f3 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -154,7 +154,7 @@ export NullException, ParseError, SystemError, - UnicodeError, + StringIndexError, # Global constants and variables ARGS, diff --git a/base/strings/basic.jl b/base/strings/basic.jl index 7ff30d09bc027..1e1b05fba1e71 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -107,7 +107,7 @@ julia> isvalid(str, 2) false julia> str[2] -ERROR: UnicodeError: invalid character index +ERROR: StringIndexError: invalid character index Stacktrace: [...] ``` diff --git a/base/strings/errors.jl b/base/strings/errors.jl deleted file mode 100644 index f97d21dc19321..0000000000000 --- a/base/strings/errors.jl +++ /dev/null @@ -1,15 +0,0 @@ -# This file is a part of Julia. License is MIT: https://julialang.org/license - -## Error messages for Unicode / UTF support - -const UTF_ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> missing one or more continuation bytes)" -const UTF_ERR_INVALID_INDEX = "invalid character index <<1>> (0x<<2>> is a continuation byte)" - -struct UnicodeError <: Exception - errmsg::AbstractString ##< A UTF_ERR_ message - errpos::Int32 ##< Position of invalid character - errchr::UInt32 ##< Invalid character -end - -show(io::IO, exc::UnicodeError) = print(io, replace(replace(string("UnicodeError: ",exc.errmsg), - "<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr))) diff --git a/base/strings/string.jl b/base/strings/string.jl index 3bf718c79865e..47dc0326914cb 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -1,5 +1,12 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license +struct StringIndexError <: Exception + string::AbstractString + index::Integer +end +@noinline string_index_err(s::AbstractString, i::Integer) = + throw(StringIndexError(s, Int(i))) + const ByteArray = Union{Vector{UInt8},Vector{Int8}} @inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi) @@ -155,7 +162,7 @@ end @noinline function next_continued(s::String, i::Int, u::UInt32) if u < 0xc0000000 isvalid(s, i) && (i += 1; @goto ret) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, (u >> 24) % UInt8)) + string_index_err(s, i) end n = ncodeunits(s) # first continuation byte @@ -188,7 +195,7 @@ end @noinline function getindex_continued(s::String, i::Int, u::UInt32) if u < 0xc0000000 isvalid(s, i) && @goto ret - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, (u >> 24) % UInt8)) + string_index_err(s, i) end n = ncodeunits(s) # first continuation byte @@ -217,10 +224,8 @@ function getindex(s::String, r::UnitRange{Int}) i, j = first(r), last(r) @boundscheck begin checkbounds(s, r) - @inbounds isvalid(s, i) || - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i))) - @inbounds isvalid(s, j) || - throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, codeunit(s, j))) + @inbounds isvalid(s, i) || string_index_err(s, i) + @inbounds isvalid(s, j) || string_index_err(s, j) end j = nextind(s, j) - 1 n = j - i + 1 @@ -284,7 +289,7 @@ function search(s::String, c::Char, i::Integer = 1) throw(BoundsError(s, i)) end @inbounds if is_valid_continuation(codeunit(s,i)) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s,i))) + string_index_err(s, i) end c ≤ '\x7f' && return search(s, c % UInt8, i) while true diff --git a/base/strings/strings.jl b/base/strings/strings.jl index 961f05cdc675a..91f10436a4e73 100644 --- a/base/strings/strings.jl +++ b/base/strings/strings.jl @@ -1,6 +1,5 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -include("strings/errors.jl") include("strings/substring.jl") include("strings/basic.jl") include("strings/search.jl") diff --git a/base/strings/substring.jl b/base/strings/substring.jl index 75dc64e8b01d7..4d33f89754d1c 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -28,10 +28,8 @@ struct SubString{T<:AbstractString} <: AbstractString i ≤ j || return new(s, i-1, 0) @boundscheck begin checkbounds(s, i:j) - @inbounds isvalid(s, i) || - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i))) - @inbounds isvalid(s, j) || - throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, codeunit(s, j))) + @inbounds isvalid(s, i) || string_index_err(s, i) + @inbounds isvalid(s, j) || string_index_err(s, j) end return new(s, i-1, nextind(s,j)-i) end diff --git a/test/choosetests.jl b/test/choosetests.jl index 4e324af14d7aa..b1057030069e4 100644 --- a/test/choosetests.jl +++ b/test/choosetests.jl @@ -82,7 +82,7 @@ function choosetests(choices = []) end - unicodetests = ["unicode/UnicodeError", "unicode/utf8"] + unicodetests = ["unicode/utf8"] if "unicode" in skip_tests filter!(x -> (x != "unicode" && !(x in unicodetests)), tests) elseif "unicode" in tests diff --git a/test/strings/basic.jl b/test/strings/basic.jl index de46027782d8c..f6165cacaef27 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -143,7 +143,7 @@ end @test get(utf8_str, 1000, 'X') == 'X' # Test that indexing into the middle of a character throws - @test_throws UnicodeError get(utf8_str, 2, 'X') + @test_throws StringIndexError get(utf8_str, 2, 'X') end #= @@ -174,7 +174,7 @@ let s = "x\u0302" @test s[1:2] == s @test_throws BoundsError s[0:3] @test_throws BoundsError s[1:4] - @test_throws UnicodeError s[1:3] + @test_throws StringIndexError s[1:3] end @testset "issue #9781" begin @@ -215,7 +215,7 @@ end @test gstr[[1]] == "1" @test s"∀∃"[big(1)] == '∀' - @test_throws UnicodeError GenericString("∀∃")[Int8(2)] + @test_throws StringIndexError GenericString("∀∃")[Int8(2)] @test_throws BoundsError GenericString("∀∃")[UInt16(10)] @test done(eachindex("foobar"),7) diff --git a/test/strings/search.jl b/test/strings/search.jl index 251d32ba5b256..c609066c4f05c 100644 --- a/test/strings/search.jl +++ b/test/strings/search.jl @@ -84,16 +84,16 @@ for str in (u8str, GenericString(u8str)) @test search(str, '\u80') == 0 @test search(str, '∄') == 0 @test search(str, '∀') == 1 - @test_throws UnicodeError search(str, '∀', 2) + @test_throws StringIndexError search(str, '∀', 2) @test search(str, '∀', 4) == 0 @test search(str, '∃') == 13 - @test_throws UnicodeError search(str, '∃', 15) + @test_throws StringIndexError search(str, '∃', 15) @test search(str, '∃', 16) == 0 @test search(str, 'x') == 26 @test search(str, 'x', 27) == 43 @test search(str, 'x', 44) == 0 @test search(str, 'δ') == 17 - @test_throws UnicodeError search(str, 'δ', 18) + @test_throws StringIndexError search(str, 'δ', 18) @test search(str, 'δ', nextind(str,17)) == 33 @test search(str, 'δ', nextind(str,33)) == 0 @test search(str, 'ε') == 5 diff --git a/test/strings/types.jl b/test/strings/types.jl index 00bac71f826b8..f89907c6f5b88 100644 --- a/test/strings/types.jl +++ b/test/strings/types.jl @@ -32,17 +32,17 @@ for idx in 0:1 end # Substring provided with invalid end index throws BoundsError -@test_throws UnicodeError SubString("∀", 1, 2) -@test_throws UnicodeError SubString("∀", 1, 3) +@test_throws StringIndexError SubString("∀", 1, 2) +@test_throws StringIndexError SubString("∀", 1, 3) @test_throws BoundsError SubString("∀", 1, 4) # Substring provided with invalid start index throws BoundsError @test SubString("∀∀", 1:1) == "∀" @test SubString("∀∀", 1:4) == "∀∀" @test SubString("∀∀", 4:4) == "∀" -@test_throws UnicodeError SubString("∀∀", 1:2) -@test_throws UnicodeError SubString("∀∀", 1:5) -@test_throws UnicodeError SubString("∀∀", 2:4) +@test_throws StringIndexError SubString("∀∀", 1:2) +@test_throws StringIndexError SubString("∀∀", 1:5) +@test_throws StringIndexError SubString("∀∀", 2:4) @test_throws BoundsError SubString("∀∀", 0:1) @test_throws BoundsError SubString("∀∀", 0:4) @test_throws BoundsError SubString("∀∀", 1:7) @@ -57,7 +57,7 @@ end # index beyond endof("∀∀") for idx in [2:3; 5:6] - @test_throws UnicodeError SubString("∀∀", 1, idx) + @test_throws StringIndexError SubString("∀∀", 1, idx) end for idx in 7:8 @test_throws BoundsError SubString("∀∀", 1, idx) @@ -94,7 +94,7 @@ let str = "aa\u2200\u2222bb" write(b, u) @test String(take!(b)) == "\u2200\u2222" - @test_throws UnicodeError SubString(str, 4, 5) + @test_throws StringIndexError SubString(str, 4, 5) @test_throws BoundsError next(u, 0) @test_throws BoundsError next(u, 7) @test_throws BoundsError getindex(u, 0) @@ -198,7 +198,7 @@ let s = "Σx + βz - 2" @test isvalid(ss, j) == isvalid(s, j) end else - @test_throws UnicodeError SubString(s, 1, i) + @test_throws StringIndexError SubString(s, 1, i) end elseif i > 0 @test_throws BoundsError SubString(s, 1, i) diff --git a/test/unicode/UnicodeError.jl b/test/unicode/UnicodeError.jl deleted file mode 100644 index a5665dff49bbe..0000000000000 --- a/test/unicode/UnicodeError.jl +++ /dev/null @@ -1,9 +0,0 @@ -# This file is a part of Julia. License is MIT: https://julialang.org/license - -@testset "invalid utf8" begin - let io = IOBuffer() - show(io, UnicodeError(Base.UTF_ERR_SHORT, 1, 10)) - check = "UnicodeError: invalid UTF-8 sequence starting at index 1 (0xa missing one or more continuation bytes)" - @test String(take!(io)) == check - end -end From 37aff062a620594df2df55e398d7daa009369efb Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Sat, 9 Dec 2017 13:53:42 -0500 Subject: [PATCH 06/22] See also: use `[name](@ref)` cross reference links. --- base/strings/basic.jl | 14 +++++++------- base/strings/unicode.jl | 6 ++++-- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/base/strings/basic.jl b/base/strings/basic.jl index 1e1b05fba1e71..4ec5c62e707ed 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -30,7 +30,7 @@ indexing functions include those intended for index arithmetic: `thisind`, bounds indices as intermediate values so long as one never uses them to retrieve a character, which often helps avoid needing to code around edge cases. -See also: `codeunit`, `ncodeunits`, `thisind`, `nextind`, `prevind` +See also: [`codeunit`](@ref), [`ncodeunits`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref) """ AbstractString @@ -44,7 +44,7 @@ access this string must satisfy `1 ≤ i ≤ ncodeunits(s)`. Not all such indic are valid – they may not be the start of a character, but they will return a code unit value when calling `codeunit(s,i)`. -See also: `codeunit`, `checkbounds`, `sizeof`, `length`, `endof` +See also: [`codeunit`](@ref), [`checkbounds`](@ref), [`sizeof`](@ref), [`length`](@ref), [`endof`](@ref) """ ncodeunits(s::AbstractString) @@ -58,7 +58,7 @@ limited to these three types, but it's hard to think of widely used string encodings that don't use one of these units. `codeunit(s)` is the same as `typeof(codeunit(s,1))` when `s` is a non-empty string. -See also: `ncodeunits` +See also: [`ncodeunits`](@ref) """ codeunit(s::AbstractString) @@ -72,7 +72,7 @@ Return the code unit value in the string `s` at index `i`. Note that I.e. the value returned by `codeunit(s, i)` is of the type returned by `codeunit(s)`. -See also: `ncodeunits`, `checkbounds` +See also: [`ncodeunits`](@ref), [`checkbounds`](@ref) """ codeunit(s::AbstractString, i::Integer) = typeof(i) === Int ? throw(MethodError(codeunit, Tuple{typeof(s),Int})) : @@ -90,7 +90,7 @@ In order for `isvalid(s, i)` to be an O(1) function, the encoding of `s` must be [self-synchronizing](https://en.wikipedia.org/wiki/Self-synchronizing_code); this is a basic assumption of Julia's generic string support. -See also: `getindex`, `next`, `thisind`, `nextind`, `prevind`, `length` +See also: [`getindex`](@ref), [`next`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref), [`length`](@ref) # Examples @@ -125,7 +125,7 @@ be iterated, yielding a sequences of characters. If `i` is out of bounds in `s` then a bounds error is raised; if `i` is not a valid character index in `s` then a Unicode index error is raised. -See also: `getindex`, `start`, `done`, `checkbounds` +See also: [`getindex`](@ref), [`start`](@ref), [`done`](@ref), [`checkbounds`](@ref) """ next(s::AbstractString, i::Integer) = typeof(i) === Int ? throw(MethodError(next, Tuple{typeof(s),Int})) : @@ -303,7 +303,7 @@ number of characters in the entire string. If `lo` or `hi` are out of ranges each out of range code unit is considered to be one character. This matches the "loose" indexing model of `thisind`, `nextind` and `prevind`. -See also: `isvalid`, `ncodeunits`, `endof`, `thisind`, `nextind`, `prevind` +See also: [`isvalid`](@ref), [`ncodeunits`](@ref), [`endof`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref) # Examples ```jldoctest diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index 202b481896a7e..64b5f6d5611ed 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -710,7 +710,8 @@ Return `s` with the first character converted to uppercase (technically "title case" for Unicode). See also [`titlecase`](@ref) to capitalize the first character of every word in `s`. -See also: `lcfirst`, `uppercase`, `lowercase`, `titlecase` +See also: [`lcfirst`](@ref), [`uppercase`](@ref), [`lowercase`](@ref), +[`titlecase`](@ref) # Examples ```jldoctest @@ -731,7 +732,8 @@ end Return `s` with the first character converted to lowercase. -See also: `ucfirst`, `uppercase`, `lowercase`, `titlecase` +See also: [`ucfirst`](@ref), [`uppercase`](@ref), [`lowercase`](@ref), +[`titlecase`](@ref) # Examples ```jldoctest From 7eb0def29c64d34b67766e66bc818a51748a5505 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Sat, 9 Dec 2017 14:27:59 -0500 Subject: [PATCH 07/22] address Milan's review comments --- base/strings/basic.jl | 53 +++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/base/strings/basic.jl b/base/strings/basic.jl index 4ec5c62e707ed..e190db531970e 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -16,19 +16,24 @@ about strings: * Each `Char` in a string is encoded by one or more code units * Only the index of the first code unit of a `Char` is a valid index * The encoding of a `Char` is independent of what precedes or follows it - * String encodings are "self-synchronizing" – i.e. `isvalid(s,i)` is O(1) - -Some string functions error if you use an out-of-bounds or invalid string index, -including code unit extraction `codeunit(s,i)`, string indexing `s[i]`, and -string iteration `next(s,i)`. Other string functions take a more relaxed -approach to indexing and give you the closest valid string index when in-bounds, -or when out-of-bounds, behave as if there were an infinite number of characters -padding each side of the string. Usually these imaginary padding characters have -code unit length `1`, but string types may choose different sizes. Relaxed -indexing functions include those intended for index arithmetic: `thisind`, -`nextind` and `prevind`. This model allows index arithmetic to work with out-of- -bounds indices as intermediate values so long as one never uses them to retrieve -a character, which often helps avoid needing to code around edge cases. + * String encodings are [self-synchronizing] – i.e. `isvalid(s,i)` is O(1) + +[self-synchronizing]: https://en.wikipedia.org/wiki/Self-synchronizing_code + +Some string functions that extract code units, characters or substrings from +strings error if you pass them out-of-bounds or invalid string indices. This +includes `codeunit(s, i)`, `s[i]`, and `next(s, i)`. Functions that do string +index arithmetic take a more relaxed approach to indexing and give you the +closest valid string index when in-bounds, or when out-of-bounds, behave as if +there were an infinite number of characters padding each side of the string. +Usually these imaginary padding characters have code unit length `1` but string +types may choose different "imaginary" character sizes as makes sense for their +implementations (e.g. substrings may pass index arithmetic through to the +underlying string they provide a view into). Relaxed indexing functions include +those intended for index arithmetic: `thisind`, `nextind` and `prevind`. This +model allows index arithmetic to work with out-of- bounds indices as +intermediate values so long as one never uses them to retrieve a character, +which often helps avoid needing to code around edge cases. See also: [`codeunit`](@ref), [`ncodeunits`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref) """ @@ -75,8 +80,7 @@ I.e. the value returned by `codeunit(s, i)` is of the type returned by See also: [`ncodeunits`](@ref), [`checkbounds`](@ref) """ codeunit(s::AbstractString, i::Integer) = typeof(i) === Int ? - throw(MethodError(codeunit, Tuple{typeof(s),Int})) : - codeunit(s, Int(i)) + throw(MethodError(codeunit, Tuple{typeof(s),Int})) : codeunit(s, Int(i)) """ isvalid(s::AbstractString, i::Integer) -> Bool @@ -113,8 +117,7 @@ Stacktrace: ``` """ isvalid(s::AbstractString, i::Integer) = typeof(i) === Int ? - throw(MethodError(isvalid, Tuple{typeof(s),Int})) : - isvalid(s, Int(i)) + throw(MethodError(isvalid, Tuple{typeof(s),Int})) : isvalid(s, Int(i)) """ next(s::AbstractString, i::Integer) -> Tuple{Char, Int} @@ -128,8 +131,7 @@ a Unicode index error is raised. See also: [`getindex`](@ref), [`start`](@ref), [`done`](@ref), [`checkbounds`](@ref) """ next(s::AbstractString, i::Integer) = typeof(i) === Int ? - throw(MethodError(next, Tuple{typeof(s),Int})) : - next(s, Int(i)) + throw(MethodError(next, Tuple{typeof(s),Int})) : next(s, Int(i)) ## basic generic definitions ## @@ -182,10 +184,12 @@ promote_rule(::Type{<:AbstractString}, ::Type{<:AbstractString}) = String ## string & character concatenation ## """ - *(s::Union{AbstractString, Char}, t::Union{AbstractString, Char}...) -> String + *(s::Union{AbstractString, Char}, t::Union{AbstractString, Char}...) -> AbstractString Concatenate strings and/or characters, producing a [`String`](@ref). This is equivalent -to calling the [`string`](@ref) function on the arguments. +to calling the [`string`](@ref) function on the arguments. Concatenation of built-in +string types always produces a value of type `String` but other string types may choose +to return a string of a different type as appropriate. # Examples ```jldoctest @@ -299,9 +303,10 @@ isless(a::Symbol, b::Symbol) = cmp(a, b) < 0 The number of characters in string `s` from indices `lo` through `hi`. This is computed as the number of code unit indices from `lo` to `hi` which are valid character indices. Without only a single string argument, this computes the -number of characters in the entire string. If `lo` or `hi` are out of ranges -each out of range code unit is considered to be one character. This matches the -"loose" indexing model of `thisind`, `nextind` and `prevind`. +number of characters in the entire string. With `lo` and `hi` arguments it computes +the number of indices between `lo` and `hi` inclusive that are valid indices in +the string `s`. Note that the trailing character may include code units past `hi` +and still be counted. See also: [`isvalid`](@ref), [`ncodeunits`](@ref), [`endof`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref) From fcae4238fce26454728943f8c8a6ffa458981954 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Sat, 9 Dec 2017 14:27:39 -0500 Subject: [PATCH 08/22] use stevengj's suggested `hash(Char)` definition --- base/char.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/base/char.jl b/base/char.jl index 6d21af949ebe8..92c52efc2d66a 100644 --- a/base/char.jl +++ b/base/char.jl @@ -81,7 +81,8 @@ in(x::Char, y::Char) = x == y ==(x::Char, y::Char) = reinterpret(UInt32, x) == reinterpret(UInt32, y) isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y) -hash(x::Char, h::UInt) = hash(reinterpret(UInt32, x), hash(Char, h)) +hash(x::Char, h::UInt) = + hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h)) -(x::Char, y::Char) = Int(x) - Int(y) -(x::Char, y::Integer) = Char(Int32(x) - Int32(y)) From f65c90b3a6fd22b705cdcd917ed16b0836467d18 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Sat, 9 Dec 2017 23:17:02 -0500 Subject: [PATCH 09/22] add ncodeunits to stdlib docs --- doc/src/stdlib/strings.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/src/stdlib/strings.md b/doc/src/stdlib/strings.md index 39f2f15a06b3b..98f53c3b27dad 100644 --- a/doc/src/stdlib/strings.md +++ b/doc/src/stdlib/strings.md @@ -13,7 +13,8 @@ Core.String(::AbstractString) Base.SubString Base.transcode Base.unsafe_string -Base.codeunit(::AbstractString, ::Integer) +Base.ncodeunits(::AbstractString) +Base.codeunit Base.ascii Base.@r_str Base.@raw_str From 100d8192b8de4aca344dd54080486ad970315ae4 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Sun, 10 Dec 2017 12:12:05 -0500 Subject: [PATCH 10/22] strings: some formatting tweaks --- base/strings/basic.jl | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/base/strings/basic.jl b/base/strings/basic.jl index e190db531970e..b32b90ff6bd91 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -16,7 +16,7 @@ about strings: * Each `Char` in a string is encoded by one or more code units * Only the index of the first code unit of a `Char` is a valid index * The encoding of a `Char` is independent of what precedes or follows it - * String encodings are [self-synchronizing] – i.e. `isvalid(s,i)` is O(1) + * String encodings are [self-synchronizing] – i.e. `isvalid(s, i)` is O(1) [self-synchronizing]: https://en.wikipedia.org/wiki/Self-synchronizing_code @@ -35,7 +35,8 @@ model allows index arithmetic to work with out-of- bounds indices as intermediate values so long as one never uses them to retrieve a character, which often helps avoid needing to code around edge cases. -See also: [`codeunit`](@ref), [`ncodeunits`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref) +See also: [`codeunit`](@ref), [`ncodeunits`](@ref), [`thisind`](@ref), +[`nextind`](@ref), [`prevind`](@ref) """ AbstractString @@ -49,7 +50,8 @@ access this string must satisfy `1 ≤ i ≤ ncodeunits(s)`. Not all such indic are valid – they may not be the start of a character, but they will return a code unit value when calling `codeunit(s,i)`. -See also: [`codeunit`](@ref), [`checkbounds`](@ref), [`sizeof`](@ref), [`length`](@ref), [`endof`](@ref) +See also: [`codeunit`](@ref), [`checkbounds`](@ref), [`sizeof`](@ref), +[`length`](@ref), [`endof`](@ref) """ ncodeunits(s::AbstractString) @@ -94,7 +96,8 @@ In order for `isvalid(s, i)` to be an O(1) function, the encoding of `s` must be [self-synchronizing](https://en.wikipedia.org/wiki/Self-synchronizing_code); this is a basic assumption of Julia's generic string support. -See also: [`getindex`](@ref), [`next`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref), [`length`](@ref) +See also: [`getindex`](@ref), [`next`](@ref), [`thisind`](@ref), +[`nextind`](@ref), [`prevind`](@ref), [`length`](@ref) # Examples @@ -128,7 +131,8 @@ be iterated, yielding a sequences of characters. If `i` is out of bounds in `s` then a bounds error is raised; if `i` is not a valid character index in `s` then a Unicode index error is raised. -See also: [`getindex`](@ref), [`start`](@ref), [`done`](@ref), [`checkbounds`](@ref) +See also: [`getindex`](@ref), [`start`](@ref), [`done`](@ref), +[`checkbounds`](@ref) """ next(s::AbstractString, i::Integer) = typeof(i) === Int ? throw(MethodError(next, Tuple{typeof(s),Int})) : next(s, Int(i)) @@ -308,7 +312,8 @@ the number of indices between `lo` and `hi` inclusive that are valid indices in the string `s`. Note that the trailing character may include code units past `hi` and still be counted. -See also: [`isvalid`](@ref), [`ncodeunits`](@ref), [`endof`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref) +See also: [`isvalid`](@ref), [`ncodeunits`](@ref), [`endof`](@ref), +[`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref) # Examples ```jldoctest @@ -561,9 +566,9 @@ last(s::AbstractString, n::Integer) = s[max(1, prevind(s, ncodeunits(s)+1, n)):e """ reverseind(v, i) -Given an index `i` in [`reverse(v)`](@ref), return the corresponding index in `v` so that -`v[reverseind(v,i)] == reverse(v)[i]`. (This can be nontrivial in cases where `v` contains -non-ASCII characters.) +Given an index `i` in [`reverse(v)`](@ref), return the corresponding index in +`v` so that `v[reverseind(v,i)] == reverse(v)[i]`. (This can be nontrivial in +cases where `v` contains non-ASCII characters.) # Examples ```jldoctest @@ -581,7 +586,9 @@ reverseind(s::AbstractString, i::Integer) = thisind(s, ncodeunits(s)-i+1) """ repeat(s::AbstractString, r::Integer) -Repeat a string `r` times. This can equivalently be accomplished by calling [`s^r`](@ref ^). +Repeat a string `r` times. This can be written as `s^r`. + +See also: [`^`](@ref) # Examples ```jldoctest @@ -594,8 +601,9 @@ repeat(s::AbstractString, r::Integer) = repeat(convert(String, s), r) """ ^(s::Union{AbstractString,Char}, n::Integer) -Repeat a string or character `n` times. -The [`repeat`](@ref) function is an alias to this operator. +Repeat a string or character `n` times. This can also be written as `repeat(s, n)`. + +See also: [`repeat`](@ref) # Examples ```jldoctest From 166924cea254ab1f0ce701e328fef01cfeb35bc9 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Sun, 10 Dec 2017 13:04:25 -0500 Subject: [PATCH 11/22] delete bswap(::Char) --- base/char.jl | 2 -- test/char.jl | 11 ++++------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/base/char.jl b/base/char.jl index 92c52efc2d66a..76f87409a656d 100644 --- a/base/char.jl +++ b/base/char.jl @@ -89,8 +89,6 @@ hash(x::Char, h::UInt) = +(x::Char, y::Integer) = Char(Int32(x) + Int32(y)) +(x::Integer, y::Char) = y + x -bswap(x::Char) = Char(bswap(UInt32(x))) - print(io::IO, c::Char) = (write(io, c); nothing) const hex_chars = UInt8['0':'9';'a':'z'] diff --git a/test/char.jl b/test/char.jl index 85b2acf5385ef..b6548183891ff 100644 --- a/test/char.jl +++ b/test/char.jl @@ -5,13 +5,10 @@ @test typemin(Char) == Char(0) @test ndims(Char) == 0 @test getindex('a', 1) == 'a' -@test_throws BoundsError getindex('a',2) -# This is current behavior, but it seems incorrect -@test getindex('a',1,1,1) == 'a' -@test_throws BoundsError getindex('a',1,1,2) -# bswap of a Char should be removed, only the underlying codeunit (UInt32) -# should be swapped -@test bswap('\U10200') == '\U20100' +@test_throws BoundsError getindex('a', 2) +# This is current behavior, but it seems questionable +@test getindex('a', 1, 1, 1) == 'a' +@test_throws BoundsError getindex('a', 1, 1, 2) @test 'b' + 1 == 'c' @test typeof('b' + 1) == Char From dcf9552ace3331cbd5426f91a5c84c8e810f9a91 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Sun, 10 Dec 2017 15:07:37 -0500 Subject: [PATCH 12/22] deprecate chr2ind and ind2chr --- base/deprecated.jl | 4 ++++ base/exports.jl | 2 -- base/strings/basic.jl | 42 ----------------------------------- base/strings/util.jl | 22 +++++------------- doc/src/manual/strings.md | 7 +++--- doc/src/stdlib/strings.md | 2 -- stdlib/Dates/src/parse.jl | 2 +- stdlib/Profile/src/Profile.jl | 4 ++-- test/lineedit.jl | 4 ++-- test/strings/basic.jl | 20 ++++++++--------- test/strings/types.jl | 19 ++++++++-------- 11 files changed, 39 insertions(+), 89 deletions(-) diff --git a/base/deprecated.jl b/base/deprecated.jl index 2980f3ff14155..fe15713c47fac 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -2992,6 +2992,10 @@ end @deprecate_binding Complex64 ComplexF32 @deprecate_binding Complex128 ComplexF64 +# PR #24999 +@deprecate ind2chr(s::AbstractString, i::Integer) length(s, 1, i) +@deprecate chr2ind(s::AbstractString, n::Integer) nextind(s, 0, n) + # END 0.7 deprecations # BEGIN 1.0 deprecations diff --git a/base/exports.jl b/base/exports.jl index b884a3641f7f3..9f8a56bd2b9a5 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -716,7 +716,6 @@ export bytes2hex, chomp, chop, - chr2ind, codeunit, dec, digits, @@ -728,7 +727,6 @@ export hex, hex2bytes, hex2bytes!, - ind2chr, info, ismatch, isvalid, diff --git a/base/strings/basic.jl b/base/strings/basic.jl index b32b90ff6bd91..4f8a09dbd26a9 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -446,48 +446,6 @@ function nextind(s::AbstractString, i::Integer, n::Integer=1) return i + n end -""" - ind2chr(s::AbstractString, i::Integer) - -Convert a byte index `i` to a character index with -respect to string `s`. - -See also [`chr2ind`](@ref). - -# Examples -```jldoctest -julia> str = "αβγdef"; - -julia> ind2chr(str, 3) -2 - -julia> chr2ind(str, 2) -3 -``` -""" -ind2chr(s::AbstractString, i::Integer) = length(s, 1, i) - -""" - chr2ind(s::AbstractString, i::Integer) - -Convert a character index `i` to a byte index. - -See also [`ind2chr`](@ref). - -# Examples -```jldoctest -julia> str = "αβγdef"; - -julia> chr2ind(str, 2) -3 - -julia> ind2chr(str, 3) -2 -``` -""" -chr2ind(s::AbstractString, n::Integer) = - n < 0 ? prevind(s, 0, -n) : nextind(s, 0, n) - ## string index iteration type ## struct EachStringIndex{T<:AbstractString} diff --git a/base/strings/util.jl b/base/strings/util.jl index da299d538a55f..d92969de12ee1 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -206,28 +206,18 @@ strip(s::AbstractString, chars::Chars) = lstrip(rstrip(s, chars), chars) function lpad(s::AbstractString, n::Integer, p::AbstractString=" ") m = n - Unicode.textwidth(s) - (m <= 0) && (return s) + m ≤ 0 && return s l = Unicode.textwidth(p) - if l==1 - return string(p^m, s) - end - q = div(m,l) - r = m - q*l - i = r != 0 ? chr2ind(p, r) : -1 - string(p^q, p[1:i], s) + q, r = divrem(m, l) + string(p^q, first(p, r), s) end function rpad(s::AbstractString, n::Integer, p::AbstractString=" ") m = n - Unicode.textwidth(s) - (m <= 0) && (return s) + m ≤ 0 && return s l = Unicode.textwidth(p) - if l==1 - return string(s, p^m) - end - q = div(m,l) - r = m - q*l - i = r != 0 ? chr2ind(p, r) : -1 - string(s, p^q, p[1:i]) + q, r = divrem(m, l) + string(s, p^q, first(p, r)) end """ diff --git a/doc/src/manual/strings.md b/doc/src/manual/strings.md index 9feb04345114c..3178c638a84ca 100644 --- a/doc/src/manual/strings.md +++ b/doc/src/manual/strings.md @@ -565,14 +565,15 @@ Some other useful functions include: * [`endof(str)`](@ref) gives the maximal (byte) index that can be used to index into `str`. * [`length(str)`](@ref) the number of characters in `str`. + * [`length(str, i, j)`](@ref) the number of valid character indices in `str` from `i` to `j`. * [`i = start(str)`](@ref start) gives the first valid index at which a character can be found in `str` (typically 1). * [`c, j = next(str,i)`](@ref next) returns next character at or after the index `i` and the next valid character index following that. With [`start`](@ref) and [`endof`](@ref), can be used to iterate through the characters in `str`. - * [`ind2chr(str,i)`](@ref) gives the number of characters in `str` up to and including any at index - `i`. - * [`chr2ind(str,j)`](@ref) gives the index at which the `j`th character in `str` occurs. + * [`thisind(str, i)`](@ref) given an arbitrary index into a string find the first index of the character into which the index points. + * [`nextind(str, i, n=1)`](@ref) find the start of the `n`th character starting after index `i`. + * [`prevind(str, i, n=1)`](@ref) find the start of the `n`th character starting before index `i`. ## [Non-Standard String Literals](@id non-standard-string-literals) diff --git a/doc/src/stdlib/strings.md b/doc/src/stdlib/strings.md index 98f53c3b27dad..0426b5ba3f6de 100644 --- a/doc/src/stdlib/strings.md +++ b/doc/src/stdlib/strings.md @@ -51,8 +51,6 @@ Base.last(::AbstractString, ::Integer) Base.join Base.chop Base.chomp -Base.ind2chr -Base.chr2ind Base.thisind Base.nextind Base.prevind diff --git a/stdlib/Dates/src/parse.jl b/stdlib/Dates/src/parse.jl index 4f4cb0d7891ea..35829f00e7b5a 100644 --- a/stdlib/Dates/src/parse.jl +++ b/stdlib/Dates/src/parse.jl @@ -181,7 +181,7 @@ end @inline function tryparsenext_word(str::AbstractString, i, len, locale, maxchars=0) word_start, word_end = i, 0 - max_pos = maxchars <= 0 ? len : min(chr2ind(str, ind2chr(str,i) + maxchars - 1), len) + max_pos = maxchars <= 0 ? len : min(len, nextind(str, i, maxchars-1)) @inbounds while i <= max_pos c, ii = next(str, i) if Base.Unicode.isalpha(c) diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index 7986e910de0a7..299439b1c1d76 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -645,14 +645,14 @@ function rtruncto(str::String, w::Int) if length(str) <= w return str else - return string("...", str[chr2ind(str, length(str)-w+4):end]) + return string("...", str[prevind(str, end, w-4):end]) end end function ltruncto(str::String, w::Int) if length(str) <= w return str else - return string(str[1:chr2ind(str,w-4)], "...") + return string(str[1:nextind(str, 1, w-4)], "...") end end diff --git a/test/lineedit.jl b/test/lineedit.jl index 571ed8a75d36b..61e1d3bd42562 100644 --- a/test/lineedit.jl +++ b/test/lineedit.jl @@ -16,8 +16,8 @@ function new_state() LineEdit.init_state(term, ModalInterface([Prompt("test> ")])) end -charseek(buf, i) = seek(buf, chr2ind(content(buf), i+1)-1) -charpos(buf, pos=position(buf)) = ind2chr(content(buf), pos+1)-1 +charseek(buf, i) = seek(buf, nextind(content(buf), 0, i+1)-1) +charpos(buf, pos=position(buf)) = length(content(buf), 1, pos+1)-1 function transform!(f, s, i = -1) # i is char-based (not bytes) buffer position buf = buffer(s) diff --git a/test/strings/basic.jl b/test/strings/basic.jl index f6165cacaef27..27323005e7f31 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -99,14 +99,14 @@ end end @testset "issue #7248" begin - @test ind2chr("hello", -1) == -1 - @test chr2ind("hello", -1) == -1 - @test ind2chr("hellø", -1) == -1 - @test chr2ind("hellø", -1) == -1 - @test ind2chr("hello", 10) == 10 - @test chr2ind("hello", 10) == 10 - @test ind2chr("hellø", 10) == 9 - @test chr2ind("hellø", 10) == 11 + @test length("hello", 1, -1) == -1 + @test prevind("hello", 0, 1) == -1 + @test length("hellø", 1, -1) == -1 + @test prevind("hellø", 0, 1) == -1 + @test length("hello", 1, 10) == 10 + @test nextind("hello", 0, 10) == 10 + @test length("hellø", 1, 10) == 9 + @test nextind("hellø", 0, 10) == 11 @test_throws BoundsError checkbounds("hello", 0) @test_throws BoundsError checkbounds("hello", 6) @test_throws BoundsError checkbounds("hello", 0:3) @@ -221,7 +221,7 @@ end @test done(eachindex("foobar"),7) @test eltype(Base.EachStringIndex) == Int @test map(Base.Unicode.uppercase, "foó") == "FOÓ" - @test chr2ind("fóobar", 3) == 4 + @test nextind("fóobar", 0, 3) == 4 @test Symbol(gstr) == Symbol("12") @@ -233,7 +233,7 @@ end @test nextind(1:1, 1) == 2 @test nextind([1], 1) == 2 - @test ind2chr(gstr, 2) == 2 + @test length(gstr, 1, 2) == 2 # tests promote_rule let svec = [s"12", GenericString("12"), SubString("123", 1, 2)] diff --git a/test/strings/types.jl b/test/strings/types.jl index f89907c6f5b88..5d4d492dd0fea 100644 --- a/test/strings/types.jl +++ b/test/strings/types.jl @@ -157,7 +157,7 @@ end @test !ismatch(Regex("aa"), SubString("",1,0)) @test ismatch(Regex(""), SubString("",1,0)) -# isvalid(), chr2ind() and ind2chr() for SubString{String} +# isvalid(), formerly length() and nextind() for SubString{String} let s = "lorem ipsum", sdict = Dict( SubString(s, 1, 11) => "lorem ipsum", SubString(s, 1, 6) => "lorem ", @@ -176,14 +176,15 @@ let s = "lorem ipsum", sdict = Dict( @test_throws BoundsError isvalid(s, i) end end - for i in 1:ncodeunits(ss) - @test ind2chr(ss, i) == ind2chr(s, i) + for i in 1:ncodeunits(ss), j = i-1:ncodeunits(ss) + @test length(ss, i, j) == length(s, i, j) end end for (ss, s) in sdict @test length(ss) == length(s) - for i in 1:length(ss) - @test chr2ind(ss, i) == chr2ind(s, i) + for i in 0:length(ss)+1, j = 0:length(ss)+1 + @test nextind(ss, i, j) == nextind(s, i, j) + @test prevind(ss, i, j) == prevind(s, i, j) end end end @@ -209,10 +210,10 @@ let s = "Σx + βz - 2" end let ss = SubString("hello", 1, 5) - @test ind2chr(ss, -1) == -1 - @test chr2ind(ss, -1) == -1 - @test chr2ind(ss, 10) == 10 - @test ind2chr(ss, 10) == 10 + @test length(ss, 1, -1) == -1 + @test length(ss, 1, 10) == 10 + @test prevind(ss, 0, 1) == -1 + @test nextind(ss, 0, 10) == 10 end # length(SubString{String}) performance specialization From 61d5003957c6d713300d8846fc1e3223f61a1221 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Sun, 10 Dec 2017 22:11:37 -0500 Subject: [PATCH 13/22] move string search functions into strings/search.jl --- base/strings/search.jl | 68 ++++++++++++++++++++++++++++++++++++++++++ base/strings/string.jl | 68 ------------------------------------------ 2 files changed, 68 insertions(+), 68 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index 43e880a26b9e5..d2731d1e9d022 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -1,5 +1,73 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license +function search(s::String, c::Char, i::Integer = 1) + if i < 1 || i > sizeof(s) + i == sizeof(s) + 1 && return 0 + throw(BoundsError(s, i)) + end + @inbounds if is_valid_continuation(codeunit(s,i)) + string_index_err(s, i) + end + c ≤ '\x7f' && return search(s, c % UInt8, i) + while true + i = search(s, first_utf8_byte(c), i) + (i == 0 || s[i] == c) && return i + i = next(s, i)[2] + end +end + +function search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1) + if i < 1 + throw(BoundsError(a, i)) + end + n = sizeof(a) + if i > n + return i == n+1 ? 0 : throw(BoundsError(a, i)) + end + p = pointer(a) + q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1) + q == C_NULL ? 0 : Int(q-p+1) +end + +function search(a::ByteArray, b::Char, i::Integer = 1) + if Unicode.isascii(b) + search(a,UInt8(b),i) + else + search(a,Vector{UInt8}(string(b)),i).start + end +end + +function rsearch(s::String, c::Char, i::Integer = sizeof(s)) + c ≤ '\x7f' && return rsearch(s, c % UInt8, i) + b = first_utf8_byte(c) + while true + i = rsearch(s, b, i) + (i == 0 || s[i] == c) && return i + i = prevind(s, i) + end +end + +function rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = sizeof(s)) + if i < 1 + return i == 0 ? 0 : throw(BoundsError(a, i)) + end + n = sizeof(a) + if i > n + return i == n+1 ? 0 : throw(BoundsError(a, i)) + end + p = pointer(a) + q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i) + q == C_NULL ? 0 : Int(q-p+1) +end + +function rsearch(a::ByteArray, b::Char, i::Integer = length(a)) + if Unicode.isascii(b) + rsearch(a,UInt8(b),i) + else + rsearch(a,Vector{UInt8}(string(b)),i).start + end +end + const Chars = Union{Char,Tuple{Vararg{Char}},AbstractVector{Char},Set{Char}} """ diff --git a/base/strings/string.jl b/base/strings/string.jl index 47dc0326914cb..1e0aafa803c5c 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -283,74 +283,6 @@ function isvalid(s::String, i::Int) end isvalid(s::String, i::Integer) = isvalid(s, Int(i)) -function search(s::String, c::Char, i::Integer = 1) - if i < 1 || i > sizeof(s) - i == sizeof(s) + 1 && return 0 - throw(BoundsError(s, i)) - end - @inbounds if is_valid_continuation(codeunit(s,i)) - string_index_err(s, i) - end - c ≤ '\x7f' && return search(s, c % UInt8, i) - while true - i = search(s, first_utf8_byte(c), i) - (i == 0 || s[i] == c) && return i - i = next(s, i)[2] - end -end - -function search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1) - if i < 1 - throw(BoundsError(a, i)) - end - n = sizeof(a) - if i > n - return i == n+1 ? 0 : throw(BoundsError(a, i)) - end - p = pointer(a) - q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1) - q == C_NULL ? 0 : Int(q-p+1) -end - -function search(a::ByteArray, b::Char, i::Integer = 1) - if Unicode.isascii(b) - search(a,UInt8(b),i) - else - search(a,Vector{UInt8}(string(b)),i).start - end -end - -function rsearch(s::String, c::Char, i::Integer = sizeof(s)) - c ≤ '\x7f' && return rsearch(s, c % UInt8, i) - b = first_utf8_byte(c) - while true - i = rsearch(s, b, i) - (i == 0 || s[i] == c) && return i - i = prevind(s, i) - end -end - -function rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = sizeof(s)) - if i < 1 - return i == 0 ? 0 : throw(BoundsError(a, i)) - end - n = sizeof(a) - if i > n - return i == n+1 ? 0 : throw(BoundsError(a, i)) - end - p = pointer(a) - q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i) - q == C_NULL ? 0 : Int(q-p+1) -end - -function rsearch(a::ByteArray, b::Char, i::Integer = length(a)) - if Unicode.isascii(b) - rsearch(a,UInt8(b),i) - else - rsearch(a,Vector{UInt8}(string(b)),i).start - end -end - ## optimized concatenation, reverse, repeat ## function string(a::String...) From c713dffdaaa29bce1327cebe09a6a2a2714a5054 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Sun, 10 Dec 2017 23:58:28 -0500 Subject: [PATCH 14/22] optimize the length(::String) method better --- base/strings/string.jl | 60 ++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 1e0aafa803c5c..44e7d86f25eb4 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -184,9 +184,8 @@ end return reinterpret(Char, u), i end -function getindex(s::String, i::Int) - @boundscheck checkbounds(s, i) - @inbounds b = codeunit(s, i) +@propagate_inbounds function getindex(s::String, i::Int) + b = codeunit(s, i) u = UInt32(b) << 24 (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u) return getindex_continued(s, i, u) @@ -241,35 +240,38 @@ function length(s::String, lo::Int, hi::Int) z = ncodeunits(s) i = Int(max(1, min(z, lo))) n = Int(min(z, max(1, hi))) - c = i - n - if i ≤ n - i, j = thisind(s, i), i - c -= i < j - i -= 1 + c = hi - lo + 1 + i < n || return c + @inbounds i, j = thisind(s, i), i + c -= i < j + _length(s, i, n, c) +end + +length(s::String) = _length(s, 1, ncodeunits(s), ncodeunits(s)) + +function _length(s::String, i::Int, n::Int, c::Int) + i < n || return c + @inbounds b = codeunit(s, i) + @inbounds while true while true - (i += 1) ≤ n || break - @inbounds b = codeunit(s, i) # lead byte - @label L - c += 1 - (0xc0 ≤ b) & (b < 0xf8) || continue - l = b - - (i += 1) ≤ n || break - @inbounds b = codeunit(s, i) # cont byte 1 - b & 0xc0 == 0x80 || @goto L - l ≥ 0xe0 || continue - - (i += 1) ≤ n || break - @inbounds b = codeunit(s, i) # cont byte 2 - b & 0xc0 == 0x80 || @goto L - l ≥ 0xf0 || continue - - (i += 1) ≤ n || break - @inbounds b = codeunit(s, i) # cont byte 3 - b & 0xc0 == 0x80 || @goto L + (i += 1) ≤ n || return c + between(b, 0xc0, 0xf7) && break + b = codeunit(s, i) end + l = b + b = codeunit(s, i) # cont byte 1 + c -= (x = b & 0xc0 == 0x80) + x & (l ≥ 0xe0) || continue + + (i += 1) ≤ n || return c + b = codeunit(s, i) # cont byte 2 + c -= (x = b & 0xc0 == 0x80) + x & (l ≥ 0xf0) || continue + + (i += 1) ≤ n || return c + b = codeunit(s, i) # cont byte 3 + c -= (b & 0xc0 == 0x80) end - return c + hi - lo end # TODO: delete or move to char.jl From b8cd96e03248f828b2169708043a6a1ece5a4c4f Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Mon, 11 Dec 2017 00:34:50 -0500 Subject: [PATCH 15/22] =?UTF-8?q?make=20length(string,=20i,=20j)=20?= =?UTF-8?q?=E2=89=A5=C2=A00=20for=20all=20i,=20j?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- base/strings/basic.jl | 1 + base/strings/string.jl | 11 +++++++---- test/strings/basic.jl | 4 ++-- test/strings/types.jl | 2 +- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/base/strings/basic.jl b/base/strings/basic.jl index 4f8a09dbd26a9..18a87c9226e8f 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -322,6 +322,7 @@ julia> length("jμΛIα") ``` """ function length(s::AbstractString, lo::Integer=1, hi::Integer=ncodeunits(s)) + lo ≤ hi || return 0 z = ncodeunits(s) a = Int(max(1, min(z, lo))) b = Int(min(z, max(1, hi))) diff --git a/base/strings/string.jl b/base/strings/string.jl index 44e7d86f25eb4..638ba8dfae73a 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -237,10 +237,13 @@ function getindex(s::String, r::UnitRange{Int}) end function length(s::String, lo::Int, hi::Int) - z = ncodeunits(s) - i = Int(max(1, min(z, lo))) - n = Int(min(z, max(1, hi))) - c = hi - lo + 1 + i, n = lo, hi + c = max(0, hi - lo + 1) + @boundscheck begin + z = ncodeunits(s) + i = Int(max(1, min(z, lo))) + n = Int(min(z, max(1, hi))) + end i < n || return c @inbounds i, j = thisind(s, i), i c -= i < j diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 27323005e7f31..49df4fe48252d 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -99,9 +99,9 @@ end end @testset "issue #7248" begin - @test length("hello", 1, -1) == -1 + @test length("hello", 1, -1) == 0 @test prevind("hello", 0, 1) == -1 - @test length("hellø", 1, -1) == -1 + @test length("hellø", 1, -1) == 0 @test prevind("hellø", 0, 1) == -1 @test length("hello", 1, 10) == 10 @test nextind("hello", 0, 10) == 10 diff --git a/test/strings/types.jl b/test/strings/types.jl index 5d4d492dd0fea..452da4cbc2ed6 100644 --- a/test/strings/types.jl +++ b/test/strings/types.jl @@ -210,7 +210,7 @@ let s = "Σx + βz - 2" end let ss = SubString("hello", 1, 5) - @test length(ss, 1, -1) == -1 + @test length(ss, 1, -1) == 0 @test length(ss, 1, 10) == 10 @test prevind(ss, 0, 1) == -1 @test nextind(ss, 0, 10) == 10 From f9e1acb4007b7896440b05538ce985c2d64747d3 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Mon, 11 Dec 2017 12:20:40 -0500 Subject: [PATCH 16/22] moar Unicode operators --- base/strings/string.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 638ba8dfae73a..fdbf1b37ade87 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -124,13 +124,13 @@ function nextind(s::String, i::Int) end # first continuation byte @inbounds b = codeunit(s, i += 1) - (b & 0xc0 != 0x80) | ((i += 1) > n) | (l < 0xe0) && return i + (b & 0xc0 ≠ 0x80) | ((i += 1) > n) | (l < 0xe0) && return i # second continuation byte @inbounds b = codeunit(s, i) - (b & 0xc0 != 0x80) | ((i += 1) > n) | (l < 0xf0) && return i + (b & 0xc0 ≠ 0x80) | ((i += 1) > n) | (l < 0xf0) && return i # third continuation byte @inbounds b = codeunit(s, i) - ifelse(b & 0xc0 != 0x80, i, i+1) + ifelse(b & 0xc0 ≠ 0x80, i, i+1) end ## checking UTF-8 & ACSII validity ## @@ -143,7 +143,7 @@ byte_string_classify(s::String) = # 1: valid ASCII # 2: valid UTF-8 -isvalid(::Type{String}, s::Union{Vector{UInt8},String}) = byte_string_classify(s) != 0 +isvalid(::Type{String}, s::Union{Vector{UInt8},String}) = byte_string_classify(s) ≠ 0 isvalid(s::String) = isvalid(String, s) is_valid_continuation(c) = c & 0xc0 == 0x80 From a7face95824c65eceeb73f775833c4cc333ab526 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Tue, 12 Dec 2017 00:06:48 -0500 Subject: [PATCH 17/22] change some code point conversions to char comparisons --- base/strings/unicode.jl | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index 64b5f6d5611ed..5c28f51189560 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -43,11 +43,10 @@ true """ isvalid(T,value) -isvalid(::Type{Char}, ch::Unsigned) = !((ch - 0xd800 < 0x800) | (ch > 0x10ffff)) -isvalid(::Type{Char}, ch::Integer) = isvalid(Char, Unsigned(ch)) -isvalid(::Type{Char}, ch::Char) = isvalid(Char, UInt32(ch)) - -isvalid(ch::Char) = isvalid(Char, ch) +isvalid(c::Char) = !ismalformed(c) & ((c ≤ '\ud7ff') | ('\ue000' ≤ c) & (c ≤ '\U10ffff')) +isvalid(::Type{Char}, c::Unsigned) = ((c ≤ 0xd7ff ) | ( 0xe000 ≤ c) & (c ≤ 0x10ffff )) +isvalid(::Type{Char}, c::Integer) = isvalid(Char, Unsigned(c)) +isvalid(::Type{Char}, c::Char) = isvalid(c) # utf8 category constants const UTF8PROC_CATEGORY_CN = 0 @@ -301,15 +300,15 @@ titlecase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : # returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category function category_code(c::Char) ismalformed(c) && return Cint(31) - (u = UInt32(c)) ≤ 0x10ffff || return Cint(30) - ccall(:utf8proc_category, Cint, (UInt32,), u) + c ≤ '\U10ffff' || return Cint(30) + ccall(:utf8proc_category, Cint, (UInt32,), c) end # more human-readable representations of the category code function category_abbrev(c) ismalformed(c) && return "Ma" - (u = UInt32(c)) ≤ 0x10ffff || return "In" - unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), u)) + c ≤ '\U10ffff' || return "In" + unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), c)) end category_string(c) = category_strings[category_code(c)+1] From 1f0c6fa35ab64ad66a5bb413fad474e2c722c290 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Mon, 11 Dec 2017 12:23:47 -0500 Subject: [PATCH 18/22] various String performance tweaks - {next,getindex}_continued don't need to re-check bounds - short-circuiting is slightly faster in length --- base/strings/string.jl | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index fdbf1b37ade87..91497d5c68c81 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -150,12 +150,10 @@ is_valid_continuation(c) = c & 0xc0 == 0x80 ## required core functionality ## -function next(s::String, i::Int) - @boundscheck checkbounds(s, i) - @inbounds b = codeunit(s, i) - # TODO: check index validity +@propagate_inbounds function next(s::String, i::Int) + b = codeunit(s, i) u = UInt32(b) << 24 - (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u), i+1 + between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1 return next_continued(s, i, u) end @@ -187,29 +185,30 @@ end @propagate_inbounds function getindex(s::String, i::Int) b = codeunit(s, i) u = UInt32(b) << 24 - (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u) + between(b, 0x80, 0xf7) || return reinterpret(Char, u) return getindex_continued(s, i, u) end -@noinline function getindex_continued(s::String, i::Int, u::UInt32) +function getindex_continued(s::String, i::Int, u::UInt32) if u < 0xc0000000 - isvalid(s, i) && @goto ret + # called from `getindex` which checks bounds + @inbounds isvalid(s, i) && @goto ret string_index_err(s, i) end n = ncodeunits(s) - # first continuation byte + (i += 1) > n && @goto ret - @inbounds b = codeunit(s, i) + @inbounds b = codeunit(s, i) # cont byte 1 b & 0xc0 == 0x80 || @goto ret u |= UInt32(b) << 16 - # second continuation byte + ((i += 1) > n) | (u < 0xe0000000) && @goto ret - @inbounds b = codeunit(s, i) + @inbounds b = codeunit(s, i) # cont byte 2 b & 0xc0 == 0x80 || @goto ret u |= UInt32(b) << 8 - # third continuation byte + ((i += 1) > n) | (u < 0xf0000000) && @goto ret - @inbounds b = codeunit(s, i) + @inbounds b = codeunit(s, i) # cont byte 3 b & 0xc0 == 0x80 || @goto ret u |= UInt32(b) @label ret @@ -252,13 +251,13 @@ end length(s::String) = _length(s, 1, ncodeunits(s), ncodeunits(s)) -function _length(s::String, i::Int, n::Int, c::Int) +@inline function _length(s::String, i::Int, n::Int, c::Int) i < n || return c @inbounds b = codeunit(s, i) @inbounds while true while true (i += 1) ≤ n || return c - between(b, 0xc0, 0xf7) && break + 0xc0 ≤ b ≤ 0xf7 && break b = codeunit(s, i) end l = b From 6f10ca2184c74de8d4e1380c4d9e6221feea5a34 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Tue, 12 Dec 2017 00:28:04 -0500 Subject: [PATCH 19/22] add various `@propagate_inbounds` annotations also: avoid second bounds check in `get` with default --- base/strings/basic.jl | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/base/strings/basic.jl b/base/strings/basic.jl index 18a87c9226e8f..667d884bb10d9 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -81,7 +81,7 @@ I.e. the value returned by `codeunit(s, i)` is of the type returned by See also: [`ncodeunits`](@ref), [`checkbounds`](@ref) """ -codeunit(s::AbstractString, i::Integer) = typeof(i) === Int ? +@propagate_inbounds codeunit(s::AbstractString, i::Integer) = typeof(i) === Int ? throw(MethodError(codeunit, Tuple{typeof(s),Int})) : codeunit(s, Int(i)) """ @@ -119,7 +119,7 @@ Stacktrace: [...] ``` """ -isvalid(s::AbstractString, i::Integer) = typeof(i) === Int ? +@propagate_inbounds isvalid(s::AbstractString, i::Integer) = typeof(i) === Int ? throw(MethodError(isvalid, Tuple{typeof(s),Int})) : isvalid(s, Int(i)) """ @@ -134,7 +134,7 @@ a Unicode index error is raised. See also: [`getindex`](@ref), [`start`](@ref), [`done`](@ref), [`checkbounds`](@ref) """ -next(s::AbstractString, i::Integer) = typeof(i) === Int ? +@propagate_inbounds next(s::AbstractString, i::Integer) = typeof(i) === Int ? throw(MethodError(next, Tuple{typeof(s),Int})) : next(s, Int(i)) ## basic generic definitions ## @@ -148,13 +148,21 @@ endof(s::AbstractString) = thisind(s, ncodeunits(s)) getindex(s::AbstractString, i::Integer) = next(s, i)[1] getindex(s::AbstractString, i::Colon) = s # TODO: handle other ranges with stride ±1 specially? +# TODO: add more @propagate_inbounds annotations? getindex(s::AbstractString, r::UnitRange{<:Integer}) = SubString(s, r) getindex(s::AbstractString, v::AbstractVector{<:Integer}) = sprint(length(v), io->(for i in v; write(io, s[i]) end)) getindex(s::AbstractString, v::AbstractVector{Bool}) = throw(ArgumentError("logical indexing not supported for strings")) -get(s::AbstractString, i::Integer, default) = checkbounds(Bool, s, i) ? s[i] : default +function get(s::AbstractString, i::Integer, default) +# TODO: use ternary once @inbounds is expression-like + if checkbounds(Bool, s, i) + @inbounds return s[i] + else + return default + end +end ## bounds checking ## From 937c3ada77398f45c8a425b5b62265bef7f4db8f Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Tue, 12 Dec 2017 02:59:32 -0500 Subject: [PATCH 20/22] isvalid: return false out of bounds instead of throwing also: `next` can assume that incoming indices are valid --- base/strings/basic.jl | 25 ++++++++++++++----------- base/strings/search.jl | 15 +++++---------- base/strings/string.jl | 13 +++---------- base/strings/substring.jl | 5 +++-- test/strings/types.jl | 7 +------ 5 files changed, 26 insertions(+), 39 deletions(-) diff --git a/base/strings/basic.jl b/base/strings/basic.jl index 667d884bb10d9..a9dcb14f54cde 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -87,14 +87,13 @@ See also: [`ncodeunits`](@ref), [`checkbounds`](@ref) """ isvalid(s::AbstractString, i::Integer) -> Bool -Predicate indicating whether the given index is the start of the encoding of -a character in `s` or not. If `isvalid(s, i)` is true then `s[i]` will return -the character whose encoding starts at that index, if it's false, then `s[i]` -will raise an invalid index error. Behavior of `next(s, i)` is similar except -that the character is returned along with the index of the following character. -In order for `isvalid(s, i)` to be an O(1) function, the encoding of `s` must -be [self-synchronizing](https://en.wikipedia.org/wiki/Self-synchronizing_code); -this is a basic assumption of Julia's generic string support. +Predicate indicating whether the given index is the start of the encoding of a +character in `s` or not. If `isvalid(s, i)` is true then `s[i]` will return the +character whose encoding starts at that index, if it's false, then `s[i]` will +raise an invalid index error or a bounds error depending on if `i` is in bounds. +In order for `isvalid(s, i)` to be an O(1) function, the encoding of `s` must be +[self-synchronizing](https://en.wikipedia.org/wiki/Self-synchronizing_code) this +is a basic assumption of Julia's generic string support. See also: [`getindex`](@ref), [`next`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref), [`length`](@ref) @@ -128,8 +127,8 @@ Stacktrace: Return a tuple of the character in `s` at index `i` with the index of the start of the following character in `s`. This is the key method that allows strings to be iterated, yielding a sequences of characters. If `i` is out of bounds in `s` -then a bounds error is raised; if `i` is not a valid character index in `s` then -a Unicode index error is raised. +then a bounds error is raised. The `next` function, as part of the iteration +protocoal may assume that `i` is the start of a character in `s`. See also: [`getindex`](@ref), [`start`](@ref), [`done`](@ref), [`checkbounds`](@ref) @@ -145,7 +144,11 @@ eltype(::Type{<:AbstractString}) = Char sizeof(s::AbstractString) = ncodeunits(s) * sizeof(codeunit(s)) endof(s::AbstractString) = thisind(s, ncodeunits(s)) -getindex(s::AbstractString, i::Integer) = next(s, i)[1] +function getindex(s::AbstractString, i::Integer) + @boundscheck checkbounds(s, i) + @inbounds return isvalid(s, i) ? next(s, i)[1] : string_index_err(s, i) +end + getindex(s::AbstractString, i::Colon) = s # TODO: handle other ranges with stride ±1 specially? # TODO: add more @propagate_inbounds annotations? diff --git a/base/strings/search.jl b/base/strings/search.jl index d2731d1e9d022..5b0fa167a783f 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -5,9 +5,7 @@ function search(s::String, c::Char, i::Integer = 1) i == sizeof(s) + 1 && return 0 throw(BoundsError(s, i)) end - @inbounds if is_valid_continuation(codeunit(s,i)) - string_index_err(s, i) - end + @inbounds isvalid(s, i) || string_index_err(s, i) c ≤ '\x7f' && return search(s, c % UInt8, i) while true i = search(s, first_utf8_byte(c), i) @@ -94,13 +92,10 @@ julia> search("JuliaLang","Julia") ``` """ function search(s::AbstractString, c::Chars, i::Integer) - if isempty(c) - return 1 <= i <= nextind(s,endof(s)) ? i : - throw(BoundsError(s, i)) - end - if i < 1 || i > nextind(s,endof(s)) - throw(BoundsError(s, i)) - end + z = ncodeunits(s) + 1 + isempty(c) && return 1 ≤ i ≤ z ? i : throw(BoundsError(s, i)) + 1 ≤ i ≤ z || throw(BoundsError(s, i)) + @inbounds i == z || isvalid(s, i) || string_index_err(s, i) while !done(s,i) d, j = next(s,i) if d in c diff --git a/base/strings/string.jl b/base/strings/string.jl index 91497d5c68c81..b1d365bb35b17 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -157,11 +157,8 @@ is_valid_continuation(c) = c & 0xc0 == 0x80 return next_continued(s, i, u) end -@noinline function next_continued(s::String, i::Int, u::UInt32) - if u < 0xc0000000 - isvalid(s, i) && (i += 1; @goto ret) - string_index_err(s, i) - end +function next_continued(s::String, i::Int, u::UInt32) + u < 0xc0000000 && (i += 1; @goto ret) n = ncodeunits(s) # first continuation byte (i += 1) > n && @goto ret @@ -281,11 +278,7 @@ first_utf8_byte(c::Char) = (reinterpret(UInt32, c) >> 24) % UInt8 ## overload methods for efficiency ## -function isvalid(s::String, i::Int) - @boundscheck checkbounds(s, i) - return thisind(s, i) == i -end -isvalid(s::String, i::Integer) = isvalid(s, Int(i)) +isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i ## optimized concatenation, reverse, repeat ## diff --git a/base/strings/substring.jl b/base/strings/substring.jl index 4d33f89754d1c..e389de6518b49 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -73,8 +73,9 @@ function getindex(s::SubString, i::Integer) end function isvalid(s::SubString, i::Integer) - @boundscheck checkbounds(s, i) - @inbounds return isvalid(s.string, s.offset + i) + ib = true + @boundscheck ib = checkbounds(Bool, s, i) + @inbounds return ib && isvalid(s.string, s.offset + i) end thisind(s::SubString, i::Integer) = thisind(s.string, s.offset + i) - s.offset diff --git a/test/strings/types.jl b/test/strings/types.jl index 452da4cbc2ed6..0af2713349465 100644 --- a/test/strings/types.jl +++ b/test/strings/types.jl @@ -169,12 +169,7 @@ let s = "lorem ipsum", sdict = Dict( for (ss, s) in sdict @test ncodeunits(ss) == ncodeunits(s) for i in -2:13 - if 1 ≤ i ≤ ncodeunits(ss) - @test isvalid(ss, i) == isvalid(s, i) - else - @test_throws BoundsError isvalid(ss, i) - @test_throws BoundsError isvalid(s, i) - end + @test isvalid(ss, i) == isvalid(s, i) end for i in 1:ncodeunits(ss), j = i-1:ncodeunits(ss) @test length(ss, i, j) == length(s, i, j) From feb1f6827124dcaed1f734a310bf5f43837a104b Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Mon, 11 Dec 2017 18:24:03 -0500 Subject: [PATCH 21/22] bounds checks on string length(s, i, j) --- base/strings/basic.jl | 39 +++++++++++++++++++++++---------------- base/strings/string.jl | 18 ++++++++---------- test/lineedit.jl | 2 +- test/strings/basic.jl | 8 ++++---- test/strings/types.jl | 6 ++++-- 5 files changed, 40 insertions(+), 33 deletions(-) diff --git a/base/strings/basic.jl b/base/strings/basic.jl index a9dcb14f54cde..fa607b6003d32 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -313,15 +313,17 @@ isless(a::Symbol, b::Symbol) = cmp(a, b) < 0 ## character index arithmetic ## """ - length(s::AbstractString, lo::Integer=1, hi::Integer=ncodeunits(s)) -> Integer + length(s::AbstractString) -> Int + length(s::AbstractString, i::Integer, j::Integer) -> Int -The number of characters in string `s` from indices `lo` through `hi`. This is -computed as the number of code unit indices from `lo` to `hi` which are valid +The number of characters in string `s` from indices `i` through `j`. This is +computed as the number of code unit indices from `i` to `j` which are valid character indices. Without only a single string argument, this computes the -number of characters in the entire string. With `lo` and `hi` arguments it computes -the number of indices between `lo` and `hi` inclusive that are valid indices in -the string `s`. Note that the trailing character may include code units past `hi` -and still be counted. +number of characters in the entire string. With `i` and `j` arguments it +computes the number of indices between `i` and `j` inclusive that are valid +indices in the string `s`. In addition to in-bounds values, `i` may take the +out-of-bounds value `ncodeunits(s) + 1` and `j` may take the out-of-bounds +value `0`. See also: [`isvalid`](@ref), [`ncodeunits`](@ref), [`endof`](@ref), [`thisind`](@ref), [`nextind`](@ref), [`prevind`](@ref) @@ -332,18 +334,23 @@ julia> length("jμΛIα") 5 ``` """ -function length(s::AbstractString, lo::Integer=1, hi::Integer=ncodeunits(s)) - lo ≤ hi || return 0 - z = ncodeunits(s) - a = Int(max(1, min(z, lo))) - b = Int(min(z, max(1, hi))) - n = a - b - for i = a:b - n += isvalid(s, i) +length(s::AbstractString) = @inbounds return length(s, 1, ncodeunits(s)) + +function length(s::AbstractString, i::Int, j::Int) + @boundscheck begin + 0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i)) + 0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j)) end - return n + hi - lo + n = 0 + for k = i:j + @inbounds n += isvalid(s, k) + end + return n end +@propagate_inbounds length(s::AbstractString, i::Integer, j::Integer) = + length(s, Int(i), Int(j)) + """ thisind(s::AbstractString, i::Integer) -> Int diff --git a/base/strings/string.jl b/base/strings/string.jl index b1d365bb35b17..2cc20a714ea69 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -232,18 +232,16 @@ function getindex(s::String, r::UnitRange{Int}) return ss end -function length(s::String, lo::Int, hi::Int) - i, n = lo, hi - c = max(0, hi - lo + 1) +function length(s::String, i::Int, j::Int) @boundscheck begin - z = ncodeunits(s) - i = Int(max(1, min(z, lo))) - n = Int(min(z, max(1, hi))) + 0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i)) + 0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j)) end - i < n || return c - @inbounds i, j = thisind(s, i), i - c -= i < j - _length(s, i, n, c) + j < i && return 0 + c = j - i + 1 + @inbounds i, k = thisind(s, i), i + c -= i < k + _length(s, i, j, c) end length(s::String) = _length(s, 1, ncodeunits(s), ncodeunits(s)) diff --git a/test/lineedit.jl b/test/lineedit.jl index 61e1d3bd42562..cb870b8842422 100644 --- a/test/lineedit.jl +++ b/test/lineedit.jl @@ -17,7 +17,7 @@ function new_state() end charseek(buf, i) = seek(buf, nextind(content(buf), 0, i+1)-1) -charpos(buf, pos=position(buf)) = length(content(buf), 1, pos+1)-1 +charpos(buf, pos=position(buf)) = length(content(buf), 1, pos) function transform!(f, s, i = -1) # i is char-based (not bytes) buffer position buf = buffer(s) diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 49df4fe48252d..512eee29e0866 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -99,13 +99,13 @@ end end @testset "issue #7248" begin - @test length("hello", 1, -1) == 0 + @test_throws BoundsError length("hello", 1, -1) == 0 @test prevind("hello", 0, 1) == -1 - @test length("hellø", 1, -1) == 0 + @test_throws BoundsError length("hellø", 1, -1) == 0 @test prevind("hellø", 0, 1) == -1 - @test length("hello", 1, 10) == 10 + @test_throws BoundsError length("hello", 1, 10) == 10 @test nextind("hello", 0, 10) == 10 - @test length("hellø", 1, 10) == 9 + @test_throws BoundsError length("hellø", 1, 10) == 9 @test nextind("hellø", 0, 10) == 11 @test_throws BoundsError checkbounds("hello", 0) @test_throws BoundsError checkbounds("hello", 6) diff --git a/test/strings/types.jl b/test/strings/types.jl index 0af2713349465..f3c549ba6b36a 100644 --- a/test/strings/types.jl +++ b/test/strings/types.jl @@ -205,8 +205,10 @@ let s = "Σx + βz - 2" end let ss = SubString("hello", 1, 5) - @test length(ss, 1, -1) == 0 - @test length(ss, 1, 10) == 10 + @test length(ss, 1, 0) == 0 + @test_throws BoundsError length(ss, 1, -1) == 0 + @test_throws BoundsError length(ss, 1, 6) + @test_throws BoundsError length(ss, 1, 10) @test prevind(ss, 0, 1) == -1 @test nextind(ss, 0, 10) == 10 end From 8de25f5ac6c8a8ef9a8872f2d9aaaee9ddbf6bf7 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Tue, 12 Dec 2017 16:31:25 -0500 Subject: [PATCH 22/22] bounds check thisind, nextind and prevind as well --- base/repl/REPL.jl | 6 +- base/strings/basic.jl | 30 ++++--- base/strings/search.jl | 3 +- base/strings/string.jl | 11 ++- base/strings/substring.jl | 23 +++++- base/strings/util.jl | 29 +++---- test/strings/basic.jl | 160 ++++++++++++++++++++------------------ test/strings/types.jl | 24 ++++-- 8 files changed, 168 insertions(+), 118 deletions(-) diff --git a/base/repl/REPL.jl b/base/repl/REPL.jl index f7585feaa2a05..c22010c168503 100644 --- a/base/repl/REPL.jl +++ b/base/repl/REPL.jl @@ -609,7 +609,11 @@ function history_search(hist::REPLHistoryProvider, query_buffer::IOBuffer, respo # Alright, first try to see if the current match still works a = position(response_buffer) + 1 # position is zero-indexed - b = min(endof(response_str), prevind(response_str, a + sizeof(searchdata))) # ensure that b is valid + # FIXME: I'm pretty sure this is broken since it uses an index + # into the search data to index into the response string + b = a + sizeof(searchdata) + b = b ≤ ncodeunits(response_str) ? prevind(response_str, b) : b-1 + b = min(endof(response_str), b) # ensure that b is valid !skip_current && searchdata == response_str[a:b] && return true diff --git a/base/strings/basic.jl b/base/strings/basic.jl index fa607b6003d32..407ebb3638e7b 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -383,8 +383,12 @@ julia> thisind("αβγdef", 10) julia> thisind("αβγdef", 20) 20 """ -function thisind(s::AbstractString, i::Integer) - i ≤ ncodeunits(s) || return i +thisind(s::AbstractString, i::Integer) = thisind(s, Int(i)) + +function thisind(s::AbstractString, i::Int) + z = ncodeunits(s) + 1 + i == z && return i + @boundscheck 0 ≤ i ≤ z || throw(BoundsError(s, i)) @inbounds while 1 < i && !isvalid(s, i) i -= 1 end @@ -415,13 +419,14 @@ julia> prevind("αβγdef", 3, 2) 0 ``` """ -function prevind(s::AbstractString, i::Integer, n::Integer=1) +prevind(s::AbstractString, i::Integer, n::Integer) = prevind(s, Int(i), Int(n)) +prevind(s::AbstractString, i::Integer) = prevind(s, Int(i)) +prevind(s::AbstractString, i::Int) = prevind(s, i, 1) + +function prevind(s::AbstractString, i::Int, n::Int) n < 0 && throw(ArgumentError("n cannot be negative: $n")) z = ncodeunits(s) + 1 - if i > z - n -= i - z - i = z - end + @boundscheck 0 < i ≤ z || throw(BoundsError(s, i)) while n > 0 && 1 < i @inbounds n -= isvalid(s, i -= 1) end @@ -452,13 +457,14 @@ julia> nextind(str, 9) 10 ``` """ -function nextind(s::AbstractString, i::Integer, n::Integer=1) +nextind(s::AbstractString, i::Integer, n::Integer) = nextind(s, Int(i), Int(n)) +nextind(s::AbstractString, i::Integer) = nextind(s, Int(i)) +nextind(s::AbstractString, i::Int) = nextind(s, i, 1) + +function nextind(s::AbstractString, i::Int, n::Int) n < 0 && throw(ArgumentError("n cannot be negative: $n")) - if i < 1 - n += i - 1 - i = 1 - end z = ncodeunits(s) + @boundscheck 0 ≤ i ≤ z || throw(BoundsError(s, i)) while n > 0 && i < z @inbounds n -= isvalid(s, i += 1) end diff --git a/base/strings/search.jl b/base/strings/search.jl index 5b0fa167a783f..4b108e4e0a0b5 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -412,7 +412,8 @@ function rsearchindex(s::String, t::String, i::Integer) if endof(t) == 1 rsearch(s, t[1], i) elseif endof(t) != 0 - _rsearchindex(s, t, nextind(s, i)-1) + j = i ≤ ncodeunits(s) ? nextind(s, i)-1 : i + _rsearchindex(s, t, j) elseif i > sizeof(s) return 0 elseif i == 0 diff --git a/base/strings/string.jl b/base/strings/string.jl index 2cc20a714ea69..223f0fc817b63 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -92,14 +92,12 @@ function ==(a::String, b::String) al == sizeof(b) && 0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, al) end -## thisind, nextind, prevind ## - -thisind(s::String, i::Integer) = oftype(i, thisind(s, Int(i))) -nextind(s::String, i::Integer) = oftype(i, nextind(s, Int(i))) +## thisind, prevind, nextind ## function thisind(s::String, i::Int) n = ncodeunits(s) - between(i, 2, n) || return i + i == n + 1 && return i + @boundscheck between(i, 0, n) || throw(BoundsError(s, i)) @inbounds b = codeunit(s, i) b & 0xc0 == 0x80 || return i @inbounds b = codeunit(s, i-1) @@ -114,8 +112,9 @@ function thisind(s::String, i::Int) end function nextind(s::String, i::Int) + i == 0 && return 1 n = ncodeunits(s) - between(i, 1, n-1) || return i+1 + @boundscheck between(i, 1, n) || throw(BoundsError(s, i)) @inbounds l = codeunit(s, i) (l < 0x80) | (0xf8 ≤ l) && return i+1 if l < 0xc0 diff --git a/base/strings/substring.jl b/base/strings/substring.jl index e389de6518b49..f7abe7dfb1e4f 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -78,9 +78,26 @@ function isvalid(s::SubString, i::Integer) @inbounds return ib && isvalid(s.string, s.offset + i) end -thisind(s::SubString, i::Integer) = thisind(s.string, s.offset + i) - s.offset -nextind(s::SubString, i::Integer) = nextind(s.string, s.offset + i) - s.offset -prevind(s::SubString, i::Integer) = prevind(s.string, s.offset + i) - s.offset +function thisind(s::SubString, i::Int) + @boundscheck 0 ≤ i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i)) + @inbounds return thisind(s.string, s.offset + i) - s.offset +end +function nextind(s::SubString, i::Int, n::Int) + @boundscheck 0 ≤ i < ncodeunits(s)+1 || throw(BoundsError(s, i)) + @inbounds return nextind(s.string, s.offset + i, n) - s.offset +end +function nextind(s::SubString, i::Int) + @boundscheck 0 ≤ i < ncodeunits(s)+1 || throw(BoundsError(s, i)) + @inbounds return nextind(s.string, s.offset + i) - s.offset +end +function prevind(s::SubString, i::Int, n::Int) + @boundscheck 0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i)) + @inbounds return prevind(s.string, s.offset + i, n) - s.offset +end +function prevind(s::SubString, i::Int) + @boundscheck 0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i)) + @inbounds return prevind(s.string, s.offset + i) - s.offset +end function cmp(a::SubString{String}, b::SubString{String}) na = sizeof(a) diff --git a/base/strings/util.jl b/base/strings/util.jl index d92969de12ee1..43e96fc1a8b22 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -281,17 +281,20 @@ function _split(str::AbstractString, splitter, limit::Integer, keep_empty::Bool, i = start(str) n = endof(str) r = search(str,splitter,i) - j, k = first(r), nextind(str,last(r)) - while 0 < j <= n && length(strs) != limit-1 - if i < k - if keep_empty || i < j - push!(strs, SubString(str,i,prevind(str,j))) + if r != 0:-1 + j, k = first(r), nextind(str,last(r)) + while 0 < j <= n && length(strs) != limit-1 + if i < k + if keep_empty || i < j + push!(strs, SubString(str,i,prevind(str,j))) + end + i = k end - i = k + (k <= j) && (k = nextind(str,j)) + r = search(str,splitter,k) + r == 0:-1 && break + j, k = first(r), nextind(str,last(r)) end - (k <= j) && (k = nextind(str,j)) - r = search(str,splitter,k) - j, k = first(r), nextind(str,last(r)) end if keep_empty || !done(str,i) push!(strs, SubString(str,i)) @@ -377,18 +380,16 @@ function replace_new(str::String, pattern, repl, count::Integer) unsafe_write(out, pointer(str, i), UInt(j-i)) _replace(out, repl, str, r, pattern) end - if k e && break k = nextind(str, j) else i = k = nextind(str, k) end - if j > e - break - end r = search(str,pattern,k) + r == 0:-1 || n == count && break j, k = first(r), last(r) - n == count && break n += 1 end write(out, SubString(str,i)) diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 512eee29e0866..512bbd0805943 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -99,11 +99,11 @@ end end @testset "issue #7248" begin - @test_throws BoundsError length("hello", 1, -1) == 0 - @test prevind("hello", 0, 1) == -1 - @test_throws BoundsError length("hellø", 1, -1) == 0 - @test prevind("hellø", 0, 1) == -1 - @test_throws BoundsError length("hello", 1, 10) == 10 + @test_throws BoundsError length("hello", 1, -1) + @test_throws BoundsError prevind("hello", 0, 1) + @test_throws BoundsError length("hellø", 1, -1) + @test_throws BoundsError prevind("hellø", 0, 1) + @test_throws BoundsError length("hello", 1, 10) @test nextind("hello", 0, 10) == 10 @test_throws BoundsError length("hellø", 1, 10) == 9 @test nextind("hellø", 0, 10) == 11 @@ -512,7 +512,8 @@ end SubString("123∀α>β:α+1>β123", 4, 18), SubString(s"123∀α>β:α+1>β123", 4, 18)] for s in strs - @test thisind(s, -2) == -2 + @test_throws BoundsError thisind(s, -2) + @test_throws BoundsError thisind(s, -1) @test thisind(s, 0) == 0 @test thisind(s, 1) == 1 @test thisind(s, 2) == 1 @@ -523,86 +524,97 @@ end @test thisind(s, 15) == 15 @test thisind(s, 16) == 15 @test thisind(s, 17) == 17 - @test thisind(s, 30) == 30 + @test_throws BoundsError thisind(s, 18) + @test_throws BoundsError thisind(s, 19) end end let strs = Any["", s"", SubString("123", 2, 1), SubString(s"123", 2, 1)] - for s in strs, i in -2:2 - @test thisind(s, i) == i + for s in strs + @test_throws BoundsError thisind(s, -1) + @test thisind(s, 0) == 0 + @test thisind(s, 1) == 1 + @test_throws BoundsError thisind(s, 2) end end end @testset "prevind and nextind" begin - let strs = Any["∀α>β:α+1>β", GenericString("∀α>β:α+1>β")] - for i in 1:2 - @test prevind(strs[i], 1) == 0 - @test prevind(strs[i], 1, 1) == 0 - @test prevind(strs[i], 2) == 1 - @test prevind(strs[i], 2, 1) == 1 - @test prevind(strs[i], 4) == 1 - @test prevind(strs[i], 4, 1) == 1 - @test prevind(strs[i], 5) == 4 - @test prevind(strs[i], 5, 1) == 4 - @test prevind(strs[i], 5, 2) == 1 - @test prevind(strs[i], 5, 3) == 0 - @test prevind(strs[i], 15) == 14 - @test prevind(strs[i], 15, 1) == 14 - @test prevind(strs[i], 15, 2) == 13 - @test prevind(strs[i], 15, 3) == 12 - @test prevind(strs[i], 15, 4) == 10 - @test prevind(strs[i], 15, 10) == 0 - @test prevind(strs[i], 15, 9) == 1 - @test prevind(strs[i], 16) == 15 - @test prevind(strs[i], 16, 1) == 15 - @test prevind(strs[i], 16, 2) == 14 - @test prevind(strs[i], 20) == 19 - @test prevind(strs[i], 20, 1) == 19 - @test prevind(strs[i], 20, 10) == 7 - @test prevind(strs[i], 20, 0) == 20 - - @test nextind(strs[i], -1) == 0 - @test nextind(strs[i], -1, 1) == 0 - @test nextind(strs[i], -1, 2) == 1 - @test nextind(strs[i], -1, 3) == 4 - @test nextind(strs[i], 0, 2) == 4 - @test nextind(strs[i], 0, 20) == 26 - @test nextind(strs[i], 0, 10) == 15 - @test nextind(strs[i], 1) == 4 - @test nextind(strs[i], 1, 1) == 4 - @test nextind(strs[i], 1, 2) == 6 - @test nextind(strs[i], 1, 9) == 15 - @test nextind(strs[i], 1, 10) == 17 - @test nextind(strs[i], 2) == 4 - @test nextind(strs[i], 2, 1) == 4 - @test nextind(strs[i], 3) == 4 - @test nextind(strs[i], 3, 1) == 4 - @test nextind(strs[i], 4) == 6 - @test nextind(strs[i], 4, 1) == 6 - @test nextind(strs[i], 14) == 15 - @test nextind(strs[i], 14, 1) == 15 - @test nextind(strs[i], 15) == 17 - @test nextind(strs[i], 15, 1) == 17 - @test nextind(strs[i], 20) == 21 - @test nextind(strs[i], 20, 1) == 21 - @test nextind(strs[i], 20, 0) == 20 - - for x in -10:20 - n = p = x - for j in 1:40 - p = prevind(strs[i], p) - @test prevind(strs[i], x, j) == p - n = nextind(strs[i], n) - @test nextind(strs[i], x, j) == n + for s in Any["∀α>β:α+1>β", GenericString("∀α>β:α+1>β")] + @test_throws BoundsError prevind(s, 0) + @test_throws BoundsError prevind(s, 0, 0) + @test_throws BoundsError prevind(s, 0, 1) + @test prevind(s, 1) == 0 + @test prevind(s, 1, 1) == 0 + @test prevind(s, 1, 0) == 1 + @test prevind(s, 2) == 1 + @test prevind(s, 2, 1) == 1 + @test prevind(s, 4) == 1 + @test prevind(s, 4, 1) == 1 + @test prevind(s, 5) == 4 + @test prevind(s, 5, 1) == 4 + @test prevind(s, 5, 2) == 1 + @test prevind(s, 5, 3) == 0 + @test prevind(s, 15) == 14 + @test prevind(s, 15, 1) == 14 + @test prevind(s, 15, 2) == 13 + @test prevind(s, 15, 3) == 12 + @test prevind(s, 15, 4) == 10 + @test prevind(s, 15, 10) == 0 + @test prevind(s, 15, 9) == 1 + @test prevind(s, 16) == 15 + @test prevind(s, 16, 1) == 15 + @test prevind(s, 16, 2) == 14 + @test prevind(s, 17) == 15 + @test prevind(s, 17, 1) == 15 + @test prevind(s, 17, 2) == 14 + @test_throws BoundsError prevind(s, 18) + @test_throws BoundsError prevind(s, 18, 0) + @test_throws BoundsError prevind(s, 18, 1) + + @test_throws BoundsError nextind(s, -1) + @test_throws BoundsError nextind(s, -1, 0) + @test_throws BoundsError nextind(s, -1, 1) + @test nextind(s, 0, 2) == 4 + @test nextind(s, 0, 20) == 26 + @test nextind(s, 0, 10) == 15 + @test nextind(s, 1) == 4 + @test nextind(s, 1, 1) == 4 + @test nextind(s, 1, 2) == 6 + @test nextind(s, 1, 9) == 15 + @test nextind(s, 1, 10) == 17 + @test nextind(s, 2) == 4 + @test nextind(s, 2, 1) == 4 + @test nextind(s, 3) == 4 + @test nextind(s, 3, 1) == 4 + @test nextind(s, 4) == 6 + @test nextind(s, 4, 1) == 6 + @test nextind(s, 14) == 15 + @test nextind(s, 14, 1) == 15 + @test nextind(s, 15) == 17 + @test nextind(s, 15, 1) == 17 + @test nextind(s, 15, 2) == 18 + @test nextind(s, 16) == 17 + @test nextind(s, 16, 1) == 17 + @test nextind(s, 16, 2) == 18 + @test nextind(s, 16, 3) == 19 + @test_throws BoundsError nextind(s, 17) + @test_throws BoundsError nextind(s, 17, 0) + @test_throws BoundsError nextind(s, 17, 1) + + for x in 0:ncodeunits(s)+1 + n = p = x + for j in 1:40 + if 1 ≤ p + p = prevind(s, p) + @test prevind(s, x, j) == p + end + if n ≤ ncodeunits(s) + n = nextind(s, n) + @test nextind(s, x, j) == n end end end - @test prevind(strs[1], -1) == -2 - @test prevind(strs[1], -1, 1) == -2 - - @test prevind(strs[2], -1) == -2 - @test prevind(strs[2], -1, 1) == -2 end end diff --git a/test/strings/types.jl b/test/strings/types.jl index f3c549ba6b36a..b849ddac07573 100644 --- a/test/strings/types.jl +++ b/test/strings/types.jl @@ -146,7 +146,7 @@ end @test prevind(SubString("{var}",2,4),4) == 3 # issue #4183 -@test split(SubString("x", 2, 0), "y") == AbstractString[""] +@test split(SubString("x", 2, 0), "y") == [""] # issue #6772 @test parse(Float64, SubString("10",1,1)) === 1.0 @@ -157,7 +157,7 @@ end @test !ismatch(Regex("aa"), SubString("",1,0)) @test ismatch(Regex(""), SubString("",1,0)) -# isvalid(), formerly length() and nextind() for SubString{String} +# isvalid, length, prevind, nextind for SubString{String} let s = "lorem ipsum", sdict = Dict( SubString(s, 1, 11) => "lorem ipsum", SubString(s, 1, 6) => "lorem ", @@ -177,10 +177,14 @@ let s = "lorem ipsum", sdict = Dict( end for (ss, s) in sdict @test length(ss) == length(s) - for i in 0:length(ss)+1, j = 0:length(ss)+1 + for i in 0:ncodeunits(ss), j = 0:length(ss)+1 + @test prevind(ss, i+1, j) == prevind(s, i+1, j) @test nextind(ss, i, j) == nextind(s, i, j) - @test prevind(ss, i, j) == prevind(s, i, j) end + @test_throws BoundsError prevind(s, 0) + @test_throws BoundsError prevind(ss, 0) + @test_throws BoundsError nextind(s, ncodeunits(ss)+1) + @test_throws BoundsError nextind(ss, ncodeunits(ss)+1) end end @@ -206,11 +210,17 @@ end let ss = SubString("hello", 1, 5) @test length(ss, 1, 0) == 0 - @test_throws BoundsError length(ss, 1, -1) == 0 + @test_throws BoundsError length(ss, 1, -1) @test_throws BoundsError length(ss, 1, 6) @test_throws BoundsError length(ss, 1, 10) - @test prevind(ss, 0, 1) == -1 - @test nextind(ss, 0, 10) == 10 + @test_throws BoundsError prevind(ss, 0, 1) + @test prevind(ss, 1, 1) == 0 + @test prevind(ss, 6, 1) == 5 + @test_throws BoundsError prevind(ss, 7, 1) + @test_throws BoundsError nextind(ss, -1, 1) + @test nextind(ss, 0, 1) == 1 + @test nextind(ss, 5, 1) == 6 + @test_throws BoundsError nextind(ss, 6, 1) end # length(SubString{String}) performance specialization