From 16d9ef62eac37413e015ecc7e6e507eed92ce0a6 Mon Sep 17 00:00:00 2001 From: tan Date: Mon, 20 Jan 2014 13:28:08 +0530 Subject: [PATCH] - split and rsplit return SubStrings - faster specialized bytestring for SubString - updated split usage that depended on old behavior - updated docs --- base/help.jl | 2 +- base/string.jl | 26 +++++++++++++++----------- doc/helpdb.jl | 2 +- doc/stdlib/base.rst | 2 +- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/base/help.jl b/base/help.jl index 0e559e40b6a88..5c563be82d24c 100644 --- a/base/help.jl +++ b/base/help.jl @@ -15,7 +15,7 @@ function clear_cache() end function decor_help_desc(func::String, mfunc::String, desc::String) - sd = split(desc, '\n') + sd = convert(Array{ByteString,1}, split(desc, '\n')) for i = 1:length(sd) if beginswith(sd[i], func) sd[i] = mfunc * sd[i][length(func)+1:end] diff --git a/base/string.jl b/base/string.jl index ca46faaf0a354..6272820419c51 100644 --- a/base/string.jl +++ b/base/string.jl @@ -642,6 +642,8 @@ prevind(s::SubString, i::Integer) = prevind(s.string, i+s.offset)-s.offset convert{T<:String}(::Type{SubString{T}}, s::T) = SubString(s, 1, endof(s)) +bytestring{T <: ByteString}(p::SubString{T}) = bytestring(pointer(p.string.data)+p.offset, nextind(p, p.endof)-1) + function serialize{T}(s, ss::SubString{T}) # avoid saving a copy of the parent string, keeping the type of ss invoke(serialize, (Any,Any), s, convert(SubString{T}, convert(T,ss))) @@ -1247,11 +1249,12 @@ lpad(s, n::Integer, p=" ") = lpad(string(s),n,string(p)) rpad(s, n::Integer, p=" ") = rpad(string(s),n,string(p)) cpad(s, n::Integer, p=" ") = rpad(lpad(s,div(n+strwidth(s),2),p),n,p) + # splitter can be a Char, Vector{Char}, String, Regex, ... # any splitter that provides search(s::String, splitter) - -function split(str::String, splitter, limit::Integer, keep_empty::Bool) - strs = String[] +split{T<:SubString}(str::T, splitter, limit::Integer, keep_empty::Bool) = _split(str, splitter, limit, keep_empty, T[]) +split{T<:String}(str::T, splitter, limit::Integer, keep_empty::Bool) = _split(str, splitter, limit, keep_empty, SubString{T}[]) +function _split{T<:String,U<:Array}(str::T, splitter, limit::Integer, keep_empty::Bool, strs::U) i = start(str) n = endof(str) r = search(str,splitter,i) @@ -1259,7 +1262,7 @@ function split(str::String, splitter, limit::Integer, keep_empty::Bool) while 0 < j <= n && length(strs) != limit-1 if i < k if keep_empty || i < j - push!(strs, str[i:prevind(str,j)]) + push!(strs, SubString(str,i,prevind(str,j))) end i = k end @@ -1268,7 +1271,7 @@ function split(str::String, splitter, limit::Integer, keep_empty::Bool) j, k = first(r), nextind(str,last(r)) end if keep_empty || !done(str,i) - push!(strs, str[i:end]) + push!(strs, SubString(str,i)) end return strs end @@ -1278,10 +1281,12 @@ split(s::String, spl) = split(s, spl, 0, true) # a bit oddball, but standard behavior in Perl, Ruby & Python: const _default_delims = [' ','\t','\n','\v','\f','\r'] -split(str::String) = split(str, _default_delims, 0, false) +split(str::String) = split(str, _default_delims, 0, false) -function rsplit(str::String, splitter, limit::Integer, keep_empty::Bool) - strs = String[] + +rsplit{T<:SubString}(str::T, splitter, limit::Integer, keep_empty::Bool) = _rsplit(str, splitter, limit, keep_empty, T[]) +rsplit{T<:String}(str::T, splitter, limit::Integer, keep_empty::Bool) = _rsplit(str, splitter, limit, keep_empty, SubString{T}[]) +function _rsplit{T<:String,U<:Array}(str::T, splitter, limit::Integer, keep_empty::Bool, strs::U) i = start(str) n = endof(str) r = rsearch(str,splitter) @@ -1289,7 +1294,7 @@ function rsplit(str::String, splitter, limit::Integer, keep_empty::Bool) k = last(r) while((0 <= j < n) && (length(strs) != limit-1)) if i <= k - (keep_empty || (k < n)) && unshift!(strs, str[k+1:n]) + (keep_empty || (k < n)) && unshift!(strs, SubString(str,k+1,n)) n = j end (k <= j) && (j = prevind(str,j)) @@ -1297,7 +1302,7 @@ function rsplit(str::String, splitter, limit::Integer, keep_empty::Bool) j = first(r)-1 k = last(r) end - (keep_empty || (n > 0)) && unshift!(strs, str[1:n]) + (keep_empty || (n > 0)) && unshift!(strs, SubString(str,1,n)) return strs end rsplit(s::String, spl, n::Integer) = rsplit(s, spl, n, true) @@ -1305,7 +1310,6 @@ rsplit(s::String, spl, keep::Bool) = rsplit(s, spl, 0, keep) rsplit(s::String, spl) = rsplit(s, spl, 0, true) #rsplit(str::String) = rsplit(str, _default_delims, 0, false) - function replace(str::ByteString, pattern, repl::Function, limit::Integer) n = 1 e = endof(str) diff --git a/doc/helpdb.jl b/doc/helpdb.jl index 3bd15890d9e84..45547e115037f 100644 --- a/doc/helpdb.jl +++ b/doc/helpdb.jl @@ -1494,7 +1494,7 @@ ("Strings","Base","split","split(string, [chars, [limit,] [include_empty]]) - Return an array of strings by splitting the given string on + Return an array of substrings by splitting the given string on occurrences of the given character delimiters, which may be specified in any of the formats allowed by \"search\"'s second argument (i.e. a single character, collection of characters, diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst index 97306a1e07981..ca23d845c6240 100644 --- a/doc/stdlib/base.rst +++ b/doc/stdlib/base.rst @@ -1011,7 +1011,7 @@ Strings .. function:: split(string, [chars, [limit,] [include_empty]]) - Return an array of strings by splitting the given string on occurrences of the given character delimiters, which may be specified in any of the formats allowed by ``search``'s second argument (i.e. a single character, collection of characters, string, or regular expression). If ``chars`` is omitted, it defaults to the set of all space characters, and ``include_empty`` is taken to be false. The last two arguments are also optional: they are are a maximum size for the result and a flag determining whether empty fields should be included in the result. + Return an array of substrings by splitting the given string on occurrences of the given character delimiters, which may be specified in any of the formats allowed by ``search``'s second argument (i.e. a single character, collection of characters, string, or regular expression). If ``chars`` is omitted, it defaults to the set of all space characters, and ``include_empty`` is taken to be false. The last two arguments are also optional: they are are a maximum size for the result and a flag determining whether empty fields should be included in the result. .. function:: rsplit(string, [chars, [limit,] [include_empty]])