From 16d9ef62eac37413e015ecc7e6e507eed92ce0a6 Mon Sep 17 00:00:00 2001
From: tan <tanmaykm@gmail.com>
Date: Mon, 20 Jan 2014 13:28:08 +0530
Subject: [PATCH] - split and rsplit return SubStrings - faster specialized
 bytestring for SubString - updated split usage that depended on old behavior
 - updated docs

---
 base/help.jl        |  2 +-
 base/string.jl      | 26 +++++++++++++++-----------
 doc/helpdb.jl       |  2 +-
 doc/stdlib/base.rst |  2 +-
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/base/help.jl b/base/help.jl
index 0e559e40b6a88..5c563be82d24c 100644
--- a/base/help.jl
+++ b/base/help.jl
@@ -15,7 +15,7 @@ function clear_cache()
 end
 
 function decor_help_desc(func::String, mfunc::String, desc::String)
-    sd = split(desc, '\n')
+    sd = convert(Array{ByteString,1}, split(desc, '\n'))
     for i = 1:length(sd)
         if beginswith(sd[i], func)
             sd[i] = mfunc * sd[i][length(func)+1:end]
diff --git a/base/string.jl b/base/string.jl
index ca46faaf0a354..6272820419c51 100644
--- a/base/string.jl
+++ b/base/string.jl
@@ -642,6 +642,8 @@ prevind(s::SubString, i::Integer) = prevind(s.string, i+s.offset)-s.offset
 
 convert{T<:String}(::Type{SubString{T}}, s::T) = SubString(s, 1, endof(s))
 
+bytestring{T <: ByteString}(p::SubString{T}) = bytestring(pointer(p.string.data)+p.offset, nextind(p, p.endof)-1)
+
 function serialize{T}(s, ss::SubString{T})
     # avoid saving a copy of the parent string, keeping the type of ss
     invoke(serialize, (Any,Any), s, convert(SubString{T}, convert(T,ss)))
@@ -1247,11 +1249,12 @@ lpad(s, n::Integer, p=" ") = lpad(string(s),n,string(p))
 rpad(s, n::Integer, p=" ") = rpad(string(s),n,string(p))
 cpad(s, n::Integer, p=" ") = rpad(lpad(s,div(n+strwidth(s),2),p),n,p)
 
+
 # splitter can be a Char, Vector{Char}, String, Regex, ...
 # any splitter that provides search(s::String, splitter)
-
-function split(str::String, splitter, limit::Integer, keep_empty::Bool)
-    strs = String[]
+split{T<:SubString}(str::T, splitter, limit::Integer, keep_empty::Bool) = _split(str, splitter, limit, keep_empty, T[])
+split{T<:String}(str::T, splitter, limit::Integer, keep_empty::Bool) = _split(str, splitter, limit, keep_empty, SubString{T}[])
+function _split{T<:String,U<:Array}(str::T, splitter, limit::Integer, keep_empty::Bool, strs::U)
     i = start(str)
     n = endof(str)
     r = search(str,splitter,i)
@@ -1259,7 +1262,7 @@ function split(str::String, splitter, limit::Integer, keep_empty::Bool)
     while 0 < j <= n && length(strs) != limit-1
         if i < k
             if keep_empty || i < j
-                push!(strs, str[i:prevind(str,j)])
+                push!(strs, SubString(str,i,prevind(str,j)))
             end
             i = k
         end
@@ -1268,7 +1271,7 @@ function split(str::String, splitter, limit::Integer, keep_empty::Bool)
         j, k = first(r), nextind(str,last(r))
     end
     if keep_empty || !done(str,i)
-        push!(strs, str[i:end])
+        push!(strs, SubString(str,i))
     end
     return strs
 end
@@ -1278,10 +1281,12 @@ split(s::String, spl)             = split(s, spl, 0, true)
 
 # a bit oddball, but standard behavior in Perl, Ruby & Python:
 const _default_delims = [' ','\t','\n','\v','\f','\r']
-split(str::String) = split(str, _default_delims, 0, false)
+split(str::String)                = split(str, _default_delims, 0, false)
 
-function rsplit(str::String, splitter, limit::Integer, keep_empty::Bool)
-    strs = String[]
+
+rsplit{T<:SubString}(str::T, splitter, limit::Integer, keep_empty::Bool) = _rsplit(str, splitter, limit, keep_empty, T[])
+rsplit{T<:String}(str::T, splitter, limit::Integer, keep_empty::Bool) = _rsplit(str, splitter, limit, keep_empty, SubString{T}[])
+function _rsplit{T<:String,U<:Array}(str::T, splitter, limit::Integer, keep_empty::Bool, strs::U)
     i = start(str)
     n = endof(str)
     r = rsearch(str,splitter)
@@ -1289,7 +1294,7 @@ function rsplit(str::String, splitter, limit::Integer, keep_empty::Bool)
     k = last(r)
     while((0 <= j < n) && (length(strs) != limit-1))
         if i <= k
-            (keep_empty || (k < n)) && unshift!(strs, str[k+1:n])
+            (keep_empty || (k < n)) && unshift!(strs, SubString(str,k+1,n))
             n = j
         end
         (k <= j) && (j = prevind(str,j))
@@ -1297,7 +1302,7 @@ function rsplit(str::String, splitter, limit::Integer, keep_empty::Bool)
         j = first(r)-1
         k = last(r)
     end
-    (keep_empty || (n > 0)) && unshift!(strs, str[1:n])
+    (keep_empty || (n > 0)) && unshift!(strs, SubString(str,1,n))
     return strs
 end
 rsplit(s::String, spl, n::Integer) = rsplit(s, spl, n, true)
@@ -1305,7 +1310,6 @@ rsplit(s::String, spl, keep::Bool) = rsplit(s, spl, 0, keep)
 rsplit(s::String, spl)             = rsplit(s, spl, 0, true)
 #rsplit(str::String) = rsplit(str, _default_delims, 0, false)
 
-
 function replace(str::ByteString, pattern, repl::Function, limit::Integer)
     n = 1
     e = endof(str)
diff --git a/doc/helpdb.jl b/doc/helpdb.jl
index 3bd15890d9e84..45547e115037f 100644
--- a/doc/helpdb.jl
+++ b/doc/helpdb.jl
@@ -1494,7 +1494,7 @@
 
 ("Strings","Base","split","split(string, [chars, [limit,] [include_empty]])
 
-   Return an array of strings by splitting the given string on
+   Return an array of substrings by splitting the given string on
    occurrences of the given character delimiters, which may be
    specified in any of the formats allowed by \"search\"'s second
    argument (i.e. a single character, collection of characters,
diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst
index 97306a1e07981..ca23d845c6240 100644
--- a/doc/stdlib/base.rst
+++ b/doc/stdlib/base.rst
@@ -1011,7 +1011,7 @@ Strings
 
 .. function:: split(string, [chars, [limit,] [include_empty]])
 
-   Return an array of strings by splitting the given string on occurrences of the given character delimiters, which may be specified in any of the formats allowed by ``search``'s second argument (i.e. a single character, collection of characters, string, or regular expression). If ``chars`` is omitted, it defaults to the set of all space characters, and ``include_empty`` is taken to be false. The last two arguments are also optional: they are are a maximum size for the result and a flag determining whether empty fields should be included in the result.
+   Return an array of substrings by splitting the given string on occurrences of the given character delimiters, which may be specified in any of the formats allowed by ``search``'s second argument (i.e. a single character, collection of characters, string, or regular expression). If ``chars`` is omitted, it defaults to the set of all space characters, and ``include_empty`` is taken to be false. The last two arguments are also optional: they are are a maximum size for the result and a flag determining whether empty fields should be included in the result.
 
 .. function:: rsplit(string, [chars, [limit,] [include_empty]])