Skip to content

Commit

Permalink
Merge pull request #24999 from JuliaLang/sk/strings
Browse files Browse the repository at this point in the history
string overhaul
  • Loading branch information
StefanKarpinski authored Dec 14, 2017
2 parents ed1ae9e + 8de25f5 commit d192302
Show file tree
Hide file tree
Showing 41 changed files with 1,276 additions and 1,240 deletions.
89 changes: 76 additions & 13 deletions base/char.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,58 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license

convert(::Type{Char}, x::UInt32) = reinterpret(Char, x)
struct MalformedCharError <: Exception
char::Char
end
struct CodePointError <: Exception
code::Integer
end
@noinline malformed_char(c::Char) = throw(MalformedCharError(c))
@noinline code_point_err(u::UInt32) = throw(CodePointError(u))

function ismalformed(c::Char)
u = reinterpret(UInt32, c)
l1 = leading_ones(u) << 3
t0 = trailing_zeros(u) & 56
(l1 == 8) | (l1 + t0 > 32) |
(((u & 0x00c0c0c0) 0x00808080) >> t0 != 0)
end

function convert(::Type{UInt32}, c::Char)
# TODO: use optimized inline LLVM
u = reinterpret(UInt32, c)
u < 0x80000000 && return reinterpret(UInt32, u >> 24)
l1 = leading_ones(u)
t0 = trailing_zeros(u) & 56
(l1 == 1) | (8l1 + t0 > 32) |
(((u & 0x00c0c0c0) 0x00808080) >> t0 != 0) &&
malformed_char(c)::Union{}
u &= 0xffffffff >> l1
u >>= t0
(u & 0x0000007f >> 0) | (u & 0x00007f00 >> 2) |
(u & 0x007f0000 >> 4) | (u & 0x7f000000 >> 6)
end

function convert(::Type{Char}, u::UInt32)
u < 0x80 && return reinterpret(Char, u << 24)
u < 0x00200000 || code_point_err(u)::Union{}
c = ((u << 0) & 0x0000003f) | ((u << 2) & 0x00003f00) |
((u << 4) & 0x003f0000) | ((u << 6) & 0x3f000000)
c = u < 0x00000800 ? (c << 16) | 0xc0800000 :
u < 0x00010000 ? (c << 08) | 0xe0808000 :
(c << 00) | 0xf0808080
reinterpret(Char, c)
end

function convert(::Type{T}, c::Char) where T <: Union{Int8,UInt8}
i = reinterpret(Int32, c)
i 0 ? ((i >>> 24) % T) : T(UInt32(c))
end

function convert(::Type{Char}, b::Union{Int8,UInt8})
0 b 0x7f ? reinterpret(Char, (b % UInt32) << 24) : Char(UInt32(b))
end

convert(::Type{Char}, x::Number) = Char(UInt32(x))
convert(::Type{UInt32}, x::Char) = reinterpret(UInt32, x)
convert(::Type{T}, x::Char) where {T<:Number} = convert(T, UInt32(x))

rem(x::Char, ::Type{T}) where {T<:Number} = rem(UInt32(x), T)
Expand All @@ -29,19 +79,16 @@ done(c::Char, state) = state
isempty(c::Char) = false
in(x::Char, y::Char) = x == y

==(x::Char, y::Char) = UInt32(x) == UInt32(y)
isless(x::Char, y::Char) = UInt32(x) < UInt32(y)

const hashchar_seed = 0xd4d64234
hash(x::Char, h::UInt) = hash_uint64(((UInt64(x)+hashchar_seed)<<32) UInt64(h))
==(x::Char, y::Char) = reinterpret(UInt32, x) == reinterpret(UInt32, y)
isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y)
hash(x::Char, h::UInt) =
hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) UInt64(h))

-(x::Char, y::Char) = Int(x) - Int(y)
-(x::Char, y::Integer) = Char(Int32(x) - Int32(y))
+(x::Char, y::Integer) = Char(Int32(x) + Int32(y))
+(x::Integer, y::Char) = y + x

bswap(x::Char) = Char(bswap(UInt32(x)))

print(io::IO, c::Char) = (write(io, c); nothing)

const hex_chars = UInt8['0':'9';'a':'z']
Expand All @@ -66,21 +113,37 @@ function show(io::IO, c::Char)
end
if Unicode.isprint(c)
write(io, 0x27, c, 0x27)
else
elseif !ismalformed(c)
u = UInt32(c)
write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
d = max(2, 8 - (leading_zeros(u) >> 2))
while 0 < d
write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
end
write(io, 0x27)
else # malformed
write(io, 0x27)
u = reinterpret(UInt32, c)
while true
a = hex_chars[((u >> 28) & 0xf) + 1]
b = hex_chars[((u >> 24) & 0xf) + 1]
write(io, 0x5c, 'x', a, b)
(u <<= 8) == 0 && break
end
write(io, 0x27)
end
return
end

function show(io::IO, ::MIME"text/plain", c::Char)
show(io, c)
u = UInt32(c)
print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
print(io, " (category ", Unicode.category_abbrev(c), ": ", Unicode.category_string(c), ")")
if !ismalformed(c)
u = UInt32(c)
print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
else
print(io, ": Malformed UTF-8")
end
abr = Unicode.category_abbrev(c)
str = Unicode.category_string(c)
print(io, " (category ", abr, ": ", str, ")")
end
4 changes: 4 additions & 0 deletions base/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2992,6 +2992,10 @@ end
@deprecate_binding Complex64 ComplexF32
@deprecate_binding Complex128 ComplexF64

# PR #24999
@deprecate ind2chr(s::AbstractString, i::Integer) length(s, 1, i)
@deprecate chr2ind(s::AbstractString, n::Integer) nextind(s, 0, n)

# END 0.7 deprecations

# BEGIN 1.0 deprecations
Expand Down
4 changes: 1 addition & 3 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ export
NullException,
ParseError,
SystemError,
UnicodeError,
StringIndexError,

# Global constants and variables
ARGS,
Expand Down Expand Up @@ -716,7 +716,6 @@ export
bytes2hex,
chomp,
chop,
chr2ind,
codeunit,
dec,
digits,
Expand All @@ -728,7 +727,6 @@ export
hex,
hex2bytes,
hex2bytes!,
ind2chr,
info,
ismatch,
isvalid,
Expand Down
20 changes: 20 additions & 0 deletions base/filesystem.jl
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,26 @@ function read(f::File, ::Type{UInt8})
return ret % UInt8
end

function read(f::File, ::Type{Char})
b0 = read(f, UInt8)
l = 8(4-leading_ones(b0))
c = UInt32(b0) << 24
if l < 24
s = 16
while s l && !eof(f)
p = position(f)
b = read(f, UInt8)
if b & 0xc0 != 0x80
seek(f, p)
break
end
c |= UInt32(b) << s
s -= 8
end
end
return reinterpret(Char, c)
end

function unsafe_read(f::File, p::Ptr{UInt8}, nel::UInt)
check_open(f)
ret = ccall(:jl_fs_read, Int32, (Int32, Ptr{Void}, Csize_t),
Expand Down
4 changes: 2 additions & 2 deletions base/intfuncs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -654,8 +654,8 @@ for sym in (:bin, :oct, :dec, :hex)
@eval begin
($sym)(x::Unsigned, p::Int) = ($sym)(x,p,false)
($sym)(x::Unsigned) = ($sym)(x,1,false)
($sym)(x::Char, p::Int) = ($sym)(unsigned(x),p,false)
($sym)(x::Char) = ($sym)(unsigned(x),1,false)
($sym)(x::Char, p::Int) = ($sym)(UInt32(x),p,false)
($sym)(x::Char) = ($sym)(UInt32(x),1,false)
($sym)(x::Integer, p::Int) = ($sym)(unsigned(abs(x)),p,x<0)
($sym)(x::Integer) = ($sym)(unsigned(abs(x)),1,x<0)
end
Expand Down
66 changes: 25 additions & 41 deletions base/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -535,25 +535,13 @@ function write(s::IO, a::SubArray{T,N,<:Array}) where {T,N}
end
end


function write(s::IO, ch::Char)
c = reinterpret(UInt32, ch)
if c < 0x80
return write(s, c%UInt8)
elseif c < 0x800
return (write(s, (( c >> 6 ) | 0xC0)%UInt8)) +
(write(s, (( c & 0x3F ) | 0x80)%UInt8))
elseif c < 0x10000
return (write(s, (( c >> 12 ) | 0xE0)%UInt8)) +
(write(s, (((c >> 6) & 0x3F ) | 0x80)%UInt8)) +
(write(s, (( c & 0x3F ) | 0x80)%UInt8))
elseif c < 0x110000
return (write(s, (( c >> 18 ) | 0xF0)%UInt8)) +
(write(s, (((c >> 12) & 0x3F ) | 0x80)%UInt8)) +
(write(s, (((c >> 6) & 0x3F ) | 0x80)%UInt8)) +
(write(s, (( c & 0x3F ) | 0x80)%UInt8))
else
return write(s, '\ufffd')
function write(io::IO, c::Char)
u = bswap(reinterpret(UInt32, c))
n = 1
while true
write(io, u % UInt8)
(u >>= 8) == 0 && return n
n += 1
end
end

Expand Down Expand Up @@ -596,31 +584,28 @@ function read!(s::IO, a::Array{T}) where T
return a
end

function read(s::IO, ::Type{Char})
ch = read(s, UInt8)
if ch < 0x80
return Char(ch)
end

# mimic utf8.next function
trailing = Base.utf8_trailing[ch+1]
c::UInt32 = 0
for j = 1:trailing
c += ch
c <<= 6
ch = read(s, UInt8)
function read(io::IO, ::Type{Char})
b0 = read(io, UInt8)
l = 8(4-leading_ones(b0))
c = UInt32(b0) << 24
if l < 24
s = 16
while s l && !eof(io)
peek(io) & 0xc0 == 0x80 || break
b = read(io, UInt8)
c |= UInt32(b) << s
s -= 8
end
end
c += ch
c -= Base.utf8_offset[trailing+1]
return Char(c)
return reinterpret(Char, c)
end

# readuntil_string is useful below since it has
# an optimized method for s::IOStream
readuntil_string(s::IO, delim::UInt8) = String(readuntil(s, delim))

function readuntil(s::IO, delim::Char)
if delim < Char(0x80)
if delim '\x7f'
return readuntil_string(s, delim % UInt8)
end
out = IOBuffer()
Expand Down Expand Up @@ -701,7 +686,7 @@ function readuntil(io::IO, target::AbstractString)
i = start(target)
done(target, i) && return ""
c, i = next(target, start(target))
if done(target, i) && c < Char(0x80)
if done(target, i) && c <= '\x7f'
return readuntil_string(io, c % UInt8)
end
# decide how we can index target
Expand All @@ -728,12 +713,11 @@ function readuntil(io::IO, target::AbstractVector{T}) where T
return out
end


"""
readchomp(x)
Read the entirety of `x` as a string and remove a single trailing newline.
Equivalent to `chomp!(read(x, String))`.
Read the entirety of `x` as a string and remove a single trailing newline
if there is one. Equivalent to `chomp(read(x, String))`.
# Examples
```jldoctest
Expand All @@ -747,7 +731,7 @@ julia> readchomp("my_file.txt")
julia> rm("my_file.txt");
```
"""
readchomp(x) = chomp!(read(x, String))
readchomp(x) = chomp(read(x, String))

# read up to nb bytes into nb, returning # bytes read

Expand Down
32 changes: 17 additions & 15 deletions base/iostream.jl
Original file line number Diff line number Diff line change
Expand Up @@ -315,12 +315,13 @@ end

## low-level calls ##

write(s::IOStream, b::UInt8) = Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios))
function write(s::IOStream, b::UInt8)
iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable"))
Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios))
end

function unsafe_write(s::IOStream, p::Ptr{UInt8}, nb::UInt)
if !iswritable(s)
throw(ArgumentError("write failed, IOStream is not writeable"))
end
iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable"))
return Int(ccall(:ios_write, Csize_t, (Ptr{Void}, Ptr{Void}, Csize_t), s.ios, p, nb))
end

Expand Down Expand Up @@ -353,14 +354,6 @@ end

## text I/O ##

function write(s::IOStream, c::Char)
if !iswritable(s)
throw(ArgumentError("write failed, IOStream is not writeable"))
end
Int(ccall(:ios_pututf8, Cint, (Ptr{Void}, UInt32), s.ios, c))
end
read(s::IOStream, ::Type{Char}) = Char(ccall(:jl_getutf8, UInt32, (Ptr{Void},), s.ios))

take!(s::IOStream) =
ccall(:jl_take_buffer, Vector{UInt8}, (Ptr{Void},), s.ios)

Expand Down Expand Up @@ -452,14 +445,23 @@ function read(s::IOStream, nb::Integer; all::Bool=true)
end

## Character streams ##
const _chtmp = Ref{Char}()

function peekchar(s::IOStream)
if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{Char}), s, _chtmp) < 0
chref = Ref{UInt32}()
if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{UInt32}), s, chref) < 0
return typemax(Char)
end
return _chtmp[]
return Char(chref[])
end

function peek(s::IOStream)
ccall(:ios_peekc, Cint, (Ptr{Void},), s)
end

function peek(s::IO)
mark(s)
try read(s, UInt8)
finally
reset(s)
end
end
4 changes: 2 additions & 2 deletions base/parse.jl
Original file line number Diff line number Diff line change
Expand Up @@ -224,12 +224,12 @@ end
## string to float functions ##

tryparse(::Type{Float64}, s::String) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s))
tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof)
tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits)
tryparse_internal(::Type{Float64}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1)
tryparse_internal(::Type{Float64}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1)

tryparse(::Type{Float32}, s::String) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s))
tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof)
tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits)
tryparse_internal(::Type{Float32}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1)
tryparse_internal(::Type{Float32}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1)

Expand Down
Loading

1 comment on commit d192302

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Executing the daily benchmark build, I will reply here when finished:

@nanosoldier runbenchmarks(ALL, isdaily = true)

Please sign in to comment.