Skip to content

Commit

Permalink
speed up TOML parsing by ~4x (#116)
Browse files Browse the repository at this point in the history
* use a charbuffer to write read chars into, instead of creating Char arrays everywhere
* be more careful with using stream seeking operations like position, mark, seek, this are kinda slow
  • Loading branch information
KristofferC authored Feb 9, 2018
1 parent e589255 commit 4bd1379
Showing 1 changed file with 54 additions and 56 deletions.
110 changes: 54 additions & 56 deletions ext/TOML/src/parser.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
NONE() = Nullable()
NONE(::Type{T}) where {T} = Nullable{T}()
SOME(v::T) where {T} = Nullable{T}(v)

"TOML Table"
mutable struct Table
struct Table
values::Dict{String,Any}
defined::Bool
end
Expand Down Expand Up @@ -41,17 +37,26 @@ struct ParserError <: Exception
end

"TOML Parser"
struct Parser{IO_T <: IO}
mutable struct Parser{IO_T <: IO}
input::IO_T
errors::Vector{ParserError}
charbuffer::Base.AbstractIOBuffer{Array{UInt8,1}}
currentchar::Char

Parser(input::IO_T) where {IO_T <: IO} = new{IO_T}(input, ParserError[])
Parser(input::IO_T) where {IO_T <: IO} = new{IO_T}(input, ParserError[], IOBuffer(), ' ')
end
Parser(input::String) = Parser(IOBuffer(input))
Base.error(p::Parser, l, h, msg) = push!(p.errors, ParserError(l, h, msg))
Base.eof(p::Parser) = eof(p.input)
Base.position(p::Parser) = Int(position(p.input))+1
Base.read(p::Parser) = read(p.input, Char)
Base.write(p::Parser, x) = write(p.charbuffer, x)
Base.read(p::Parser) = p.currentchar = read(p.input, Char)

NONE() = Nullable()
NONE(::Type{T}) where {T} = Nullable{T}()
SOME(v::T) where {T} = Nullable{T}(v)
NONE(::Type{String}, p::Parser) = (take!(p.charbuffer); Nullable{String}())
SOME(::Type{String}, p::Parser) = Nullable{String}(String(take!(p.charbuffer)))

"Rewind parser input on `n` characters."
function rewind(p::Parser, n=1)
Expand Down Expand Up @@ -81,11 +86,7 @@ end
"Determine next position in parser input"
function nextpos(p::Parser)
pos = position(p)
mark(p.input)
skip(p.input, 1)
npos = eof(p) ? pos : pos+1
reset(p.input)
return npos
return pos + 1
end

"Peeks ahead `n` characters"
Expand All @@ -99,12 +100,11 @@ end
"Returns `true` and consumes the next character if it matches `ch`, otherwise do nothing and return `false`"
function consume(p::Parser, ch::Char)
eof(p) && return false
mark(p.input)
if read(p) == ch
unmark(p.input)
c = peek(p)
if get(c) == ch
read(p)
return true
else
reset(p.input)
return false
end
end
Expand Down Expand Up @@ -183,26 +183,25 @@ function keyname(p)
elseif consume(p, '\'')
literalstring(p, s, false)
else
ret = Char[]
while !eof(p)
ch = read(p)
if 'a' <= ch <= 'z' ||
'A' <= ch <= 'Z' ||
isdigit(ch) ||
ch == '_' ||
ch == '-'
push!(ret, ch)
write(p, ch)
else
rewind(p)
break
end
end
SOME(String(ret))
SOME(String, p)
end

if !isnull(key) && isempty(get(key))
error(p, s, s, "expected a key but found an empty string")
key = NONE(String)
key = NONE(String, p)
end
return key
end
Expand Down Expand Up @@ -248,48 +247,47 @@ end

"Parses integer with leading zeros and sign"
function integer(p::Parser, st::Int, allow_leading_zeros::Bool=false, allow_sign::Bool=false)
s = Char[]
if allow_sign
if consume(p, '-')
push!(s, '-')
write(p, '-')
elseif consume(p, '+')
push!(s, '+')
write(p, '+')
end
end
nch = peek(p)
if isnull(nch)
pos = nextpos(p)
error(p, pos, pos, "expected start of a numeric literal")
return NONE(String)
return NONE(String, p)
else
ch = get(nch)
if isdigit(ch)
c = read(p)
if c == '0' && !allow_leading_zeros
push!(s, '0')
write(p, '0')
nch = peek(p)
if !isnull(nch)
ch = get(nch)
if isdigit(ch)
error(p, st, position(p), "leading zeroes are not allowed")
return NONE(String)
return NONE(String, p)
end
end
elseif isdigit(c)
push!(s, c)
write(p, c)
end
else
# non-digit
error(p, st, position(p), "expected a digit, found `$ch`")
return NONE(String)
return NONE(String, p)
end
end

underscore = false
while !eof(p)
ch = read(p)
if isdigit(ch)
push!(s, ch)
write(p, ch)
underscore = false
elseif ch == '_' && !underscore
underscore = true
Expand All @@ -301,9 +299,9 @@ function integer(p::Parser, st::Int, allow_leading_zeros::Bool=false, allow_sign
if underscore
pos = nextpos(p)
error(p, pos, pos, "numeral cannot end with an underscore")
NONE(String)
NONE(String, p)
else
length(s) == 0 ? NONE(String) : SOME(String(s))
p.charbuffer.ptr == 1 ? NONE(String, p) : SOME(String, p)
end
end

Expand Down Expand Up @@ -361,7 +359,7 @@ function numdatetime(p::Parser, st::Int)
isnull(ndecimal) && return NONE()
SOME(get(ndecimal))
else
NONE(String)
NONE(String, p)
end

exponent = if consume(p,'e') || consume(p,'E')
Expand All @@ -370,7 +368,7 @@ function numdatetime(p::Parser, st::Int)
isnull(nexponent) && return NONE()
SOME(get(nexponent))
else
NONE(String)
NONE(String, p)
end

pend = nextpos(p)
Expand Down Expand Up @@ -481,7 +479,7 @@ end

"Parses a single or multi-line string"
function basicstring(p::Parser, st::Int)
!expect(p, '"') && return NONE(String)
!expect(p, '"') && return NONE(String, p)

multiline = false

Expand All @@ -499,37 +497,38 @@ end

"Finish parsing a basic string after the opening quote has been seen"
function basicstring(p::Parser, st::Int, multiline::Bool)
ret = Char[]
while true
while multiline && newline(p)
push!(ret, '\n')
write(p, '\n')
end
pos = position(p)
if eof(p)
pos = position(p)
error(p, st, pos, "unterminated string literal")
return NONE(String)
return NONE(String, p)
else
ch = read(p)
if ch == '"'
if multiline
if !consume(p, '"')
push!(ret, '"')
write(p, '"')
continue
end
if !consume(p, '"')
push!(ret, '"')
push!(ret, '"')
write(p, '"')
write(p, '"')
continue
end
end
return SOME(String(ret))
return SOME(String, p)
elseif ch == '\\'
pos = position(p)
ec = escape(p, pos, multiline)
!isnull(ec) && push!(ret, get(ec))
!isnull(ec) && write(p, get(ec))
elseif ch < '\x1f'
pos = position(p)
error(p, st, pos, "control character `$ch` must be escaped")
else
push!(ret, ch)
write(p, ch)
end
end
end
Expand Down Expand Up @@ -568,11 +567,11 @@ function escape(p::Parser, st::Int, multiline::Bool)
try
if length(snum) < len
error(p, st, st+len, "expected $len hex digits after a `$ch` escape")
thorw()
NONE()
end
if !all(isxdigit, snum)
error(p, st, st+len, "unknown string escape: `$snum`")
thorw()
NONE()
end
c = unescape_string(ucstr * snum)[1]
SOME(c)
Expand All @@ -590,7 +589,7 @@ end

"Parses a single or multi-line literal string"
function literalstring(p::Parser, st::Int)
!expect(p, '\'') && return NONE(String)
!expect(p, '\'') && return NONE(String, p)

multiline = false

Expand All @@ -607,33 +606,32 @@ function literalstring(p::Parser, st::Int)
end

function literalstring(p::Parser, st::Int, multiline::Bool)
ret = UInt8[]
while true
if !multiline && newline(p)
npos = nextpos(p)
error(p, st, npos, "literal strings cannot contain newlines")
return NONE(String)
return NONE(String, p)
end
if eof(p)
error(p, st, position(p), "unterminated string literal")
return NONE(String)
return NONE(String, p)
else
ch = read(p.input, UInt8)
if ch == 0x27
if multiline
if !consume(p, '\'')
push!(ret, 0x27)
write(p, 0x27)
continue
end
if !consume(p, '\'')
push!(ret, 0x27)
push!(ret, 0x27)
write(p, 0x27)
write(p, 0x27)
continue
end
end
return SOME(String(ret))
return SOME(String, p)
else
push!(ret, ch)
write(p, ch)
end
end
end
Expand Down

0 comments on commit 4bd1379

Please sign in to comment.