Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reverse iteration for eachline #42225

Merged
merged 24 commits into from
Dec 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ Standard library changes

* `range` accepts either `stop` or `length` as a sole keyword argument ([#39241])
* `precision` and `setprecision` now accept a `base` keyword ([#42428]).
* `Iterators.reverse` (and hence `last`) now supports `eachline` iterators ([#42225]).
* The `length` function on certain ranges of certain specific element types no longer checks for integer
overflow in most cases. The new function `checked_length` is now available, which will try to use checked
arithmetic to error if the result may be wrapping. Or use a package such as SaferIntegers.jl when
Expand Down
117 changes: 117 additions & 0 deletions base/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,11 @@ closed when the `EachLine` object is garbage collected.

To iterate over each line of a `String`, `eachline(IOBuffer(str))` can be used.

[`Iterators.reverse`](@ref) can be used on an `EachLine` object to read the lines
in reverse order (for files, buffers, and other I/O streams supporting [`seek`](@ref)),
and [`first`](@ref) or [`last`](@ref) can be used to extract the initial or final
lines, respectively.

# Examples
```jldoctest
julia> open("my_file.txt", "w") do io
Expand All @@ -1032,6 +1037,9 @@ JuliaLang is a GitHub organization. It has many members.

julia> rm("my_file.txt");
```

!!! compat "Julia 1.8"
Julia 1.8 is required to use `Iterators.reverse` or `last` with `eachline` iterators.
"""
function eachline(stream::IO=stdin; keep::Bool=false)
EachLine(stream, keep=keep)::EachLine
Expand All @@ -1053,6 +1061,115 @@ IteratorSize(::Type{<:EachLine}) = SizeUnknown()

isdone(itr::EachLine, state...) = eof(itr.stream)

# Reverse-order iteration for the EachLine iterator for seekable streams,
# which works by reading the stream from the end in 4kiB chunks.
function iterate(r::Iterators.Reverse{<:EachLine})
p0 = position(r.itr.stream)
seekend(r.itr.stream) # may throw if io is non-seekable
p = position(r.itr.stream)
# chunks = circular buffer of 4kiB blocks read from end of stream
chunks = empty!(Vector{Vector{UInt8}}(undef, 2)) # allocate space for 2 buffers (common case)
inewline = jnewline = 0
while p > p0 && inewline == 0 # read chunks until we find a newline or we read whole file
chunk = Vector{UInt8}(undef, min(4096, p-p0))
p -= length(chunk)
readbytes!(seek(r.itr.stream, p), chunk)
pushfirst!(chunks, chunk)
inewline = something(findlast(==(UInt8('\n')), chunk), 0)
if length(chunks) == 1 && inewline == length(chunks[1])
# found newline at end of file … keep looking
jnewline = inewline
inewline = something(findprev(==(UInt8('\n')), chunk, inewline-1), 0)
end
end
return iterate(r, (; p0, p, chunks, ichunk=1, inewline, jchunk=length(chunks), jnewline = jnewline == 0 && !isempty(chunks) ? length(chunks[end]) : jnewline))
end
function iterate(r::Iterators.Reverse{<:EachLine}, state)
function _stripnewline(keep, pos, data)
# strip \n or \r\n from data[pos] by decrementing pos
if !keep && pos > 0 && data[pos] == UInt8('\n')
pos -= 1
pos -= pos > 0 && data[pos] == UInt8('\r')
end
return pos
end
# state tuple: p0 = initial file position, p = current position,
# chunks = circular array of chunk buffers,
# current line is from chunks[ichunk][inewline+1] to chunks[jchunk][jnewline]
p0, p, chunks, ichunk, inewline, jchunk, jnewline = state
if inewline == 0 # no newline found, remaining line = rest of chunks (if any)
isempty(chunks) && return (r.itr.ondone(); nothing)
buf = IOBuffer(sizehint = ichunk==jchunk ? jnewline : 4096)
while ichunk != jchunk
write(buf, chunks[ichunk])
ichunk = ichunk == length(chunks) ? 1 : ichunk + 1
end
chunk = chunks[jchunk]
write(buf, view(chunk, 1:jnewline))
buf.size = _stripnewline(r.itr.keep, buf.size, buf.data)
empty!(chunks) # will cause next iteration to terminate
seekend(r.itr.stream) # reposition to end of stream for isdone
s = String(take!(buf))
else
# extract the string from chunks[ichunk][inewline+1] to chunks[jchunk][jnewline]
if ichunk == jchunk # common case: current and previous newline in same chunk
chunk = chunks[ichunk]
s = String(view(chunk, inewline+1:_stripnewline(r.itr.keep, jnewline, chunk)))
else
buf = IOBuffer(sizehint=max(128, length(chunks[ichunk])-inewline+jnewline))
write(buf, view(chunks[ichunk], inewline+1:length(chunks[ichunk])))
i = ichunk
while true
i = i == length(chunks) ? 1 : i + 1
i == jchunk && break
write(buf, chunks[i])
end
write(buf, view(chunks[jchunk], 1:jnewline))
buf.size = _stripnewline(r.itr.keep, buf.size, buf.data)
s = String(take!(buf))

# overwrite obsolete chunks (ichunk+1:jchunk)
i = jchunk
while i != ichunk
chunk = chunks[i]
p -= length(resize!(chunk, min(4096, p-p0)))
readbytes!(seek(r.itr.stream, p), chunk)
i = i == 1 ? length(chunks) : i - 1
end
end

# find the newline previous to inewline
jchunk = ichunk
jnewline = inewline
while true
inewline = something(findprev(==(UInt8('\n')), chunks[ichunk], inewline-1), 0)
inewline > 0 && break
ichunk = ichunk == 1 ? length(chunks) : ichunk - 1
ichunk == jchunk && break # found nothing — may need to read more chunks
inewline = length(chunks[ichunk])+1 # start for next findprev
end

# read more chunks to look for a newline (should rarely happen)
if inewline == 0 && p > p0
ichunk = jchunk + 1
while true
chunk = Vector{UInt8}(undef, min(4096, p-p0))
p -= length(chunk)
readbytes!(seek(r.itr.stream, p), chunk)
insert!(chunks, ichunk, chunk)
inewline = something(findlast(==(UInt8('\n')), chunk), 0)
(p == p0 || inewline > 0) && break
end
end
end
return (s, (; p0, p, chunks, ichunk, inewline, jchunk, jnewline))
end
isdone(r::Iterators.Reverse{<:EachLine}, state) = isempty(state.chunks)
isdone(r::Iterators.Reverse{<:EachLine}) = isdone(r.itr)

# use reverse iteration to get end of EachLines (if possible)
last(itr::EachLine) = first(Iterators.reverse(itr))

struct ReadEachIterator{T, IOT <: IO}
stream::IOT
end
Expand Down
1 change: 0 additions & 1 deletion base/iterators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ size(r::Reverse) = size(r.itr)
IteratorSize(::Type{Reverse{T}}) where {T} = IteratorSize(T)
IteratorEltype(::Type{Reverse{T}}) where {T} = IteratorEltype(T)
last(r::Reverse) = first(r.itr) # the first shall be last
first(r::Reverse) = last(r.itr) # and the last shall be first

# reverse-order array iterators: assumes more-specialized Reverse for eachindex
@propagate_inbounds function iterate(A::Reverse{<:AbstractArray}, state=(reverse(eachindex(A.itr)),))
Expand Down
38 changes: 38 additions & 0 deletions test/read.jl
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,14 @@ for (name, f) in l
@test collect(eachline(io(), keep=true)) == collect(eachline(filename, keep=true))
@test collect(eachline(io())) == collect(eachline(IOBuffer(text)))
@test collect(@inferred(eachline(io()))) == collect(@inferred(eachline(filename))) #20351
if try; seekend(io()); true; catch; false; end # reverse iteration only supports seekable streams
for keep in (true, false)
lines = readlines(io(); keep)
@test last(lines) == last(eachline(io(); keep))
@test last(lines,2) == last(eachline(io(); keep),2)
@test reverse!(lines) == collect(Iterators.reverse(eachline(io(); keep))) == collect(Iterators.reverse(eachline(IOBuffer(text); keep)))
end
end

cleanup()

Expand Down Expand Up @@ -621,3 +629,33 @@ end
first(itr) # consume the iterator
@test isempty(itr) # now it is empty
end

# more tests for reverse(eachline)
@testset "reverse(eachline)" begin
lines = vcat(repr.(1:4), ' '^50000 .* repr.(5:10), repr.(11:10^5))
for lines in (lines, reverse(lines)), finalnewline in (true, false), eol in ("\n", "\r\n")
buf = IOBuffer(join(lines, eol) * (finalnewline ? eol : ""))
@test reverse!(collect(Iterators.reverse(eachline(seekstart(buf))))) == lines
@test last(eachline(seekstart(buf))) == last(lines)
@test last(eachline(seekstart(buf)),10^4) == last(lines,10^4)
@test last(eachline(seekstart(buf)),length(lines)*2) == lines
@test reverse!(collect(Iterators.reverse(eachline(seek(buf, sum(sizeof, lines[1:100]) + 100*sizeof(eol)))))) == lines[101:end]
@test isempty(Iterators.reverse(eachline(buf)))
end

let rempty = Iterators.reverse(eachline(IOBuffer()))
@test isempty(rempty)
@test isempty(collect(rempty))
end

let buf = IOBuffer("foo\nbar")
@test readline(buf) == "foo"
r = Iterators.reverse(eachline(buf))
line, state = iterate(r)
@test line == "bar"
@test Base.isdone(r, state)
@test Base.isdone(r)
@test isempty(r) && isempty(collect(r))
end
end