From 1c8d4134fc9cff0eec60d2144816bc0e7e97e3f4 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Mon, 20 Dec 2021 12:52:20 -0500 Subject: [PATCH] reverse iteration for eachline (#42225) --- NEWS.md | 1 + base/io.jl | 117 ++++++++++++++++++++++++++++++++++++++++++++++ base/iterators.jl | 1 - test/read.jl | 38 +++++++++++++++ 4 files changed, 156 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index f0310f4cf37e3d..8b71822a34fdae 100644 --- a/NEWS.md +++ b/NEWS.md @@ -80,6 +80,7 @@ Standard library changes * `range` accepts either `stop` or `length` as a sole keyword argument ([#39241]) * `precision` and `setprecision` now accept a `base` keyword ([#42428]). +* `Iterators.reverse` (and hence `last`) now supports `eachline` iterators ([#42225]). * The `length` function on certain ranges of certain specific element types no longer checks for integer overflow in most cases. The new function `checked_length` is now available, which will try to use checked arithmetic to error if the result may be wrapping. Or use a package such as SaferIntegers.jl when diff --git a/base/io.jl b/base/io.jl index 40b38d3183be74..18ae691c92239b 100644 --- a/base/io.jl +++ b/base/io.jl @@ -1019,6 +1019,11 @@ closed when the `EachLine` object is garbage collected. To iterate over each line of a `String`, `eachline(IOBuffer(str))` can be used. +[`Iterators.reverse`](@ref) can be used on an `EachLine` object to read the lines +in reverse order (for files, buffers, and other I/O streams supporting [`seek`](@ref)), +and [`first`](@ref) or [`last`](@ref) can be used to extract the initial or final +lines, respectively. + # Examples ```jldoctest julia> open("my_file.txt", "w") do io @@ -1032,6 +1037,9 @@ JuliaLang is a GitHub organization. It has many members. julia> rm("my_file.txt"); ``` + +!!! compat "Julia 1.8" + Julia 1.8 is required to use `Iterators.reverse` or `last` with `eachline` iterators. """ function eachline(stream::IO=stdin; keep::Bool=false) EachLine(stream, keep=keep)::EachLine @@ -1053,6 +1061,115 @@ IteratorSize(::Type{<:EachLine}) = SizeUnknown() isdone(itr::EachLine, state...) = eof(itr.stream) +# Reverse-order iteration for the EachLine iterator for seekable streams, +# which works by reading the stream from the end in 4kiB chunks. +function iterate(r::Iterators.Reverse{<:EachLine}) + p0 = position(r.itr.stream) + seekend(r.itr.stream) # may throw if io is non-seekable + p = position(r.itr.stream) + # chunks = circular buffer of 4kiB blocks read from end of stream + chunks = empty!(Vector{Vector{UInt8}}(undef, 2)) # allocate space for 2 buffers (common case) + inewline = jnewline = 0 + while p > p0 && inewline == 0 # read chunks until we find a newline or we read whole file + chunk = Vector{UInt8}(undef, min(4096, p-p0)) + p -= length(chunk) + readbytes!(seek(r.itr.stream, p), chunk) + pushfirst!(chunks, chunk) + inewline = something(findlast(==(UInt8('\n')), chunk), 0) + if length(chunks) == 1 && inewline == length(chunks[1]) + # found newline at end of file … keep looking + jnewline = inewline + inewline = something(findprev(==(UInt8('\n')), chunk, inewline-1), 0) + end + end + return iterate(r, (; p0, p, chunks, ichunk=1, inewline, jchunk=length(chunks), jnewline = jnewline == 0 && !isempty(chunks) ? length(chunks[end]) : jnewline)) +end +function iterate(r::Iterators.Reverse{<:EachLine}, state) + function _stripnewline(keep, pos, data) + # strip \n or \r\n from data[pos] by decrementing pos + if !keep && pos > 0 && data[pos] == UInt8('\n') + pos -= 1 + pos -= pos > 0 && data[pos] == UInt8('\r') + end + return pos + end + # state tuple: p0 = initial file position, p = current position, + # chunks = circular array of chunk buffers, + # current line is from chunks[ichunk][inewline+1] to chunks[jchunk][jnewline] + p0, p, chunks, ichunk, inewline, jchunk, jnewline = state + if inewline == 0 # no newline found, remaining line = rest of chunks (if any) + isempty(chunks) && return (r.itr.ondone(); nothing) + buf = IOBuffer(sizehint = ichunk==jchunk ? jnewline : 4096) + while ichunk != jchunk + write(buf, chunks[ichunk]) + ichunk = ichunk == length(chunks) ? 1 : ichunk + 1 + end + chunk = chunks[jchunk] + write(buf, view(chunk, 1:jnewline)) + buf.size = _stripnewline(r.itr.keep, buf.size, buf.data) + empty!(chunks) # will cause next iteration to terminate + seekend(r.itr.stream) # reposition to end of stream for isdone + s = String(take!(buf)) + else + # extract the string from chunks[ichunk][inewline+1] to chunks[jchunk][jnewline] + if ichunk == jchunk # common case: current and previous newline in same chunk + chunk = chunks[ichunk] + s = String(view(chunk, inewline+1:_stripnewline(r.itr.keep, jnewline, chunk))) + else + buf = IOBuffer(sizehint=max(128, length(chunks[ichunk])-inewline+jnewline)) + write(buf, view(chunks[ichunk], inewline+1:length(chunks[ichunk]))) + i = ichunk + while true + i = i == length(chunks) ? 1 : i + 1 + i == jchunk && break + write(buf, chunks[i]) + end + write(buf, view(chunks[jchunk], 1:jnewline)) + buf.size = _stripnewline(r.itr.keep, buf.size, buf.data) + s = String(take!(buf)) + + # overwrite obsolete chunks (ichunk+1:jchunk) + i = jchunk + while i != ichunk + chunk = chunks[i] + p -= length(resize!(chunk, min(4096, p-p0))) + readbytes!(seek(r.itr.stream, p), chunk) + i = i == 1 ? length(chunks) : i - 1 + end + end + + # find the newline previous to inewline + jchunk = ichunk + jnewline = inewline + while true + inewline = something(findprev(==(UInt8('\n')), chunks[ichunk], inewline-1), 0) + inewline > 0 && break + ichunk = ichunk == 1 ? length(chunks) : ichunk - 1 + ichunk == jchunk && break # found nothing — may need to read more chunks + inewline = length(chunks[ichunk])+1 # start for next findprev + end + + # read more chunks to look for a newline (should rarely happen) + if inewline == 0 && p > p0 + ichunk = jchunk + 1 + while true + chunk = Vector{UInt8}(undef, min(4096, p-p0)) + p -= length(chunk) + readbytes!(seek(r.itr.stream, p), chunk) + insert!(chunks, ichunk, chunk) + inewline = something(findlast(==(UInt8('\n')), chunk), 0) + (p == p0 || inewline > 0) && break + end + end + end + return (s, (; p0, p, chunks, ichunk, inewline, jchunk, jnewline)) +end +isdone(r::Iterators.Reverse{<:EachLine}, state) = isempty(state.chunks) +isdone(r::Iterators.Reverse{<:EachLine}) = isdone(r.itr) + +# use reverse iteration to get end of EachLines (if possible) +last(itr::EachLine) = first(Iterators.reverse(itr)) + struct ReadEachIterator{T, IOT <: IO} stream::IOT end diff --git a/base/iterators.jl b/base/iterators.jl index ec47ae8ea0435f..3e339c59bebcb3 100644 --- a/base/iterators.jl +++ b/base/iterators.jl @@ -103,7 +103,6 @@ size(r::Reverse) = size(r.itr) IteratorSize(::Type{Reverse{T}}) where {T} = IteratorSize(T) IteratorEltype(::Type{Reverse{T}}) where {T} = IteratorEltype(T) last(r::Reverse) = first(r.itr) # the first shall be last -first(r::Reverse) = last(r.itr) # and the last shall be first # reverse-order array iterators: assumes more-specialized Reverse for eachindex @propagate_inbounds function iterate(A::Reverse{<:AbstractArray}, state=(reverse(eachindex(A.itr)),)) diff --git a/test/read.jl b/test/read.jl index 81ee7fea21fbaf..d26f2463dcbd1a 100644 --- a/test/read.jl +++ b/test/read.jl @@ -293,6 +293,14 @@ for (name, f) in l @test collect(eachline(io(), keep=true)) == collect(eachline(filename, keep=true)) @test collect(eachline(io())) == collect(eachline(IOBuffer(text))) @test collect(@inferred(eachline(io()))) == collect(@inferred(eachline(filename))) #20351 + if try; seekend(io()); true; catch; false; end # reverse iteration only supports seekable streams + for keep in (true, false) + lines = readlines(io(); keep) + @test last(lines) == last(eachline(io(); keep)) + @test last(lines,2) == last(eachline(io(); keep),2) + @test reverse!(lines) == collect(Iterators.reverse(eachline(io(); keep))) == collect(Iterators.reverse(eachline(IOBuffer(text); keep))) + end + end cleanup() @@ -621,3 +629,33 @@ end first(itr) # consume the iterator @test isempty(itr) # now it is empty end + +# more tests for reverse(eachline) +@testset "reverse(eachline)" begin + lines = vcat(repr.(1:4), ' '^50000 .* repr.(5:10), repr.(11:10^5)) + for lines in (lines, reverse(lines)), finalnewline in (true, false), eol in ("\n", "\r\n") + buf = IOBuffer(join(lines, eol) * (finalnewline ? eol : "")) + @test reverse!(collect(Iterators.reverse(eachline(seekstart(buf))))) == lines + @test last(eachline(seekstart(buf))) == last(lines) + @test last(eachline(seekstart(buf)),10^4) == last(lines,10^4) + @test last(eachline(seekstart(buf)),length(lines)*2) == lines + @test reverse!(collect(Iterators.reverse(eachline(seek(buf, sum(sizeof, lines[1:100]) + 100*sizeof(eol)))))) == lines[101:end] + @test isempty(Iterators.reverse(eachline(buf))) + end + + let rempty = Iterators.reverse(eachline(IOBuffer())) + @test isempty(rempty) + @test isempty(collect(rempty)) + end + + let buf = IOBuffer("foo\nbar") + @test readline(buf) == "foo" + r = Iterators.reverse(eachline(buf)) + line, state = iterate(r) + @test line == "bar" + @test Base.isdone(r, state) + @test Base.isdone(r) + @test isempty(r) && isempty(collect(r)) + end +end +