Skip to content

Commit

Permalink
read_standard_header: improve error checks & messages
Browse files Browse the repository at this point in the history
This is a fairly significant refactoring of the error checking aspect of
header parsing with a few high-level improvements. The first major
improvement is that this now extracts and checks the version string
first, then the checksum and only if those are valid does it try
extracting any other fields. This makes the error message a bit clearer:
previously you would get a cryptic complaint about an invalid octal
digit if you tried to parse something that wasn't a tar file, now at
least you get an "invalid version string for tar file" error.

This now also dumps the entire header data if there is any error parsing
the header, which helps with diagnosis. In addition, the error checking
and reporting for boundary conditions in fields which can be malformed
(i.e. octal integers) is much improved.

Last but not least, the test coverage of all of this is much increased.
  • Loading branch information
StefanKarpinski committed Jun 4, 2021
1 parent 0910203 commit 71ce072
Show file tree
Hide file tree
Showing 4 changed files with 264 additions and 71 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ Tar_jll = "9b64493d-8859-5bf3-93d7-7c32dd38186f"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Random", "Tar_jll", "Test", "SimpleBufferStream"]
test = ["Random", "SimpleBufferStream", "Tar_jll", "Test"]
184 changes: 120 additions & 64 deletions src/extract.jl
Original file line number Diff line number Diff line change
Expand Up @@ -518,32 +518,53 @@ end
# For reference see
# https://www.gnu.org/software/tar/manual/html_node/Standard.html

const HEADER_FIELDS = [
const HEADER_FIELDS = (
# field, offset, size
(:name, 0, 100)
(:mode, 100, 8)
(:uid, 108, 8)
(:gid, 116, 8)
(:size, 124, 12)
(:mtime, 136, 12)
(:chksum, 148, 8)
(:typeflag, 156, 1)
(:linkname, 157, 100)
(:magic, 257, 6)
(:version, 263, 2)
(:uname, 265, 32)
(:gname, 297, 32)
(:devmajor, 329, 8)
(:devminor, 337, 8)
(:prefix, 345, 155)
(:rest, 500, 12)
]

index_range(offset::Int, length::Int) = offset .+ (1:length)
(:name, 0, 100),
(:mode, 100, 8),
(:uid, 108, 8),
(:gid, 116, 8),
(:size, 124, 12),
(:mtime, 136, 12),
(:chksum, 148, 8),
(:typeflag, 156, 1),
(:linkname, 157, 100),
(:magic, 257, 6),
(:version, 263, 2),
(:uname, 265, 32),
(:gname, 297, 32),
(:devmajor, 329, 8),
(:devminor, 337, 8),
(:prefix, 345, 155),
(:rest, 500, 12),
)

function index_range(field::Symbol)
for (fld, off, len) in HEADER_FIELDS
fld == field && return off .+ (1:len)
end
error("[internal error] invalid field name: $field")
end

dump_header(buf::AbstractVector{UInt8}) =
[ field => String(buf[index_range(offset, size)])
for (field, offset, size) in HEADER_FIELDS ]
[ fld => String(buf[off .+ (1:len)])
for (fld, off, len) in HEADER_FIELDS ]

function header_error(buf::AbstractVector{UInt8}, msg::AbstractString)
sprint() do io
println(io, msg, "\n[header block data]:")
for (field, value) in dump_header(buf)
print(io, " ", rpad(field, 8), " = ")
show(io, value)
println(io)
end
end |> error
end

function header_error(buf::AbstractVector{UInt8}, fld::Symbol)
value = read_header_str(buf, fld)
header_error(buf, "malformed $fld field: $(repr(value))")
end

function read_standard_header(
io::IO;
Expand All @@ -559,83 +580,118 @@ function read_standard_header(
end
return nothing
end
# extract fields we care about
name = read_header_str(data, 0, 100)
mode = read_header_int(data, 100, 8)
size = data[124+1] & 0x80 == 0 ?
read_header_int(data, 124, 12) :
read_header_bin(data, 124, 12)
chksum = read_header_int(data, 148, 8)
type = read_header_chr(data, 156)
link = read_header_str(data, 157, 100)
version = read_header_str(data, 263, 2)
prefix = read_header_str(data, 345, 155)
# check version field
occursin(r"^0* *$", version) ||
error("unknown version string for tar file: $(repr(version))")
# check checksum field
chksum_orig = data[index_range(148, 8)]
data[index_range(148, 8)] .= ' ' # fill checksum field with spaces
chksum′ = sum(data)
data[index_range(148, 8)] .= chksum_orig
if chksum != chksum′
msg = sprint() do io
println(io, "incorrect header checksum = $chksum; should be $chksum′")
print(io, "header data is ")
show(io, MIME"text/plain"(), dump_header(data))
# verify valid header
try
check_version_field(buf)
check_checksum_field(buf)
catch err
if err isa ErrorException
msg = match(r"^(.*?)\s*\[header block data\]"s, err.msg).captures[1]
msg = "This does not appear to be a TAR file/stream — $msg. Note: Tar.jl does not handle decompression; if the tarball is compressed you must use an external command like `gzcat` or package like CodecZlib.jl to decompress it. See the README file for examples."
err = ErrorException(msg)
end
error(msg)
rethrow(err)
end
# extract fields we care about
size = read_header_size(buf)
name = read_header_str(buf, :name)
mode = read_header_int(buf, :mode)
type = read_header_chr(buf, :typeflag)
link = read_header_str(buf, :linkname)
prefix = read_header_str(buf, :prefix)
# check mode
mode  typemax(typemax(UInt16)) ||
header_error(buf, "mode value too large: $(string(mode, base=8))")
# combine prefix & name fields
path = isempty(prefix) ? name : "$prefix/$name"
return Header(path, to_symbolic_type(type), mode, size, link)
end

round_up(size) = 512 * ((size + 511) ÷ 512)
function check_version_field(buf::AbstractVector{UInt8})
version = read_header_str(buf, :version)
occursin(r"^0* *$", version) && return
header_error(buf, "invalid version string for tar file: $(repr(version))")
end

function skip_data(tar::IO, size::Integer)
size < 0 && throw(ArgumentError("[internal error] negative skip: $size"))
size > 0 && skip(tar, round_up(size))
function check_checksum_field(buf::AbstractVector{UInt8})
chksum = read_header_int(buf, :chksum)
actual = let r = index_range(:chksum)
sum(i in r ? UInt8(' ') : buf[i] for i = 1:512)
end
chksum == actual && return
header_error(buf, "incorrect header checksum = $chksum; should be $actual")
end

read_header_chr(buf::AbstractVector{UInt8}, offset::Int) = Char(buf[offset+1])
function read_header_size(buf::AbstractVector{UInt8})
r = index_range(:size)
b1 = buf[r[1]]
b1 & 0x80 == 0 && return read_header_int(buf, :size)
# high bit set for binary
buf[r[1]] &= ~0x80
n = try read_header_bin(buf, :size)
finally
buf[r[1]] = b1
end
return n
end

function read_header_chr(buf::AbstractVector{UInt8}, fld::Symbol)
r = index_range(fld)
length(r) == 1 || error("[internal error] not a character field: $fld")
return Char(buf[first(r)])
end

function read_header_str(buf::AbstractVector{UInt8}, offset::Int, length::Int)
r = index_range(offset, length)
function read_header_str(buf::AbstractVector{UInt8}, fld::Symbol)
r = index_range(fld)
for i in r
byte = buf[i]
byte == 0 && return String(buf[first(r):i-1])
end
return String(buf[r])
end

function read_header_int(buf::AbstractVector{UInt8}, offset::Int, length::Int)
n = UInt64(0)
function read_header_int(buf::AbstractVector{UInt8}, fld::Symbol)
r = index_range(fld)
n = Int64(0)
before = true
r = index_range(offset, length)
for i in r
byte = buf[i]
before && byte == UInt8(' ') && continue
byte in (0x00, UInt8(' ')) && break
UInt8('0') <= byte <= UInt8('7') ||
error("invalid octal digit: $(repr(Char(byte)))")
UInt8('0') <= byte <= UInt8('7') || header_error(buf, fld)
if leading_zeros(n) <= 3
val = String(buf[r])
header_error(buf, "octal integer $fld value too large: $(repr(val))")
end
n <<= 3
n |= byte - 0x30
before = false
end
before && error("invalid integer value: $(repr(String(buf[r])))")
before && header_error(buf, fld)
return n
end

function read_header_bin(buf::AbstractVector{UInt8}, offset::Int, length::Int)
n = UInt64(0)
for i in index_range(offset, length)
function read_header_bin(buf::AbstractVector{UInt8}, fld::Symbol)
r = index_range(fld)
n = Int64(0)
for i in r
if leading_zeros(n) <= 8
val = String(buf[r])
header_error(buf, "binary integer $fld value too large: $(repr(val))")
end
n <<= 8
n |= buf[i]
end
return n
end

round_up(size) = 512 * ((size + 511) ÷ 512)

function skip_data(tar::IO, size::Integer)
size < 0 && throw(ArgumentError("[internal error] negative skip: $size"))
size > 0 && skip(tar, round_up(size))
end

function read_data(
tar::IO,
file::IO;
Expand Down
103 changes: 97 additions & 6 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -868,10 +868,101 @@ if Sys.iswindows() && Sys.which("icacls") !== nothing && VERSION >= v"1.6"
end
end

@testset "leading spaces" begin
# fragment of https://sparse.tamu.edu/MM/Oberwolfach/LF10.tar.gz
tarball = joinpath(@__DIR__, "data", "LF10-fragment.tar")
@test Tar.list(tarball) == [
Tar.Header("LF10/LF10_B.mtx", :file, 0o100600, 367, "")
]
@testset "header parsing" begin
@testset "leading spaces in integer fields" begin
# fragment of https://sparse.tamu.edu/MM/Oberwolfach/LF10.tar.gz
tarball = joinpath(test_data_dir, "LF10-fragment.tar")
hdr = Tar.Header("LF10/LF10_B.mtx", :file, 0o100600, 367, "")
@test open(Tar.read_header, tarball) == hdr
@test Tar.list(tarball) == [hdr]
end
@testset "header errors" begin
# generate a valid header
buf = IOBuffer()
Tar.write_header(buf, Tar.Header("file", :file, 0o644, 123, ""))
data = Tar.dump_header(take!(buf))
notar = "This does not appear to be a TAR file/stream —"
# test various header problems
tarball = write_modified_header(data, :version => "AB")
test_error_prefix("$notar invalid version string for tar file: \"AB\"") do
open(Tar.read_header, tarball)
end
# malformed checksums
for str in [" ", " "^8, "1HKPhaUq", "\1"]
tarball = write_modified_header(data, :chksum => str)
test_error_prefix("$notar malformed chksum field: $(repr(str))") do
open(Tar.read_header, tarball)
end
end
# incorrect checksum
tarball = write_modified_header(data, :chksum => "123456\0 ")
test_error_prefix("$notar incorrect header checksum = 42798;") do
open(Tar.read_header, tarball)
end
# malformed sizes
for str in [" ", " "^12, "lVonG911HzaL", "\1"]
tarball = write_modified_header(data, :size => str)
test_error_prefix("malformed size field: $(repr(str))") do
open(Tar.read_header, tarball)
end
end
# largest valid binary size
str = lpad("\x7f"*"\xff"^7, 11, "\0")
tarball = write_modified_header(data, :size => "\x80$str")
@test open(Tar.read_header, tarball).size == typemax(Int64)
# smallest too large binary size
str = lpad("\x80"*"\x00"^7, 11, "\0")
tarball = write_modified_header(data, :size => "\x80$str")
test_error_prefix("binary integer size value too large: $(repr("\0$str"))") do
open(Tar.read_header, tarball)
end
# largest binary size (also too large)
str = "\xff"^11
tarball = write_modified_header(data, :size => "\xff$str")
test_error_prefix("binary integer size value too large: $(repr("\x7f$str"))") do
open(Tar.read_header, tarball)
end
# malformed modes
for str in [" ", " "^8, "CbiX4Rkb", "\1"]
tarball = write_modified_header(data, :mode => str)
test_error_prefix("malformed mode field: $(repr(str))") do
open(Tar.read_header, tarball)
end
end
# various valid mode values
for str in [
"0", " 0", " 0", " 0",
"123", " 123", " 00123", " 123",
"177777", " 177777", "00177777", " 0177777",
]
tarball = write_modified_header(data, :mode => str)
@test open(Tar.read_header, tarball).mode == parse(Int, str, base=8)
end
# smallest & largest too large mode values
for str in ["200000", "77777777"]
tarball = write_modified_header(data, :mode => str)
test_error_prefix("mode value too large: $str") do
open(Tar.read_header, tarball)
end
end
end
@testset "octal parsing" begin
buf = fill(0x0, 512)
buf[1:21] .= '7'
# largest valid octal value
@test Tar.read_header_int(buf, :name) == typemax(Int64)
# smallest too large octal value
buf[1] = '1'
buf[2:22] .= '0'
test_error_prefix("octal integer name value too large:") do
Tar.read_header_int(buf, :name)
end
# way too large octal value
for i = 1:length(buf)
buf[i] = '0' + (i % 8)
end
test_error_prefix("octal integer name value too large:") do
Tar.read_header_int(buf, :name)
end
end
end
Loading

0 comments on commit 71ce072

Please sign in to comment.