Skip to content

Commit

Permalink
archive/tar: support reporting and selecting the format
Browse files Browse the repository at this point in the history
The Reader and Writer are now at feature parity,
meaning that everything that can be parsed by the Reader,
can also be composed by the Writer.

This position enables us to support selection of the format
in a backwards compatible way, since it ensures that everything
that can be read can also be round-trip written.

As such, we add the following new API:
    type Format int
            const FormatUnknown Format = 0 ...
    type Header struct { ...; Format Format }

The new Header.Format field is populated by the Reader on the
best guess on what the format is. Note that the Reader is very liberal
in what it permits, so a hybrid TAR file using aspects of multiple
formats can still be decoded, but will be reported as FormatUnknown.

Even though Reader has full support for V7 and basic support for STAR,
it will still report those formats as unknown (and the constants for
those formats are not even exported). The reasons for this is because
the Writer has no support for V7 or STAR. Leaving it as unknown allows
the Writer to choose a format usually USTAR or GNU that can encode
the equivalent Header.

When writing, the Header.allowedFormats will take the Format field
into consideration if it is a known format.

Fixes #18710

Change-Id: I00980c475d067c6969d3414e1ff0224fdd89cd49
Reviewed-on: https://go-review.googlesource.com/58230
Run-TryBot: Joe Tsai <[email protected]>
TryBot-Result: Gobot Gobot <[email protected]>
Reviewed-by: Ian Lance Taylor <[email protected]>
  • Loading branch information
dsnet committed Aug 24, 2017
1 parent 9a9a0fc commit 9d3d370
Show file tree
Hide file tree
Showing 11 changed files with 277 additions and 154 deletions.
66 changes: 40 additions & 26 deletions src/archive/tar/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,6 @@
// Package tar implements access to tar archives.
// It aims to cover most of the variations, including those produced
// by GNU and BSD tars.
//
// References:
// http://www.freebsd.org/cgi/man.cgi?query=tar&sektion=5
// http://www.gnu.org/software/tar/manual/html_node/Standard.html
// http://pubs.opengroup.org/onlinepubs/9699919799/utilities/pax.html
package tar

import (
Expand Down Expand Up @@ -76,13 +71,26 @@ type Header struct {
// SparseHoles represents a sequence of holes in a sparse file.
//
// A file is sparse if len(SparseHoles) > 0 or Typeflag is TypeGNUSparse.
// If TypeGNUSparse is set, then the format is GNU, otherwise
// the PAX format with GNU-specific record is used.
//
// A sparse file consists of fragments of data, intermixed with holes
// (described by this field). A hole is semantically a block of NUL-bytes,
// but does not actually exist within the TAR file.
// but does not actually exist within the tar file.
// The logical size of the file stored in the Size field, while
// the holes must be sorted in ascending order,
// not overlap with each other, and not extend past the specified Size.
SparseHoles []SparseEntry

// Format specifies the format of the tar header.
//
// This is set by Reader.Next as a best-effort guess at the format.
// Since the Reader liberally reads some non-compliant files,
// it is possible for this to be FormatUnknown.
//
// When writing, if this is not FormatUnknown, then Writer.WriteHeader
// uses this as the format to encode the header.
Format Format
}

// SparseEntry represents a Length-sized fragment at Offset in the file.
Expand Down Expand Up @@ -209,12 +217,12 @@ func (h *Header) FileInfo() os.FileInfo {

// allowedFormats determines which formats can be used. The value returned
// is the logical OR of multiple possible formats. If the value is
// formatUnknown, then the input Header cannot be encoded.
// FormatUnknown, then the input Header cannot be encoded.
//
// As a by-product of checking the fields, this function returns paxHdrs, which
// contain all fields that could not be directly encoded.
func (h *Header) allowedFormats() (format int, paxHdrs map[string]string) {
format = formatUSTAR | formatPAX | formatGNU
func (h *Header) allowedFormats() (format Format, paxHdrs map[string]string) {
format = FormatUSTAR | FormatPAX | FormatGNU
paxHdrs = make(map[string]string)

verifyString := func(s string, size int, paxKey string) {
Expand All @@ -224,28 +232,28 @@ func (h *Header) allowedFormats() (format int, paxHdrs map[string]string) {
tooLong := len(s) > size
allowLongGNU := paxKey == paxPath || paxKey == paxLinkpath
if hasNUL(s) || (tooLong && !allowLongGNU) {
format &^= formatGNU // No GNU
format.mustNotBe(FormatGNU)
}
if !isASCII(s) || tooLong {
canSplitUSTAR := paxKey == paxPath
if _, _, ok := splitUSTARPath(s); !canSplitUSTAR || !ok {
format &^= formatUSTAR // No USTAR
format.mustNotBe(FormatUSTAR)
}
if paxKey == paxNone {
format &^= formatPAX // No PAX
format.mustNotBe(FormatPAX)
} else {
paxHdrs[paxKey] = s
}
}
}
verifyNumeric := func(n int64, size int, paxKey string) {
if !fitsInBase256(size, n) {
format &^= formatGNU // No GNU
format.mustNotBe(FormatGNU)
}
if !fitsInOctal(size, n) {
format &^= formatUSTAR // No USTAR
format.mustNotBe(FormatUSTAR)
if paxKey == paxNone {
format &^= formatPAX // No PAX
format.mustNotBe(FormatPAX)
} else {
paxHdrs[paxKey] = strconv.FormatInt(n, 10)
}
Expand All @@ -258,12 +266,12 @@ func (h *Header) allowedFormats() (format int, paxHdrs map[string]string) {
needsNano := ts.Nanosecond() != 0
hasFieldUSTAR := paxKey == paxMtime
if !fitsInBase256(size, ts.Unix()) || needsNano {
format &^= formatGNU // No GNU
format.mustNotBe(FormatGNU)
}
if !fitsInOctal(size, ts.Unix()) || needsNano || !hasFieldUSTAR {
format &^= formatUSTAR // No USTAR
format.mustNotBe(FormatUSTAR)
if paxKey == paxNone {
format &^= formatPAX // No PAX
format.mustNotBe(FormatPAX)
} else {
paxHdrs[paxKey] = formatPAXTime(ts)
}
Expand All @@ -289,34 +297,40 @@ func (h *Header) allowedFormats() (format int, paxHdrs map[string]string) {
verifyTime(h.ChangeTime, len(gnu.ChangeTime()), paxCtime)

if !isHeaderOnlyType(h.Typeflag) && h.Size < 0 {
return formatUnknown, nil
return FormatUnknown, nil
}
if len(h.Xattrs) > 0 {
for k, v := range h.Xattrs {
paxHdrs[paxXattr+k] = v
}
format &= formatPAX // PAX only
format.mayOnlyBe(FormatPAX)
}
for k, v := range paxHdrs {
// Forbid empty values (which represent deletion) since usage of
// them are non-sensible without global PAX record support.
if !validPAXRecord(k, v) || v == "" {
return formatUnknown, nil // Invalid PAX key
return FormatUnknown, nil // Invalid PAX key
}
}
if len(h.SparseHoles) > 0 || h.Typeflag == TypeGNUSparse {
if isHeaderOnlyType(h.Typeflag) {
return formatUnknown, nil // Cannot have sparse data on header-only file
return FormatUnknown, nil // Cannot have sparse data on header-only file
}
if !validateSparseEntries(h.SparseHoles, h.Size) {
return formatUnknown, nil
return FormatUnknown, nil
}
if h.Typeflag == TypeGNUSparse {
format &= formatGNU // GNU only
format.mayOnlyBe(FormatGNU)
} else {
format &^= formatGNU // No GNU
format.mustNotBe(FormatGNU)
}
format.mustNotBe(FormatUSTAR)
}
if wantFormat := h.Format; wantFormat != FormatUnknown {
if wantFormat.has(FormatPAX) {
wantFormat.mayBe(FormatUSTAR) // PAX implies USTAR allowed too
}
format &^= formatUSTAR // No USTAR
format.mayOnlyBe(wantFormat) // Set union of formats allowed and format wanted
}
return format, paxHdrs
}
Expand Down
115 changes: 86 additions & 29 deletions src/archive/tar/format.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,38 +4,95 @@

package tar

import "strings"

type Format int

// Constants to identify various tar formats.
const (
// The format is unknown.
formatUnknown = (1 << iota) / 2 // Sequence of 0, 1, 2, 4, 8, etc...
// Deliberately hide the meaning of constants from public API.
_ Format = (1 << iota) / 4 // Sequence of 0, 0, 1, 2, 4, 8, etc...

// FormatUnknown indicates that the format is unknown.
FormatUnknown

// The format of the original Unix V7 tar tool prior to standardization.
formatV7

// The old and new GNU formats, which are incompatible with USTAR.
// This does cover the old GNU sparse extension.
// This does not cover the GNU sparse extensions using PAX headers,
// versions 0.0, 0.1, and 1.0; these fall under the PAX format.
formatGNU
// FormatUSTAR represents the USTAR header format defined in POSIX.1-1988.
//
// While this format is compatible with most tar readers,
// the format has several limitations making it unsuitable for some usages.
// Most notably, it cannot support sparse files, files larger than 8GiB,
// filenames larger than 256 characters, and non-ASCII filenames.
//
// Reference:
// http://pubs.opengroup.org/onlinepubs/9699919799/utilities/pax.html#tag_20_92_13_06
FormatUSTAR

// FormatPAX represents the PAX header format defined in POSIX.1-2001.
//
// PAX extends USTAR by writing a special file with Typeflag TypeXHeader
// preceding the original header. This file contains a set of key-value
// records, which are used to overcome USTAR's shortcomings.
//
// Some newer formats add their own extensions to PAX by defining their
// own keys and assigning certain semantic meaning to the associated values.
// For example, sparse file support in PAX is implemented using keys
// defined by the GNU manual (e.g., "GNU.sparse.map").
//
// Reference:
// http://pubs.opengroup.org/onlinepubs/009695399/utilities/pax.html
FormatPAX

// FormatGNU represents the GNU header format.
//
// The GNU header format is older than the USTAR and PAX standards and
// is not compatible with them. The GNU format supports
// arbitrary file sizes, filenames of arbitrary encoding and length,
// sparse files, and other features.
//
// It is recommended that PAX be chosen over GNU unless the target
// application can only parse GNU formatted archives.
//
// Reference:
// http://www.gnu.org/software/tar/manual/html_node/Standard.html
FormatGNU

// Schily's tar format, which is incompatible with USTAR.
// This does not cover STAR extensions to the PAX format; these fall under
// the PAX format.
formatSTAR

// USTAR is the former standardization of tar defined in POSIX.1-1988.
// This is incompatible with the GNU and STAR formats.
formatUSTAR

// PAX is the latest standardization of tar defined in POSIX.1-2001.
// This is an extension of USTAR and is "backwards compatible" with it.
//
// Some newer formats add their own extensions to PAX, such as GNU sparse
// files and SCHILY extended attributes. Since they are backwards compatible
// with PAX, they will be labelled as "PAX".
formatPAX
formatMax
)

func (f Format) has(f2 Format) bool { return f&f2 != 0 }
func (f *Format) mayBe(f2 Format) { *f |= f2 }
func (f *Format) mayOnlyBe(f2 Format) { *f &= f2 }
func (f *Format) mustNotBe(f2 Format) { *f &^= f2 }

var formatNames = map[Format]string{
formatV7: "V7", FormatUSTAR: "USTAR", FormatPAX: "PAX", FormatGNU: "GNU", formatSTAR: "STAR",
}

func (f Format) String() string {
var ss []string
for f2 := Format(1); f2 < formatMax; f2 <<= 1 {
if f.has(f2) {
ss = append(ss, formatNames[f2])
}
}
switch len(ss) {
case 0:
return "<unknown>"
case 1:
return ss[0]
default:
return "(" + strings.Join(ss, " | ") + ")"
}
}

// Magics used to identify various formats.
const (
magicGNU, versionGNU = "ustar ", " \x00"
Expand Down Expand Up @@ -69,14 +126,14 @@ func (b *block) Sparse() sparseArray { return (sparseArray)(b[:]) }

// GetFormat checks that the block is a valid tar header based on the checksum.
// It then attempts to guess the specific format based on magic values.
// If the checksum fails, then formatUnknown is returned.
func (b *block) GetFormat() (format int) {
// If the checksum fails, then FormatUnknown is returned.
func (b *block) GetFormat() Format {
// Verify checksum.
var p parser
value := p.parseOctal(b.V7().Chksum())
chksum1, chksum2 := b.ComputeChecksum()
if p.err != nil || (value != chksum1 && value != chksum2) {
return formatUnknown
return FormatUnknown
}

// Guess the magic values.
Expand All @@ -87,29 +144,29 @@ func (b *block) GetFormat() (format int) {
case magic == magicUSTAR && trailer == trailerSTAR:
return formatSTAR
case magic == magicUSTAR:
return formatUSTAR
return FormatUSTAR | FormatPAX
case magic == magicGNU && version == versionGNU:
return formatGNU
return FormatGNU
default:
return formatV7
}
}

// SetFormat writes the magic values necessary for specified format
// and then updates the checksum accordingly.
func (b *block) SetFormat(format int) {
func (b *block) SetFormat(format Format) {
// Set the magic values.
switch format {
case formatV7:
switch {
case format.has(formatV7):
// Do nothing.
case formatGNU:
case format.has(FormatGNU):
copy(b.GNU().Magic(), magicGNU)
copy(b.GNU().Version(), versionGNU)
case formatSTAR:
case format.has(formatSTAR):
copy(b.STAR().Magic(), magicUSTAR)
copy(b.STAR().Version(), versionUSTAR)
copy(b.STAR().Trailer(), trailerSTAR)
case formatUSTAR, formatPAX:
case format.has(FormatUSTAR | FormatPAX):
copy(b.USTAR().Magic(), magicUSTAR)
copy(b.USTAR().Version(), versionUSTAR)
default:
Expand Down
Loading

0 comments on commit 9d3d370

Please sign in to comment.