-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: handle hardlinks symmetrically #19
base: handle-hardlinks
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -38,6 +38,18 @@ type ExtractInfo struct { | |||||
Context any | ||||||
} | ||||||
|
||||||
// The identifier for the hardlink is unique for each unique base file, | ||||||
// which counts from 1. The 0 value is reserved for files that are not hard links. | ||||||
type hardLinkRevMapEntry struct { | ||||||
Target []string | ||||||
Identifier int | ||||||
StagingPath string | ||||||
} | ||||||
|
||||||
type tarMetadata struct { | ||||||
HardLinkRevMap map[string]hardLinkRevMapEntry | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe a comment about this map would be helpful. |
||||||
} | ||||||
|
||||||
func getValidOptions(options *ExtractOptions) (*ExtractOptions, error) { | ||||||
for extractPath, extractInfos := range options.Extract { | ||||||
isGlob := strings.ContainsAny(extractPath, "*?") | ||||||
|
@@ -62,7 +74,7 @@ func getValidOptions(options *ExtractOptions) (*ExtractOptions, error) { | |||||
return options, nil | ||||||
} | ||||||
|
||||||
func Extract(pkgReader io.Reader, options *ExtractOptions) (err error) { | ||||||
func Extract(pkgReader io.ReadSeeker, options *ExtractOptions) (err error) { | ||||||
defer func() { | ||||||
if err != nil { | ||||||
err = fmt.Errorf("cannot extract from package %q: %w", options.Package, err) | ||||||
|
@@ -83,43 +95,75 @@ func Extract(pkgReader io.Reader, options *ExtractOptions) (err error) { | |||||
return err | ||||||
} | ||||||
|
||||||
return extractData(pkgReader, validOpts) | ||||||
} | ||||||
|
||||||
// getDataReader returns a ReadCloser for the data payload of the package. | ||||||
// Calling the Close method must happen outside of this function to prevent | ||||||
// premature closing of the underlying package reader. | ||||||
// The xz.Reader is wrapper with io.NopCloser since it does not implement the Close method. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
nitpick
Comment on lines
+101
to
+104
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In my opinion all of the comment here is unnecessary because it is a mix of Go idiosyncrasies and implementation details. The comment on the function should not state things like "xz.Reader is wrapper with io.NopCloser...", that is something internal that the user does not care about. And the bit about "Calling the Close method must happen outside of this function..." is also something that is taken for granted in Go. |
||||||
func getDataReader(pkgReader io.ReadSeeker) (io.ReadCloser, error) { | ||||||
arReader := ar.NewReader(pkgReader) | ||||||
var dataReader io.Reader | ||||||
var dataReader io.ReadCloser | ||||||
for dataReader == nil { | ||||||
arHeader, err := arReader.Next() | ||||||
if err == io.EOF { | ||||||
return fmt.Errorf("no data payload") | ||||||
return nil, fmt.Errorf("no data payload") | ||||||
} | ||||||
if err != nil { | ||||||
return err | ||||||
return nil, err | ||||||
} | ||||||
switch arHeader.Name { | ||||||
case "data.tar.gz": | ||||||
gzipReader, err := gzip.NewReader(arReader) | ||||||
if err != nil { | ||||||
return err | ||||||
return nil, err | ||||||
} | ||||||
defer gzipReader.Close() | ||||||
dataReader = gzipReader | ||||||
case "data.tar.xz": | ||||||
xzReader, err := xz.NewReader(arReader) | ||||||
if err != nil { | ||||||
return err | ||||||
return nil, err | ||||||
} | ||||||
dataReader = xzReader | ||||||
dataReader = io.NopCloser(xzReader) | ||||||
case "data.tar.zst": | ||||||
zstdReader, err := zstd.NewReader(arReader) | ||||||
if err != nil { | ||||||
return err | ||||||
return nil, err | ||||||
} | ||||||
defer zstdReader.Close() | ||||||
dataReader = zstdReader | ||||||
dataReader = zstdReader.IOReadCloser() | ||||||
} | ||||||
} | ||||||
return extractData(dataReader, validOpts) | ||||||
|
||||||
return dataReader, nil | ||||||
} | ||||||
|
||||||
func extractData(dataReader io.Reader, options *ExtractOptions) error { | ||||||
func extractData(pkgReader io.ReadSeeker, options *ExtractOptions) error { | ||||||
// stagingDir is the directory where the hard link base file, which is not | ||||||
// listed in the pendingPaths, to be extracted. | ||||||
stagingDir, err := os.MkdirTemp("", "chisel-staging-") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need a staging dir in the first place? I think it should not be needed at all. Correct me if I am wrong but:
As I said in 1. there is also the optimization that target might be extracted so there is no need for a second iteration. The downside is that we need to keep track of the names of the extracted files to know if target was extracted and we already do that in the report, and exposing a good API here might be a challenge, we have to think about that. In fact, because this is a "niche" use case and we do not anticipate it happening a lot, and because the optimization mentioned in 1. is even more niche, I would say let's not worry about it unless it is easy to implement. |
||||||
if err != nil { | ||||||
return err | ||||||
} | ||||||
// Fist pass over the tarball to read the header of the tarball | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
nitpick |
||||||
// and create its metadata. | ||||||
dataReader, err := getDataReader(pkgReader) | ||||||
if err != nil { | ||||||
return err | ||||||
} | ||||||
tarReader := tar.NewReader(dataReader) | ||||||
tarMetadata, err := readTarMetadata(tarReader) | ||||||
Comment on lines
+154
to
+155
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is always going to read the file twice even if there are no hardlinks. I think a better design is to do the extraction for all files in the first loop and keep track of the hardlinks that we cannot extract then, only if necessary, do we loop over the tarball again. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not regarding "Whether or not we can extract the hardlink", but "How can we have all the files extracted from a hardlink group to be represented symmetrically in the report". Think about a counter-example: When we meet the "regular file" (which is the target of other hardlinks) in the first pass reading the tarball, we don't know if it is associated with a hardlink group. Then in the |
||||||
if err != nil { | ||||||
return err | ||||||
} | ||||||
// Rewind back to the start of the tarball and extract the files. | ||||||
pkgReader.Seek(0, io.SeekStart) | ||||||
// Second pass over the tarball to extract the files. | ||||||
dataReader, err = getDataReader(pkgReader) | ||||||
if err != nil { | ||||||
return err | ||||||
} | ||||||
defer dataReader.Close() | ||||||
|
||||||
oldUmask := syscall.Umask(0) | ||||||
defer func() { | ||||||
|
@@ -143,7 +187,7 @@ func extractData(dataReader io.Reader, options *ExtractOptions) error { | |||||
// before the entry for the file itself. This is the case for .deb files but | ||||||
// not for all tarballs. | ||||||
tarDirMode := make(map[string]fs.FileMode) | ||||||
tarReader := tar.NewReader(dataReader) | ||||||
tarReader = tar.NewReader(dataReader) | ||||||
for { | ||||||
tarHeader, err := tarReader.Next() | ||||||
if err == io.EOF { | ||||||
|
@@ -153,6 +197,9 @@ func extractData(dataReader io.Reader, options *ExtractOptions) error { | |||||
return err | ||||||
} | ||||||
|
||||||
// targetDir is the directory where the file is extracted. | ||||||
// It is either the options.TargetDir or the stagingDir. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
targetDir := options.TargetDir | ||||||
sourcePath := tarHeader.Name | ||||||
if len(sourcePath) < 3 || sourcePath[0] != '.' || sourcePath[1] != '/' { | ||||||
continue | ||||||
|
@@ -187,8 +234,24 @@ func extractData(dataReader io.Reader, options *ExtractOptions) error { | |||||
} | ||||||
} | ||||||
if len(targetPaths) == 0 { | ||||||
// Nothing to do. | ||||||
continue | ||||||
// Extract the hard link base file to the staging directory, when | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
something like this so that it does not give the impression of "either of these". |
||||||
// 1. it is not part of the target paths (len(targetPaths) == 0) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would probably put it like "it's not listed in the slice definition file". |
||||||
// 2. it is required by other hard links (exists as a key in the HardLinkRevMap) | ||||||
// In case that [len(targetPaths) > 0], the hard link base file is extracted normally. | ||||||
// Note that the hard link base file can also be a symlink. | ||||||
// tarHeader.Name is used here since the paths in the HardLinkRevMap are relative. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If it's possible and if it makes sense, let's try to use absolute paths everywhere to avoid confusion. |
||||||
if entry, ok := tarMetadata.HardLinkRevMap[tarHeader.Name]; ok { | ||||||
targetDir = stagingDir | ||||||
entry.StagingPath = filepath.Join(stagingDir, tarHeader.Name) | ||||||
tarMetadata.HardLinkRevMap[tarHeader.Name] = entry | ||||||
targetPaths[sourcePath] = append(targetPaths[sourcePath], ExtractInfo{ | ||||||
Path: sourcePath, | ||||||
Mode: uint(tarHeader.FileInfo().Mode()), | ||||||
}) | ||||||
} else { | ||||||
// Nothing to do. | ||||||
continue | ||||||
} | ||||||
} | ||||||
|
||||||
var contentCache []byte | ||||||
|
@@ -236,7 +299,7 @@ func extractData(dataReader io.Reader, options *ExtractOptions) error { | |||||
delete(tarDirMode, path) | ||||||
|
||||||
createOptions := &fsutil.CreateOptions{ | ||||||
Path: filepath.Join(options.TargetDir, path), | ||||||
Path: filepath.Join(targetDir, path), | ||||||
Mode: mode, | ||||||
MakeParents: true, | ||||||
} | ||||||
|
@@ -247,17 +310,33 @@ func extractData(dataReader io.Reader, options *ExtractOptions) error { | |||||
} | ||||||
// Create the entry itself. | ||||||
link := tarHeader.Linkname | ||||||
hardLinkId := 0 | ||||||
if tarHeader.Typeflag == tar.TypeLink { | ||||||
// A hard link requires the real path of the target file. | ||||||
link = filepath.Join(options.TargetDir, link) | ||||||
// Set the [link] to the absolute path if it's a hard link | ||||||
if entry, ok := tarMetadata.HardLinkRevMap[link]; ok { | ||||||
if entry.StagingPath != "" { | ||||||
link = entry.StagingPath | ||||||
} else { | ||||||
link = filepath.Join(targetDir, link) | ||||||
} | ||||||
// Set the hardLinkId for hard links | ||||||
hardLinkId = int(entry.Identifier) | ||||||
} else { | ||||||
return fmt.Errorf("hard link target %s not found in the tarball header", tarHeader.Linkname) | ||||||
} | ||||||
} | ||||||
// Set the HardLinkId to both the hard link and its counterpart file, | ||||||
// so they are symmetric in the report. | ||||||
if entry, ok := tarMetadata.HardLinkRevMap["."+targetPath]; ok { | ||||||
hardLinkId = int(entry.Identifier) | ||||||
} | ||||||
|
||||||
createOptions := &fsutil.CreateOptions{ | ||||||
Path: filepath.Join(options.TargetDir, targetPath), | ||||||
Path: filepath.Join(targetDir, targetPath), | ||||||
Mode: tarHeader.FileInfo().Mode(), | ||||||
Data: pathReader, | ||||||
Link: link, | ||||||
MakeParents: true, | ||||||
HardLinkId: hardLinkId, | ||||||
} | ||||||
err := options.Create(extractInfos, createOptions) | ||||||
if err != nil { | ||||||
|
@@ -294,3 +373,47 @@ func parentDirs(path string) []string { | |||||
} | ||||||
return parents | ||||||
} | ||||||
|
||||||
func newTarMetadata() tarMetadata { | ||||||
return tarMetadata{ | ||||||
HardLinkRevMap: make(map[string]hardLinkRevMapEntry), | ||||||
} | ||||||
} | ||||||
Comment on lines
+377
to
+381
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You probably don't need this function since we are using it once and it's quite minimal. |
||||||
|
||||||
func readTarMetadata(tarReader *tar.Reader) (tarMetadata, error) { | ||||||
metadata := newTarMetadata() | ||||||
var hardLinkRevMap = make(map[string][]string) | ||||||
for { | ||||||
tarHeader, err := tarReader.Next() | ||||||
if err == io.EOF { | ||||||
break | ||||||
} | ||||||
if err != nil { | ||||||
return metadata, err | ||||||
} | ||||||
|
||||||
if tarHeader.Typeflag == tar.TypeLink { | ||||||
sourcePath := tarHeader.Name | ||||||
linkPath := tarHeader.Linkname | ||||||
hardLinkRevMap[linkPath] = append(hardLinkRevMap[linkPath], sourcePath) | ||||||
} | ||||||
} | ||||||
|
||||||
// Sort the hard link targets to ensure a deterministic HardLinkId in the report | ||||||
targets := make([]string, 0, len(hardLinkRevMap)) | ||||||
for target := range hardLinkRevMap { | ||||||
targets = append(targets, target) | ||||||
} | ||||||
sort.Strings(targets) | ||||||
|
||||||
for idx, target := range targets { | ||||||
sources := hardLinkRevMap[target] | ||||||
metadata.HardLinkRevMap[target] = hardLinkRevMapEntry{ | ||||||
Target: sources, | ||||||
Identifier: idx + 1, | ||||||
StagingPath: "", | ||||||
} | ||||||
} | ||||||
|
||||||
return metadata, nil | ||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We might need to explain the "base file" since that's something we came up with, or try to think of a better name.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I understand it correctly, it is probably "base file" == "inode".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this means "The identifier for the hard link should be unique for all the file entries in the same hard link group"