letFunny · zhijie-yang · Sep 5, 2024 · Sep 5, 2024 · Sep 6, 2024 · Sep 6, 2024
diff --git a/internal/archive/archive.go b/internal/archive/archive.go
@@ -18,7 +18,7 @@ import (
 
 type Archive interface {
 	Options() *Options
-	Fetch(pkg string) (io.ReadCloser, error)
+	Fetch(pkg string) (io.ReadSeekCloser, error)
 	Exists(pkg string) bool
 }
 
@@ -112,7 +112,7 @@ func (a *ubuntuArchive) selectPackage(pkg string) (control.Section, *ubuntuIndex
 	return selectedSection, selectedIndex, nil
 }
 
-func (a *ubuntuArchive) Fetch(pkg string) (io.ReadCloser, error) {
+func (a *ubuntuArchive) Fetch(pkg string) (io.ReadSeekCloser, error) {
 	section, index, err := a.selectPackage(pkg)
 	if err != nil {
 		return nil, err
@@ -269,7 +269,7 @@ func (index *ubuntuIndex) checkComponents(components []string) error {
 	return nil
 }
 
-func (index *ubuntuIndex) fetch(suffix, digest string, flags fetchFlags) (io.ReadCloser, error) {
+func (index *ubuntuIndex) fetch(suffix, digest string, flags fetchFlags) (io.ReadSeekCloser, error) {
 	reader, err := index.archive.cache.Open(digest)
 	if err == nil {
 		return reader, nil

diff --git a/internal/cache/cache.go b/internal/cache/cache.go
@@ -132,7 +132,7 @@ func (c *Cache) Write(digest string, data []byte) error {
 	return err2
 }
 
-func (c *Cache) Open(digest string) (io.ReadCloser, error) {
+func (c *Cache) Open(digest string) (io.ReadSeekCloser, error) {
 	if c.Dir == "" || digest == "" {
 		return nil, MissErr
 	}

diff --git a/internal/deb/extract.go b/internal/deb/extract.go
@@ -38,6 +38,18 @@ type ExtractInfo struct {
 	Context  any
 }
 
+// The identifier for the hardlink is unique for each unique base file,
+// which counts from 1. The 0 value is reserved for files that are not hard links.
+type hardLinkRevMapEntry struct {
+	Target      []string
+	Identifier  int
+	StagingPath string
+}
+
+type tarMetadata struct {
+	HardLinkRevMap map[string]hardLinkRevMapEntry
+}
+
 func getValidOptions(options *ExtractOptions) (*ExtractOptions, error) {
 	for extractPath, extractInfos := range options.Extract {
 		isGlob := strings.ContainsAny(extractPath, "*?")
@@ -62,7 +74,7 @@ func getValidOptions(options *ExtractOptions) (*ExtractOptions, error) {
 	return options, nil
 }
 
-func Extract(pkgReader io.Reader, options *ExtractOptions) (err error) {
+func Extract(pkgReader io.ReadSeeker, options *ExtractOptions) (err error) {
 	defer func() {
 		if err != nil {
 			err = fmt.Errorf("cannot extract from package %q: %w", options.Package, err)
@@ -83,43 +95,75 @@ func Extract(pkgReader io.Reader, options *ExtractOptions) (err error) {
 		return err
 	}
 
+	return extractData(pkgReader, validOpts)
+}
+
+// getDataReader returns a ReadCloser for the data payload of the package.
+// Calling the Close method must happen outside of this function to prevent
+// premature closing of the underlying package reader.
+// The xz.Reader is wrapper with io.NopCloser since it does not implement the Close method.
-// The xz.Reader is wrapper with io.NopCloser since it does not implement the Close method.
+// The xz.Reader is wrapped with io.NopCloser since it does not implement the Close method.
-// The xz.Reader is wrapper with io.NopCloser since it does not implement the Close method.
+// The xz.Reader is wrapped with io.NopCloser since it does not implement the Close method.
+func getDataReader(pkgReader io.ReadSeeker) (io.ReadCloser, error) {
 	arReader := ar.NewReader(pkgReader)
-	var dataReader io.Reader
+	var dataReader io.ReadCloser
 	for dataReader == nil {
 		arHeader, err := arReader.Next()
 		if err == io.EOF {
-			return fmt.Errorf("no data payload")
+			return nil, fmt.Errorf("no data payload")
 		}
 		if err != nil {
-			return err
+			return nil, err
 		}
 		switch arHeader.Name {
 		case "data.tar.gz":
 			gzipReader, err := gzip.NewReader(arReader)
 			if err != nil {
-				return err
+				return nil, err
 			}
-			defer gzipReader.Close()
 			dataReader = gzipReader
 		case "data.tar.xz":
 			xzReader, err := xz.NewReader(arReader)
 			if err != nil {
-				return err
+				return nil, err
 			}
-			dataReader = xzReader
+			dataReader = io.NopCloser(xzReader)
 		case "data.tar.zst":
 			zstdReader, err := zstd.NewReader(arReader)
 			if err != nil {
-				return err
+				return nil, err
 			}
-			defer zstdReader.Close()
-			dataReader = zstdReader
+			dataReader = zstdReader.IOReadCloser()
 		}
 	}
-	return extractData(dataReader, validOpts)
+
+	return dataReader, nil
 }
 
-func extractData(dataReader io.Reader, options *ExtractOptions) error {
+func extractData(pkgReader io.ReadSeeker, options *ExtractOptions) error {
+	// stagingDir is the directory where the hard link base file, which is not
+	// listed in the pendingPaths, to be extracted.
+	stagingDir, err := os.MkdirTemp("", "chisel-staging-")
+	if err != nil {
+		return err
+	}
+	// Fist pass over the tarball to read the header of the tarball
-	// Fist pass over the tarball to read the header of the tarball
+	// First pass over the tarball to read the header of the tarball
-	// Fist pass over the tarball to read the header of the tarball
+	// First pass over the tarball to read the header of the tarball
+	// and create its metadata.
+	dataReader, err := getDataReader(pkgReader)
+	if err != nil {
+		return err
+	}
+	tarReader := tar.NewReader(dataReader)
+	tarMetadata, err := readTarMetadata(tarReader)
+	if err != nil {
+		return err
+	}
+	// Rewind back to the start of the tarball and extract the files.
+	pkgReader.Seek(0, io.SeekStart)
+	// Second pass over the tarball to extract the files.
+	dataReader, err = getDataReader(pkgReader)
+	if err != nil {
+		return err
+	}
+	defer dataReader.Close()
 
 	oldUmask := syscall.Umask(0)
 	defer func() {
@@ -143,7 +187,7 @@ func extractData(dataReader io.Reader, options *ExtractOptions) error {
 	// before the entry for the file itself. This is the case for .deb files but
 	// not for all tarballs.
 	tarDirMode := make(map[string]fs.FileMode)
-	tarReader := tar.NewReader(dataReader)
+	tarReader = tar.NewReader(dataReader)
 	for {
 		tarHeader, err := tarReader.Next()
 		if err == io.EOF {
@@ -153,6 +197,9 @@ func extractData(dataReader io.Reader, options *ExtractOptions) error {
 			return err
 		}
 
+		// targetDir is the directory where the file is extracted.
+		// It is either the options.TargetDir or the stagingDir.
-		// It is either the options.TargetDir or the stagingDir.
+		// It is either the options.TargetDir or the stagingDir depending on the file.
-		// It is either the options.TargetDir or the stagingDir.
+		// It is either the options.TargetDir or the stagingDir depending on the file.
+		targetDir := options.TargetDir
 		sourcePath := tarHeader.Name
 		if len(sourcePath) < 3 || sourcePath[0] != '.' || sourcePath[1] != '/' {
 			continue
@@ -187,8 +234,24 @@ func extractData(dataReader io.Reader, options *ExtractOptions) error {
 			}
 		}
 		if len(targetPaths) == 0 {
-			// Nothing to do.
-			continue
+			// Extract the hard link base file to the staging directory, when
-			// Extract the hard link base file to the staging directory, when
+			// Extract the hard link base file to the staging directory, when the following are true:
-			// Extract the hard link base file to the staging directory, when
+			// Extract the hard link base file to the staging directory, when the following are true:
+			// 1. it is not part of the target paths (len(targetPaths) == 0)
+			// 2. it is required by other hard links (exists as a key in the HardLinkRevMap)
+			// In case that [len(targetPaths) > 0], the hard link base file is extracted normally.
+			// Note that the hard link base file can also be a symlink.
+			// tarHeader.Name is used here since the paths in the HardLinkRevMap are relative.
+			if entry, ok := tarMetadata.HardLinkRevMap[tarHeader.Name]; ok {
+				targetDir = stagingDir
+				entry.StagingPath = filepath.Join(stagingDir, tarHeader.Name)
+				tarMetadata.HardLinkRevMap[tarHeader.Name] = entry
+				targetPaths[sourcePath] = append(targetPaths[sourcePath], ExtractInfo{
+					Path: sourcePath,
+					Mode: uint(tarHeader.FileInfo().Mode()),
+				})
+			} else {
+				// Nothing to do.
+				continue
+			}
 		}
 
 		var contentCache []byte
@@ -236,7 +299,7 @@ func extractData(dataReader io.Reader, options *ExtractOptions) error {
 				delete(tarDirMode, path)
 
 				createOptions := &fsutil.CreateOptions{
-					Path:        filepath.Join(options.TargetDir, path),
+					Path:        filepath.Join(targetDir, path),
 					Mode:        mode,
 					MakeParents: true,
 				}
@@ -247,17 +310,33 @@ func extractData(dataReader io.Reader, options *ExtractOptions) error {
 			}
 			// Create the entry itself.
 			link := tarHeader.Linkname
+			hardLinkId := 0
 			if tarHeader.Typeflag == tar.TypeLink {
-				// A hard link requires the real path of the target file.
-				link = filepath.Join(options.TargetDir, link)
+				// Set the [link] to the absolute path if it's a hard link
+				if entry, ok := tarMetadata.HardLinkRevMap[link]; ok {
+					if entry.StagingPath != "" {
+						link = entry.StagingPath
+					} else {
+						link = filepath.Join(targetDir, link)
+					}
+					// Set the hardLinkId for hard links
+					hardLinkId = int(entry.Identifier)
+				} else {
+					return fmt.Errorf("hard link target %s not found in the tarball header", tarHeader.Linkname)
+				}
+			}
+			// Set the HardLinkId to both the hard link and its counterpart file,
+			// so they are symmetric in the report.
+			if entry, ok := tarMetadata.HardLinkRevMap["."+targetPath]; ok {
+				hardLinkId = int(entry.Identifier)
 			}
-
 			createOptions := &fsutil.CreateOptions{
-				Path:        filepath.Join(options.TargetDir, targetPath),
+				Path:        filepath.Join(targetDir, targetPath),
 				Mode:        tarHeader.FileInfo().Mode(),
 				Data:        pathReader,
 				Link:        link,
 				MakeParents: true,
+				HardLinkId:  hardLinkId,
 			}
 			err := options.Create(extractInfos, createOptions)
 			if err != nil {
@@ -294,3 +373,47 @@ func parentDirs(path string) []string {
 	}
 	return parents
 }
+
+func newTarMetadata() tarMetadata {
+	return tarMetadata{
+		HardLinkRevMap: make(map[string]hardLinkRevMapEntry),
+	}
+}
+
+func readTarMetadata(tarReader *tar.Reader) (tarMetadata, error) {
+	metadata := newTarMetadata()
+	var hardLinkRevMap = make(map[string][]string)
+	for {
+		tarHeader, err := tarReader.Next()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return metadata, err
+		}
+
+		if tarHeader.Typeflag == tar.TypeLink {
+			sourcePath := tarHeader.Name
+			linkPath := tarHeader.Linkname
+			hardLinkRevMap[linkPath] = append(hardLinkRevMap[linkPath], sourcePath)
+		}
+	}
+
+	// Sort the hard link targets to ensure a deterministic HardLinkId in the report
+	targets := make([]string, 0, len(hardLinkRevMap))
+	for target := range hardLinkRevMap {
+		targets = append(targets, target)
+	}
+	sort.Strings(targets)
+
+	for idx, target := range targets {
+		sources := hardLinkRevMap[target]
+		metadata.HardLinkRevMap[target] = hardLinkRevMapEntry{
+			Target:      sources,
+			Identifier:  idx + 1,
+			StagingPath: "",
+		}
+	}
+
+	return metadata, nil
+}
diff --git a/internal/deb/extract_test.go b/internal/deb/extract_test.go
@@ -433,7 +433,7 @@ func (s *S) TestExtract(c *C) {
 			test.hackopt(&options)
 		}
 
-		err := deb.Extract(bytes.NewBuffer(test.pkgdata), &options)
+		err := deb.Extract(bytes.NewReader(test.pkgdata), &options)
 		if test.error != "" {
 			c.Assert(err, ErrorMatches, test.error)
 			continue
@@ -544,7 +544,7 @@ func (s *S) TestExtractCreateCallback(c *C) {
 			return nil
 		}
 
-		err := deb.Extract(bytes.NewBuffer(test.pkgdata), &options)
+		err := deb.Extract(bytes.NewReader(test.pkgdata), &options)
 		c.Assert(err, IsNil)
 
 		c.Assert(createExtractInfos, DeepEquals, test.calls)

diff --git a/internal/fsutil/create.go b/internal/fsutil/create.go
@@ -22,14 +22,16 @@ type CreateOptions struct {
 	// If MakeParents is true, missing parent directories of Path are
 	// created with permissions 0755.
 	MakeParents bool
+	HardLinkId  int
 }
 
 type Entry struct {
-	Path string
-	Mode fs.FileMode
-	Hash string
-	Size int
-	Link string
+	Path       string
+	Mode       fs.FileMode
+	Hash       string
+	Size       int
+	Link       string
+	HardLinkId int
 }
 
 // Create creates a filesystem entry according to the provided options and returns
@@ -75,11 +77,12 @@ func Create(options *CreateOptions) (*Entry, error) {
 		return nil, err
 	}
 	entry := &Entry{
-		Path: o.Path,
-		Mode: s.Mode(),
-		Hash: hash,
-		Size: rp.size,
-		Link: o.Link,
+		Path:       o.Path,
+		Mode:       s.Mode(),
+		Hash:       hash,
+		Size:       rp.size,
+		Link:       o.Link,
+		HardLinkId: o.HardLinkId,
 	}
 	return entry, nil
 }

diff --git a/internal/slicer/report.go b/internal/slicer/report.go
@@ -11,13 +11,14 @@ import (
 )
 
 type ReportEntry struct {
-	Path      string
-	Mode      fs.FileMode
-	Hash      string
-	Size      int
-	Slices    map[*setup.Slice]bool
-	Link      string
-	FinalHash string
+	Path       string
+	Mode       fs.FileMode
+	Hash       string
+	Size       int
+	Slices     map[*setup.Slice]bool
+	Link       string
+	FinalHash  string
+	HardLinkId int
 }
 
 // Report holds the information about files and directories created when slicing
@@ -62,12 +63,13 @@ func (r *Report) Add(slice *setup.Slice, fsEntry *fsutil.Entry) error {
 		r.Entries[relPath] = entry
 	} else {
 		r.Entries[relPath] = ReportEntry{
-			Path:   relPath,
-			Mode:   fsEntry.Mode,
-			Hash:   fsEntry.Hash,
-			Size:   fsEntry.Size,
-			Slices: map[*setup.Slice]bool{slice: true},
-			Link:   fsEntry.Link,
+			Path:       relPath,
+			Mode:       fsEntry.Mode,
+			Hash:       fsEntry.Hash,
+			Size:       fsEntry.Size,
+			Slices:     map[*setup.Slice]bool{slice: true},
+			Link:       fsEntry.Link,
+			HardLinkId: fsEntry.HardLinkId,
 		}
 	}
 	return nil