Skip to content

Commit

Permalink
Improve diff performance (chainguard-dev#429)
Browse files Browse the repository at this point in the history
* Improve diff performance

Signed-off-by: egibs <[email protected]>

* Appease the linter

Signed-off-by: egibs <[email protected]>

* Copy loop vars

Signed-off-by: egibs <[email protected]>

* Address PR comments; fix channel bug with a WaitGroup

Signed-off-by: egibs <[email protected]>

* Update samples

Signed-off-by: egibs <[email protected]>

* Appease the linter

Signed-off-by: egibs <[email protected]>

* Don't use concurrency for combine

Signed-off-by: egibs <[email protected]>

---------

Signed-off-by: egibs <[email protected]>
  • Loading branch information
egibs authored Aug 27, 2024
1 parent bdcb640 commit 790c304
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 45 deletions.
137 changes: 92 additions & 45 deletions pkg/action/diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@ import (
"log/slog"
"math"
"path/filepath"
"regexp"
"sync"

"github.com/agext/levenshtein"
"github.com/chainguard-dev/bincapz/pkg/bincapz"
"github.com/chainguard-dev/clog"
orderedmap "github.com/wk8/go-ordered-map/v2"
"golang.org/x/sync/errgroup"
)

func relFileReport(ctx context.Context, c bincapz.Config, fromPath string) (map[string]*bincapz.FileReport, error) {
Expand Down Expand Up @@ -45,13 +47,28 @@ func Diff(ctx context.Context, c bincapz.Config) (*bincapz.Report, error) {
return nil, fmt.Errorf("diff mode requires 2 paths, you passed in %d path(s)", len(c.ScanPaths))
}

src, err := relFileReport(ctx, c, c.ScanPaths[0])
if err != nil {
return nil, err
}
var g errgroup.Group
g.SetLimit(2) // create src and dest relFileReports concurrently

dest, err := relFileReport(ctx, c, c.ScanPaths[1])
if err != nil {
var src, dest map[string]*bincapz.FileReport
var err error
g.Go(func() error {
src, err = relFileReport(ctx, c, c.ScanPaths[0])
if err != nil {
return err
}
return nil
})

g.Go(func() error {
dest, err = relFileReport(ctx, c, c.ScanPaths[1])
if err != nil {
return err
}
return nil
})

if err := g.Wait(); err != nil {
return nil, err
}

Expand Down Expand Up @@ -164,55 +181,88 @@ func fileDestination(ctx context.Context, c bincapz.Config, fr, tr *bincapz.File
}
}

type diffReports struct {
Added string
AddedFR *bincapz.FileReport
Removed string
RemovedFR *bincapz.FileReport
// filterMap filters orderedmap pairs by checking for matches against a slice of compiled regular expression patterns.
func filterMap(om *orderedmap.OrderedMap[string, *bincapz.FileReport], ps []*regexp.Regexp, c chan<- *orderedmap.Pair[string, *bincapz.FileReport], wg *sync.WaitGroup) {
defer wg.Done()
for pair := om.Oldest(); pair != nil; pair = pair.Next() {
for _, pattern := range ps {
if match := pattern.FindString(filepath.Base(pair.Key)); match != "" {
c <- pair
}
}
}
}

// combineReports builds a flattened slice of added and removed paths and their respective file reports.
func combineReports(d *bincapz.DiffReport) []diffReports {
diffs := make(chan diffReports)
// combine iterates over the removed and added channels to create a diff report to store in the combined channel.
func combine(removed, added <-chan *orderedmap.Pair[string, *bincapz.FileReport], combined chan<- bincapz.CombinedReport) {
for r := range removed {
for a := range added {
score := levenshtein.Match(r.Key, a.Key, levenshtein.NewParams())
if score < 0.9 {
continue
}
combined <- bincapz.CombinedReport{
Added: a.Key,
AddedFR: a.Value,
Removed: r.Key,
RemovedFR: r.Value,
Score: score,
}
}
}
}

// combineReports orchestrates the population of the diffs channel with relevant diffReports.
func combineReports(d *bincapz.DiffReport, combined chan<- bincapz.CombinedReport) {
defer close(combined)

var wg sync.WaitGroup

for removed := d.Removed.Oldest(); removed != nil; removed = removed.Next() {
wg.Add(1)
go func(path string, fr *bincapz.FileReport) {
defer wg.Done()
for added := d.Added.Oldest(); added != nil; added = added.Next() {
diffs <- diffReports{
Added: added.Key,
AddedFR: added.Value,
Removed: path,
RemovedFR: fr,
}
}
}(removed.Key, removed.Value)
// Patterns we care about when handling diffs
patterns := []string{
`^[\w.-]+\.so$`,
`^.+-.*-r\d+\.spdx\.json$`,
}

ps := make([]*regexp.Regexp, len(patterns))
for i, pattern := range patterns {
ps[i] = regexp.MustCompile(pattern)
}

// Build two channels with filtered paths to iterate through in the worker pool
removed := make(chan *orderedmap.Pair[string, *bincapz.FileReport], d.Removed.Len())
added := make(chan *orderedmap.Pair[string, *bincapz.FileReport], d.Added.Len())

wg.Add(1)
go func() {
wg.Wait()
close(diffs)
filterMap(d.Removed, ps, removed, &wg)
close(removed)
}()

var reports []diffReports
for diff := range diffs {
reports = append(reports, diff)
}
return reports
wg.Add(1)
go func() {
filterMap(d.Added, ps, added, &wg)
close(added)
}()

combine(removed, added, combined)

wg.Wait()
}

func inferMoves(ctx context.Context, c bincapz.Config, d *bincapz.DiffReport) {
flattenedDiffs := combineReports(d)
// Create a channel with enough capacity to hold the entirety of the two maps
// This is the worst case since we will always filter out irrelevant filepaths
combined := make(chan bincapz.CombinedReport, d.Removed.Len()+d.Added.Len())

combineReports(d, combined)

for _, fd := range flattenedDiffs {
score := levenshtein.Match(fd.Removed, fd.Added, levenshtein.NewParams())
fileMove(ctx, c, fd.RemovedFR, fd.AddedFR, fd.Removed, fd.Added, score, d)
for dr := range combined {
fileMove(ctx, c, dr.RemovedFR, dr.AddedFR, dr.Removed, dr.Added, d, dr.Score)
}
}

func fileMove(ctx context.Context, c bincapz.Config, fr, tr *bincapz.FileReport, rpath, apath string, score float64, d *bincapz.DiffReport) {
func fileMove(ctx context.Context, c bincapz.Config, fr, tr *bincapz.FileReport, rpath, apath string, d *bincapz.DiffReport, score float64) {
minRisk := int(math.Min(float64(c.MinRisk), float64(c.MinFileRisk)))
if fr.RiskScore < minRisk && tr.RiskScore < minRisk {
clog.FromContext(ctx).Info("diff does not meet min trigger level", slog.Any("path", tr.Path))
Expand Down Expand Up @@ -249,10 +299,7 @@ func fileMove(ctx context.Context, c bincapz.Config, fr, tr *bincapz.FileReport,
}
}

// Move these into the modified list if the files are not completely different (something like ~0.3)
if score > 0.3 {
d.Modified.Set(apath, abs)
d.Modified.Delete(rpath)
d.Modified.Delete(apath)
}
d.Modified.Set(apath, abs)
d.Removed.Delete(rpath)
d.Added.Delete(apath)
}
8 changes: 8 additions & 0 deletions pkg/bincapz/bincapz.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,11 @@ type StrMetric struct {
Total int
Value float64
}

type CombinedReport struct {
Added string
AddedFR *FileReport
Removed string
RemovedFR *FileReport
Score float64
}
Binary file modified samples.tar.gz.aa
Binary file not shown.
Binary file modified samples.tar.gz.ab
Binary file not shown.
Binary file modified samples.tar.gz.ac
Binary file not shown.
Binary file modified samples.tar.gz.ad
Binary file not shown.

0 comments on commit 790c304

Please sign in to comment.