-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathdiff.go
370 lines (320 loc) · 10.5 KB
/
diff.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
// Copyright 2024 Chainguard, Inc.
// SPDX-License-Identifier: Apache-2.0
package action
import (
"context"
"fmt"
"log/slog"
"math"
"os"
"path/filepath"
"regexp"
"sync"
"github.com/agext/levenshtein"
"github.com/chainguard-dev/clog"
"github.com/chainguard-dev/malcontent/pkg/malcontent"
orderedmap "github.com/wk8/go-ordered-map/v2"
"golang.org/x/sync/errgroup"
)
func relFileReport(ctx context.Context, c malcontent.Config, fromPath string) (map[string]*malcontent.FileReport, error) {
fromConfig := c
fromConfig.Renderer = nil
fromConfig.ScanPaths = []string{fromPath}
fromReport, err := recursiveScan(ctx, fromConfig)
if err != nil {
return nil, err
}
fromRelPath := map[string]*malcontent.FileReport{}
fromReport.Files.Range(func(key, value any) bool {
if key == nil || value == nil {
return true
}
if fr, ok := value.(*malcontent.FileReport); ok {
if fr.Skipped != "" || fr.Error != "" {
return true
}
// Evaluate symlinks to cover edge cases like macOS' /private/tmp -> /tmp symlink
// Also, remove any filenames to correctly determine the relative path
// Using "." and "." will show as modifications for completely unrelated files and paths
info, err := os.Stat(fromPath)
if err != nil {
return false
}
dir := filepath.Dir(fromPath)
var fromRoot string
if info.IsDir() {
fromRoot, err = filepath.EvalSymlinks(fromPath)
} else {
fromRoot, err = filepath.EvalSymlinks(dir)
}
if err != nil {
return false
}
if fromRoot == "." {
fromRoot = fromPath
}
rel, err := filepath.Rel(fromRoot, fr.Path)
if err != nil {
return false
}
fromRelPath[rel] = fr
}
return true
})
return fromRelPath, nil
}
func Diff(ctx context.Context, c malcontent.Config) (*malcontent.Report, error) {
if len(c.ScanPaths) != 2 {
return nil, fmt.Errorf("diff mode requires 2 paths, you passed in %d path(s)", len(c.ScanPaths))
}
var g errgroup.Group
var src, dest map[string]*malcontent.FileReport
srcCh := make(chan map[string]*malcontent.FileReport, 1)
destCh := make(chan map[string]*malcontent.FileReport, 1)
g.Go(func() error {
src, err := relFileReport(ctx, c, c.ScanPaths[0])
if err != nil {
return err
}
srcCh <- src
return nil
})
g.Go(func() error {
dest, err := relFileReport(ctx, c, c.ScanPaths[1])
if err != nil {
return err
}
destCh <- dest
return nil
})
if err := g.Wait(); err != nil {
return nil, err
}
src = <-srcCh
dest = <-destCh
close(srcCh)
close(destCh)
d := &malcontent.DiffReport{
Added: orderedmap.New[string, *malcontent.FileReport](),
Removed: orderedmap.New[string, *malcontent.FileReport](),
Modified: orderedmap.New[string, *malcontent.FileReport](),
}
processSrc(ctx, c, src, dest, d)
processDest(ctx, c, src, dest, d)
// skip inferring moves if added and removed are empty
if d.Added != nil && d.Removed != nil {
inferMoves(ctx, c, d)
}
return &malcontent.Report{Diff: d}, nil
}
func processSrc(ctx context.Context, c malcontent.Config, src, dest map[string]*malcontent.FileReport, d *malcontent.DiffReport) {
// things that appear in the source
for relPath, fr := range src {
tr, exists := dest[relPath]
if !exists {
d.Removed.Set(relPath, fr)
continue
}
handleFile(ctx, c, fr, tr, relPath, d)
}
}
func processDest(ctx context.Context, c malcontent.Config, from, to map[string]*malcontent.FileReport, d *malcontent.DiffReport) {
// findings that exist only in the destination
for relPath, tr := range to {
fr, exists := from[relPath]
if !exists {
d.Added.Set(relPath, tr)
continue
}
fileDestination(ctx, c, fr, tr, relPath, d)
}
}
func fileDestination(ctx context.Context, c malcontent.Config, fr, tr *malcontent.FileReport, relPath string, d *malcontent.DiffReport) {
// We've now established that this file exists in both source and destination
if fr.RiskScore < c.MinFileRisk && tr.RiskScore < c.MinFileRisk {
clog.FromContext(ctx).Info("diff does not meet min trigger level", slog.Any("path", tr.Path))
return
}
// Filter files that are marked as added
if filterDiff(ctx, c, fr, tr) {
return
}
abs := createFileReport(tr, fr)
// if destination behavior is not in the source
for _, tb := range tr.Behaviors {
if !behaviorExists(tb, fr.Behaviors) {
tb.DiffAdded = true
abs.Behaviors = append(abs.Behaviors, tb)
continue
}
}
// are there already modified behaviors for this file?
rel, exists := d.Modified.Get(relPath)
if !exists {
d.Modified.Set(relPath, abs)
} else {
rel.Behaviors = append(rel.Behaviors, abs.Behaviors...)
d.Modified.Set(relPath, rel)
}
}
func handleFile(ctx context.Context, c malcontent.Config, fr, tr *malcontent.FileReport, relPath string, d *malcontent.DiffReport) {
// We've now established that file exists in both source & destination
if fr.RiskScore < c.MinFileRisk && tr.RiskScore < c.MinFileRisk {
clog.FromContext(ctx).Info("diff does not meet min trigger level", slog.Any("path", tr.Path))
return
}
// Filter files that are marked for removal
if filterDiff(ctx, c, fr, tr) {
return
}
rbs := createFileReport(tr, fr)
for _, fb := range fr.Behaviors {
// findings that exist only in the source
if !behaviorExists(fb, tr.Behaviors) {
fb.DiffRemoved = true
rbs.Behaviors = append(rbs.Behaviors, fb)
continue
}
// findings that exist in both, for reference
rbs.Behaviors = append(rbs.Behaviors, fb)
}
d.Modified.Set(relPath, rbs)
}
func createFileReport(tr, fr *malcontent.FileReport) *malcontent.FileReport {
return &malcontent.FileReport{
Path: tr.Path,
PreviousRelPath: fr.Path,
Behaviors: []*malcontent.Behavior{},
PreviousRiskScore: fr.RiskScore,
PreviousRiskLevel: fr.RiskLevel,
RiskLevel: tr.RiskLevel,
RiskScore: tr.RiskScore,
}
}
func behaviorExists(b *malcontent.Behavior, behaviors []*malcontent.Behavior) bool {
for _, tb := range behaviors {
if tb.ID == b.ID {
return true
}
}
return false
}
// filterMap filters orderedmap pairs by checking for matches against a slice of compiled regular expression patterns.
func filterMap(om *orderedmap.OrderedMap[string, *malcontent.FileReport], ps []*regexp.Regexp, c chan<- *orderedmap.Pair[string, *malcontent.FileReport], wg *sync.WaitGroup) {
defer wg.Done()
for pair := om.Oldest(); pair != nil; pair = pair.Next() {
for _, pattern := range ps {
if match := pattern.FindString(filepath.Base(pair.Key)); match != "" {
c <- pair
}
}
}
}
// combine iterates over the removed and added channels to create a diff report to store in the combined channel.
func combine(removed, added <-chan *orderedmap.Pair[string, *malcontent.FileReport]) []malcontent.CombinedReport {
combined := make([]malcontent.CombinedReport, 0, len(removed)*len(added))
for r := range removed {
for a := range added {
score := levenshtein.Match(r.Key, a.Key, levenshtein.NewParams())
if score < 0.9 {
continue
}
combined = append(combined, malcontent.CombinedReport{
Added: a.Key,
AddedFR: a.Value,
Removed: r.Key,
RemovedFR: r.Value,
Score: score,
})
}
}
return combined
}
// combineReports orchestrates the population of the diffs channel with relevant diffReports.
func combineReports(d *malcontent.DiffReport) []malcontent.CombinedReport {
var wg sync.WaitGroup
// Patterns we care about when handling diffs
patterns := []string{
`^[\w.-]+\.so$`,
`^.+-.*-r\d+\.spdx\.json$`,
}
ps := make([]*regexp.Regexp, len(patterns))
for i, pattern := range patterns {
ps[i] = regexp.MustCompile(pattern)
}
// Build two channels with filtered paths to iterate through in the worker pool
removed := make(chan *orderedmap.Pair[string, *malcontent.FileReport], d.Removed.Len())
added := make(chan *orderedmap.Pair[string, *malcontent.FileReport], d.Added.Len())
wg.Add(1)
go func() {
filterMap(d.Removed, ps, removed, &wg)
close(removed)
}()
wg.Add(1)
go func() {
filterMap(d.Added, ps, added, &wg)
close(added)
}()
wg.Wait()
return combine(removed, added)
}
func inferMoves(ctx context.Context, c malcontent.Config, d *malcontent.DiffReport) {
for _, cr := range combineReports(d) {
fileMove(ctx, c, cr.RemovedFR, cr.AddedFR, cr.Removed, cr.Added, d, cr.Score)
}
}
func fileMove(ctx context.Context, c malcontent.Config, fr, tr *malcontent.FileReport, rpath, apath string, d *malcontent.DiffReport, score float64) {
minRisk := int(math.Min(float64(c.MinRisk), float64(c.MinFileRisk)))
if fr.RiskScore < minRisk && tr.RiskScore < minRisk {
clog.FromContext(ctx).Info("diff does not meet min trigger level", slog.Any("path", tr.Path))
return
}
// Filter diffs for files that make it through the combineReports pattern matching
// i.e., `.so` and `.spdx.json` files
if filterDiff(ctx, c, fr, tr) {
return
}
// We think that this file moved from rpath to apath.
abs := &malcontent.FileReport{
Path: tr.Path,
PreviousRelPath: rpath,
PreviousRelPathScore: score,
Behaviors: []*malcontent.Behavior{},
PreviousRiskScore: fr.RiskScore,
PreviousRiskLevel: fr.RiskLevel,
RiskScore: tr.RiskScore,
RiskLevel: tr.RiskLevel,
}
// if destination behavior is not in the source
for _, tb := range tr.Behaviors {
if !behaviorExists(tb, fr.Behaviors) {
tb.DiffAdded = true
abs.Behaviors = append(abs.Behaviors, tb)
}
}
// if source behavior is not in the destination
for _, fb := range fr.Behaviors {
if !behaviorExists(fb, tr.Behaviors) {
fb.DiffRemoved = true
abs.Behaviors = append(abs.Behaviors, fb)
}
}
d.Modified.Set(apath, abs)
d.Removed.Delete(rpath)
d.Added.Delete(apath)
}
// filterDiff returns a boolean dictating whether a diff report should be ignored depending on the following conditions:
// `true` when passing `--file-risk-change` and the source risk score matches the destination risk score
// `true` when passing `--file-risk-increase` and the source risk score is equal to or greater than the destination risk score
// `false` otherwise.
func filterDiff(ctx context.Context, c malcontent.Config, fr, tr *malcontent.FileReport) bool {
if c.FileRiskChange && fr.RiskScore == tr.RiskScore {
clog.FromContext(ctx).Info("dropping result because diff scores were the same", slog.Any("paths", fmt.Sprintf("%s (%d) %s (%d)", fr.Path, fr.RiskScore, tr.Path, tr.RiskScore)))
return true
}
if c.FileRiskIncrease && fr.RiskScore >= tr.RiskScore {
clog.FromContext(ctx).Info("dropping result because old score was the same or higher than the new score", slog.Any("paths ", fmt.Sprintf("%s (%d) %s (%d)", fr.Path, fr.RiskScore, tr.Path, tr.RiskScore)))
return true
}
return false
}