chainguard-dev · tstromberg · Jul 9, 2024 · Jul 9, 2024 · Jul 9, 2024 · Jul 9, 2024
diff --git a/bincapz.go b/bincapz.go
@@ -70,6 +70,7 @@ func main() {
 	errFirstHitFlag := flag.Bool("err-first-hit", false, "exit with error if scan source has matching capabilities")
 	ociFlag := flag.Bool("oci", false, "Scan an OCI image")
 	omitEmptyFlag := flag.Bool("omit-empty", false, "Omit files that contain no matches")
+	frequencyUpgradeFlag := flag.Bool("frequency-upgrade", true, "increase file risk score based on frequency heuristics")
 	profileFlag := flag.Bool("profile", false, "Generate profile and trace files")
 	statsFlag := flag.Bool("stats", false, "Show statistics about the scan")
 	thirdPartyFlag := flag.Bool("third-party", true, "Include third-party rules, which may have licensing restrictions")
@@ -178,12 +179,13 @@ func main() {
 		return
 	}
 
-	bc := action.Config{
+	bc := bincapz.Config{
 		IgnoreSelf:       *ignoreSelfFlag,
 		IgnoreTags:       ignoreTags,
 		IncludeDataFiles: includeDataFiles,
 		MinFileRisk:      minFileRisk,
 		MinRisk:          minRisk,
+		FrequencyUpgrade: *frequencyUpgradeFlag,
 		OCI:              *ociFlag,
 		OmitEmpty:        *omitEmptyFlag,
 		Renderer:         renderer,

diff --git a/pkg/action/action.go b/pkg/action/action.go
@@ -2,27 +2,3 @@
 // SPDX-License-Identifier: Apache-2.0
 
 package action
-
-import (
-	"io"
-
-	"github.com/chainguard-dev/bincapz/pkg/render"
-	"github.com/hillu/go-yara/v4"
-)
-
-type Config struct {
-	IgnoreSelf       bool
-	IgnoreTags       []string
-	IncludeDataFiles bool
-	MinFileRisk      int
-	MinRisk          int
-	OCI              bool
-	OmitEmpty        bool
-	Output           io.Writer
-	Renderer         render.Renderer
-	Rules            *yara.Rules
-	ScanPaths        []string
-	Stats            bool
-	ErrFirstMiss     bool
-	ErrFirstHit      bool
-}
diff --git a/pkg/action/archive_test.go b/pkg/action/archive_test.go
@@ -10,6 +10,7 @@ import (
 	"strings"
 	"testing"
 
+	"github.com/chainguard-dev/bincapz/pkg/bincapz"
 	"github.com/chainguard-dev/bincapz/pkg/compile"
 	"github.com/chainguard-dev/bincapz/pkg/render"
 	"github.com/chainguard-dev/bincapz/rules"
@@ -228,7 +229,7 @@ func TestScanArchive(t *testing.T) {
 	if err != nil {
 		t.Fatalf("render: %v", err)
 	}
-	bc := Config{
+	bc := bincapz.Config{
 		IgnoreSelf: false,
 		IgnoreTags: []string{"harmless"},
 		Renderer:   simple,

diff --git a/pkg/action/diff.go b/pkg/action/diff.go
@@ -16,7 +16,7 @@ import (
 	"github.com/chainguard-dev/clog"
 )
 
-func relFileReport(ctx context.Context, c Config, fromPath string) (map[string]*bincapz.FileReport, error) {
+func relFileReport(ctx context.Context, c bincapz.Config, fromPath string) (map[string]*bincapz.FileReport, error) {
 	fromConfig := c
 	fromConfig.Renderer = nil
 	fromConfig.ScanPaths = []string{fromPath}
@@ -40,7 +40,7 @@ func relFileReport(ctx context.Context, c Config, fromPath string) (map[string]*
 	return fromRelPath, nil
 }
 
-func Diff(ctx context.Context, c Config) (*bincapz.Report, error) {
+func Diff(ctx context.Context, c bincapz.Config) (*bincapz.Report, error) {
 	if len(c.ScanPaths) != 2 {
 		return nil, fmt.Errorf("diff mode requires 2 paths, you passed in %d path(s)", len(c.ScanPaths))
 	}
@@ -68,7 +68,7 @@ func Diff(ctx context.Context, c Config) (*bincapz.Report, error) {
 	return &bincapz.Report{Diff: d}, err
 }
 
-func processSrc(ctx context.Context, c Config, src, dest map[string]*bincapz.FileReport, d *bincapz.DiffReport) {
+func processSrc(ctx context.Context, c bincapz.Config, src, dest map[string]*bincapz.FileReport, d *bincapz.DiffReport) {
 	// things that appear in the source
 	for relPath, fr := range src {
 		tr, exists := dest[relPath]
@@ -80,7 +80,7 @@ func processSrc(ctx context.Context, c Config, src, dest map[string]*bincapz.Fil
 	}
 }
 
-func handleFile(ctx context.Context, c Config, fr, tr *bincapz.FileReport, relPath string, d *bincapz.DiffReport) {
+func handleFile(ctx context.Context, c bincapz.Config, fr, tr *bincapz.FileReport, relPath string, d *bincapz.DiffReport) {
 	// We've now established that file exists in both source & destination
 	if fr.RiskScore < c.MinFileRisk && tr.RiskScore < c.MinFileRisk {
 		clog.FromContext(ctx).Info("diff does not meet min trigger level", slog.Any("path", tr.Path))
@@ -120,7 +120,7 @@ func behaviorExists(b *bincapz.Behavior, behaviors []*bincapz.Behavior) bool {
 	return false
 }
 
-func processDest(ctx context.Context, c Config, from, to map[string]*bincapz.FileReport, d *bincapz.DiffReport) {
+func processDest(ctx context.Context, c bincapz.Config, from, to map[string]*bincapz.FileReport, d *bincapz.DiffReport) {
 	// things that exist in the destination
 	for relPath, tr := range to {
 		fr, exists := from[relPath]
@@ -133,7 +133,7 @@ func processDest(ctx context.Context, c Config, from, to map[string]*bincapz.Fil
 	}
 }
 
-func fileDestination(ctx context.Context, c Config, fr, tr *bincapz.FileReport, relPath string, d *bincapz.DiffReport) {
+func fileDestination(ctx context.Context, c bincapz.Config, fr, tr *bincapz.FileReport, relPath string, d *bincapz.DiffReport) {
 	// We've now established that this file exists in both source and destination
 	if fr.RiskScore < c.MinFileRisk && tr.RiskScore < c.MinFileRisk {
 		clog.FromContext(ctx).Info("diff does not meet min trigger level", slog.Any("path", tr.Path))
@@ -158,7 +158,7 @@ func fileDestination(ctx context.Context, c Config, fr, tr *bincapz.FileReport,
 	}
 }
 
-func inferMoves(ctx context.Context, c Config, d *bincapz.DiffReport) {
+func inferMoves(ctx context.Context, c bincapz.Config, d *bincapz.DiffReport) {
 	// Walk over the added/removed paths and infer moves based on the
 	// levenshtein distance of the file names.  If the distance is a 90+% match,
 	// then treat it as a move.
@@ -184,7 +184,7 @@ func inferMoves(ctx context.Context, c Config, d *bincapz.DiffReport) {
 	}
 }
 
-func fileMove(ctx context.Context, c Config, fr, tr *bincapz.FileReport, rpath, apath string, score float64, d *bincapz.DiffReport) {
+func fileMove(ctx context.Context, c bincapz.Config, fr, tr *bincapz.FileReport, rpath, apath string, score float64, d *bincapz.DiffReport) {
 	if fr.RiskScore < c.MinFileRisk && tr.RiskScore < c.MinFileRisk {
 		clog.FromContext(ctx).Info("diff does not meet min trigger level", slog.Any("path", tr.Path))
 		return

diff --git a/pkg/action/oci_test.go b/pkg/action/oci_test.go
@@ -7,6 +7,7 @@ import (
 	"regexp"
 	"testing"
 
+	"github.com/chainguard-dev/bincapz/pkg/bincapz"
 	"github.com/chainguard-dev/bincapz/pkg/compile"
 	"github.com/chainguard-dev/bincapz/pkg/render"
 	"github.com/chainguard-dev/bincapz/rules"
@@ -45,7 +46,7 @@ func TestOCI(t *testing.T) {
 		t.Fatalf("oci: %v", err)
 	}
 
-	bc := Config{
+	bc := bincapz.Config{
 		IgnoreSelf: false,
 		IgnoreTags: []string{"harmless"},
 		Renderer:   simple,

diff --git a/pkg/action/scan.go b/pkg/action/scan.go
@@ -69,7 +69,7 @@ func formatPath(path string) string {
 }
 
 // scanSinglePath YARA scans a single path and converts it to a fileReport.
-func scanSinglePath(ctx context.Context, c Config, yrs *yara.Rules, path string, absPath string, archiveRoot string) (*bincapz.FileReport, error) {
+func scanSinglePath(ctx context.Context, c bincapz.Config, yrs *yara.Rules, path string, absPath string, archiveRoot string) (*bincapz.FileReport, error) {
 	logger := clog.FromContext(ctx)
 	var mrs yara.MatchRules
 	logger = logger.With("path", path)
@@ -87,7 +87,7 @@ func scanSinglePath(ctx context.Context, c Config, yrs *yara.Rules, path string,
 		return &bincapz.FileReport{Path: path, Error: fmt.Sprintf("scanfile: %v", err)}, nil
 	}
 
-	fr, err := report.Generate(ctx, path, mrs, c.IgnoreTags, c.MinRisk, c.IgnoreSelf)
+	fr, err := report.Generate(ctx, path, mrs, c)
 	if err != nil {
 		return nil, err
 	}
@@ -148,7 +148,7 @@ func errIfHitOrMiss(frs map[string]*bincapz.FileReport, kind string, scanPath st
 }
 
 // recursiveScan recursively YARA scans the configured paths - handling archives and OCI images.
-func recursiveScan(ctx context.Context, c Config) (*bincapz.Report, error) {
+func recursiveScan(ctx context.Context, c bincapz.Config) (*bincapz.Report, error) {
 	logger := clog.FromContext(ctx)
 	logger.Debug("recursive scan", slog.Any("config", c))
 	r := &bincapz.Report{
@@ -255,7 +255,7 @@ func recursiveScan(ctx context.Context, c Config) (*bincapz.Report, error) {
 }
 
 // processArchive extracts and scans a single archive file.
-func processArchive(ctx context.Context, c Config, yrs *yara.Rules, archivePath string, logger *clog.Logger) (map[string]*bincapz.FileReport, error) {
+func processArchive(ctx context.Context, c bincapz.Config, yrs *yara.Rules, archivePath string, logger *clog.Logger) (map[string]*bincapz.FileReport, error) {
 	logger = logger.With("archivePath", archivePath)
 
 	var err error
@@ -288,7 +288,7 @@ func processArchive(ctx context.Context, c Config, yrs *yara.Rules, archivePath
 }
 
 // processFile scans a single output file, rendering live output if available.
-func processFile(ctx context.Context, c Config, yrs *yara.Rules, path string, scanPath string, archiveRoot string, logger *clog.Logger) (*bincapz.FileReport, error) {
+func processFile(ctx context.Context, c bincapz.Config, yrs *yara.Rules, path string, scanPath string, archiveRoot string, logger *clog.Logger) (*bincapz.FileReport, error) {
 	logger = logger.With("path", path)
 
 	fr, err := scanSinglePath(ctx, c, yrs, path, scanPath, archiveRoot)
@@ -321,7 +321,7 @@ func processFile(ctx context.Context, c Config, yrs *yara.Rules, path string, sc
 }
 
 // Scan YARA scans a data source, applying output filters if necessary.
-func Scan(ctx context.Context, c Config) (*bincapz.Report, error) {
+func Scan(ctx context.Context, c bincapz.Config) (*bincapz.Report, error) {
 	r, err := recursiveScan(ctx, c)
 	if err != nil {
 		return r, err

diff --git a/pkg/bincapz/bincapz.go b/pkg/bincapz/bincapz.go
@@ -3,6 +3,37 @@
 
 package bincapz
 
+import (
+	"context"
+	"io"
+
+	"github.com/hillu/go-yara/v4"
+)
+
+// Renderer is a common interface for Renderers.
+type Renderer interface {
+	File(context.Context, *FileReport) error
+	Full(context.Context, *Report) error
+}
+
+type Config struct {
+	IgnoreSelf       bool
+	IgnoreTags       []string
+	IncludeDataFiles bool
+	FrequencyUpgrade bool
+	MinFileRisk      int
+	MinRisk          int
+	OCI              bool
+	OmitEmpty        bool
+	Output           io.Writer
+	Renderer         Renderer
+	Rules            *yara.Rules
+	ScanPaths        []string
+	Stats            bool
+	ErrFirstMiss     bool
+	ErrFirstHit      bool
+}
+
 type Behavior struct {
 	Description string `json:",omitempty" yaml:",omitempty"`
 	// MatchStrings are all strings found relating to this behavior
@@ -29,6 +60,7 @@ type Behavior struct {
 type FileReport struct {
 	Path   string
 	SHA256 string
+	Size   int64
 	// compiler -> x
 	Error             string            `json:",omitempty" yaml:",omitempty"`
 	Skipped           string            `json:",omitempty" yaml:",omitempty"`

diff --git a/pkg/render/render.go b/pkg/render/render.go
@@ -4,21 +4,14 @@
 package render
 
 import (
-	"context"
 	"fmt"
 	"io"
 
 	"github.com/chainguard-dev/bincapz/pkg/bincapz"
 )
 
-// Renderer is a common interface for Renderers.
-type Renderer interface {
-	File(context.Context, *bincapz.FileReport) error
-	Full(context.Context, *bincapz.Report) error
-}
-
 // New returns a new Renderer.
-func New(kind string, w io.Writer) (Renderer, error) {
+func New(kind string, w io.Writer) (bincapz.Renderer, error) {
 	switch kind {
 	case "", "auto", "terminal":
 		return NewTerminal(w), nil

diff --git a/pkg/report/report.go b/pkg/report/report.go
@@ -253,19 +253,27 @@ func matchStrings(ruleName string, ms []yara.MatchString) []string {
 	return longestUnique(raw)
 }
 
-func pathChecksum(path string) (string, error) {
+func sizeAndChecksum(path string) (int64, string, error) {
+	s, err := os.Stat(path)
+	if err != nil {
+		return -1, "", err
+	}
+
+	size := s.Size()
+
 	f, err := os.Open(path)
 	if err != nil {
-		return fmt.Sprintf("err-%v", err), nil
+		return size, "", err
 	}
+
 	defer f.Close()
 
 	h := sha256.New()
 	if _, err := io.Copy(h, f); err != nil {
-		return "", err
+		return size, "", err
 	}
 
-	return fmt.Sprintf("%x", h.Sum(nil)), nil
+	return size, fmt.Sprintf("%x", h.Sum(nil)), nil
 }
 
 // fixURL fixes badly formed URLs.
@@ -286,20 +294,25 @@ func mungeDescription(s string) string {
 }
 
 //nolint:cyclop // ignore complexity of 44
-func Generate(ctx context.Context, path string, mrs yara.MatchRules, ignoreTags []string, minScore int, ignoreSelf bool) (bincapz.FileReport, error) {
+func Generate(ctx context.Context, path string, mrs yara.MatchRules, c bincapz.Config) (bincapz.FileReport, error) {
+	ignoreTags := c.IgnoreTags
+	minScore := c.MinRisk
+	ignoreSelf := c.IgnoreSelf
+
 	ignore := map[string]bool{}
 	for _, t := range ignoreTags {
 		ignore[t] = true
 	}
 
-	ptCheck, err := pathChecksum(path)
+	size, checksum, err := sizeAndChecksum(path)
 	if err != nil {
 		return bincapz.FileReport{}, err
 	}
 
 	fr := bincapz.FileReport{
 		Path:      path,
-		SHA256:    ptCheck,
+		SHA256:    checksum,
+		Size:      size,
 		Meta:      map[string]string{},
 		Behaviors: []*bincapz.Behavior{},
 	}
@@ -449,7 +462,7 @@ func Generate(ctx context.Context, path string, mrs yara.MatchRules, ignoreTags
 	}
 
 	// If something has a lot of high, it's probably critical
-	if riskCounts[3] >= 4 {
+	if c.FrequencyUpgrade && upgradeRisk(ctx, overallRiskScore, riskCounts, size) {
 		overallRiskScore = 4
 	}
 
@@ -465,6 +478,47 @@ func Generate(ctx context.Context, path string, mrs yara.MatchRules, ignoreTags
 	return fr, nil
 }
 
+// upgradeRisk determines whether to upgrade risk based on finding density.
+func upgradeRisk(ctx context.Context, riskScore int, riskCounts map[int]int, size int64) bool {
+	if riskScore != 3 {
+		return false
+	}
+	highCount := riskCounts[3]
+	sizeMB := size / 1024 / 1024
+	upgrade := false
+
+	// small scripts, tiny ELF binaries
+	if size < 1024 && highCount > 1 {
+		upgrade = true
+	}
+
+	// include most UPX binaries
+	if sizeMB < 2 && highCount > 2 {
+		upgrade = true
+	}
+
+	if sizeMB < 10 && highCount > 3 {
+		upgrade = true
+	}
+
+	// bloated go binaries
+	if sizeMB < 20 && highCount > 4 {
+		upgrade = true
+	}
+
+	if highCount > 6 {
+		upgrade = true
+	}
+
+	if !upgrade {
+		return false
+	}
+
+	clog.DebugContextf(ctx, "upgrading risk: high=%d, size=%d", highCount, size)
+
+	return upgrade
+}
+
 // all returns a single boolean based on a slice of booleans.
 func all(conditions ...bool) bool {
 	for _, condition := range conditions {