Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Playground PR for verification caching exploration #3457

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
15 changes: 12 additions & 3 deletions pkg/detectors/detectors.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,9 @@ type Result struct {
// DetectorName is the name of the Detector. Used for custom detectors.
DetectorName string
// DecoderType is the type of Decoder.
DecoderType detectorspb.DecoderType
Verified bool
DecoderType detectorspb.DecoderType
Verified bool
VerificationFromCache bool
// Raw contains the raw secret identifier data. Prefer IDs over secrets since it is used for deduping after hashing.
Raw []byte
// RawV2 contains the raw secret identifier that is a combination of both the ID and the secret.
Expand All @@ -113,7 +114,15 @@ type Result struct {
AnalysisInfo map[string]string
}

// SetVerificationError is the only way to set a verification error. Any sensitive values should be passed-in as secrets to be redacted.
// CopyVerificationInfo clones verification info (status and error) from another Result struct. This is used when
// loading verification info from a verification cache. (A method is necessary because verification errors are not
// exported, to prevent the accidental storage of sensitive information in them.)
func (r *Result) CopyVerificationInfo(from *Result) {
r.Verified = from.Verified
r.verificationError = from.verificationError
}

// SetVerificationError is the only way to set a new verification error. Any sensitive values should be passed-in as secrets to be redacted.
func (r *Result) SetVerificationError(err error, secrets ...string) {
if err != nil {
r.verificationError = redactSecrets(err, secrets...)
Expand Down
17 changes: 16 additions & 1 deletion pkg/engine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import (
"github.com/adrg/strutil"
"github.com/adrg/strutil/metrics"
lru "github.com/hashicorp/golang-lru/v2"
"github.com/trufflesecurity/trufflehog/v3/pkg/cache"
"github.com/trufflesecurity/trufflehog/v3/pkg/verificationcaching"
"google.golang.org/protobuf/proto"

"github.com/trufflesecurity/trufflehog/v3/pkg/common"
Expand Down Expand Up @@ -155,6 +157,10 @@ type Engine struct {
concurrency int
decoders []decoders.Decoder
detectors []detectors.Detector
// verificationCache must be thread-safe
verificationCache cache.Cache[*detectors.Result]
// getVerificationCacheKey must be thread-safe
getVerificationCacheKey func(result *detectors.Result) string
// Any detectors configured to override sources' verification flags
detectorVerificationOverrides map[config.DetectorID]bool

Expand Down Expand Up @@ -219,6 +225,8 @@ func NewEngine(ctx context.Context, cfg *Config) (*Engine, error) {
concurrency: cfg.Concurrency,
decoders: cfg.Decoders,
detectors: cfg.Detectors,
verificationCache: nil,
getVerificationCacheKey: func(result *detectors.Result) string { panic("cache should be unused") },
dispatcher: cfg.Dispatcher,
verify: cfg.Verify,
filterUnverified: cfg.FilterUnverified,
Expand Down Expand Up @@ -1052,7 +1060,14 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
t := time.AfterFunc(detectionTimeout+1*time.Second, func() {
ctx.Logger().Error(nil, "a detector ignored the context timeout")
})
results, err := data.detector.Detector.FromData(ctx, data.chunk.Verify, matchBytes)
results, err := verificationcaching.FromDataCached(
ctx,
e.verificationCache,
e.getVerificationCacheKey,
data.detector.Detector,
data.chunk.Verify,
data.chunk.SecretID != 0,
matchBytes)
t.Stop()
cancel()
if err != nil {
Expand Down
80 changes: 80 additions & 0 deletions pkg/verificationcaching/verificationcaching.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package verificationcaching

import (
"github.com/trufflesecurity/trufflehog/v3/pkg/cache"
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
)

// FromDataCached executes detection on chunk data in a way that uses a provided verification cache to deduplicate
// verification requests when possible.
//
// If the provided cache is nil, this function simply returns the result of the provided detector's FromData method.
//
// If verify is false, this function returns the result of the provided detector's FromData method. In this case, the
// cache is only updated if forceCacheUpdate is true.
//
// If verify is true, and forceCacheUpdate is false, this function first executes the provided detector's FromData
// method with verification disabled. Then, the cache is queried for each result. If they are all present in the cache,
// the cached values are returned. Otherwise, the provided detector's FromData method is invoked (again) with
// verification enabled, and the results are used to update the cache before being returned.
//
// If verify is true, and forceCacheUpdate is also true, the provided detector's FromData method is invoked, and the
// results are used to update the cache before being returned.
func FromDataCached(
ctx context.Context,
verificationCache cache.Cache[*detectors.Result],
getCacheKey func(result *detectors.Result) string,
detector detectors.Detector,
verify bool,
forceCacheUpdate bool,
data []byte,
) ([]detectors.Result, error) {

if verificationCache == nil {
return detector.FromData(ctx, verify, data)
}

if !forceCacheUpdate {
withoutRemoteVerification, err := detector.FromData(ctx, false, data)
if err != nil {
return nil, err
}

if !verify {
return withoutRemoteVerification, nil
}

isEverythingCached := false
for _, r := range withoutRemoteVerification {
if cacheHit, ok := verificationCache.Get(getCacheKey(&r)); ok {
r.CopyVerificationInfo(cacheHit)
} else {
isEverythingCached = false
break
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was playing around converting this to slices.ContainsFunc, but then I realized maybe it's not optimal to break here? If we run into something that isn't yet cached, everything after it won't be updated from cache. Maybe that's fine though?

(also dunno what I think about the Func functions in slices; I guess the main use case is to not dupe logic in loops, but that doesn't apply here)

Copy link
Collaborator Author

@rosecodym rosecodym Oct 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this is just kinda unfortunate. If we find something that's not cached, we have to rerun FromData, which will return all the results for this chunk - whether they're cached or not. So there's no reason to read from the cache. (Did I successfully explain that?)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So there's no reason to read from the cache. (Did I successfully explain that?)

Essentially, this is a stopgap solution to handle duplicate chunks (e.g., scanning orgs + members that have multiple forks of kubernetes/kubernetes.)

Proper caching on a granular level will require splitting FromData and Verify into discrete functions.

// pseudo-code
cache := cache.New()

for _, chunk := range chunks {
    for _, d := range detectors {
        results := d.FromData(ctx, chunk)
        
        for result := range results {
            if val, ok := cache.Get(detector.Type, result.Raw); ok {
                result.Update(val)
            } else {
                detector.Verify(result)
                cache.Set(detector.Type, result.Raw)
            }
        }
    }
}

}
}

if isEverythingCached {
return withoutRemoteVerification, nil
}
}

withRemoteVerification, err := detector.FromData(ctx, verify, data)
if err != nil {
return nil, err
}

for _, r := range withRemoteVerification {
copyForCaching := r
// Do not persist raw secret values in a long-lived cache
copyForCaching.Raw = nil
copyForCaching.RawV2 = nil
// Decoder type will be set later, so clear it out now to minimize the chance of accidentally cloning it
copyForCaching.DecoderType = detectorspb.DecoderType_UNKNOWN
verificationCache.Set(getCacheKey(&r), &copyForCaching)
}

return withRemoteVerification, nil
}
Loading