diff --git a/go.mod b/go.mod index e4e6141d3753..f327ca02bbfc 100644 --- a/go.mod +++ b/go.mod @@ -156,6 +156,7 @@ require ( github.com/apache/arrow/go/v14 v14.0.2 // indirect github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect github.com/atotto/clipboard v0.1.4 // indirect + github.com/avast/apkparser v0.0.0-20240729092610-90591e0804ae // indirect github.com/aws/smithy-go v1.20.1 // indirect github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect github.com/aymerick/douceur v0.2.0 // indirect @@ -181,6 +182,7 @@ require ( github.com/couchbase/goprotostellar v1.0.2 // indirect github.com/couchbaselabs/gocbconnstr/v2 v2.0.0-20240607131231-fb385523de28 // indirect github.com/cpuguy83/dockercfg v0.3.2 // indirect + github.com/csnewman/dextk v0.3.0 // indirect github.com/cyphar/filepath-securejoin v0.2.4 // indirect github.com/danieljoos/wincred v1.1.2 // indirect github.com/davecgh/go-spew v1.1.1 // indirect diff --git a/go.sum b/go.sum index 4659bbac7d83..439480caada7 100644 --- a/go.sum +++ b/go.sum @@ -173,6 +173,8 @@ github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3d github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4= github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI= +github.com/avast/apkparser v0.0.0-20240729092610-90591e0804ae h1:rDNramK9mnAbvUBJyIRZnzHchM45cXexHIX9pS9da4Q= +github.com/avast/apkparser v0.0.0-20240729092610-90591e0804ae/go.mod h1:GNvprXNmXaDjpHmN3RFxz5QdK5VXTUvmQludCbjoBy4= github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU= github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= github.com/aws/smithy-go v1.20.1 h1:4SZlSlMr36UEqC7XOyRVb27XMeZubNcBNN+9IgEPIQw= @@ -277,6 +279,8 @@ github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= github.com/crewjam/rfc5424 v0.1.0 h1:MSeXJm22oKovLzWj44AHwaItjIMUMugYGkEzfa831H8= github.com/crewjam/rfc5424 v0.1.0/go.mod h1:RCi9M3xHVOeerf6ULZzqv2xOGRO/zYaVUeRyPnBW3gQ= +github.com/csnewman/dextk v0.3.0 h1:gigNZlZRNfCuARV7depunRlafEAzGhyvgBQo1FT3/0M= +github.com/csnewman/dextk v0.3.0/go.mod h1:FcDoI3258ea0KPQogyv4iazQRGcLFNOW+I4pHBUfNO0= github.com/cyphar/filepath-securejoin v0.2.4 h1:Ugdm7cg7i6ZK6x3xDF1oEu1nfkyfH53EtKeQYTC3kyg= github.com/cyphar/filepath-securejoin v0.2.4/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= github.com/danieljoos/wincred v1.1.2 h1:QLdCxFs1/Yl4zduvBdcHB8goaYk9RARS2SgLLRuAyr0= diff --git a/hack/snifftest/main.go b/hack/snifftest/main.go index 8f76c7a361ea..0068eb339ff7 100644 --- a/hack/snifftest/main.go +++ b/hack/snifftest/main.go @@ -17,7 +17,7 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/context" "github.com/trufflesecurity/trufflehog/v3/pkg/decoders" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" - "github.com/trufflesecurity/trufflehog/v3/pkg/engine" + "github.com/trufflesecurity/trufflehog/v3/pkg/engine/defaults" "github.com/trufflesecurity/trufflehog/v3/pkg/log" "github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb" "github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb" @@ -247,7 +247,7 @@ func main() { func getAllScanners() map[string]detectors.Detector { allScanners := map[string]detectors.Detector{} - for _, s := range engine.DefaultDetectors() { + for _, s := range defaults.DefaultDetectors() { secretType := reflect.Indirect(reflect.ValueOf(s)).Type().PkgPath() path := strings.Split(secretType, "/")[len(strings.Split(secretType, "/"))-1] allScanners[path] = s diff --git a/main.go b/main.go index 008f60b72fde..abd635f07e5a 100644 --- a/main.go +++ b/main.go @@ -28,6 +28,7 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/config" "github.com/trufflesecurity/trufflehog/v3/pkg/context" "github.com/trufflesecurity/trufflehog/v3/pkg/engine" + "github.com/trufflesecurity/trufflehog/v3/pkg/engine/defaults" "github.com/trufflesecurity/trufflehog/v3/pkg/feature" "github.com/trufflesecurity/trufflehog/v3/pkg/handlers" "github.com/trufflesecurity/trufflehog/v3/pkg/log" @@ -409,6 +410,9 @@ func run(state overseer.State) { feature.UserAgentSuffix.Store(*userAgentSuffix) } + // OSS Default APK handling on + feature.EnableAPKHandler.Store(true) + conf := &config.Config{} if *configFilename != "" { var err error @@ -461,7 +465,7 @@ func run(state overseer.State) { // default detectors, which can be further filtered by the // user. The filters are applied by the engine and are only // subtractive. - Detectors: append(engine.DefaultDetectors(), conf.Detectors...), + Detectors: append(defaults.DefaultDetectors(), conf.Detectors...), Verify: !*noVerification, IncludeDetectors: *includeDetectors, ExcludeDetectors: *excludeDetectors, diff --git a/pkg/engine/defaults.go b/pkg/engine/defaults/defaults.go similarity index 97% rename from pkg/engine/defaults.go rename to pkg/engine/defaults/defaults.go index 1cb8776784df..1bc9266beb3a 100644 --- a/pkg/engine/defaults.go +++ b/pkg/engine/defaults/defaults.go @@ -1,6 +1,11 @@ -package engine +package defaults import ( + "bytes" + "strings" + "sync" + + ahocorasick "github.com/BobuSumisu/aho-corasick" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/abbysale" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/abuseipdb" @@ -811,8 +816,8 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" ) -func DefaultDetectors() []detectors.Detector { - detectorList := []detectors.Detector{ +func buildDetectorList() []detectors.Detector { + return []detectors.Detector{ &abbysale.Scanner{}, // &abstract.Scanner{}, &abuseipdb.Scanner{}, @@ -1647,6 +1652,10 @@ func DefaultDetectors() []detectors.Detector { &zonkafeedback.Scanner{}, &zulipchat.Scanner{}, } +} + +func DefaultDetectors() []detectors.Detector { + detectorList := buildDetectorList() // Automatically initialize all detectors that implement // EndpointCustomizer and/or CloudProvider interfaces. @@ -1675,3 +1684,60 @@ func DefaultDetectorTypesImplementing[T any]() map[detectorspb.DetectorType]stru } return out } + +func defaultDetectorKeywords() []string { + allDetectors := buildDetectorList() + + // Remove keywords that cause lots of false positives. + var exclusions = []string{ + "AKIA", "SG.", "pat", "token", "gh", "github", "sql", "database", "http", "key", "api-", "sdk-", "float", "-us", "gh", "pat", "token", "sid", "http", "private", "key", "segment", "close", "protocols", "verifier", "box", "privacy", "dm", "sl.", "vf", "flat", + } + + var keywords []string + exclusionSet := make(map[string]struct{}) + for _, excl := range exclusions { + exclusionSet[strings.ToLower(excl)] = struct{}{} + } + + // Aggregate all keywords from detectors. + for _, detector := range allDetectors { + for _, kw := range detector.Keywords() { + kwLower := strings.ToLower(kw) + if _, excluded := exclusionSet[kwLower]; !excluded { + keywords = append(keywords, kwLower) + } + } + } + return keywords +} + +// DefaultDetectorKeywordMatcher encapsulates the Aho-Corasick trie for keyword matching. +type DefaultDetectorKeywordMatcher struct { + mu sync.RWMutex + trie *ahocorasick.Trie +} + +// NewDefaultDetectorKeywordMatcher creates a new DefaultDetectorKeywordMatcher. +func NewDefaultDetectorKeywordMatcher() *DefaultDetectorKeywordMatcher { + keywords := defaultDetectorKeywords() + return &DefaultDetectorKeywordMatcher{trie: ahocorasick.NewTrieBuilder().AddStrings(keywords).Build()} +} + +// FindKeywords scans the input text and returns a slice of matched keywords. +func (km *DefaultDetectorKeywordMatcher) FindKeywords(text []byte) []string { + km.mu.RLock() + defer km.mu.RUnlock() + + matches := km.trie.Match(bytes.ToLower(text)) + found := make([]string, 0, len(matches)) + seen := make(map[string]struct{}) // To avoid duplicate entries + + for _, match := range matches { + keyword := match.MatchString() + if _, exists := seen[keyword]; !exists { + found = append(found, keyword) + seen[keyword] = struct{}{} + } + } + return found +} diff --git a/pkg/engine/defaults_test.go b/pkg/engine/defaults/defaults_test.go similarity index 99% rename from pkg/engine/defaults_test.go rename to pkg/engine/defaults/defaults_test.go index 6131b211f42b..cfb4c1c908d2 100644 --- a/pkg/engine/defaults_test.go +++ b/pkg/engine/defaults/defaults_test.go @@ -1,4 +1,4 @@ -package engine +package defaults import ( "testing" diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 4d07cbe4a692..9ec11dccc181 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -21,6 +21,7 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/decoders" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" "github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick" + "github.com/trufflesecurity/trufflehog/v3/pkg/engine/defaults" "github.com/trufflesecurity/trufflehog/v3/pkg/giturl" "github.com/trufflesecurity/trufflehog/v3/pkg/output" "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" @@ -345,7 +346,7 @@ func (e *Engine) setDefaults(ctx context.Context) { // Only use the default detectors if none are provided. if len(e.detectors) == 0 { - e.detectors = DefaultDetectors() + e.detectors = defaults.DefaultDetectors() } if e.dispatcher == nil { @@ -398,7 +399,7 @@ func parseCustomVerifierEndpoints(endpoints map[string]string) (map[config.Detec return nil, fmt.Errorf("invalid verifier detector configuration id %v: %w", id, err) } // Extra check for endpoint customization. - isEndpointCustomizer := DefaultDetectorTypesImplementing[detectors.EndpointCustomizer]() + isEndpointCustomizer := defaults.DefaultDetectorTypesImplementing[detectors.EndpointCustomizer]() for id := range customVerifierEndpoints { if _, ok := isEndpointCustomizer[id.ID]; !ok { return nil, fmt.Errorf("endpoint provided but detector does not support endpoint customization: %w", err) @@ -435,7 +436,7 @@ func getWithDetectorID[T any](d detectors.Detector, data map[config.DetectorID]T // verifyDetectorsAreVersioner checks all keys in a provided map to verify the // provided type is actually a Versioner. func verifyDetectorsAreVersioner[T any](data map[config.DetectorID]T) (config.DetectorID, error) { - isVersioner := DefaultDetectorTypesImplementing[detectors.Versioner]() + isVersioner := defaults.DefaultDetectorTypesImplementing[detectors.Versioner]() for id := range data { if id.Version == 0 { // Version not provided. @@ -564,7 +565,7 @@ func (e *Engine) GetDetectorsMetrics() map[string]time.Duration { e.metrics.mu.RLock() defer e.metrics.mu.RUnlock() - result := make(map[string]time.Duration, len(DefaultDetectors())) + result := make(map[string]time.Duration, len(defaults.DefaultDetectors())) for detectorName, durations := range e.DetectorAvgTime() { var total time.Duration for _, d := range durations { diff --git a/pkg/engine/engine_test.go b/pkg/engine/engine_test.go index 9be786d93742..5901e67d2d78 100644 --- a/pkg/engine/engine_test.go +++ b/pkg/engine/engine_test.go @@ -22,6 +22,7 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/decoders" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" "github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick" + "github.com/trufflesecurity/trufflehog/v3/pkg/engine/defaults" "github.com/trufflesecurity/trufflehog/v3/pkg/pb/custom_detectorspb" "github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb" "github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb" @@ -259,7 +260,7 @@ func TestEngine_DuplicateSecrets(t *testing.T) { conf := Config{ Concurrency: 1, Decoders: decoders.DefaultDecoders(), - Detectors: DefaultDetectors(), + Detectors: defaults.DefaultDetectors(), Verify: false, SourceManager: sourceManager, Dispatcher: NewPrinterDispatcher(new(discardPrinter)), @@ -360,7 +361,7 @@ even more`, conf := Config{ Concurrency: 1, Decoders: decoders.DefaultDecoders(), - Detectors: DefaultDetectors(), + Detectors: defaults.DefaultDetectors(), Verify: false, SourceManager: sourceManager, Dispatcher: lineCapturer, @@ -891,12 +892,12 @@ func TestLikelyDuplicate(t *testing.T) { // Initialize detectors // (not actually calling detector FromData or anything, just using detector struct for key creation) detectorA := ahocorasick.DetectorMatch{ - Key: ahocorasick.CreateDetectorKey(DefaultDetectors()[0]), - Detector: DefaultDetectors()[0], + Key: ahocorasick.CreateDetectorKey(defaults.DefaultDetectors()[0]), + Detector: defaults.DefaultDetectors()[0], } detectorB := ahocorasick.DetectorMatch{ - Key: ahocorasick.CreateDetectorKey(DefaultDetectors()[1]), - Detector: DefaultDetectors()[1], + Key: ahocorasick.CreateDetectorKey(defaults.DefaultDetectors()[1]), + Detector: defaults.DefaultDetectors()[1], } // Define test cases @@ -1037,7 +1038,7 @@ func TestFilterResults_CustomCleaner(t *testing.T) { } func BenchmarkPopulateMatchingDetectors(b *testing.B) { - allDetectors := DefaultDetectors() + allDetectors := defaults.DefaultDetectors() ac := ahocorasick.NewAhoCorasickCore(allDetectors) // Generate sample data with keywords from detectors. @@ -1164,7 +1165,7 @@ func TestEngineInitializesCloudProviderDetectors(t *testing.T) { ctx := context.Background() conf := Config{ Concurrency: 1, - Detectors: DefaultDetectors(), + Detectors: defaults.DefaultDetectors(), Verify: false, SourceManager: sources.NewManager(), Dispatcher: NewPrinterDispatcher(new(discardPrinter)), diff --git a/pkg/engine/gcs_test.go b/pkg/engine/gcs_test.go index e634819810f4..d73b0bf5e84c 100644 --- a/pkg/engine/gcs_test.go +++ b/pkg/engine/gcs_test.go @@ -8,6 +8,7 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/context" "github.com/trufflesecurity/trufflehog/v3/pkg/decoders" + "github.com/trufflesecurity/trufflehog/v3/pkg/engine/defaults" "github.com/trufflesecurity/trufflehog/v3/pkg/sources" ) @@ -72,7 +73,7 @@ func TestScanGCS(t *testing.T) { conf := Config{ Concurrency: 1, Decoders: decoders.DefaultDecoders(), - Detectors: DefaultDetectors(), + Detectors: defaults.DefaultDetectors(), Verify: false, SourceManager: sourceManager, Dispatcher: NewPrinterDispatcher(new(discardPrinter)), diff --git a/pkg/engine/git_test.go b/pkg/engine/git_test.go index a20ae7da627c..6bac1af992ab 100644 --- a/pkg/engine/git_test.go +++ b/pkg/engine/git_test.go @@ -10,6 +10,7 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/context" "github.com/trufflesecurity/trufflehog/v3/pkg/decoders" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" + "github.com/trufflesecurity/trufflehog/v3/pkg/engine/defaults" "github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb" "github.com/trufflesecurity/trufflehog/v3/pkg/sources" "github.com/trufflesecurity/trufflehog/v3/pkg/sources/git" @@ -73,7 +74,7 @@ func TestGitEngine(t *testing.T) { conf := Config{ Concurrency: 1, Decoders: decoders.DefaultDecoders(), - Detectors: DefaultDetectors(), + Detectors: defaults.DefaultDetectors(), Verify: true, SourceManager: sourceManager, Dispatcher: NewPrinterDispatcher(new(discardPrinter)), @@ -135,7 +136,7 @@ func BenchmarkGitEngine(b *testing.B) { conf := Config{ Concurrency: runtime.NumCPU(), Decoders: decoders.DefaultDecoders(), - Detectors: DefaultDetectors(), + Detectors: defaults.DefaultDetectors(), Verify: false, SourceManager: sourceManager, Dispatcher: NewPrinterDispatcher(new(discardPrinter)), diff --git a/pkg/engine/postman_test.go b/pkg/engine/postman_test.go index 37830a9b92e8..448e59b6a07a 100644 --- a/pkg/engine/postman_test.go +++ b/pkg/engine/postman_test.go @@ -7,6 +7,7 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/context" "github.com/trufflesecurity/trufflehog/v3/pkg/decoders" + "github.com/trufflesecurity/trufflehog/v3/pkg/engine/defaults" "github.com/trufflesecurity/trufflehog/v3/pkg/sources" ) @@ -55,7 +56,7 @@ func TestPostmanEngine(t *testing.T) { conf := Config{ Concurrency: 1, Decoders: decoders.DefaultDecoders(), - Detectors: DefaultDetectors(), + Detectors: defaults.DefaultDetectors(), Verify: false, SourceManager: sourceManager, Dispatcher: NewPrinterDispatcher(new(discardPrinter)), diff --git a/pkg/feature/feature.go b/pkg/feature/feature.go index 3d1f3321e1cb..db8036f5d386 100644 --- a/pkg/feature/feature.go +++ b/pkg/feature/feature.go @@ -6,6 +6,7 @@ var ( ForceSkipBinaries atomic.Bool ForceSkipArchives atomic.Bool SkipAdditionalRefs atomic.Bool + EnableAPKHandler atomic.Bool UserAgentSuffix AtomicString ) diff --git a/pkg/handlers/apk.go b/pkg/handlers/apk.go new file mode 100644 index 000000000000..c83665522189 --- /dev/null +++ b/pkg/handlers/apk.go @@ -0,0 +1,405 @@ +package handlers + +import ( + "archive/zip" + "bytes" + "encoding/xml" + "errors" + "fmt" + "io" + "path/filepath" + "regexp" + "strings" + "time" + + dextk "github.com/csnewman/dextk" + + "github.com/avast/apkparser" + logContext "github.com/trufflesecurity/trufflehog/v3/pkg/context" + "github.com/trufflesecurity/trufflehog/v3/pkg/engine/defaults" + "github.com/trufflesecurity/trufflehog/v3/pkg/iobuf" +) + +// General Note: There are tools that can fully decompile an apk (e.g. jadx, apktool, etc.) +// However, none of these are in golang + they take awhile to run + +// they will decompile files that most likely don't contain secrets. So instead, we have a +// lightweight version that will search for secrets in the most common files that contain them. +// And run in a fraction of the time (ex: 15 seconds vs. 5 minutes) + +// ToDo: Scan nested APKs (aka XAPK files). ATM the archive.go file will skip over them. +// ToDo: Provide file location information to secret output. + +var ( + stringInstructionType = "const-string" + targetInstructionTypes = []string{stringInstructionType, "iput-object", "sput-object", "const-class", "invoke-virtual", "invoke-super", "invoke-direct", "invoke-static", "invoke-interface"} + // Note: We're only looking at a subset of instructions. + // If expanding, update precompiled REGEX below. + // - const-string: loads a string into a register (value) + // - iput-object: stores a string into a field (key) + // - the rest have to do with function, methods, objects and classes. + reIPutRegex = regexp.MustCompile(`iput-object obj=\d+ field=com/[a-zA-Z0-9/_]+:([a-zA-Z0-9_]+):`) + reSPutRegex = regexp.MustCompile(`sput-object field=com/[a-zA-Z0-9/_]+:([a-zA-Z0-9_]+):`) + reConstRegex = regexp.MustCompile(`const-string(?:/jumbo)? dst=\d+ value='([^']*)'`) + reConstClassRegex = regexp.MustCompile(`const-class dst=\d+ value='[a-zA-Z0-9/_$]+/([a-zA-Z0-9]+)(?:\$|;)`) + reInvokeRegex = regexp.MustCompile(`invoke-(?:virtual|super|direct|static|interface)(?:/range)? method=[a-zA-Z0-9/._$]+/([a-zA-Z0-9_$]+:[a-zA-Z0-9_<]+)`) + reInstructions = []*regexp.Regexp{ + reIPutRegex, + reSPutRegex, + reConstRegex, + reConstClassRegex, + reInvokeRegex, + } +) + +// apkHandler handles apk archive formats. +type apkHandler struct { + keywordMatcher *defaults.DefaultDetectorKeywordMatcher + *defaultHandler +} + +// newapkHandler creates an apkHandler. +func newAPKHandler() *apkHandler { + return &apkHandler{ + defaultHandler: newDefaultHandler(apkHandlerType), + keywordMatcher: defaults.NewDefaultDetectorKeywordMatcher(), + } +} + +// HandleFile processes apk formatted files. +func (h *apkHandler) HandleFile(ctx logContext.Context, input fileReader) (chan []byte, error) { + apkChan := make(chan []byte, defaultBufferSize) + + go func() { + ctx, cancel := logContext.WithTimeout(ctx, maxTimeout) + defer cancel() + defer close(apkChan) + + // Update the metrics for the file processing. + start := time.Now() + var err error + defer func() { + h.measureLatencyAndHandleErrors(start, err) + h.metrics.incFilesProcessed() + }() + + // Defer a panic recovery to handle any panics that occur during the APK processing. + defer func() { + if r := recover(); r != nil { + // Return the panic as an error. + if e, ok := r.(error); ok { + err = e + } else { + err = fmt.Errorf("panic occurred: %v", r) + } + ctx.Logger().Error(err, "Panic occurred when reading apk archive") + } + }() + + if err = h.processAPK(ctx, input, apkChan); err != nil { + ctx.Logger().Error(err, "error processing apk content") + } + }() + return apkChan, nil +} + +// processAPK processes the apk file and sends the extracted data to the provided channel. +func (h *apkHandler) processAPK(ctx logContext.Context, input fileReader, apkChan chan []byte) error { + + // Create a ZIP reader from the input fileReader + zipReader, err := createZipReader(input) + if err != nil { + return err + } + + // Extract the resources.arsc file into a ResourceTable (needed for XML decoding) + resTable, err := parseResTable(zipReader) + if err != nil { + return err + } + + // Process the ResourceTable file for secrets + if err := h.processResources(ctx, resTable, apkChan); err != nil { + ctx.Logger().Error(err, "failed to process resources.arsc") + } + + // Process all files for secrets + for _, file := range zipReader.File { + if err := h.processFile(ctx, file, resTable, apkChan); err != nil { + ctx.Logger().V(2).Info(fmt.Sprintf("failed to process file: %s", file.Name), "error", err) + } + } + return nil +} + +// processResources processes the resources.arsc file and sends the extracted data to the provided channel. +func (h *apkHandler) processResources(ctx logContext.Context, resTable *apkparser.ResourceTable, apkChan chan []byte) error { + if resTable == nil { + return errors.New("ResourceTable is nil") + } + rscStrRdr, err := extractStringsFromResTable(resTable) + if err != nil { + return fmt.Errorf("failed to parse strings from resources.arsc: %w", err) + } + return h.handleAPKFileContent(ctx, rscStrRdr, "resources.arsc", apkChan) +} + +// processFile processes the file and sends the extracted data to the provided channel. +func (h *apkHandler) processFile(ctx logContext.Context, file *zip.File, resTable *apkparser.ResourceTable, apkChan chan []byte) error { + // check if the file is empty + if file.UncompressedSize64 == 0 { + return nil + } + + // Open the file from the zip archive + rdr, err := openFile(file) + if err != nil { + return fmt.Errorf("failed to read file %s: %w", file.Name, err) + } + defer rdr.Close() + + var contentReader io.Reader + // Decode the file based on its extension + switch strings.ToLower(filepath.Ext(file.Name)) { + case ".xml": + contentReader, err = decodeXML(rdr, resTable) + if err != nil { + return fmt.Errorf("failed to decode xml file %s: %w", file.Name, err) + } + case ".dex": + contentReader, err = h.processDexFile(ctx, iobuf.NewBufferedReaderSeeker(rdr)) + if err != nil { + return fmt.Errorf("failed to decode dex file %s: %w", file.Name, err) + } + default: + contentReader = rdr + } + return h.handleAPKFileContent(ctx, contentReader, file.Name, apkChan) +} + +// handleAPKFileContent sends the extracted data to the provided channel via the handleNonArchiveContent function. +func (h *apkHandler) handleAPKFileContent(ctx logContext.Context, rdr io.Reader, fileName string, apkChan chan []byte) error { + mimeReader, err := newMimeTypeReader(rdr) + if err != nil { + return fmt.Errorf("failed to create mimeTypeReader for file %s: %w", fileName, err) + } + ctx = logContext.WithValues( + ctx, + "filename", fileName, + ) + return h.handleNonArchiveContent(ctx, mimeReader, apkChan) +} + +// createZipReader creates a new ZIP reader from the input fileReader. +func createZipReader(input fileReader) (*zip.Reader, error) { + size, err := input.Size() + if err != nil { + return nil, err + } + zipReader, err := zip.NewReader(input, size) + if err != nil { + return nil, err + } + return zipReader, err +} + +// parseResTable parses the resources.arsc file and returns the ResourceTable. +func parseResTable(zipReader *zip.Reader) (*apkparser.ResourceTable, error) { + for _, file := range zipReader.File { + if file.Name == "resources.arsc" { + rdr, err := openFile(file) + if err != nil { + return nil, err + } + + resTable, err := apkparser.ParseResourceTable(rdr) + rdr.Close() + if err != nil { + return nil, err + } + return resTable, nil + } + } + return nil, errors.New("resources.arsc file not found in the APK archive") +} + +// openFile opens the file from the zip archive and returns the data as an io.ReadCloser +// Note: responsibility of calling function to close the reader +func openFile(file *zip.File) (io.ReadCloser, error) { + rc, err := file.Open() + if err != nil { + return nil, err + } + return rc, nil +} + +// extractStringsFromResTable extracts the strings from the resources table +// Note: This is a hacky way to get the strings from the resources table +// APK strings are typically (always?) stored in the 0x7f000000-0x7fffffff range +// https://chromium.googlesource.com/chromium/src/+/master/build/android/docs/life_of_a_resource.md +func extractStringsFromResTable(resTable *apkparser.ResourceTable) (io.Reader, error) { + var resourceStrings bytes.Buffer + inStrings := false + for i := 0x7f000000; i <= 0x7fffffff; i++ { + entry, _ := resTable.GetResourceEntry(uint32(i)) + if entry == nil { + continue + } + if entry.ResourceType == "string" { + inStrings = true + val, err := entry.GetValue().String() + if err != nil { + return nil, err + } + // Write directly to the buffer + resourceStrings.WriteString(entry.Key) + resourceStrings.WriteString(": ") + resourceStrings.WriteString(val) + resourceStrings.WriteString("\n") + } + // Exit the loop if we've finished processing the strings + if inStrings && entry.ResourceType != "string" { + break + } + } + return &resourceStrings, nil +} + +// processDexFile decodes the dex file and returns the relevant instructions +func (h *apkHandler) processDexFile(ctx logContext.Context, rdr io.ReaderAt) (io.Reader, error) { + dexReader, err := dextk.Read(rdr, dextk.WithReadCache(16)) + if err != nil { + return nil, err + } + + // Get relevant instruction data from the dex file + var dexOutput bytes.Buffer + ci := dexReader.ClassIter() + for ci.HasNext() { + node, err := ci.Next() + if err != nil { + ctx.Logger().Error(err, "failed to process a dex class") + break + } + h.processDexClass(ctx, dexReader, node, &dexOutput) + } + + return &dexOutput, nil +} + +// processDexClass processes a single class node's methods +func (h *apkHandler) processDexClass(ctx logContext.Context, dexReader *dextk.Reader, node dextk.ClassNode, dexOutput *bytes.Buffer) { + + var classOutput bytes.Buffer + methodValues := make(map[string]struct{}) + + // Process Direct Methods + processDexMethod(ctx, dexReader, node.DirectMethods, &classOutput, methodValues) + // Process Virtual Methods + processDexMethod(ctx, dexReader, node.VirtualMethods, &classOutput, methodValues) + + // Write the classOutput to the dexOutput + dexOutput.Write(classOutput.Bytes()) + + // Check if classOutput contains any of the default keywords + foundKeywords := h.keywordMatcher.FindKeywords(classOutput.Bytes()) + + // For each found keyword, create a keyword:value pair and append to dexOutput + for str := range methodValues { + for _, keyword := range foundKeywords { + dexOutput.WriteString(keyword + ":" + str + "\n") + } + } +} + +// processDexMethod iterates over a slice of methods, processes each method, +// handles errors, and writes the output to dexOutput. +func processDexMethod(ctx logContext.Context, dexReader *dextk.Reader, methods []dextk.MethodNode, classOutput *bytes.Buffer, methodValues map[string]struct{}) { + for _, method := range methods { + s, err := parseDexInstructions(dexReader, method, methodValues) + if err != nil { + ctx.Logger().V(2).Info("failed to process dex method", "error", err) + continue + } + classOutput.Write(s.Bytes()) + } +} + +// parseDexInstructions processes a dex method and returns the string representation of the instruction +func parseDexInstructions(r *dextk.Reader, m dextk.MethodNode, methodValues map[string]struct{}) (*bytes.Buffer, error) { + var instrBuf bytes.Buffer + + if m.CodeOff == 0 { + return &instrBuf, nil + } + + c, err := r.ReadCodeAndParse(m.CodeOff) + if err != nil { + return &instrBuf, err + } + + // Iterate over the instructions and extract the relevant values + for _, o := range c.Ops { + oStr := o.String() + + instructionType := getInstructionType(oStr) + if instructionType == "" { + continue + } + + val := formatAndFilterInstruction(oStr) + if val != "" { + instrBuf.WriteString(val + "\n") + if instructionType == stringInstructionType { + methodValues[val] = struct{}{} + } + } + } + return &instrBuf, nil +} + +// getInstructionType checks for specific target instructions +func getInstructionType(instruction string) string { + for _, t := range targetInstructionTypes { + if strings.HasPrefix(instruction, t) { + return t + } + } + return "" +} + +// formatAndFilterInstruction looks for a match to our regex and returns it +// Note: This is critical for ensuring secret + keyword are in close proximity. +// If we expand the instructions we're looking at, this function will need to be updated. +func formatAndFilterInstruction(line string) string { + for _, re := range reInstructions { + matches := re.FindStringSubmatch(line) + if len(matches) > 1 { + return matches[1] + } + } + return "" +} + +func decodeXML(rdr io.ReadCloser, resTable *apkparser.ResourceTable) (io.Reader, error) { + //Convert rdr to BufferedReadSeeker to support rewinding + bufRdr := iobuf.NewBufferedReaderSeeker(rdr) + + // Create a buffer to store the formatted XML data + // Note: in the future, consider a custom writer that spills to disk if the buffer gets too large + var buf bytes.Buffer + enc := xml.NewEncoder(&buf) + + // Parse the XML data using the apkparser library + resource table + err := apkparser.ParseXml(bufRdr, enc, resTable) + if err != nil { + // If the error is due to plaintext XML, return the plaintext XML + if errors.Is(err, apkparser.ErrPlainTextManifest) { + if _, err := bufRdr.Seek(0, io.SeekStart); err != nil { + return bufRdr, fmt.Errorf("error resetting reader after XML parsing error: %w", err) + } + return bufRdr, nil + } + return nil, err + } + return &buf, nil +} diff --git a/pkg/handlers/apk_test.go b/pkg/handlers/apk_test.go new file mode 100644 index 000000000000..0790d451118c --- /dev/null +++ b/pkg/handlers/apk_test.go @@ -0,0 +1,114 @@ +package handlers + +import ( + "io" + "net/http" + "regexp" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/trufflesecurity/trufflehog/v3/pkg/context" + logContext "github.com/trufflesecurity/trufflehog/v3/pkg/context" +) + +func TestAPKHandler(t *testing.T) { + tests := map[string]struct { + archiveURL string + expectedChunks int + expectedSecrets int + matchString string + expectErr bool + }{ + "apk_with_3_leaked_keys": { + "https://github.com/joeleonjr/leakyAPK/raw/refs/heads/main/aws_leak.apk", + 942, + // Note: the secret count is 4 instead of 3 b/c we're not actually running the secret detection engine, + // we're just looking for a string match. There is one extra string match in the APK (but only 3 detected secrets). + 4, + "AKIA2UC3BSXMLSCLTUUS", + false, + }, + } + + for name, testCase := range tests { + t.Run(name, func(t *testing.T) { + resp, err := http.Get(testCase.archiveURL) + assert.NoError(t, err) + assert.Equal(t, http.StatusOK, resp.StatusCode) + defer resp.Body.Close() + + handler := newAPKHandler() + + newReader, err := newFileReader(resp.Body) + if err != nil { + t.Errorf("error creating reusable reader: %s", err) + } + defer newReader.Close() + + archiveChan, err := handler.HandleFile(logContext.Background(), newReader) + if testCase.expectErr { + assert.NoError(t, err) + return + } + + chunkCount := 0 + secretCount := 0 + re := regexp.MustCompile(testCase.matchString) + matched := false + for chunk := range archiveChan { + chunkCount++ + if re.Match(chunk) { + secretCount++ + matched = true + } + } + + assert.True(t, matched) + assert.Equal(t, testCase.expectedChunks, chunkCount) + assert.Equal(t, testCase.expectedSecrets, secretCount) + }) + } +} + +func TestOpenInvalidAPK(t *testing.T) { + reader := strings.NewReader("invalid apk") + + ctx := logContext.AddLogger(context.Background()) + handler := apkHandler{} + + rdr, err := newFileReader(io.NopCloser(reader)) + assert.NoError(t, err) + defer rdr.Close() + + archiveChan := make(chan []byte) + + err = handler.processAPK(ctx, rdr, archiveChan) + assert.Contains(t, err.Error(), "zip: not a valid zip file") +} + +func TestOpenValidZipInvalidAPK(t *testing.T) { + // Grabbed from archive_test.go + validZipURL := "https://raw.githubusercontent.com/bill-rich/bad-secrets/master/aws-canary-creds.zip" + + resp, err := http.Get(validZipURL) + assert.NoError(t, err) + assert.Equal(t, http.StatusOK, resp.StatusCode) + defer resp.Body.Close() + + handler := newAPKHandler() + + newReader, err := newFileReader(resp.Body) + if err != nil { + t.Errorf("error creating reusable reader: %s", err) + } + assert.NoError(t, err) + defer newReader.Close() + + archiveChan := make(chan []byte) + ctx := logContext.AddLogger(context.Background()) + + err = handler.processAPK(ctx, newReader, archiveChan) + assert.Contains(t, err.Error(), "resources.arsc file not found") +} diff --git a/pkg/handlers/handlers.go b/pkg/handlers/handlers.go index 1b91a5ec2978..3f49cf62538a 100644 --- a/pkg/handlers/handlers.go +++ b/pkg/handlers/handlers.go @@ -1,17 +1,21 @@ package handlers import ( + "archive/zip" "bufio" "errors" "fmt" "io" + "path/filepath" "time" "github.com/gabriel-vasile/mimetype" "github.com/mholt/archiver/v4" logContext "github.com/trufflesecurity/trufflehog/v3/pkg/context" + "github.com/trufflesecurity/trufflehog/v3/pkg/feature" "github.com/trufflesecurity/trufflehog/v3/pkg/iobuf" + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb" "github.com/trufflesecurity/trufflehog/v3/pkg/sources" ) @@ -38,6 +42,14 @@ type fileReader struct { *iobuf.BufferedReadSeeker } +type readerConfig struct{ fileExtension string } + +type readerOption func(*readerConfig) + +func withFileExtension(ext string) readerOption { + return func(c *readerConfig) { c.fileExtension = ext } +} + var ErrEmptyReader = errors.New("reader is empty") // mimeTypeReader wraps an io.Reader with MIME type information. @@ -80,8 +92,15 @@ func newMimeTypeReader(r io.Reader) (mimeTypeReader, error) { } // newFileReader creates a fileReader from an io.Reader, optionally using BufferedFileWriter for certain formats. -func newFileReader(r io.Reader) (fileReader, error) { - var fReader fileReader +func newFileReader(r io.Reader, options ...readerOption) (fileReader, error) { + var ( + fReader fileReader + cfg readerConfig + ) + + for _, opt := range options { + opt(&cfg) + } fReader.BufferedReadSeeker = iobuf.NewBufferedReaderSeeker(r) @@ -96,6 +115,17 @@ func newFileReader(r io.Reader) (fileReader, error) { return fReader, fmt.Errorf("error resetting reader after MIME detection: %w", err) } + // Check for APK files + if shouldHandleAsAPK(cfg, fReader) { + isAPK, err := isAPKFile(&fReader) + if err != nil { + return fReader, fmt.Errorf("error checking for APK: %w", err) + } + if isAPK { + return handleAPKFile(&fReader) + } + } + // If a MIME type is known to not be an archive type, we might as well return here rather than // paying the I/O penalty of an archiver.Identify() call that won't identify anything. if _, ok := skipArchiverMimeTypes[mimeType(mime.String())]; ok { @@ -157,7 +187,9 @@ const ( archiveHandlerType handlerType = "archive" arHandlerType handlerType = "ar" rpmHandlerType handlerType = "rpm" + apkHandlerType handlerType = "apk" defaultHandlerType handlerType = "default" + apkExt = ".apk" ) type mimeType string @@ -192,6 +224,9 @@ const ( pyScriptMime mimeType = "application/x-script.python" tclTextMime mimeType = "text/x-tcl" tclMime mimeType = "application/x-tcl" + apkMime mimeType = "application/vnd.android.package-archive" + zipMime mimeType = "application/zip" + jarMime mimeType = "application/java-archive" ) // skipArchiverMimeTypes is a set of MIME types that should bypass archiver library processing because they are either @@ -226,6 +261,7 @@ var skipArchiverMimeTypes = map[mimeType]struct{}{ pyScriptMime: {}, tclTextMime: {}, tclMime: {}, + apkMime: {}, } // selectHandler dynamically selects and configures a FileHandler based on the provided |mimetype| type and archive flag. @@ -233,6 +269,7 @@ var skipArchiverMimeTypes = map[mimeType]struct{}{ // This method uses specialized handlers for specific file types: // - arHandler is used for Unix archives and Debian packages ('arMime', 'unixArMime', and 'debMime'). // - rpmHandler is used for RPM and CPIO archives ('rpmMime' and 'cpioMime'). +// - apkHandler is used for APK archives ('apkMime'). // - archiveHandler is used for common archive formats supported by the archiver library (.zip, .tar, .gz, etc.). // - defaultHandler is used for non-archive files. // The selected handler is then returned, ready to handle the file according to its specific format and requirements. @@ -242,6 +279,8 @@ func selectHandler(mimeT mimeType, isGenericArchive bool) FileHandler { return newARHandler() case rpmMime, cpioMime: return newRPMHandler() + case apkMime: + return newAPKHandler() default: if isGenericArchive { return newArchiveHandler() @@ -275,7 +314,8 @@ func HandleFile( return fmt.Errorf("reader is nil") } - rdr, err := newFileReader(reader) + readerOption := withFileExtension(getFileExtension(chunkSkel)) + rdr, err := newFileReader(reader, readerOption) if err != nil { if errors.Is(err, ErrEmptyReader) { ctx.Logger().V(5).Info("empty reader, skipping file") @@ -346,3 +386,131 @@ func handleChunks( } } } + +// getFileExtension extracts the file extension from the chunk's SourceMetadata. +// It considers all sources defined in the MetaData message. +// Note: Probably should add this as a method to the source_metadatapb object. +// then it'd just be chunkSkel.SourceMetadata.GetFileExtension() +func getFileExtension(chunkSkel *sources.Chunk) string { + if chunkSkel == nil || chunkSkel.SourceMetadata == nil { + return "" + } + + var fileName string + + // Inspect the SourceMetadata to determine the source type + switch metadata := chunkSkel.SourceMetadata.Data.(type) { + case *source_metadatapb.MetaData_Artifactory: + fileName = metadata.Artifactory.Path + case *source_metadatapb.MetaData_Azure: + fileName = metadata.Azure.File + case *source_metadatapb.MetaData_AzureRepos: + fileName = metadata.AzureRepos.File + case *source_metadatapb.MetaData_Bitbucket: + fileName = metadata.Bitbucket.File + case *source_metadatapb.MetaData_Buildkite: + fileName = metadata.Buildkite.Link + case *source_metadatapb.MetaData_Circleci: + fileName = metadata.Circleci.Link + case *source_metadatapb.MetaData_Confluence: + fileName = metadata.Confluence.File + case *source_metadatapb.MetaData_Docker: + fileName = metadata.Docker.File + case *source_metadatapb.MetaData_Ecr: + fileName = metadata.Ecr.File + case *source_metadatapb.MetaData_Filesystem: + fileName = metadata.Filesystem.File + case *source_metadatapb.MetaData_Git: + fileName = metadata.Git.File + case *source_metadatapb.MetaData_Github: + fileName = metadata.Github.File + case *source_metadatapb.MetaData_Gitlab: + fileName = metadata.Gitlab.File + case *source_metadatapb.MetaData_Gcs: + fileName = metadata.Gcs.Filename + case *source_metadatapb.MetaData_GoogleDrive: + fileName = metadata.GoogleDrive.File + case *source_metadatapb.MetaData_Huggingface: + fileName = metadata.Huggingface.File + case *source_metadatapb.MetaData_Jira: + fileName = metadata.Jira.Link + case *source_metadatapb.MetaData_Jenkins: + fileName = metadata.Jenkins.Link + case *source_metadatapb.MetaData_Npm: + fileName = metadata.Npm.File + case *source_metadatapb.MetaData_Pypi: + fileName = metadata.Pypi.File + case *source_metadatapb.MetaData_S3: + fileName = metadata.S3.File + case *source_metadatapb.MetaData_Slack: + fileName = metadata.Slack.File + case *source_metadatapb.MetaData_Sharepoint: + fileName = metadata.Sharepoint.Link + case *source_metadatapb.MetaData_Gerrit: + fileName = metadata.Gerrit.File + case *source_metadatapb.MetaData_Test: + fileName = metadata.Test.File + case *source_metadatapb.MetaData_Teams: + fileName = metadata.Teams.File + case *source_metadatapb.MetaData_TravisCI: + fileName = metadata.TravisCI.Link + // Add other sources if they have a file or equivalent field + // Skipping Syslog, Forager, Postman, Vector, Webhook and Elasticsearch + default: + return "" + } + + // Use filepath.Ext to extract the file extension from the file name + ext := filepath.Ext(fileName) + return ext +} + +// shouldHandleAsAPK checks if the file should be handled as an APK based on config and MIME type. +// Note: We can't extend the mimetype package with an APK detection function b/c it would require adjusting settings +// so that all files are fully read into a byte slice for detection (mimetype.SetLimit(0)), which would bloat memory. +// Instead we call the isAPKFile function in here after ensuring it's a zip/jar file and has an .apk extension. +func shouldHandleAsAPK(cfg readerConfig, fReader fileReader) bool { + return feature.EnableAPKHandler.Load() && + cfg.fileExtension == apkExt && + (fReader.mime.String() == string(zipMime) || fReader.mime.String() == string(jarMime)) +} + +func isAPKFile(r *fileReader) (bool, error) { + size, _ := r.Size() + zipReader, err := zip.NewReader(r, size) + if err != nil { + return false, fmt.Errorf("error creating zip reader: %w", err) + } + + hasManifest := false + hasClasses := false + + for _, file := range zipReader.File { + switch file.Name { + case "AndroidManifest.xml": + hasManifest = true + case "classes.dex": + hasClasses = true + default: + // Skip other files. + } + if hasManifest && hasClasses { + return true, nil + } + } + + return false, nil +} + +// handleAPKFile configures the MIME type for an APK and resets the reader. +func handleAPKFile(fReader *fileReader) (fileReader, error) { + // Extend the MIME type to recognize APK files + mimetype.Lookup("application/zip").Extend(func(r []byte, l uint32) bool { return false }, string(apkMime), ".apk") + fReader.mime = mimetype.Lookup(string(apkMime)) + + // Reset reader for further handling + if _, err := fReader.Seek(0, io.SeekStart); err != nil { + return *fReader, fmt.Errorf("error resetting reader after APK detection: %w", err) + } + return *fReader, nil +}