From f481626b44530abdf1b07246ecbf526a1ae40066 Mon Sep 17 00:00:00 2001 From: alexn-dd Date: Wed, 16 Oct 2024 23:53:49 -0700 Subject: [PATCH] Make Windows crash dump checks asynchronous. (#30156) --- .../system/wincrashdetect/probe/crashparse.go | 9 +- .../wincrashdetect/probe/crashparse_test.go | 9 +- .../wincrashdetect/probe/win_crash_types.go | 18 +- .../wincrashdetect/probe/wincrash_testutil.go | 26 +++ .../wincrashdetect/probe/wincrashprobe.go | 112 ++++++++++-- .../wincrashdetect_windows_test.go | 169 +++++++++++++++++- pkg/util/crashreport/crashreport.go | 9 +- ...dowscrashreportasync-1b2c77f9ebeafdd5.yaml | 14 ++ 8 files changed, 338 insertions(+), 28 deletions(-) create mode 100644 pkg/collector/corechecks/system/wincrashdetect/probe/wincrash_testutil.go create mode 100644 releasenotes/notes/windowscrashreportasync-1b2c77f9ebeafdd5.yaml diff --git a/pkg/collector/corechecks/system/wincrashdetect/probe/crashparse.go b/pkg/collector/corechecks/system/wincrashdetect/probe/crashparse.go index c4fb188d2e5d2..38fe8119ba9cd 100644 --- a/pkg/collector/corechecks/system/wincrashdetect/probe/crashparse.go +++ b/pkg/collector/corechecks/system/wincrashdetect/probe/crashparse.go @@ -23,6 +23,7 @@ import ( // allow us to change for testing var readfn = doReadCrashDump +var parseCrashDump = parseWinCrashDump type logCallbackContext struct { loglines []string @@ -107,14 +108,14 @@ func doReadCrashDump(filename string, ctx *logCallbackContext, exterr *uint32) e return nil } -func parseCrashDump(wcs *WinCrashStatus) { +func parseWinCrashDump(wcs *WinCrashStatus) { var ctx logCallbackContext var extendedError uint32 err := readfn(wcs.FileName, &ctx, &extendedError) if err != nil { - wcs.Success = false + wcs.StatusCode = WinCrashStatusCodeFailed wcs.ErrString = fmt.Sprintf("Failed to load crash dump file %v %x", err, extendedError) log.Errorf("Failed to open crash dump %s: %v %x", wcs.FileName, err, extendedError) return @@ -122,7 +123,7 @@ func parseCrashDump(wcs *WinCrashStatus) { if len(ctx.loglines) < 2 { wcs.ErrString = fmt.Sprintf("Invalid crash dump file %s", wcs.FileName) - wcs.Success = false + wcs.StatusCode = WinCrashStatusCodeFailed return } @@ -190,5 +191,5 @@ func parseCrashDump(wcs *WinCrashStatus) { wcs.Offender = callsite break } - wcs.Success = true + wcs.StatusCode = WinCrashStatusCodeSuccess } diff --git a/pkg/collector/corechecks/system/wincrashdetect/probe/crashparse_test.go b/pkg/collector/corechecks/system/wincrashdetect/probe/crashparse_test.go index 161b3b6068119..2efd8d5f21781 100644 --- a/pkg/collector/corechecks/system/wincrashdetect/probe/crashparse_test.go +++ b/pkg/collector/corechecks/system/wincrashdetect/probe/crashparse_test.go @@ -51,12 +51,11 @@ func TestCrashParser(t *testing.T) { FileName: "testdata/crashsample1.txt", } // first read in the sample data - - readfn = testCrashReader + OverrideCrashDumpReader(testCrashReader) parseCrashDump(wcs) - assert.True(t, wcs.Success) + assert.Equal(t, WinCrashStatusCodeSuccess, wcs.StatusCode) assert.Empty(t, wcs.ErrString) assert.Equal(t, "Mon Jun 26 20:44:49.742 2023 (UTC - 7:00)", wcs.DateString) before, _, _ := strings.Cut(wcs.Offender, "+") @@ -72,11 +71,11 @@ func TestCrashParserWithLineSplits(t *testing.T) { } // first read in the sample data - readfn = testCrashReaderWithLineSplits + OverrideCrashDumpReader(testCrashReaderWithLineSplits) parseCrashDump(wcs) - assert.True(t, wcs.Success) + assert.Equal(t, WinCrashStatusCodeSuccess, wcs.StatusCode) assert.Empty(t, wcs.ErrString) assert.Equal(t, "Mon Jun 26 20:44:49.742 2023 (UTC - 7:00)", wcs.DateString) before, _, _ := strings.Cut(wcs.Offender, "+") diff --git a/pkg/collector/corechecks/system/wincrashdetect/probe/win_crash_types.go b/pkg/collector/corechecks/system/wincrashdetect/probe/win_crash_types.go index 5fac5c9853b17..a1a36691be08a 100644 --- a/pkg/collector/corechecks/system/wincrashdetect/probe/win_crash_types.go +++ b/pkg/collector/corechecks/system/wincrashdetect/probe/win_crash_types.go @@ -29,10 +29,26 @@ const ( DumpTypeAutomatic = int(7) // automatic ) +const ( + // WinCrashStatusCodeUnknown indicates an invalid or corrupted code. + WinCrashStatusCodeUnknown = int(-1) + + // WinCrashStatusCodeSuccess indicates that crash dump processing succeeded + // or no crash dump was found. + WinCrashStatusCodeSuccess = int(0) + + // WinCrashStatusCodeBusy indicates that crash dump processing is still busy + // and no result is yet available. + WinCrashStatusCodeBusy = int(1) + + // WinCrashStatusCodeFailed indicates that crash dump processing failed or had an error. + WinCrashStatusCodeFailed = int(2) +) + // WinCrashStatus defines all of the information returned from the system // probe to the caller type WinCrashStatus struct { - Success bool `json:"success"` + StatusCode int `json:"statuscode"` ErrString string `json:"errstring"` FileName string `json:"filename"` Type int `json:"dumptype"` diff --git a/pkg/collector/corechecks/system/wincrashdetect/probe/wincrash_testutil.go b/pkg/collector/corechecks/system/wincrashdetect/probe/wincrash_testutil.go new file mode 100644 index 0000000000000..c2f52b0071589 --- /dev/null +++ b/pkg/collector/corechecks/system/wincrashdetect/probe/wincrash_testutil.go @@ -0,0 +1,26 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2024-present Datadog, Inc. + +//go:build test && windows + +package probe + +type readCrashDumpType func(filename string, ctx *logCallbackContext, _ *uint32) error +type parseCrashDumpType func(wcs *WinCrashStatus) + +// SetCachedSettings sets the settings used for tests without reading the Registry. +func (p *WinCrashProbe) SetCachedSettings(wcs *WinCrashStatus) { + p.status = wcs +} + +// OverrideCrashDumpReader relpaces the crash dump reading function for tests. +func OverrideCrashDumpReader(customCrashReader readCrashDumpType) { + readfn = customCrashReader +} + +// OverrideCrashDumpParser relpaces the crash dump parsing function for tests. +func OverrideCrashDumpParser(customParseCrashDump parseCrashDumpType) { + parseCrashDump = customParseCrashDump +} diff --git a/pkg/collector/corechecks/system/wincrashdetect/probe/wincrashprobe.go b/pkg/collector/corechecks/system/wincrashdetect/probe/wincrashprobe.go index 1d75c514142df..533cfcc2c431b 100644 --- a/pkg/collector/corechecks/system/wincrashdetect/probe/wincrashprobe.go +++ b/pkg/collector/corechecks/system/wincrashdetect/probe/wincrashprobe.go @@ -11,38 +11,126 @@ import ( "fmt" "os" "path/filepath" + "sync" sysconfigtypes "github.com/DataDog/datadog-agent/cmd/system-probe/config/types" "github.com/DataDog/datadog-agent/pkg/util/winutil" "golang.org/x/sys/windows/registry" ) +type probeState uint32 + +const ( + // Idle indicates that the probe is waiting for a request + idle probeState = iota + + // Busy indicates that the probe is currently processing a crash dump + busy + + // Completed indicates that the probe finished processing a crash dump. + completed + + // Failed indicates that the probe failed to process a crash dump. + failed +) + // WinCrashProbe has no stored state. type WinCrashProbe struct { + state probeState + status *WinCrashStatus + mu sync.Mutex } // NewWinCrashProbe returns an initialized WinCrashProbe func NewWinCrashProbe(_ *sysconfigtypes.Config) (*WinCrashProbe, error) { - return &WinCrashProbe{}, nil + return &WinCrashProbe{ + state: idle, + status: nil, + }, nil +} + +// Handles crash dump parsing in a separate thread since this may take very long. +func (p *WinCrashProbe) parseCrashDumpAsync() { + if p.status == nil { + p.state = failed + return + } + + parseCrashDump(p.status) + + p.mu.Lock() + defer p.mu.Unlock() + p.state = completed } // Get returns the current crash, if any func (p *WinCrashProbe) Get() *WinCrashStatus { wcs := &WinCrashStatus{} - err := wcs.getCurrentCrashSettings() - if err != nil { - wcs.ErrString = err.Error() - wcs.Success = false - return wcs - } + // Nothing in this method should take long. + p.mu.Lock() + defer p.mu.Unlock() + + switch p.state { + case idle: + if p.status == nil { + // This is a new request. + err := wcs.getCurrentCrashSettings() + if err != nil { + wcs.ErrString = err.Error() + wcs.StatusCode = WinCrashStatusCodeFailed + } + } else { + // Use cached settings, set by tests. + // Make a copy to avoid side-effect modifications. + *wcs = *(p.status) + } - if len(wcs.FileName) == 0 { - // no filename means no crash dump - wcs.Success = true // we succeeded - return wcs + // Transition to the next state. + if wcs.StatusCode == WinCrashStatusCodeFailed { + // Only try once and cache the failure. + p.status = wcs + p.state = failed + } else if len(wcs.FileName) == 0 { + // No filename means no crash dump + p.status = wcs + p.state = completed + wcs.StatusCode = WinCrashStatusCodeSuccess + } else { + // Kick off the crash dump processing asynchronously. + // The crash dump may be very large and we should not block for a response. + p.state = busy + wcs.StatusCode = WinCrashStatusCodeBusy + + // Make a new copy of the wcs for async processing while returning "Busy" + // for the current response. + p.status = &WinCrashStatus{ + FileName: wcs.FileName, + Type: wcs.Type, + } + + go p.parseCrashDumpAsync() + } + + case busy: + // The crash dump processing is not done yet. Reply busy. + if p.status != nil { + wcs.FileName = p.status.FileName + wcs.Type = p.status.Type + } + wcs.StatusCode = WinCrashStatusCodeBusy + + case failed: + fallthrough + case completed: + // The crash dump processing was done, return the result. + if p.status != nil { + // This result is cached for all subsequent queries. + wcs = p.status + } else { + wcs.StatusCode = WinCrashStatusCodeFailed + } } - parseCrashDump(wcs) return wcs } diff --git a/pkg/collector/corechecks/system/wincrashdetect/wincrashdetect_windows_test.go b/pkg/collector/corechecks/system/wincrashdetect/wincrashdetect_windows_test.go index ffb9ce3647dbc..ff1cdb0ced1c8 100644 --- a/pkg/collector/corechecks/system/wincrashdetect/wincrashdetect_windows_test.go +++ b/pkg/collector/corechecks/system/wincrashdetect/wincrashdetect_windows_test.go @@ -10,11 +10,10 @@ package wincrashdetect import ( "net" "net/http" - - //"strings" + "sync" "testing" + "time" - //"github.com/stretchr/testify/require" "github.com/stretchr/testify/assert" "github.com/DataDog/datadog-agent/cmd/system-probe/utils" @@ -108,7 +107,7 @@ func TestWinCrashReporting(t *testing.T) { // set the return value handled in the check handler above p = &probe.WinCrashStatus{ - Success: true, + StatusCode: probe.WinCrashStatusCodeSuccess, } check := newCheck() @@ -128,7 +127,7 @@ func TestWinCrashReporting(t *testing.T) { testSetup(t) defer testCleanup() p = &probe.WinCrashStatus{ - Success: true, + StatusCode: probe.WinCrashStatusCodeSuccess, FileName: `c:\windows\memory.dmp`, Type: probe.DumpTypeAutomatic, DateString: `Fri Jun 30 15:33:05.086 2023 (UTC - 7:00)`, @@ -201,3 +200,163 @@ func TestWinCrashReporting(t *testing.T) { mock.AssertNumberOfCalls(t, "Commit", 2) }) } + +func TestCrashReportingStates(t *testing.T) { + var crashStatus *probe.WinCrashStatus + + listener, closefunc := createSystemProbeListener() + defer closefunc() + + pkgconfigsetup.InitSystemProbeConfig(pkgconfigsetup.SystemProbe()) + + mux := http.NewServeMux() + server := http.Server{ + Handler: mux, + } + defer server.Close() + + cp, err := probe.NewWinCrashProbe(nil) + assert.NotNil(t, cp) + assert.Nil(t, err) + + wg := sync.WaitGroup{} + + // This will artificially delay the "parsing" to ensure the first check gets a "busy" status. + delayedCrashDumpParser := func(wcs *probe.WinCrashStatus) { + time.Sleep(4 * time.Second) + + assert.Equal(t, `c:\windows\memory.dmp`, wcs.FileName) + assert.Equal(t, probe.DumpTypeAutomatic, wcs.Type) + + wcs.StatusCode = probe.WinCrashStatusCodeSuccess + wcs.ErrString = crashStatus.ErrString + wcs.DateString = crashStatus.DateString + wcs.Offender = crashStatus.Offender + wcs.BugCheck = crashStatus.BugCheck + + // Signal that the artificial delay is done. + wg.Done() + } + + // This ensures that no crash dump parsing should happen. + noCrashDumpParser := func(_ *probe.WinCrashStatus) { + assert.FailNow(t, "Should not parse") + } + + mux.Handle("/windows_crash_detection/check", http.HandlerFunc(func(rw http.ResponseWriter, _ *http.Request) { + results := cp.Get() + utils.WriteAsJSON(rw, results) + })) + mux.Handle("/debug/stats", http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) { + })) + go server.Serve(listener) + + t.Run("test reporting a crash with a busy intermediate state", func(t *testing.T) { + testSetup(t) + defer testCleanup() + + check := newCheck() + crashCheck := check.(*WinCrashDetect) + mock := mocksender.NewMockSender(crashCheck.ID()) + err := crashCheck.Configure(mock.GetSenderManager(), 0, nil, nil, "") + assert.NoError(t, err) + + crashStatus = &probe.WinCrashStatus{ + StatusCode: probe.WinCrashStatusCodeSuccess, + FileName: `c:\windows\memory.dmp`, + Type: probe.DumpTypeAutomatic, + ErrString: "", + DateString: `Fri Jun 30 15:33:05.086 2023 (UTC - 7:00)`, + Offender: `somedriver.sys`, + BugCheck: "0x00000007", + } + + // Test the 2-check response from crash reporting. + cp.SetCachedSettings(crashStatus) + probe.OverrideCrashDumpParser(delayedCrashDumpParser) + + // First run should be "busy" and not return an event yet. + wg.Add(1) + err = crashCheck.Run() + assert.Nil(t, err) + mock.AssertNumberOfCalls(t, "Gauge", 0) + mock.AssertNumberOfCalls(t, "Rate", 0) + mock.AssertNumberOfCalls(t, "Event", 0) + mock.AssertNumberOfCalls(t, "Commit", 0) + + // Wait for the artificial delay to finish, plus a small time buffer. + wg.Wait() + time.Sleep(4 * time.Second) + + expected := event.Event{ + Priority: event.PriorityNormal, + SourceTypeName: CheckName, + EventType: CheckName, + AlertType: event.AlertTypeError, + Title: formatTitle(crashStatus), + Text: formatText(crashStatus), + } + + mock.On("Event", expected).Return().Times(1) + mock.On("Commit").Return().Times(1) + + // The result should be available now. + err = crashCheck.Run() + assert.Nil(t, err) + mock.AssertNumberOfCalls(t, "Gauge", 0) + mock.AssertNumberOfCalls(t, "Rate", 0) + mock.AssertNumberOfCalls(t, "Event", 1) + mock.AssertNumberOfCalls(t, "Commit", 1) + }) + + t.Run("test that no crash is reported", func(t *testing.T) { + testSetup(t) + defer testCleanup() + + check := newCheck() + crashCheck := check.(*WinCrashDetect) + mock := mocksender.NewMockSender(crashCheck.ID()) + err := crashCheck.Configure(mock.GetSenderManager(), 0, nil, nil, "") + assert.NoError(t, err) + + noCrashStatus := &probe.WinCrashStatus{ + StatusCode: probe.WinCrashStatusCodeSuccess, + FileName: "", + } + + // Test finding no crashes. The response should be immediate. + cp.SetCachedSettings(noCrashStatus) + probe.OverrideCrashDumpParser(noCrashDumpParser) + err = crashCheck.Run() + assert.Nil(t, err) + mock.AssertNumberOfCalls(t, "Gauge", 0) + mock.AssertNumberOfCalls(t, "Rate", 0) + mock.AssertNumberOfCalls(t, "Event", 0) + mock.AssertNumberOfCalls(t, "Commit", 0) + }) + + t.Run("test failure on reading crash settings", func(t *testing.T) { + testSetup(t) + defer testCleanup() + + check := newCheck() + crashCheck := check.(*WinCrashDetect) + mock := mocksender.NewMockSender(crashCheck.ID()) + err := crashCheck.Configure(mock.GetSenderManager(), 0, nil, nil, "") + assert.NoError(t, err) + + failedStatus := &probe.WinCrashStatus{ + StatusCode: probe.WinCrashStatusCodeFailed, + ErrString: "Mocked failure", + } + + // Test having a failure reading setings. The response should be immediate. + cp.SetCachedSettings(failedStatus) + probe.OverrideCrashDumpParser(noCrashDumpParser) + err = crashCheck.Run() + assert.NotNil(t, err) + mock.AssertNumberOfCalls(t, "Rate", 0) + mock.AssertNumberOfCalls(t, "Event", 0) + mock.AssertNumberOfCalls(t, "Commit", 0) + }) +} diff --git a/pkg/util/crashreport/crashreport.go b/pkg/util/crashreport/crashreport.go index 0bab9ca167051..8c04cdbb6c11c 100644 --- a/pkg/util/crashreport/crashreport.go +++ b/pkg/util/crashreport/crashreport.go @@ -115,6 +115,13 @@ func (wcr *WinCrashReporter) CheckForCrash() (*probe.WinCrashStatus, error) { if !ok { return nil, fmt.Errorf("Raw data has incorrect type") } + + // Crash dump processing is not done yet, nothing to send at the moment. Try later. + if crash.StatusCode == probe.WinCrashStatusCodeBusy { + log.Infof("Crash dump processing is busy") + return nil, nil + } + /* * originally did this with a sync.once. The problem is the check is run prior to the * system probe being successfully started. This is OK; we just need to detect the BSOD @@ -124,7 +131,7 @@ func (wcr *WinCrashReporter) CheckForCrash() (*probe.WinCrashStatus, error) { * we don't need to run any more */ wcr.hasRunOnce = true - if !crash.Success { + if crash.StatusCode == probe.WinCrashStatusCodeFailed { return nil, fmt.Errorf("Error getting crash data %s", crash.ErrString) } diff --git a/releasenotes/notes/windowscrashreportasync-1b2c77f9ebeafdd5.yaml b/releasenotes/notes/windowscrashreportasync-1b2c77f9ebeafdd5.yaml new file mode 100644 index 0000000000000..16552ffd1708b --- /dev/null +++ b/releasenotes/notes/windowscrashreportasync-1b2c77f9ebeafdd5.yaml @@ -0,0 +1,14 @@ +# Each section from every release note are combined when the +# CHANGELOG.rst is rendered. So the text needs to be worded so that +# it does not depend on any information only available in another +# section. This may mean repeating some details, but each section +# must be readable independently of the other. +# +# Each section note must be formatted as reStructuredText. +--- +enhancements: + - | + On Windows, the endpoint /windows_crash_detection/check has been modified to report crashes in + an asynchronous manner, to allow processing of large crash dumps without blocking or timing out. + The first check will return a busy status and continue to do so until the processing is completed. +