From 33d115593a2c033c6e9f5db90a4b3aeffe201ecf Mon Sep 17 00:00:00 2001 From: Jack Phillips Date: Mon, 25 Nov 2024 14:02:15 -0500 Subject: [PATCH] clean exit test all services (#31334) --- .gitlab/e2e/e2e.yml | 22 ++-- cmd/security-agent/main_windows.go | 6 + .../subcommands/start/command.go | 7 +- pkg/util/winutil/servicemain/servicemain.go | 17 ++- .../fixtures/datadog-pa-disabled.yaml | 14 +++ .../fixtures/datadog-ta-disabled.yaml | 12 ++ .../fixtures/security-agent-disabled.yaml | 3 + .../windows/service-test/startstop_test.go | 117 ++++++++++++++---- 8 files changed, 146 insertions(+), 52 deletions(-) create mode 100644 test/new-e2e/tests/windows/service-test/fixtures/datadog-pa-disabled.yaml create mode 100644 test/new-e2e/tests/windows/service-test/fixtures/datadog-ta-disabled.yaml create mode 100644 test/new-e2e/tests/windows/service-test/fixtures/security-agent-disabled.yaml diff --git a/.gitlab/e2e/e2e.yml b/.gitlab/e2e/e2e.yml index b48c9d5c4d41c..aa3c21938b4ce 100644 --- a/.gitlab/e2e/e2e.yml +++ b/.gitlab/e2e/e2e.yml @@ -190,21 +190,13 @@ new-e2e-windows-service-test: variables: TARGETS: ./tests/windows/service-test TEAM: windows-agent - EXTRA_PARAMS: --run TestServiceBehavior - -# Temporary job for hunting a crash -new-e2e-windows-service-test-nofim: - extends: .new_e2e_template - needs: - - !reference [.needs_new_e2e_template] - - deploy_windows_testing-a7 - rules: - - !reference [.on_windows_service_or_e2e_changes] - - !reference [.manual] - variables: - TARGETS: ./tests/windows/service-test - TEAM: windows-agent - EXTRA_PARAMS: --run TestNoFIMServiceBehavior + parallel: + matrix: + - EXTRA_PARAMS: --run TestServiceBehaviorAgentCommand + - EXTRA_PARAMS: --run TestServiceBehaviorPowerShell + - EXTRA_PARAMS: --run TestServiceBehaviorWhenDisabledSystemProbe + - EXTRA_PARAMS: --run TestServiceBehaviorWhenDisabledProcessAgent + - EXTRA_PARAMS: --run TestServiceBehaviorWhenDisabledTraceAgent new-e2e-language-detection: extends: .new_e2e_template_needs_deb_x64 diff --git a/cmd/security-agent/main_windows.go b/cmd/security-agent/main_windows.go index d2e6ab66b4441..6aa47f99d56b4 100644 --- a/cmd/security-agent/main_windows.go +++ b/cmd/security-agent/main_windows.go @@ -10,6 +10,8 @@ package main import ( "context" + "errors" + "fmt" "os" "path" @@ -94,6 +96,10 @@ func (s *service) Run(svcctx context.Context) error { err := start.RunAgent(log, config, telemetry, statusComponent, settings, wmeta) if err != nil { + if errors.Is(err, start.ErrAllComponentsDisabled) { + // If all components are disabled, we should exit cleanly + return fmt.Errorf("%w: %w", servicemain.ErrCleanStopAfterInit, err) + } return err } diff --git a/cmd/security-agent/subcommands/start/command.go b/cmd/security-agent/subcommands/start/command.go index 84b4309024edd..6dec7ce712a4e 100644 --- a/cmd/security-agent/subcommands/start/command.go +++ b/cmd/security-agent/subcommands/start/command.go @@ -211,7 +211,7 @@ func start(log log.Component, config config.Component, _ secrets.Component, _ st defer StopAgent(log) err := RunAgent(log, config, telemetry, statusComponent, settings, wmeta) - if errors.Is(err, errAllComponentsDisabled) || errors.Is(err, errNoAPIKeyConfigured) { + if errors.Is(err, ErrAllComponentsDisabled) || errors.Is(err, errNoAPIKeyConfigured) { return nil } if err != nil { @@ -257,7 +257,8 @@ var ( expvarServer *http.Server ) -var errAllComponentsDisabled = errors.New("all security-agent component are disabled") +// ErrAllComponentsDisabled is returned when all components are disabled +var ErrAllComponentsDisabled = errors.New("all security-agent component are disabled") var errNoAPIKeyConfigured = errors.New("no API key configured") // RunAgent initialized resources and starts API server @@ -274,7 +275,7 @@ func RunAgent(log log.Component, config config.Component, telemetry telemetry.Co // to startup because of an error. Only applies on Debian 7. time.Sleep(5 * time.Second) - return errAllComponentsDisabled + return ErrAllComponentsDisabled } if !config.IsSet("api_key") { diff --git a/pkg/util/winutil/servicemain/servicemain.go b/pkg/util/winutil/servicemain/servicemain.go index 1660e121ab05e..dfc63c2c871c4 100644 --- a/pkg/util/winutil/servicemain/servicemain.go +++ b/pkg/util/winutil/servicemain/servicemain.go @@ -296,22 +296,21 @@ func (s *controlHandler) Execute(args []string, r <-chan svc.ChangeRequest, chan defer close(done) // Run the actual agent/service err = s.service.Run(ctx) - if err != nil { - s.eventlog(messagestrings.MSG_SERVICE_FAILED, err.Error()) - } }() select { case <-done: - if err != nil { + case <-cleanExitCtx.Done(): + err = errors.New("service did not cleanly shutdown in a timely manner, hard stopping service") + } + if err != nil { + if errors.Is(err, ErrCleanStopAfterInit) { + s.eventlog(messagestrings.MSG_AGENT_CLEAN_STOP_AFTER_INIT, err.Error()) + } else { + s.eventlog(messagestrings.MSG_SERVICE_FAILED, err.Error()) // since exitGate is meant to avoid an error, if we are returning // with an error then we can skip the exitGate. return } - case <-cleanExitCtx.Done(): - s.eventlog(messagestrings.MSG_SERVICE_FAILED, "service did not cleanly shutdown in a timely manner, hard stopping service") - // since exitGate is meant to avoid an error, if we are returning - // with an error then we can skip the exitGate. - return } } diff --git a/test/new-e2e/tests/windows/service-test/fixtures/datadog-pa-disabled.yaml b/test/new-e2e/tests/windows/service-test/fixtures/datadog-pa-disabled.yaml new file mode 100644 index 0000000000000..a932271b658ee --- /dev/null +++ b/test/new-e2e/tests/windows/service-test/fixtures/datadog-pa-disabled.yaml @@ -0,0 +1,14 @@ +# disable logs agent +logs_enabled: false + +# disable process agent +process_config: + process_collection: + enabled: false + container_collection: + enabled: false + enabled: "disabled" + process_discovery: + enabled: false + +# trace-agent is running by default diff --git a/test/new-e2e/tests/windows/service-test/fixtures/datadog-ta-disabled.yaml b/test/new-e2e/tests/windows/service-test/fixtures/datadog-ta-disabled.yaml new file mode 100644 index 0000000000000..1ccc2ae05a839 --- /dev/null +++ b/test/new-e2e/tests/windows/service-test/fixtures/datadog-ta-disabled.yaml @@ -0,0 +1,12 @@ +# enable logs agent +logs_enabled: true + +# enable process agent +process_config: + # live process + process_collection: + enabled: true + +# trace-agent is running by default +apm_config: + enabled: false \ No newline at end of file diff --git a/test/new-e2e/tests/windows/service-test/fixtures/security-agent-disabled.yaml b/test/new-e2e/tests/windows/service-test/fixtures/security-agent-disabled.yaml new file mode 100644 index 0000000000000..88b59095e28c7 --- /dev/null +++ b/test/new-e2e/tests/windows/service-test/fixtures/security-agent-disabled.yaml @@ -0,0 +1,3 @@ +# enable security agent +runtime_security_config: + enabled: false diff --git a/test/new-e2e/tests/windows/service-test/startstop_test.go b/test/new-e2e/tests/windows/service-test/startstop_test.go index 1e600ca0927a5..837766c78f5e7 100644 --- a/test/new-e2e/tests/windows/service-test/startstop_test.go +++ b/test/new-e2e/tests/windows/service-test/startstop_test.go @@ -33,6 +33,12 @@ import ( //go:embed fixtures/datadog.yaml var agentConfig string +//go:embed fixtures/datadog-pa-disabled.yaml +var agentConfigPADisabled string + +//go:embed fixtures/datadog-ta-disabled.yaml +var agentConfigTADisabled string + //go:embed fixtures/system-probe.yaml var systemProbeConfig string @@ -45,22 +51,25 @@ var systemProbeDisabled string //go:embed fixtures/security-agent.yaml var securityAgentConfig string +//go:embed fixtures/security-agent-disabled.yaml +var securityAgentConfigDisabled string + // TestServiceBehaviorAgentCommandNoFIM tests the service behavior when controlled by Agent commands func TestNoFIMServiceBehaviorAgentCommand(t *testing.T) { s := &agentServiceCommandSuite{} - run(t, s, systemProbeNoFIMConfig) + run(t, s, systemProbeNoFIMConfig, agentConfig, securityAgentConfig) } // TestServiceBehaviorPowerShellNoFIM tests the service behavior when controlled by PowerShell commands func TestNoFIMServiceBehaviorPowerShell(t *testing.T) { s := &powerShellServiceCommandSuite{} - run(t, s, systemProbeNoFIMConfig) + run(t, s, systemProbeNoFIMConfig, agentConfig, securityAgentConfig) } // TestServiceBehaviorAgentCommand tests the service behavior when controlled by Agent commands func TestServiceBehaviorAgentCommand(t *testing.T) { s := &agentServiceCommandSuite{} - run(t, s, systemProbeConfig) + run(t, s, systemProbeConfig, agentConfig, securityAgentConfig) } type agentServiceCommandSuite struct { @@ -96,7 +105,7 @@ func (s *agentServiceCommandSuite) SetupSuite() { // TestServiceBehaviorAgentCommand tests the service behavior when controlled by PowerShell commands func TestServiceBehaviorPowerShell(t *testing.T) { s := &powerShellServiceCommandSuite{} - run(t, s, systemProbeConfig) + run(t, s, systemProbeConfig, agentConfig, securityAgentConfig) } type powerShellServiceCommandSuite struct { @@ -222,14 +231,54 @@ func (s *powerShellServiceCommandSuite) TestHardExitEventLogEntry() { }, 1*time.Minute, 1*time.Second, "should have hard exit messages in the event log") } +type agentServiceDisabledSuite struct { + baseStartStopSuite + disabledServices []string +} + // TestServiceBehaviorWhenDisabled tests the service behavior when disabled in the configuration -func TestServiceBehaviorWhenDisabled(t *testing.T) { - s := &agentServiceDisabledSuite{} - run(t, s, systemProbeDisabled) +func TestServiceBehaviorWhenDisabledSystemProbe(t *testing.T) { + s := &agentServiceDisabledSystemProbeSuite{} + s.disabledServices = []string{ + "datadog-security-agent", + "datadog-system-probe", + "ddnpm", + "ddprocmon", + } + run(t, s, systemProbeDisabled, agentConfig, securityAgentConfigDisabled) } -type agentServiceDisabledSuite struct { - baseStartStopSuite +type agentServiceDisabledSystemProbeSuite struct { + agentServiceDisabledSuite +} + +// TestServiceBehaviorWhenDisabledProcessAgent tests the service behavior when disabled in the configuration +func TestServiceBehaviorWhenDisabledProcessAgent(t *testing.T) { + s := &agentServiceDisabledProcessAgentSuite{} + s.disabledServices = []string{ + "datadog-process-agent", + "datadog-security-agent", + "datadog-system-probe", + "ddnpm", + "ddprocmon", + } + run(t, s, systemProbeDisabled, agentConfigPADisabled, securityAgentConfigDisabled) +} + +type agentServiceDisabledProcessAgentSuite struct { + agentServiceDisabledSuite +} + +func TestServiceBehaviorWhenDisabledTraceAgent(t *testing.T) { + s := &agentServiceDisabledTraceAgentSuite{} + s.disabledServices = []string{ + "datadog-trace-agent", + } + run(t, s, systemProbeConfig, agentConfigTADisabled, securityAgentConfig) +} + +type agentServiceDisabledTraceAgentSuite struct { + agentServiceDisabledSuite } func (s *agentServiceDisabledSuite) SetupSuite() { @@ -237,14 +286,22 @@ func (s *agentServiceDisabledSuite) SetupSuite() { // set up the expected services before calling the base setup s.runningUserServices = func() []string { - return []string{ - "datadogagent", - "datadog-trace-agent", - "datadog-process-agent", + runningServices := []string{} + for _, service := range s.getInstalledUserServices() { + if !slices.Contains(s.disabledServices, service) { + runningServices = append(runningServices, service) + } } + return runningServices } s.runningServices = func() []string { - return s.runningUserServices() + runningServices := []string{} + for _, service := range s.getInstalledServices() { + if !slices.Contains(s.disabledServices, service) { + runningServices = append(runningServices, service) + } + } + return runningServices } s.startAgentCommand = func(host *components.RemoteHost) error { @@ -268,15 +325,21 @@ func (s *agentServiceDisabledSuite) SetupSuite() { } func (s *agentServiceDisabledSuite) TestStartingDisabledService() { + kernel := s.getInstalledKernelServices() // check that the system probe is not running - s.assertServiceState("Stopped", "datadog-system-probe") + for _, service := range s.disabledServices { + s.assertServiceState("Stopped", service) - // try and start it and verify that it does correctly outputs to event log - err := windowsCommon.StartService(s.Env().RemoteHost, "datadog-system-probe") - s.Require().NoError(err, "should start datadog-system-probe") + // verify that we only try user services + if !slices.Contains(kernel, service) { + // try and start it and verify that it does correctly outputs to event log + err := windowsCommon.StartService(s.Env().RemoteHost, service) + s.Require().NoError(err, fmt.Sprintf("should start %s", service)) - //verify that service returns to stopped state - s.assertServiceState("Stopped", "datadog-system-probe") + // verify that service returns to stopped state + s.assertServiceState("Stopped", service) + } + } // Verify there are not errors in the event log entries, err := s.getAgentEventLogErrorsAndWarnings() @@ -284,7 +347,7 @@ func (s *agentServiceDisabledSuite) TestStartingDisabledService() { s.Require().Empty(entries, "should not have errors or warnings from agents in the event log") } -func run[Env any](t *testing.T, s e2e.Suite[Env], systemProbeConfig string) { +func run[Env any](t *testing.T, s e2e.Suite[Env], systemProbeConfig string, agentConfig string, securityAgentConfig string) { opts := []e2e.SuiteOption{e2e.WithProvisioner(awsHostWindows.ProvisionerNoFakeIntake( awsHostWindows.WithAgentOptions( agentparams.WithAgentConfig(agentConfig), @@ -583,13 +646,17 @@ func (s *baseStartStopSuite) getInstalledUserServices() []string { } } -// expectedInstalledServices returns the list of services that should be installed by the agent -func (s *baseStartStopSuite) getInstalledServices() []string { - user := s.getInstalledUserServices() - kernel := []string{ +func (s *baseStartStopSuite) getInstalledKernelServices() []string { + return []string{ "ddnpm", "ddprocmon", } +} + +// expectedInstalledServices returns the list of services that should be installed by the agent +func (s *baseStartStopSuite) getInstalledServices() []string { + user := s.getInstalledUserServices() + kernel := s.getInstalledKernelServices() return append(user, kernel...) }