From 8990be35990b9833ad98d9bc254667560b7a9951 Mon Sep 17 00:00:00 2001 From: Paulo Janotti Date: Fri, 15 Mar 2024 10:05:12 -0700 Subject: [PATCH] Respect telemetry configuration when running as a Windows service (#9726) **Description:** Fixes #5300 With this change the service telemetry section is respected by the collector when running as a Windows service. Log lever can be used to control the verbosity of the events logged and the logger can be redirected to a file by specifying an output path on the service telemetry config. By default `stdout` and `stderr` are redirected to the event log when running as a Windows service to keep the current behavior. The code change itself was made with a focus of not breaking the public APIs and not reading the config more than once. That said it is probably something to be refactored when the public APIs can be touched again. **Link to tracking Issue:** #5300 **Testing:** The test is an integration test that depends on the actual executable. It checks for event publication and file output. --- ...ows-service-respects-telemetry-config.yaml | 25 +++ .github/workflows/build-and-test-windows.yaml | 7 +- otelcol/collector.go | 6 +- otelcol/collector_windows.go | 30 ++- otelcol/collector_windows_service_test.go | 185 ++++++++++++++++++ otelcol/testdata/otel-log-to-file.yaml | 30 +++ 6 files changed, 272 insertions(+), 11 deletions(-) create mode 100644 .chloggen/windows-service-respects-telemetry-config.yaml create mode 100644 otelcol/collector_windows_service_test.go create mode 100644 otelcol/testdata/otel-log-to-file.yaml diff --git a/.chloggen/windows-service-respects-telemetry-config.yaml b/.chloggen/windows-service-respects-telemetry-config.yaml new file mode 100644 index 00000000000..312765f92d2 --- /dev/null +++ b/.chloggen/windows-service-respects-telemetry-config.yaml @@ -0,0 +1,25 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: 'bug_fix' + +# The name of the component, or a single word describing the area of concern, (e.g. otlpreceiver) +component: otelcol + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Respect telemetry configuration when running as a Windows service + +# One or more tracking issues or pull requests related to the change +issues: [5300] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [user] diff --git a/.github/workflows/build-and-test-windows.yaml b/.github/workflows/build-and-test-windows.yaml index c15649edeb7..8f70d45ae82 100644 --- a/.github/workflows/build-and-test-windows.yaml +++ b/.github/workflows/build-and-test-windows.yaml @@ -61,15 +61,16 @@ jobs: - name: Install otelcorecol as a service run: | - New-Service -Name "otelcorecol" -BinaryPathName "${PWD}\bin\otelcorecol_windows_amd64 --config ${PWD}\examples\local\otel-config.yaml" + New-Service -Name "otelcorecol" -StartupType "Manual" -BinaryPathName "${PWD}\bin\otelcorecol_windows_amd64 --config ${PWD}\examples\local\otel-config.yaml" eventcreate.exe /t information /id 1 /l application /d "Creating event provider for 'otelcorecol'" /so otelcorecol - name: Test otelcorecol service + working-directory: ${{ github.workspace }}/otelcol run: | - (Start-Service otelcorecol -PassThru).WaitForStatus('Running', '00:00:30') - (Stop-Service otelcorecol -PassThru).WaitForStatus('Stopped', '00:00:30') + go test -timeout 90s -run ^TestCollectorAsService$ -v -tags=win32service - name: Remove otelcorecol service + if: always() run: | Remove-Service otelcorecol Remove-Item HKLM:\SYSTEM\CurrentControlSet\Services\EventLog\Application\otelcorecol diff --git a/otelcol/collector.go b/otelcol/collector.go index 3fd5cb2ca31..a1500f5b81c 100644 --- a/otelcol/collector.go +++ b/otelcol/collector.go @@ -99,8 +99,9 @@ type Collector struct { configProvider ConfigProvider - service *service.Service - state *atomic.Int32 + serviceConfig *service.Config + service *service.Service + state *atomic.Int32 // shutdownChan is used to terminate the collector. shutdownChan chan struct{} @@ -182,6 +183,7 @@ func (col *Collector) setupConfigurationComponents(ctx context.Context) error { return fmt.Errorf("invalid configuration: %w", err) } + col.serviceConfig = &cfg.Service col.service, err = service.New(ctx, service.Settings{ BuildInfo: col.set.BuildInfo, CollectorConf: conf, diff --git a/otelcol/collector_windows.go b/otelcol/collector_windows.go index f443d930b17..cc0a3611628 100644 --- a/otelcol/collector_windows.go +++ b/otelcol/collector_windows.go @@ -18,6 +18,7 @@ import ( "golang.org/x/sys/windows/svc/eventlog" "go.opentelemetry.io/collector/featuregate" + "go.opentelemetry.io/collector/service" ) type windowsService struct { @@ -76,11 +77,6 @@ func (s *windowsService) Execute(args []string, requests <-chan svc.ChangeReques } func (s *windowsService) start(elog *eventlog.Log, colErrorChannel chan error) error { - // Append to new slice instead of the already existing s.settings.LoggingOptions slice to not change that. - s.settings.LoggingOptions = append( - []zap.Option{zap.WrapCore(withWindowsCore(elog))}, - s.settings.LoggingOptions..., - ) // Parse all the flags manually. if err := s.flags.Parse(os.Args[1:]); err != nil { return err @@ -96,6 +92,18 @@ func (s *windowsService) start(elog *eventlog.Log, colErrorChannel chan error) e return err } + // The logging options need to be in place before the collector Run method is called + // since the telemetry creates the logger at the time of the Run method call. + // However, the zap.WrapCore function needs to read the serviceConfig to determine + // if the Windows Event Log should be used, however, the serviceConfig is also + // only read at the time of the Run method call. To work around this, we pass the + // serviceConfig as a pointer to the logging options, and then read its value + // when the zap.Logger is created by the telemetry. + s.col.set.LoggingOptions = append( + s.col.set.LoggingOptions, + zap.WrapCore(withWindowsCore(elog, &s.col.serviceConfig)), + ) + // col.Run blocks until receiving a SIGTERM signal, so needs to be started // asynchronously, but it will exit early if an error occurs on startup go func() { @@ -192,8 +200,18 @@ func (w windowsEventLogCore) Sync() error { return w.core.Sync() } -func withWindowsCore(elog *eventlog.Log) func(zapcore.Core) zapcore.Core { +func withWindowsCore(elog *eventlog.Log, serviceConfig **service.Config) func(zapcore.Core) zapcore.Core { return func(core zapcore.Core) zapcore.Core { + if serviceConfig != nil { + for _, output := range (*serviceConfig).Telemetry.Logs.OutputPaths { + if output != "stdout" && output != "stderr" { + // A log file was specified in the configuration, so we should not use the Windows Event Log + return core + } + } + } + + // Use the Windows Event Log encoderConfig := zap.NewProductionEncoderConfig() encoderConfig.LineEnding = "\r\n" return windowsEventLogCore{core, elog, zapcore.NewConsoleEncoder(encoderConfig)} diff --git a/otelcol/collector_windows_service_test.go b/otelcol/collector_windows_service_test.go new file mode 100644 index 00000000000..999fa87bbbc --- /dev/null +++ b/otelcol/collector_windows_service_test.go @@ -0,0 +1,185 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//go:build windows && win32service + +package otelcol + +import ( + "encoding/xml" + "fmt" + "os" + "os/exec" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/require" + "golang.org/x/sys/windows/svc" + "golang.org/x/sys/windows/svc/mgr" +) + +const ( + collectorServiceName = "otelcorecol" +) + +// Test the collector as a Windows service. +// The test assumes that the service and respective event source are already created. +// The test also must be executed with administrative privileges. +func TestCollectorAsService(t *testing.T) { + collector_executable, err := filepath.Abs(filepath.Join("..", "bin", "otelcorecol_windows_amd64")) + require.NoError(t, err) + _, err = os.Stat(collector_executable) + require.NoError(t, err) + + scm, err := mgr.Connect() + require.NoError(t, err) + defer scm.Disconnect() + + service, err := scm.OpenService(collectorServiceName) + require.NoError(t, err) + defer service.Close() + + tests := []struct { + name string + configFile string + expectStartFailure bool + customSetup func(*testing.T) + customValidation func(*testing.T) + }{ + { + name: "Default", + configFile: filepath.Join("..", "examples", "local", "otel-config.yaml"), + }, + { + name: "ConfigFileNotFound", + configFile: filepath.Join(".", "non", "existent", "otel-config.yaml"), + expectStartFailure: true, + }, + { + name: "LogToFile", + configFile: filepath.Join(".", "testdata", "otel-log-to-file.yaml"), + customSetup: func(t *testing.T) { + // Create the folder and clean the log file if it exists + programDataPath := os.Getenv("ProgramData") + logsPath := filepath.Join(programDataPath, "OpenTelemetry", "Collector", "Logs") + err := os.MkdirAll(logsPath, os.ModePerm) + require.NoError(t, err) + + logFilePath := filepath.Join(logsPath, "otelcol.log") + err = os.Remove(logFilePath) + if err != nil && !os.IsNotExist(err) { + require.NoError(t, err) + } + }, + customValidation: func(t *testing.T) { + // Check that the log file was created + programDataPath := os.Getenv("ProgramData") + logsPath := filepath.Join(programDataPath, "OpenTelemetry", "Collector", "Logs") + logFilePath := filepath.Join(logsPath, "otelcol.log") + fileinfo, err := os.Stat(logFilePath) + require.NoError(t, err) + require.NotEmpty(t, fileinfo.Size()) + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + serviceConfig, err := service.Config() + require.NoError(t, err) + + // Setup the command line to launch the collector as a service + fullConfigPath, err := filepath.Abs(tt.configFile) + require.NoError(t, err) + + serviceConfig.BinaryPathName = fmt.Sprintf("\"%s\" --config \"%s\"", collector_executable, fullConfigPath) + err = service.UpdateConfig(serviceConfig) + require.NoError(t, err) + + if tt.customSetup != nil { + tt.customSetup(t) + } + + startTime := time.Now() + + err = service.Start() + require.NoError(t, err) + + expectedState := svc.Running + if tt.expectStartFailure { + expectedState = svc.Stopped + } else { + defer func() { + _, err = service.Control(svc.Stop) + require.NoError(t, err) + + require.Eventually(t, func() bool { + status, _ := service.Query() + return status.State == svc.Stopped + }, 10*time.Second, 500*time.Millisecond) + }() + } + + // Wait for the service to reach the expected state + require.Eventually(t, func() bool { + status, _ := service.Query() + return status.State == expectedState + }, 10*time.Second, 500*time.Millisecond) + + if tt.customValidation != nil { + tt.customValidation(t) + } else { + // Read the events from the otelcorecol source and check that they were emitted after the service + // command started. This is a simple validation that the messages are being logged on the + // Windows event log. + cmd := exec.Command("wevtutil.exe", "qe", "Application", "/c:1", "/rd:true", "/f:RenderedXml", "/q:*[System[Provider[@Name='otelcorecol']]]") + out, err := cmd.CombinedOutput() + require.NoError(t, err) + + var e Event + require.NoError(t, xml.Unmarshal([]byte(out), &e)) + + eventTime, err := time.Parse("2006-01-02T15:04:05.9999999Z07:00", e.System.TimeCreated.SystemTime) + require.NoError(t, err) + + require.True(t, eventTime.After(startTime.In(time.UTC))) + } + }) + } +} + +// Helper types to read the XML events from the event log using wevtutil +type Event struct { + XMLName xml.Name `xml:"Event"` + System System `xml:"System"` + Data string `xml:"EventData>Data"` +} + +type System struct { + Provider Provider `xml:"Provider"` + EventID int `xml:"EventID"` + Version int `xml:"Version"` + Level int `xml:"Level"` + Task int `xml:"Task"` + Opcode int `xml:"Opcode"` + Keywords string `xml:"Keywords"` + TimeCreated TimeCreated `xml:"TimeCreated"` + EventRecordID int `xml:"EventRecordID"` + Execution Execution `xml:"Execution"` + Channel string `xml:"Channel"` + Computer string `xml:"Computer"` +} + +type Provider struct { + Name string `xml:"Name,attr"` +} + +type TimeCreated struct { + SystemTime string `xml:"SystemTime,attr"` +} + +type Execution struct { + ProcessID string `xml:"ProcessID,attr"` + ThreadID string `xml:"ThreadID,attr"` +} diff --git a/otelcol/testdata/otel-log-to-file.yaml b/otelcol/testdata/otel-log-to-file.yaml new file mode 100644 index 00000000000..e618aca22fc --- /dev/null +++ b/otelcol/testdata/otel-log-to-file.yaml @@ -0,0 +1,30 @@ +extensions: + zpages: + +receivers: + otlp: + protocols: + grpc: + http: + +exporters: + debug: + verbosity: detailed + +service: + telemetry: + logs: + level: warn + output_paths: + # The folder need to be created prior to starting the collector + - ${ProgramData}\OpenTelemetry\Collector\Logs\otelcol.log + + pipelines: + traces: + receivers: [otlp] + exporters: [debug] + metrics: + receivers: [otlp] + exporters: [debug] + + extensions: [zpages]