diff --git a/changelog/fragments/1700520158-dynamic-timeout-for-diags.yaml b/changelog/fragments/1700520158-dynamic-timeout-for-diags.yaml new file mode 100644 index 00000000000..d56af718c48 --- /dev/null +++ b/changelog/fragments/1700520158-dynamic-timeout-for-diags.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: enhancement + +# Change summary; a 80ish characters long description of the change. +summary: Use shorter timeouts for diagnostic requests unless CPU diagnostics are requested + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +description: Use different timeout values for diagnostics requests, depending on if CPU diagnostics are requested + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: diagnostics + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +#pr: https://github.com/owner/repo/1234 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +issue: https://github.com/elastic/elastic-agent/issues/3197 diff --git a/pkg/component/runtime/manager.go b/pkg/component/runtime/manager.go index 81fde1090b0..a375ddf816a 100644 --- a/pkg/component/runtime/manager.go +++ b/pkg/component/runtime/manager.go @@ -43,8 +43,11 @@ const ( // maxCheckinMisses is the maximum number of check-in misses a component can miss before it is killed // and restarted. maxCheckinMisses = 3 - // diagnosticTimeout is the maximum amount of time to wait for a diagnostic response from a unit. - diagnosticTimeout = time.Minute + // diagnosticTimeoutCPU is the maximum amount of time to wait for a diagnostic response from a unit while collecting CPU profiles + diagnosticTimeoutCPU = time.Minute + + // diagnosticTimeout is the maximum amount of time to wait for a diagnostic response from a unit + diagnosticTimeout = time.Second * 20 // stopCheckRetryPeriod is a idle time between checks for component stopped state stopCheckRetryPeriod = 200 * time.Millisecond @@ -944,7 +947,15 @@ func (m *Manager) getListenAddr() string { // performDiagAction creates a diagnostic ActionRequest and executes it against the runtime that's mapped to the specified component. // if the specified actionLevel is ActionRequest_COMPONENT, the unit field is ignored. func (m *Manager) performDiagAction(ctx context.Context, comp component.Component, unit component.Unit, actionLevel proto.ActionRequest_Level, params client.DiagnosticParams) ([]*proto.ActionDiagnosticUnitResult, error) { - ctx, cancel := context.WithTimeout(ctx, diagnosticTimeout) + // if we're gathering CPU diagnostics, request a longer timeout; CPU diag collection requires the diagnostic hook to sit and gather a CPU profile. + finalDiagnosticTime := diagnosticTimeout + for _, tag := range params.AdditionalMetrics { + if tag == "CPU" { + finalDiagnosticTime = diagnosticTimeoutCPU + break + } + } + ctx, cancel := context.WithTimeout(ctx, finalDiagnosticTime) defer cancel() id, err := uuid.NewV4() @@ -966,7 +977,7 @@ func (m *Manager) performDiagAction(ctx context.Context, comp component.Componen } if len(params.AdditionalMetrics) > 0 { - m.logger.Debugf("Performing diagnostic action with params: %v", params.AdditionalMetrics) + m.logger.Debugf("Performing diagnostic action with params: %v; will wait %s", params.AdditionalMetrics, finalDiagnosticTime) } marshalParams, err := json.Marshal(params) if err != nil { @@ -989,7 +1000,7 @@ func (m *Manager) performDiagAction(ctx context.Context, comp component.Componen // the only way this can return an error is a context Done(), be sure to make that explicit. if err != nil { if errors.Is(context.DeadlineExceeded, err) { - return nil, fmt.Errorf("diagnostic action timed out, deadline is %s: %w", diagnosticTimeout, err) + return nil, fmt.Errorf("diagnostic action timed out, deadline is %s: %w", finalDiagnosticTime, err) } return nil, fmt.Errorf("error running performAction: %w", err) }