Skip to content

Commit

Permalink
Use different timeout values for CPU and non-CPU diagnostic requests (e…
Browse files Browse the repository at this point in the history
…lastic#3794)

* use different timeout values for diagnostics

* update readme, add break

---------

Co-authored-by: Pierre HILBERT <[email protected]>
  • Loading branch information
fearful-symmetry and pierrehilbert authored Nov 27, 2023
1 parent 80c29c0 commit 3afa073
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 5 deletions.
32 changes: 32 additions & 0 deletions changelog/fragments/1700520158-dynamic-timeout-for-diags.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: enhancement

# Change summary; a 80ish characters long description of the change.
summary: Use shorter timeouts for diagnostic requests unless CPU diagnostics are requested

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
description: Use different timeout values for diagnostics requests, depending on if CPU diagnostics are requested

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: diagnostics

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
#pr: https://github.com/owner/repo/1234

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
issue: https://github.com/elastic/elastic-agent/issues/3197
21 changes: 16 additions & 5 deletions pkg/component/runtime/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,11 @@ const (
// maxCheckinMisses is the maximum number of check-in misses a component can miss before it is killed
// and restarted.
maxCheckinMisses = 3
// diagnosticTimeout is the maximum amount of time to wait for a diagnostic response from a unit.
diagnosticTimeout = time.Minute
// diagnosticTimeoutCPU is the maximum amount of time to wait for a diagnostic response from a unit while collecting CPU profiles
diagnosticTimeoutCPU = time.Minute

// diagnosticTimeout is the maximum amount of time to wait for a diagnostic response from a unit
diagnosticTimeout = time.Second * 20

// stopCheckRetryPeriod is a idle time between checks for component stopped state
stopCheckRetryPeriod = 200 * time.Millisecond
Expand Down Expand Up @@ -944,7 +947,15 @@ func (m *Manager) getListenAddr() string {
// performDiagAction creates a diagnostic ActionRequest and executes it against the runtime that's mapped to the specified component.
// if the specified actionLevel is ActionRequest_COMPONENT, the unit field is ignored.
func (m *Manager) performDiagAction(ctx context.Context, comp component.Component, unit component.Unit, actionLevel proto.ActionRequest_Level, params client.DiagnosticParams) ([]*proto.ActionDiagnosticUnitResult, error) {
ctx, cancel := context.WithTimeout(ctx, diagnosticTimeout)
// if we're gathering CPU diagnostics, request a longer timeout; CPU diag collection requires the diagnostic hook to sit and gather a CPU profile.
finalDiagnosticTime := diagnosticTimeout
for _, tag := range params.AdditionalMetrics {
if tag == "CPU" {
finalDiagnosticTime = diagnosticTimeoutCPU
break
}
}
ctx, cancel := context.WithTimeout(ctx, finalDiagnosticTime)
defer cancel()

id, err := uuid.NewV4()
Expand All @@ -966,7 +977,7 @@ func (m *Manager) performDiagAction(ctx context.Context, comp component.Componen
}

if len(params.AdditionalMetrics) > 0 {
m.logger.Debugf("Performing diagnostic action with params: %v", params.AdditionalMetrics)
m.logger.Debugf("Performing diagnostic action with params: %v; will wait %s", params.AdditionalMetrics, finalDiagnosticTime)
}
marshalParams, err := json.Marshal(params)
if err != nil {
Expand All @@ -989,7 +1000,7 @@ func (m *Manager) performDiagAction(ctx context.Context, comp component.Componen
// the only way this can return an error is a context Done(), be sure to make that explicit.
if err != nil {
if errors.Is(context.DeadlineExceeded, err) {
return nil, fmt.Errorf("diagnostic action timed out, deadline is %s: %w", diagnosticTimeout, err)
return nil, fmt.Errorf("diagnostic action timed out, deadline is %s: %w", finalDiagnosticTime, err)
}
return nil, fmt.Errorf("error running performAction: %w", err)
}
Expand Down

0 comments on commit 3afa073

Please sign in to comment.