Skip to content

Commit

Permalink
[NETPATH-297] Update timeout to be per hop instead of per path (#29092)
Browse files Browse the repository at this point in the history
  • Loading branch information
ken-schneider authored Sep 7, 2024
1 parent d003e99 commit d44450f
Show file tree
Hide file tree
Showing 10 changed files with 117 additions and 27 deletions.
16 changes: 8 additions & 8 deletions cmd/agent/dist/conf.d/network_path.d/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ init_config:
#
# min_collection_interval: 60

## @param timeout - integer - optional - default: 10000
## Specifies how much time the full traceroute should take
## in milliseconds
## @param timeout - integer - optional - default: 1000
## Specifies how much time in milliseconds the traceroute should
## wait for a response from each hop before timing out.
#
# timeout: 10000
# timeout: 1000

# Network Path integration is used to monitor individual endpoints.
# Supported platforms are Linux and Windows. macOS is not supported yet.
Expand All @@ -36,11 +36,11 @@ instances:
#
# max_ttl: <PORT>

## @param timeout - integer - optional - default: 10000
## Specifies how much time the full traceroute should take
## in milliseconds
## @param timeout - integer - optional - default: 1000
## Specifies how much time in milliseconds the traceroute should
## wait for a response from each hop before timing out.
#
# timeout: 10000
# timeout: 1000

## @param min_collection_interval - number - optional - default: 60
## Specifies how frequently we should probe the endpoint.
Expand Down
4 changes: 2 additions & 2 deletions cmd/system-probe/modules/traceroute.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ func (t *traceroute) RegisterGRPC(_ grpc.ServiceRegistrar) error {
func (t *traceroute) Close() {}

func logTracerouteRequests(cfg tracerouteutil.Config, client string, runCount uint64, start time.Time) {
args := []interface{}{cfg.DestHostname, client, cfg.DestPort, cfg.MaxTTL, cfg.Timeout, runCount, time.Since(start)}
msg := "Got request on /traceroute/%s?client_id=%s&port=%d&maxTTL=%d&timeout=%d (count: %d): retrieved traceroute in %s"
args := []interface{}{cfg.DestHostname, client, cfg.DestPort, cfg.MaxTTL, cfg.Timeout, cfg.Protocol, runCount, time.Since(start)}
msg := "Got request on /traceroute/%s?client_id=%s&port=%d&maxTTL=%d&timeout=%d&protocol=%s (count: %d): retrieved traceroute in %s"
switch {
case runCount <= 5, runCount%20 == 0:
log.Infof(msg, args...)
Expand Down
16 changes: 14 additions & 2 deletions pkg/collector/corechecks/networkpath/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,17 @@ const (
defaultCheckInterval time.Duration = 1 * time.Minute
)

// Number is a type that is used to make a generic version
// of the firstNonZero function
type Number interface {
~int | ~int64 | ~uint8
}

// InitConfig is used to deserialize integration init config
type InitConfig struct {
MinCollectionInterval int64 `yaml:"min_collection_interval"`
TimeoutMs int64 `yaml:"timeout"`
MaxTTL uint8 `yaml:"max_ttl"`
}

// InstanceConfig is used to deserialize integration instance config
Expand Down Expand Up @@ -83,7 +90,6 @@ func NewCheckConfig(rawInstance integration.Data, rawInitConfig integration.Data
c.DestPort = instance.DestPort
c.SourceService = instance.SourceService
c.DestinationService = instance.DestinationService
c.MaxTTL = instance.MaxTTL
c.Protocol = payload.Protocol(strings.ToUpper(instance.Protocol))

c.MinCollectionInterval = firstNonZero(
Expand All @@ -104,13 +110,19 @@ func NewCheckConfig(rawInstance integration.Data, rawInitConfig integration.Data
return nil, fmt.Errorf("timeout must be > 0")
}

c.MaxTTL = firstNonZero(
instance.MaxTTL,
initConfig.MaxTTL,
setup.DefaultNetworkPathMaxTTL,
)

c.Tags = instance.Tags
c.Namespace = coreconfig.Datadog().GetString("network_devices.namespace")

return c, nil
}

func firstNonZero(values ...time.Duration) time.Duration {
func firstNonZero[T Number](values ...T) T {
for _, value := range values {
if value != 0 {
return value
Expand Down
67 changes: 67 additions & 0 deletions pkg/collector/corechecks/networkpath/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ hostname: 1.2.3.4
MinCollectionInterval: time.Duration(60) * time.Second,
Namespace: "my-namespace",
Timeout: setup.DefaultNetworkPathTimeout * time.Millisecond,
MaxTTL: setup.DefaultNetworkPathMaxTTL,
},
},
{
Expand Down Expand Up @@ -71,6 +72,7 @@ min_collection_interval: 10
MinCollectionInterval: time.Duration(42) * time.Second,
Namespace: "my-namespace",
Timeout: setup.DefaultNetworkPathTimeout * time.Millisecond,
MaxTTL: setup.DefaultNetworkPathMaxTTL,
},
},
{
Expand All @@ -86,6 +88,7 @@ min_collection_interval: 10
MinCollectionInterval: time.Duration(10) * time.Second,
Namespace: "my-namespace",
Timeout: setup.DefaultNetworkPathTimeout * time.Millisecond,
MaxTTL: setup.DefaultNetworkPathMaxTTL,
},
},
{
Expand All @@ -98,6 +101,7 @@ hostname: 1.2.3.4
MinCollectionInterval: time.Duration(1) * time.Minute,
Namespace: "my-namespace",
Timeout: setup.DefaultNetworkPathTimeout * time.Millisecond,
MaxTTL: setup.DefaultNetworkPathMaxTTL,
},
},
{
Expand All @@ -115,6 +119,7 @@ destination_service: service-b
MinCollectionInterval: time.Duration(60) * time.Second,
Namespace: "my-namespace",
Timeout: setup.DefaultNetworkPathTimeout * time.Millisecond,
MaxTTL: setup.DefaultNetworkPathMaxTTL,
},
},
{
Expand All @@ -130,6 +135,7 @@ protocol: udp
Namespace: "my-namespace",
Protocol: payload.ProtocolUDP,
Timeout: setup.DefaultNetworkPathTimeout * time.Millisecond,
MaxTTL: setup.DefaultNetworkPathMaxTTL,
},
},
{
Expand All @@ -145,6 +151,7 @@ protocol: UDP
Namespace: "my-namespace",
Protocol: payload.ProtocolUDP,
Timeout: setup.DefaultNetworkPathTimeout * time.Millisecond,
MaxTTL: setup.DefaultNetworkPathMaxTTL,
},
},
{
Expand All @@ -160,6 +167,7 @@ protocol: TCP
Namespace: "my-namespace",
Protocol: payload.ProtocolTCP,
Timeout: setup.DefaultNetworkPathTimeout * time.Millisecond,
MaxTTL: setup.DefaultNetworkPathMaxTTL,
},
},
{
Expand All @@ -177,6 +185,7 @@ min_collection_interval: 10
MinCollectionInterval: time.Duration(42) * time.Second,
Namespace: "my-namespace",
Timeout: 50000 * time.Millisecond,
MaxTTL: setup.DefaultNetworkPathMaxTTL,
},
},
{
Expand All @@ -195,6 +204,7 @@ timeout: 70000
MinCollectionInterval: time.Duration(42) * time.Second,
Namespace: "my-namespace",
Timeout: 50000 * time.Millisecond,
MaxTTL: setup.DefaultNetworkPathMaxTTL,
},
},
{
Expand All @@ -212,6 +222,7 @@ timeout: 70000
MinCollectionInterval: time.Duration(42) * time.Second,
Namespace: "my-namespace",
Timeout: 70000 * time.Millisecond,
MaxTTL: setup.DefaultNetworkPathMaxTTL,
},
},
{
Expand All @@ -228,6 +239,7 @@ min_collection_interval: 10
MinCollectionInterval: time.Duration(42) * time.Second,
Namespace: "my-namespace",
Timeout: setup.DefaultNetworkPathTimeout * time.Millisecond,
MaxTTL: setup.DefaultNetworkPathMaxTTL,
},
},
{
Expand All @@ -242,6 +254,61 @@ timeout: -1
`),
expectedError: "timeout must be > 0",
},
{
name: "maxTTL from instance config",
rawInstance: []byte(`
hostname: 1.2.3.4
max_ttl: 50
min_collection_interval: 42
`),
rawInitConfig: []byte(`
min_collection_interval: 10
`),
expectedConfig: &CheckConfig{
DestHostname: "1.2.3.4",
MinCollectionInterval: time.Duration(42) * time.Second,
Namespace: "my-namespace",
Timeout: setup.DefaultNetworkPathTimeout * time.Millisecond,
MaxTTL: 50,
},
},
{
name: "maxTTL from instance config preferred over init config",
rawInstance: []byte(`
hostname: 1.2.3.4
max_ttl: 50
min_collection_interval: 42
`),
rawInitConfig: []byte(`
min_collection_interval: 10
max_ttl: 64
`),
expectedConfig: &CheckConfig{
DestHostname: "1.2.3.4",
MinCollectionInterval: time.Duration(42) * time.Second,
Namespace: "my-namespace",
Timeout: setup.DefaultNetworkPathTimeout * time.Millisecond,
MaxTTL: 50,
},
},
{
name: "maxTTL from init config",
rawInstance: []byte(`
hostname: 1.2.3.4
min_collection_interval: 42
`),
rawInitConfig: []byte(`
min_collection_interval: 10
max_ttl: 64
`),
expectedConfig: &CheckConfig{
DestHostname: "1.2.3.4",
MinCollectionInterval: time.Duration(42) * time.Second,
Namespace: "my-namespace",
Timeout: setup.DefaultNetworkPathTimeout * time.Millisecond,
MaxTTL: 64,
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand Down
7 changes: 5 additions & 2 deletions pkg/config/setup/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,10 @@ const (
DefaultMaxMessageSizeBytes = 256 * 1000

// DefaultNetworkPathTimeout defines the default timeout for a network path test
DefaultNetworkPathTimeout = 10000
DefaultNetworkPathTimeout = 1000

// DefaultNetworkPathMaxTTL defines the default maximum TTL for traceroute tests
DefaultNetworkPathMaxTTL = 30
)

// datadog is the global configuration object
Expand Down Expand Up @@ -437,7 +440,7 @@ func InitConfig(config pkgconfigmodel.Config) {
config.BindEnvAndSetDefault("network_path.connections_monitoring.enabled", false)
config.BindEnvAndSetDefault("network_path.collector.workers", 4)
config.BindEnvAndSetDefault("network_path.collector.timeout", DefaultNetworkPathTimeout)
config.BindEnvAndSetDefault("network_path.collector.max_ttl", 30)
config.BindEnvAndSetDefault("network_path.collector.max_ttl", DefaultNetworkPathMaxTTL)
config.BindEnvAndSetDefault("network_path.collector.input_chan_size", 1000)
config.BindEnvAndSetDefault("network_path.collector.processing_chan_size", 1000)
config.BindEnvAndSetDefault("network_path.collector.pathtest_contexts_limit", 10000)
Expand Down
2 changes: 1 addition & 1 deletion pkg/config/setup/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -665,7 +665,7 @@ func TestNetworkPathDefaults(t *testing.T) {

assert.Equal(t, false, config.GetBool("network_path.connections_monitoring.enabled"))
assert.Equal(t, 4, config.GetInt("network_path.collector.workers"))
assert.Equal(t, 10000, config.GetInt("network_path.collector.timeout"))
assert.Equal(t, 1000, config.GetInt("network_path.collector.timeout"))
assert.Equal(t, 30, config.GetInt("network_path.collector.max_ttl"))
assert.Equal(t, 1000, config.GetInt("network_path.collector.input_chan_size"))
assert.Equal(t, 1000, config.GetInt("network_path.collector.processing_chan_size"))
Expand Down
6 changes: 2 additions & 4 deletions pkg/networkpath/traceroute/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ const (
DefaultNumPaths = 1
// DefaultMinTTL defines the default minimum TTL
DefaultMinTTL = 1
// DefaultMaxTTL defines the default maximum TTL
DefaultMaxTTL = 30
// DefaultDelay defines the default delay
DefaultDelay = 50 //msec
// DefaultOutputFormat defines the default output format
Expand Down Expand Up @@ -117,12 +115,12 @@ func (r *Runner) RunTraceroute(ctx context.Context, cfg Config) (payload.Network

maxTTL := cfg.MaxTTL
if maxTTL == 0 {
maxTTL = DefaultMaxTTL
maxTTL = setup.DefaultNetworkPathMaxTTL
}

var timeout time.Duration
if cfg.Timeout == 0 {
timeout = setup.DefaultNetworkPathTimeout * time.Millisecond
timeout = setup.DefaultNetworkPathTimeout * time.Duration(maxTTL) * time.Millisecond
} else {
timeout = cfg.Timeout
}
Expand Down
8 changes: 1 addition & 7 deletions pkg/networkpath/traceroute/tcp/tcpv4.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,15 +103,9 @@ func (t *TCPv4) TracerouteSequential() (*Results, error) {
// hops should be of length # of hops
hops := make([]*Hop, 0, t.MaxTTL-t.MinTTL)

// TODO: better logic around timeout for sequential is needed
// right now we're just hacking around the existing
// need to convert uint8 to int for proper conversion to
// time.Duration
timeout := t.Timeout / time.Duration(int(t.MaxTTL-t.MinTTL))

for i := int(t.MinTTL); i <= int(t.MaxTTL); i++ {
seqNumber := rand.Uint32()
hop, err := t.sendAndReceive(rawIcmpConn, rawTCPConn, i, seqNumber, timeout)
hop, err := t.sendAndReceive(rawIcmpConn, rawTCPConn, i, seqNumber, t.Timeout)
if err != nil {
return nil, fmt.Errorf("failed to run traceroute: %w", err)
}
Expand Down
4 changes: 3 additions & 1 deletion pkg/process/net/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,9 @@ func (r *RemoteSysProbeUtil) GetPing(clientID string, host string, count int, in

// GetTraceroute returns the results of a traceroute to a host
func (r *RemoteSysProbeUtil) GetTraceroute(clientID string, host string, port uint16, protocol nppayload.Protocol, maxTTL uint8, timeout time.Duration) ([]byte, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout+10*time.Second) // allow extra time for the system probe communication overhead
httpTimeout := timeout*time.Duration(maxTTL) + 10*time.Second // allow extra time for the system probe communication overhead, calculate full timeout for TCP traceroute
log.Tracef("Network Path traceroute HTTP request timeout: %s", httpTimeout)
ctx, cancel := context.WithTimeout(context.Background(), httpTimeout)
defer cancel()

req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprintf("%s/%s?client_id=%s&port=%d&max_ttl=%d&timeout=%d&protocol=%s", tracerouteURL, host, clientID, port, maxTTL, timeout, protocol), nil)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Each section from every release note are combined when the
# CHANGELOG.rst is rendered. So the text needs to be worded so that
# it does not depend on any information only available in another
# section. This may mean repeating some details, but each section
# must be readable independently of the other.
#
# Each section note must be formatted as reStructuredText.
---
upgrade:
- |
Changes behavior of the timeout for Network Path. Previously, the timeout
signified the total time to wait for a full traceroute to complete. Now,
the timeout signifies the time to wait for each hop in the traceroute.
Additionally, the default timeout has been changed to 1000ms.

0 comments on commit d44450f

Please sign in to comment.