From 711eec1f54e6849a735acb7579246e724412a457 Mon Sep 17 00:00:00 2001 From: Mark Wolfe Date: Tue, 4 Jun 2024 10:55:52 +1000 Subject: [PATCH 1/5] This change enables configuration of http connection pooling To provide customers with more options to tune for their situation this introduces a new option to reduce or disable connection reuse. In some cases disabling connection reuse may avoid read timeouts or pausing for connections a server, or CDN may deprioritize. I have also updated the documentation for the expanded timeout option to highlight it is used for more than just request timeout. --- README.md | 50 +++++++++++++++++++++++++----------------- collector/collector.go | 9 +++++++- lambda/main.go | 11 +++++++++- main.go | 19 ++++++++-------- 4 files changed, 58 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index d1d63ff..d8dcdf7 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,12 @@ It requires a `provided.al2` environment and respects the following env vars: `Key=Value,Other=Value` containing the Cloudwatch dimensions to index metrics under. +To adjust timeouts, and connection pooling in the HTTP client use the following env vars: + +- `BUILDKITE_AGENT_METRICS_TIMEOUT` : Timeout, in seconds, TLS handshake and idle connections, for HTTP requests, to Buildkite API (default 15). +- `BUILDKITE_AGENT_METRICS_MAX_IDLE_CONNS` : Maximum number of idle (keep-alive) HTTP connections + for Buildkite Agent API. Zero means no limit, -1 disables pooling (default 0). + Additionally, one of the following groups of environment variables must be set in order to define how the Lambda function should obtain the required Buildkite Agent API token: @@ -140,43 +146,47 @@ docker run --rm buildkite-agent-metrics -token abc123 -interval 30s -queue my-qu $ buildkite-agent-metrics --help Usage of buildkite-agent-metrics: -backend string - Specify the backend to use: cloudwatch, statsd, prometheus, stackdriver (default "cloudwatch") + Specify the backend to use: cloudwatch, newrelic, prometheus, stackdriver, statsd (default "cloudwatch") -cloudwatch-dimensions string - Cloudwatch dimensions to index metrics under, in the form of Key=Value, Other=Value + Cloudwatch dimensions to index metrics under, in the form of Key=Value, Other=Value -cloudwatch-region string - AWS Region to connect to, defaults to $AWS_REGION or us-east-1 + AWS Region to connect to, defaults to $AWS_REGION or us-east-1 -debug - Show debug output + Show debug output -debug-http - Show full http traces + Show full http traces -dry-run - Whether to only print metrics + Whether to only print metrics -endpoint string - A custom Buildkite Agent API endpoint (default "https://agent.buildkite.com/v3") + A custom Buildkite Agent API endpoint (default "https://agent.buildkite.com/v3") -interval duration - Update metrics every interval, rather than once + Update metrics every interval, rather than once + -max-idle-conns int + Maximum number of idle (keep-alive) HTTP connections for Buildkite Agent API. Zero means no limit, -1 disables pooling. -newrelic-app-name string - New Relic application name for metric events + New Relic application name for metric events -newrelic-license-key string - New Relic license key for publishing events + New Relic license key for publishing events -prometheus-addr string - Prometheus metrics transport bind address (default ":8080") + Prometheus metrics transport bind address (default ":8080") -prometheus-path string - Prometheus metrics transport path (default "/metrics") + Prometheus metrics transport path (default "/metrics") -queue value - Specific queues to process + Specific queues to process -quiet - Only print errors + Only print errors -stackdriver-projectid string - Specify Stackdriver Project ID + Specify Stackdriver Project ID -statsd-host string - Specify the StatsD server (default "127.0.0.1:8125") + Specify the StatsD server (default "127.0.0.1:8125") -statsd-tags - Whether your StatsD server supports tagging like Datadog - -token string - A Buildkite Agent Registration Token + Whether your StatsD server supports tagging like Datadog + -timeout int + Timeout, in seconds, for HTTP requests to Buildkite API (default 15) + -token value + Buildkite Agent registration tokens. At least one is required. Multiple cluster tokens can be used to gather metrics for multiple clusters. -version - Show the version + Show the version ``` ### Backends diff --git a/collector/collector.go b/collector/collector.go index 03362ab..2e5241d 100644 --- a/collector/collector.go +++ b/collector/collector.go @@ -407,18 +407,25 @@ func traceHTTPRequest(req *http.Request) *http.Request { return req } -func NewHTTPClient(timeout int) *http.Client { +func NewHTTPClient(timeout, maxIdleConns int) *http.Client { connectionTimeout := time.Duration(timeout) * time.Second return &http.Client{ Timeout: connectionTimeout, Transport: &http.Transport{ + MaxIdleConns: maxIdleConns, + IdleConnTimeout: connectionTimeout, + ResponseHeaderTimeout: connectionTimeout, + DisableKeepAlives: false, Dial: (&net.Dialer{ Timeout: connectionTimeout, KeepAlive: connectionTimeout, }).Dial, TLSHandshakeTimeout: connectionTimeout, }, + CheckRedirect: func(req *http.Request, via []*http.Request) error { + return http.ErrUseLastResponse + }, } } diff --git a/lambda/main.go b/lambda/main.go index 431fb14..746c07b 100644 --- a/lambda/main.go +++ b/lambda/main.go @@ -56,6 +56,7 @@ func Handler(ctx context.Context, evt json.RawMessage) (string, error) { quietString := os.Getenv("BUILDKITE_QUIET") quiet := quietString == "1" || quietString == "true" timeout := os.Getenv("BUILDKITE_AGENT_METRICS_TIMEOUT") + maxIdleConns := os.Getenv("BUILDKITE_AGENT_METRICS_MAX_IDLE_CONNS") debugEnvVar := os.Getenv("BUILDKITE_AGENT_METRICS_DEBUG") debug := debugEnvVar == "1" || debugEnvVar == "true" @@ -98,13 +99,21 @@ func Handler(ctx context.Context, evt json.RawMessage) (string, error) { timeout = "15" } + if maxIdleConns == "" { + maxIdleConns = "0" // default to unlimited + } + configuredTimeout, err := strconv.Atoi(timeout) + if err != nil { + return "", err + } + configuredMaxIdleConns, err := strconv.Atoi(maxIdleConns) if err != nil { return "", err } - httpClient := collector.NewHTTPClient(configuredTimeout) + httpClient := collector.NewHTTPClient(configuredTimeout, configuredMaxIdleConns) userAgent := fmt.Sprintf("buildkite-agent-metrics/%s buildkite-agent-metrics-lambda", version.Version) diff --git a/main.go b/main.go index 1363907..0982842 100644 --- a/main.go +++ b/main.go @@ -20,14 +20,15 @@ var metricsBackend backend.Backend func main() { var ( - interval = flag.Duration("interval", 0, "Update metrics every interval, rather than once") - showVersion = flag.Bool("version", false, "Show the version") - quiet = flag.Bool("quiet", false, "Only print errors") - debug = flag.Bool("debug", false, "Show debug output") - debugHttp = flag.Bool("debug-http", false, "Show full http traces") - dryRun = flag.Bool("dry-run", false, "Whether to only print metrics") - endpoint = flag.String("endpoint", "https://agent.buildkite.com/v3", "A custom Buildkite Agent API endpoint") - timeout = flag.Int("timeout", 15, "Timeout, in seconds, for HTTP requests to Buildkite API") + interval = flag.Duration("interval", 0, "Update metrics every interval, rather than once") + showVersion = flag.Bool("version", false, "Show the version") + quiet = flag.Bool("quiet", false, "Only print errors") + debug = flag.Bool("debug", false, "Show debug output") + debugHttp = flag.Bool("debug-http", false, "Show full http traces") + dryRun = flag.Bool("dry-run", false, "Whether to only print metrics") + endpoint = flag.String("endpoint", "https://agent.buildkite.com/v3", "A custom Buildkite Agent API endpoint") + timeout = flag.Int("timeout", 15, "Timeout, in seconds, TLS handshake and idle connections, for HTTP requests, to Buildkite API") + maxIdleConns = flag.Int("max-idle-conns", 0, "Maximum number of idle (keep-alive) HTTP connections for Buildkite Agent API. Zero means no limit, -1 disables pooling.") // backend config backendOpt = flag.String("backend", "cloudwatch", "Specify the backend to use: cloudwatch, newrelic, prometheus, stackdriver, statsd") @@ -141,7 +142,7 @@ func main() { } } - httpClient := collector.NewHTTPClient(*timeout) + httpClient := collector.NewHTTPClient(*timeout, *maxIdleConns) collectors := make([]*collector.Collector, 0, len(tokens)) for _, token := range tokens { From c3e9a572c51c9a13e8065e43572616d52a940f46 Mon Sep 17 00:00:00 2001 From: Mark Wolfe Date: Wed, 5 Jun 2024 13:21:28 +1000 Subject: [PATCH 2/5] Updated documentation and defaults --- README.md | 4 ++-- lambda/main.go | 2 +- main.go | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d8dcdf7..462bb76 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ To adjust timeouts, and connection pooling in the HTTP client use the following - `BUILDKITE_AGENT_METRICS_TIMEOUT` : Timeout, in seconds, TLS handshake and idle connections, for HTTP requests, to Buildkite API (default 15). - `BUILDKITE_AGENT_METRICS_MAX_IDLE_CONNS` : Maximum number of idle (keep-alive) HTTP connections - for Buildkite Agent API. Zero means no limit, -1 disables pooling (default 0). + for Buildkite Agent API. Zero means no limit, -1 disables pooling (default 100). Additionally, one of the following groups of environment variables must be set in order to define how the Lambda function should obtain the required Buildkite @@ -162,7 +162,7 @@ Usage of buildkite-agent-metrics: -interval duration Update metrics every interval, rather than once -max-idle-conns int - Maximum number of idle (keep-alive) HTTP connections for Buildkite Agent API. Zero means no limit, -1 disables pooling. + Maximum number of idle (keep-alive) HTTP connections for Buildkite Agent API. Zero means no limit, -1 disables connection reuse. (default 100) -newrelic-app-name string New Relic application name for metric events -newrelic-license-key string diff --git a/lambda/main.go b/lambda/main.go index 746c07b..7e2a2dd 100644 --- a/lambda/main.go +++ b/lambda/main.go @@ -100,7 +100,7 @@ func Handler(ctx context.Context, evt json.RawMessage) (string, error) { } if maxIdleConns == "" { - maxIdleConns = "0" // default to unlimited + maxIdleConns = "100" // Default to 100 in line with http.DefaultTransport } configuredTimeout, err := strconv.Atoi(timeout) diff --git a/main.go b/main.go index 0982842..cc4da90 100644 --- a/main.go +++ b/main.go @@ -28,7 +28,7 @@ func main() { dryRun = flag.Bool("dry-run", false, "Whether to only print metrics") endpoint = flag.String("endpoint", "https://agent.buildkite.com/v3", "A custom Buildkite Agent API endpoint") timeout = flag.Int("timeout", 15, "Timeout, in seconds, TLS handshake and idle connections, for HTTP requests, to Buildkite API") - maxIdleConns = flag.Int("max-idle-conns", 0, "Maximum number of idle (keep-alive) HTTP connections for Buildkite Agent API. Zero means no limit, -1 disables pooling.") + maxIdleConns = flag.Int("max-idle-conns", 100, "Maximum number of idle (keep-alive) HTTP connections for Buildkite Agent API. Zero means no limit, -1 disables connection reuse.") // backend config backendOpt = flag.String("backend", "cloudwatch", "Specify the backend to use: cloudwatch, newrelic, prometheus, stackdriver, statsd") From 5e1c6df72dc82fa3eb2921a4f1ed02cf9eeb2524 Mon Sep 17 00:00:00 2001 From: Mark Wolfe Date: Wed, 5 Jun 2024 14:11:19 +1000 Subject: [PATCH 3/5] Improve the default values for int based environment vars --- lambda/main.go | 20 ++++++++++---------- lambda/main_test.go | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 10 deletions(-) create mode 100644 lambda/main_test.go diff --git a/lambda/main.go b/lambda/main.go index 7e2a2dd..3922163 100644 --- a/lambda/main.go +++ b/lambda/main.go @@ -95,20 +95,12 @@ func Handler(ctx context.Context, evt json.RawMessage) (string, error) { queues = strings.Split(queue, ",") } - if timeout == "" { - timeout = "15" - } - - if maxIdleConns == "" { - maxIdleConns = "100" // Default to 100 in line with http.DefaultTransport - } - - configuredTimeout, err := strconv.Atoi(timeout) + configuredTimeout, err := toIntWithDefault(timeout, 15) if err != nil { return "", err } - configuredMaxIdleConns, err := strconv.Atoi(maxIdleConns) + configuredMaxIdleConns, err := toIntWithDefault(maxIdleConns, 100) // Default to 100 in line with http.DefaultTransport if err != nil { return "", err } @@ -283,3 +275,11 @@ func checkMutuallyExclusiveEnvVars(varNames ...string) error { return fmt.Errorf("the environment variables [%s] are mutually exclusive", strings.Join(foundVars, ",")) } } + +func toIntWithDefault(val string, defaultVal int) (int, error) { + if val == "" { + return defaultVal, nil + } + + return strconv.Atoi(val) +} diff --git a/lambda/main_test.go b/lambda/main_test.go new file mode 100644 index 0000000..2e8f4ea --- /dev/null +++ b/lambda/main_test.go @@ -0,0 +1,43 @@ +package main + +import "testing" + +func Test_toIntWithDefault(t *testing.T) { + type args struct { + value string + defaults int + } + tests := []struct { + name string + args args + want int + wantErr bool + }{ + { + name: "empty", + args: args{value: "", defaults: 10}, + want: 10, + }, + { + name: "invalid", + args: args{value: "invalid", defaults: 10}, + wantErr: true, + }, + { + name: "valid", + args: args{value: "20", defaults: 10}, + want: 20, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got, err := toIntWithDefault(tt.args.value, tt.args.defaults); (err != nil) != tt.wantErr { + t.Errorf("toIntWithDefault() error = %v, wantErr %v", err, tt.wantErr) + } else { + if got != tt.want { + t.Errorf("toIntWithDefault() = %v, want %v", got, tt.want) + } + } + }) + } +} From d5242a3778cf894a5563b7414e092be0edd13eaf Mon Sep 17 00:00:00 2001 From: Mark Wolfe Date: Wed, 5 Jun 2024 15:31:23 +1000 Subject: [PATCH 4/5] Update lambda/main_test.go Co-authored-by: Josh Deprez --- lambda/main_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lambda/main_test.go b/lambda/main_test.go index 2e8f4ea..27bc46e 100644 --- a/lambda/main_test.go +++ b/lambda/main_test.go @@ -32,10 +32,10 @@ func Test_toIntWithDefault(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { if got, err := toIntWithDefault(tt.args.value, tt.args.defaults); (err != nil) != tt.wantErr { - t.Errorf("toIntWithDefault() error = %v, wantErr %v", err, tt.wantErr) + t.Errorf("toIntWithDefault(%q, %d) error = %v, wantErr %v", tt.args.value, tt.args.defaults, err, tt.wantErr) } else { if got != tt.want { - t.Errorf("toIntWithDefault() = %v, want %v", got, tt.want) + t.Errorf("toIntWithDefault(%q, %d) = %v, want %v", tt.args.value, tt.args.defaults, got, tt.want) } } }) From 0e08a5d5c02335bb23ce5b7b15cd5558cda98dd0 Mon Sep 17 00:00:00 2001 From: Mark Wolfe Date: Wed, 5 Jun 2024 15:36:29 +1000 Subject: [PATCH 5/5] Updated test to github.com/cweill/gotests/gotests to simplify --- lambda/main_test.go | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/lambda/main_test.go b/lambda/main_test.go index 27bc46e..70dbce0 100644 --- a/lambda/main_test.go +++ b/lambda/main_test.go @@ -4,8 +4,8 @@ import "testing" func Test_toIntWithDefault(t *testing.T) { type args struct { - value string - defaults int + val string + defaultVal int } tests := []struct { name string @@ -15,28 +15,29 @@ func Test_toIntWithDefault(t *testing.T) { }{ { name: "empty", - args: args{value: "", defaults: 10}, + args: args{val: "", defaultVal: 10}, want: 10, }, { name: "invalid", - args: args{value: "invalid", defaults: 10}, + args: args{val: "invalid", defaultVal: 10}, wantErr: true, }, { name: "valid", - args: args{value: "20", defaults: 10}, + args: args{val: "20", defaultVal: 10}, want: 20, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got, err := toIntWithDefault(tt.args.value, tt.args.defaults); (err != nil) != tt.wantErr { - t.Errorf("toIntWithDefault(%q, %d) error = %v, wantErr %v", tt.args.value, tt.args.defaults, err, tt.wantErr) - } else { - if got != tt.want { - t.Errorf("toIntWithDefault(%q, %d) = %v, want %v", tt.args.value, tt.args.defaults, got, tt.want) - } + got, err := toIntWithDefault(tt.args.val, tt.args.defaultVal) + if (err != nil) != tt.wantErr { + t.Errorf("toIntWithDefault(%q, %d) error = %v, wantErr %v", tt.args.val, tt.args.defaultVal, err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("toIntWithDefault(%q, %d) = %v, want %v", tt.args.val, tt.args.defaultVal, got, tt.want) } }) }