diff --git a/README.md b/README.md index d1d63ff..462bb76 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,12 @@ It requires a `provided.al2` environment and respects the following env vars: `Key=Value,Other=Value` containing the Cloudwatch dimensions to index metrics under. +To adjust timeouts, and connection pooling in the HTTP client use the following env vars: + +- `BUILDKITE_AGENT_METRICS_TIMEOUT` : Timeout, in seconds, TLS handshake and idle connections, for HTTP requests, to Buildkite API (default 15). +- `BUILDKITE_AGENT_METRICS_MAX_IDLE_CONNS` : Maximum number of idle (keep-alive) HTTP connections + for Buildkite Agent API. Zero means no limit, -1 disables pooling (default 100). + Additionally, one of the following groups of environment variables must be set in order to define how the Lambda function should obtain the required Buildkite Agent API token: @@ -140,43 +146,47 @@ docker run --rm buildkite-agent-metrics -token abc123 -interval 30s -queue my-qu $ buildkite-agent-metrics --help Usage of buildkite-agent-metrics: -backend string - Specify the backend to use: cloudwatch, statsd, prometheus, stackdriver (default "cloudwatch") + Specify the backend to use: cloudwatch, newrelic, prometheus, stackdriver, statsd (default "cloudwatch") -cloudwatch-dimensions string - Cloudwatch dimensions to index metrics under, in the form of Key=Value, Other=Value + Cloudwatch dimensions to index metrics under, in the form of Key=Value, Other=Value -cloudwatch-region string - AWS Region to connect to, defaults to $AWS_REGION or us-east-1 + AWS Region to connect to, defaults to $AWS_REGION or us-east-1 -debug - Show debug output + Show debug output -debug-http - Show full http traces + Show full http traces -dry-run - Whether to only print metrics + Whether to only print metrics -endpoint string - A custom Buildkite Agent API endpoint (default "https://agent.buildkite.com/v3") + A custom Buildkite Agent API endpoint (default "https://agent.buildkite.com/v3") -interval duration - Update metrics every interval, rather than once + Update metrics every interval, rather than once + -max-idle-conns int + Maximum number of idle (keep-alive) HTTP connections for Buildkite Agent API. Zero means no limit, -1 disables connection reuse. (default 100) -newrelic-app-name string - New Relic application name for metric events + New Relic application name for metric events -newrelic-license-key string - New Relic license key for publishing events + New Relic license key for publishing events -prometheus-addr string - Prometheus metrics transport bind address (default ":8080") + Prometheus metrics transport bind address (default ":8080") -prometheus-path string - Prometheus metrics transport path (default "/metrics") + Prometheus metrics transport path (default "/metrics") -queue value - Specific queues to process + Specific queues to process -quiet - Only print errors + Only print errors -stackdriver-projectid string - Specify Stackdriver Project ID + Specify Stackdriver Project ID -statsd-host string - Specify the StatsD server (default "127.0.0.1:8125") + Specify the StatsD server (default "127.0.0.1:8125") -statsd-tags - Whether your StatsD server supports tagging like Datadog - -token string - A Buildkite Agent Registration Token + Whether your StatsD server supports tagging like Datadog + -timeout int + Timeout, in seconds, for HTTP requests to Buildkite API (default 15) + -token value + Buildkite Agent registration tokens. At least one is required. Multiple cluster tokens can be used to gather metrics for multiple clusters. -version - Show the version + Show the version ``` ### Backends diff --git a/collector/collector.go b/collector/collector.go index 03362ab..2e5241d 100644 --- a/collector/collector.go +++ b/collector/collector.go @@ -407,18 +407,25 @@ func traceHTTPRequest(req *http.Request) *http.Request { return req } -func NewHTTPClient(timeout int) *http.Client { +func NewHTTPClient(timeout, maxIdleConns int) *http.Client { connectionTimeout := time.Duration(timeout) * time.Second return &http.Client{ Timeout: connectionTimeout, Transport: &http.Transport{ + MaxIdleConns: maxIdleConns, + IdleConnTimeout: connectionTimeout, + ResponseHeaderTimeout: connectionTimeout, + DisableKeepAlives: false, Dial: (&net.Dialer{ Timeout: connectionTimeout, KeepAlive: connectionTimeout, }).Dial, TLSHandshakeTimeout: connectionTimeout, }, + CheckRedirect: func(req *http.Request, via []*http.Request) error { + return http.ErrUseLastResponse + }, } } diff --git a/lambda/main.go b/lambda/main.go index 431fb14..3922163 100644 --- a/lambda/main.go +++ b/lambda/main.go @@ -56,6 +56,7 @@ func Handler(ctx context.Context, evt json.RawMessage) (string, error) { quietString := os.Getenv("BUILDKITE_QUIET") quiet := quietString == "1" || quietString == "true" timeout := os.Getenv("BUILDKITE_AGENT_METRICS_TIMEOUT") + maxIdleConns := os.Getenv("BUILDKITE_AGENT_METRICS_MAX_IDLE_CONNS") debugEnvVar := os.Getenv("BUILDKITE_AGENT_METRICS_DEBUG") debug := debugEnvVar == "1" || debugEnvVar == "true" @@ -94,17 +95,17 @@ func Handler(ctx context.Context, evt json.RawMessage) (string, error) { queues = strings.Split(queue, ",") } - if timeout == "" { - timeout = "15" + configuredTimeout, err := toIntWithDefault(timeout, 15) + if err != nil { + return "", err } - configuredTimeout, err := strconv.Atoi(timeout) - + configuredMaxIdleConns, err := toIntWithDefault(maxIdleConns, 100) // Default to 100 in line with http.DefaultTransport if err != nil { return "", err } - httpClient := collector.NewHTTPClient(configuredTimeout) + httpClient := collector.NewHTTPClient(configuredTimeout, configuredMaxIdleConns) userAgent := fmt.Sprintf("buildkite-agent-metrics/%s buildkite-agent-metrics-lambda", version.Version) @@ -274,3 +275,11 @@ func checkMutuallyExclusiveEnvVars(varNames ...string) error { return fmt.Errorf("the environment variables [%s] are mutually exclusive", strings.Join(foundVars, ",")) } } + +func toIntWithDefault(val string, defaultVal int) (int, error) { + if val == "" { + return defaultVal, nil + } + + return strconv.Atoi(val) +} diff --git a/lambda/main_test.go b/lambda/main_test.go new file mode 100644 index 0000000..70dbce0 --- /dev/null +++ b/lambda/main_test.go @@ -0,0 +1,44 @@ +package main + +import "testing" + +func Test_toIntWithDefault(t *testing.T) { + type args struct { + val string + defaultVal int + } + tests := []struct { + name string + args args + want int + wantErr bool + }{ + { + name: "empty", + args: args{val: "", defaultVal: 10}, + want: 10, + }, + { + name: "invalid", + args: args{val: "invalid", defaultVal: 10}, + wantErr: true, + }, + { + name: "valid", + args: args{val: "20", defaultVal: 10}, + want: 20, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := toIntWithDefault(tt.args.val, tt.args.defaultVal) + if (err != nil) != tt.wantErr { + t.Errorf("toIntWithDefault(%q, %d) error = %v, wantErr %v", tt.args.val, tt.args.defaultVal, err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("toIntWithDefault(%q, %d) = %v, want %v", tt.args.val, tt.args.defaultVal, got, tt.want) + } + }) + } +} diff --git a/main.go b/main.go index 1363907..cc4da90 100644 --- a/main.go +++ b/main.go @@ -20,14 +20,15 @@ var metricsBackend backend.Backend func main() { var ( - interval = flag.Duration("interval", 0, "Update metrics every interval, rather than once") - showVersion = flag.Bool("version", false, "Show the version") - quiet = flag.Bool("quiet", false, "Only print errors") - debug = flag.Bool("debug", false, "Show debug output") - debugHttp = flag.Bool("debug-http", false, "Show full http traces") - dryRun = flag.Bool("dry-run", false, "Whether to only print metrics") - endpoint = flag.String("endpoint", "https://agent.buildkite.com/v3", "A custom Buildkite Agent API endpoint") - timeout = flag.Int("timeout", 15, "Timeout, in seconds, for HTTP requests to Buildkite API") + interval = flag.Duration("interval", 0, "Update metrics every interval, rather than once") + showVersion = flag.Bool("version", false, "Show the version") + quiet = flag.Bool("quiet", false, "Only print errors") + debug = flag.Bool("debug", false, "Show debug output") + debugHttp = flag.Bool("debug-http", false, "Show full http traces") + dryRun = flag.Bool("dry-run", false, "Whether to only print metrics") + endpoint = flag.String("endpoint", "https://agent.buildkite.com/v3", "A custom Buildkite Agent API endpoint") + timeout = flag.Int("timeout", 15, "Timeout, in seconds, TLS handshake and idle connections, for HTTP requests, to Buildkite API") + maxIdleConns = flag.Int("max-idle-conns", 100, "Maximum number of idle (keep-alive) HTTP connections for Buildkite Agent API. Zero means no limit, -1 disables connection reuse.") // backend config backendOpt = flag.String("backend", "cloudwatch", "Specify the backend to use: cloudwatch, newrelic, prometheus, stackdriver, statsd") @@ -141,7 +142,7 @@ func main() { } } - httpClient := collector.NewHTTPClient(*timeout) + httpClient := collector.NewHTTPClient(*timeout, *maxIdleConns) collectors := make([]*collector.Collector, 0, len(tokens)) for _, token := range tokens {