Split labels/series API endpoints in query frontend (thanos-io#3276)

* split metadata endpoints Signed-off-by: Ben Ye <[email protected]> * add metadata codec implementation Signed-off-by: Ben Ye <[email protected]> * cleanup go mod Signed-off-by: Ben Ye <[email protected]> * add back fix for thanos-io#3240 Signed-off-by: Ben Ye <[email protected]> * check set key exist Signed-off-by: Ben Ye <[email protected]> * update Signed-off-by: Ben Ye <[email protected]> * add default metadata range flag Signed-off-by: Ben Ye <[email protected]> * fix linting issues Signed-off-by: Ben Ye <[email protected]> * refactor flags and add test cases Signed-off-by: Ben Ye <[email protected]> * fix go lint issue Signed-off-by: Ben Ye <[email protected]> * add roundtrip tripperware tests for labels and series requests Signed-off-by: Ben Ye <[email protected]> * fix all unit tests and e2e tests Signed-off-by: Ben Ye <[email protected]> * fix lint issues Signed-off-by: Ben Ye <[email protected]> * add nolint unparam Signed-off-by: Ben Ye <[email protected]> * update flags and changelog Signed-off-by: Ben Ye <[email protected]> * switch response format to protobuf Signed-off-by: Ben Ye <[email protected]>
OGKevin · Oct 15, 2020 · 143ee10 · 143ee10
1 parent 212dbf6
commit 143ee10
Show file tree

Hide file tree

Showing 23 changed files with 2,602 additions and 288 deletions.
diff --git a/.golangci.yml b/.golangci.yml
@@ -53,6 +53,8 @@ linters-settings:
     exclude: ./.errcheck_excludes.txt
   misspell:
     locale: US
+  goconst:
+    min-occurrences: 5
 
 issues:
   exclude-rules:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re
 ### Fixed
 - [#3261](https://github.com/thanos-io/thanos/pull/3261) Thanos Store: Use segment files specified in meta.json file, if present. If not present, Store does the LIST operation as before.
 - [#3257](https://github.com/thanos-io/thanos/pull/3257) Ruler: Prevent Ruler from crashing when using default DNS to lookup hosts that results in "No such hosts" errors.
+- [#3276](https://github.com/thanos-io/thanos/pull/3276) Query Frontend: Support query splitting and retry for labels and series requests.
 
 ## [v0.16.0](https://github.com/thanos-io/thanos/releases) - Release in progress
 

diff --git a/cmd/thanos/query_frontend.go b/cmd/thanos/query_frontend.go
@@ -42,33 +42,59 @@ func registerQueryFrontend(app *extkingpin.App) {
 	cmd := app.Command(comp.String(), "query frontend")
 	cfg := &queryFrontendConfig{
 		Config: queryfrontend.Config{
-			CortexFrontendConfig:     &cortexfrontend.Config{},
-			CortexLimits:             &cortexvalidation.Limits{},
-			CortexResultsCacheConfig: &queryrange.ResultsCacheConfig{},
+			CortexFrontendConfig: &cortexfrontend.Config{},
+			QueryRangeConfig: queryfrontend.QueryRangeConfig{
+				Limits:             &cortexvalidation.Limits{},
+				ResultsCacheConfig: &queryrange.ResultsCacheConfig{},
+			},
+			LabelsConfig: queryfrontend.LabelsConfig{
+				Limits:             &cortexvalidation.Limits{},
+				ResultsCacheConfig: &queryrange.ResultsCacheConfig{},
+			},
 		},
 	}
 
 	cfg.http.registerFlag(cmd)
 
-	cmd.Flag("query-range.split-interval", "Split queries by an interval and execute in parallel, it should be greater than 0 when response-cache-config is configured.").
-		Default("24h").DurationVar(&cfg.SplitQueriesByInterval)
+	// Query range tripperware flags.
+	cmd.Flag("query-range.split-interval", "Split query range requests by an interval and execute in parallel, it should be greater than 0 when query-range.response-cache-config is configured.").
+		Default("24h").DurationVar(&cfg.QueryRangeConfig.SplitQueriesByInterval)
 
-	cmd.Flag("query-range.max-retries-per-request", "Maximum number of retries for a single request; beyond this, the downstream error is returned.").
-		Default("5").IntVar(&cfg.MaxRetries)
+	cmd.Flag("query-range.max-retries-per-request", "Maximum number of retries for a single query range request; beyond this, the downstream error is returned.").
+		Default("5").IntVar(&cfg.QueryRangeConfig.MaxRetries)
 
 	cmd.Flag("query-range.max-query-length", "Limit the query time range (end - start time) in the query-frontend, 0 disables it.").
-		Default("0").DurationVar(&cfg.CortexLimits.MaxQueryLength)
+		Default("0").DurationVar(&cfg.QueryRangeConfig.Limits.MaxQueryLength)
 
-	cmd.Flag("query-range.max-query-parallelism", "Maximum number of queries will be scheduled in parallel by the Frontend.").
-		Default("14").IntVar(&cfg.CortexLimits.MaxQueryParallelism)
+	cmd.Flag("query-range.max-query-parallelism", "Maximum number of query range requests will be scheduled in parallel by the Frontend.").
+		Default("14").IntVar(&cfg.QueryRangeConfig.Limits.MaxQueryParallelism)
 
-	cmd.Flag("query-range.response-cache-max-freshness", "Most recent allowed cacheable result, to prevent caching very recent results that might still be in flux.").
-		Default("1m").DurationVar(&cfg.CortexLimits.MaxCacheFreshness)
+	cmd.Flag("query-range.response-cache-max-freshness", "Most recent allowed cacheable result for query range requests, to prevent caching very recent results that might still be in flux.").
+		Default("1m").DurationVar(&cfg.QueryRangeConfig.Limits.MaxCacheFreshness)
 
-	cmd.Flag("query-range.partial-response", "Enable partial response for queries if no partial_response param is specified. --no-query-range.partial-response for disabling.").
-		Default("true").BoolVar(&cfg.PartialResponseStrategy)
+	cmd.Flag("query-range.partial-response", "Enable partial response for query range requests if no partial_response param is specified. --no-query-range.partial-response for disabling.").
+		Default("true").BoolVar(&cfg.QueryRangeConfig.PartialResponseStrategy)
 
-	cfg.CachePathOrContent = *extflag.RegisterPathOrContent(cmd, "query-range.response-cache-config", "YAML file that contains response cache configuration.", false)
+	cfg.QueryRangeConfig.CachePathOrContent = *extflag.RegisterPathOrContent(cmd, "query-range.response-cache-config", "YAML file that contains response cache configuration.", false)
+
+	// Labels tripperware flags.
+	cmd.Flag("labels.split-interval", "Split labels requests by an interval and execute in parallel, it should be greater than 0 when labels.response-cache-config is configured.").
+		Default("24h").DurationVar(&cfg.LabelsConfig.SplitQueriesByInterval)
+
+	cmd.Flag("labels.max-retries-per-request", "Maximum number of retries for a single label/series API request; beyond this, the downstream error is returned.").
+		Default("5").IntVar(&cfg.LabelsConfig.MaxRetries)
+
+	cmd.Flag("labels.max-query-parallelism", "Maximum number of labels requests will be scheduled in parallel by the Frontend.").
+		Default("14").IntVar(&cfg.LabelsConfig.Limits.MaxQueryParallelism)
+
+	cmd.Flag("labels.response-cache-max-freshness", "Most recent allowed cacheable result for labels requests, to prevent caching very recent results that might still be in flux.").
+		Default("1m").DurationVar(&cfg.LabelsConfig.Limits.MaxCacheFreshness)
+
+	cmd.Flag("labels.partial-response", "Enable partial response for labels requests if no partial_response param is specified. --no-labels.partial-response for disabling.").
+		Default("true").BoolVar(&cfg.LabelsConfig.PartialResponseStrategy)
+
+	cmd.Flag("labels.default-time-range", "The default metadata time range duration for retrieving labels through Labels and Series API when the range parameters are not specified.").
+		Default("24h").DurationVar(&cfg.DefaultTimeRange)
 
 	cmd.Flag("cache-compression-type", "Use compression in results cache. Supported values are: 'snappy' and '' (disable compression).").
 		Default("").StringVar(&cfg.CacheCompression)
@@ -97,20 +123,16 @@ func runQueryFrontend(
 	cfg *queryFrontendConfig,
 	comp component.Component,
 ) error {
-	cacheConfContentYaml, err := cfg.CachePathOrContent.Content()
+	queryRangeCacheConfContentYaml, err := cfg.QueryRangeConfig.CachePathOrContent.Content()
 	if err != nil {
 		return err
 	}
-	if len(cacheConfContentYaml) > 0 {
-		cacheConfig, err := queryfrontend.NewCacheConfig(logger, cacheConfContentYaml)
+	if len(queryRangeCacheConfContentYaml) > 0 {
+		cacheConfig, err := queryfrontend.NewCacheConfig(logger, queryRangeCacheConfContentYaml)
 		if err != nil {
-			return errors.Wrap(err, "initializing the query frontend config")
-		}
-		if cfg.CortexResultsCacheConfig.CacheConfig.Memcache.Expiration == 0 {
-			level.Warn(logger).Log("msg", "memcached cache valid time set to 0, so using a default of 24 hours expiration time")
-			cfg.CortexResultsCacheConfig.CacheConfig.Memcache.Expiration = 24 * time.Hour
+			return errors.Wrap(err, "initializing the query range cache config")
 		}
-		cfg.CortexResultsCacheConfig = &queryrange.ResultsCacheConfig{
+		cfg.QueryRangeConfig.ResultsCacheConfig = &queryrange.ResultsCacheConfig{
 			Compression: cfg.CacheCompression,
 			CacheConfig: *cacheConfig,
 		}
@@ -128,7 +150,7 @@ func runQueryFrontend(
 
 	tripperWare, err := queryfrontend.NewTripperware(cfg.Config, reg, logger)
 	if err != nil {
-		return errors.Wrap(err, "setup query range middlewares")
+		return errors.Wrap(err, "setup tripperwares")
 	}
 
 	fe.Wrap(tripperWare)

diff --git a/docs/components/query-frontend.md b/docs/components/query-frontend.md
@@ -107,71 +107,99 @@ usage: thanos query-frontend [<flags>]
 query frontend
 
 Flags:
-  -h, --help                  Show context-sensitive help (also try --help-long
-                              and --help-man).
-      --version               Show application version.
-      --log.level=info        Log filtering level.
-      --log.format=logfmt     Log format to use. Possible options: logfmt or
-                              json.
+  -h, --help                     Show context-sensitive help (also try
+                                 --help-long and --help-man).
+      --version                  Show application version.
+      --log.level=info           Log filtering level.
+      --log.format=logfmt        Log format to use. Possible options: logfmt or
+                                 json.
       --tracing.config-file=<file-path>
-                              Path to YAML file with tracing configuration. See
-                              format details:
-                              https://thanos.io/tip/thanos/tracing.md/#configuration
+                                 Path to YAML file with tracing configuration.
+                                 See format details:
+                                 https://thanos.io/tip/thanos/tracing.md/#configuration
       --tracing.config=<content>
-                              Alternative to 'tracing.config-file' flag (lower
-                              priority). Content of YAML file with tracing
-                              configuration. See format details:
-                              https://thanos.io/tip/thanos/tracing.md/#configuration
+                                 Alternative to 'tracing.config-file' flag
+                                 (lower priority). Content of YAML file with
+                                 tracing configuration. See format details:
+                                 https://thanos.io/tip/thanos/tracing.md/#configuration
       --http-address="0.0.0.0:10902"
-                              Listen host:port for HTTP endpoints.
-      --http-grace-period=2m  Time to wait after an interrupt received for HTTP
-                              Server.
+                                 Listen host:port for HTTP endpoints.
+      --http-grace-period=2m     Time to wait after an interrupt received for
+                                 HTTP Server.
       --query-range.split-interval=24h
-                              Split queries by an interval and execute in
-                              parallel, it should be greater than 0 when
-                              response-cache-config is configured.
+                                 Split query range requests by an interval and
+                                 execute in parallel, it should be greater than
+                                 0 when query-range.response-cache-config is
+                                 configured.
       --query-range.max-retries-per-request=5
-                              Maximum number of retries for a single request;
-                              beyond this, the downstream error is returned.
+                                 Maximum number of retries for a single query
+                                 range request; beyond this, the downstream
+                                 error is returned.
       --query-range.max-query-length=0
-                              Limit the query time range (end - start time) in
-                              the query-frontend, 0 disables it.
+                                 Limit the query time range (end - start time)
+                                 in the query-frontend, 0 disables it.
       --query-range.max-query-parallelism=14
-                              Maximum number of queries will be scheduled in
-                              parallel by the Frontend.
+                                 Maximum number of query range requests will be
+                                 scheduled in parallel by the Frontend.
       --query-range.response-cache-max-freshness=1m
-                              Most recent allowed cacheable result, to prevent
-                              caching very recent results that might still be in
-                              flux.
+                                 Most recent allowed cacheable result for query
+                                 range requests, to prevent caching very recent
+                                 results that might still be in flux.
       --query-range.partial-response
-                              Enable partial response for queries if no
-                              partial_response param is specified.
-                              --no-query-range.partial-response for disabling.
+                                 Enable partial response for query range
+                                 requests if no partial_response param is
+                                 specified. --no-query-range.partial-response
+                                 for disabling.
       --query-range.response-cache-config-file=<file-path>
-                              Path to YAML file that contains response cache
-                              configuration.
+                                 Path to YAML file that contains response cache
+                                 configuration.
       --query-range.response-cache-config=<content>
-                              Alternative to
-                              'query-range.response-cache-config-file' flag
-                              (lower priority). Content of YAML file that
-                              contains response cache configuration.
+                                 Alternative to
+                                 'query-range.response-cache-config-file' flag
+                                 (lower priority). Content of YAML file that
+                                 contains response cache configuration.
+      --labels.split-interval=24h
+                                 Split labels requests by an interval and
+                                 execute in parallel, it should be greater than
+                                 0 when labels.response-cache-config is
+                                 configured.
+      --labels.max-retries-per-request=5
+                                 Maximum number of retries for a single
+                                 label/series API request; beyond this, the
+                                 downstream error is returned.
+      --labels.max-query-parallelism=14
+                                 Maximum number of labels requests will be
+                                 scheduled in parallel by the Frontend.
+      --labels.response-cache-max-freshness=1m
+                                 Most recent allowed cacheable result for labels
+                                 requests, to prevent caching very recent
+                                 results that might still be in flux.
+      --labels.partial-response  Enable partial response for labels requests if
+                                 no partial_response param is specified.
+                                 --no-labels.partial-response for disabling.
+      --labels.default-time-range=24h
+                                 The default metadata time range duration for
+                                 retrieving labels through Labels and Series API
+                                 when the range parameters are not specified.
       --cache-compression-type=""
-                              Use compression in results cache. Supported values
-                              are: 'snappy' and ” (disable compression).
+                                 Use compression in results cache. Supported
+                                 values are: 'snappy' and ” (disable
+                                 compression).
       --query-frontend.downstream-url="http://localhost:9090"
-                              URL of downstream Prometheus Query compatible API.
+                                 URL of downstream Prometheus Query compatible
+                                 API.
       --query-frontend.compress-responses
-                              Compress HTTP responses.
+                                 Compress HTTP responses.
       --query-frontend.log-queries-longer-than=0
-                              Log queries that are slower than the specified
-                              duration. Set to 0 to disable. Set to < 0 to
-                              enable on all queries.
+                                 Log queries that are slower than the specified
+                                 duration. Set to 0 to disable. Set to < 0 to
+                                 enable on all queries.
       --log.request.decision=LogFinishCall
-                              Request Logging for logging the start and end of
-                              requests. LogFinishCall is enabled by default.
-                              LogFinishCall : Logs the finish call of the
-                              requests. LogStartAndFinishCall : Logs the start
-                              and finish call of the requests. NoLogCall :
-                              Disable request logging.
+                                 Request Logging for logging the start and end
+                                 of requests. LogFinishCall is enabled by
+                                 default. LogFinishCall : Logs the finish call
+                                 of the requests. LogStartAndFinishCall : Logs
+                                 the start and finish call of the requests.
+                                 NoLogCall : Disable request logging.
 
 ```
diff --git a/go.mod b/go.mod
@@ -44,11 +44,11 @@ require (
 	github.com/opentracing/opentracing-go v1.2.0
 	github.com/pkg/errors v0.9.1
 	github.com/pmezard/go-difflib v1.0.0
-	github.com/prometheus/alertmanager v0.21.0
+	github.com/prometheus/alertmanager v0.21.1-0.20200911160112-1fdff6b3f939
 	github.com/prometheus/client_golang v1.7.1
 	github.com/prometheus/client_model v0.2.0
-	github.com/prometheus/common v0.13.0
-	github.com/prometheus/prometheus v1.8.2-0.20200819132913-cb830b0a9c78
+	github.com/prometheus/common v0.14.0
+	github.com/prometheus/prometheus v1.8.2-0.20200923143134-7e2db3d092f3
 	github.com/uber/jaeger-client-go v2.25.0+incompatible
 	github.com/uber/jaeger-lib v2.2.0+incompatible
 	github.com/weaveworks/common v0.0.0-20200914083218-61ffdd448099