From b74717a9650345cc2448d9e2d715635242c9c815 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 12 Mar 2019 15:30:22 +0100 Subject: [PATCH 01/43] *: Add initial remote-write-receive component (#811) --- cmd/thanos/flags.go | 26 +- cmd/thanos/main.go | 1 + cmd/thanos/receive.go | 231 ++++++++++++++ docs/components/query.md | 12 +- docs/components/rule.md | 12 +- docs/components/sidecar.md | 12 +- docs/components/store.md | 12 +- go.mod | 2 + go.sum | 1 + pkg/component/component.go | 3 + pkg/receive/handler.go | 196 ++++++++++++ pkg/receive/writer.go | 57 ++++ pkg/store/prompb/remote.pb.go | 552 +++++++++++++++++++++++++++++++--- pkg/store/prompb/remote.proto | 12 + pkg/store/storepb/rpc.pb.go | 117 +++---- pkg/store/storepb/rpc.proto | 1 + scripts/quickstart.sh | 60 +++- test/e2e/query_test.go | 65 +++- test/e2e/spinup_test.go | 41 ++- 19 files changed, 1261 insertions(+), 152 deletions(-) create mode 100644 cmd/thanos/receive.go create mode 100644 pkg/receive/handler.go create mode 100644 pkg/receive/writer.go diff --git a/cmd/thanos/flags.go b/cmd/thanos/flags.go index cd3edd8271..db370ed97c 100644 --- a/cmd/thanos/flags.go +++ b/cmd/thanos/flags.go @@ -17,25 +17,37 @@ import ( "gopkg.in/alecthomas/kingpin.v2" ) -func regCommonServerFlags(cmd *kingpin.CmdClause) ( +func regGRPCFlags(cmd *kingpin.CmdClause) ( grpcBindAddr *string, - httpBindAddr *string, grpcTLSSrvCert *string, grpcTLSSrvKey *string, grpcTLSSrvClientCA *string, - peerFunc func(log.Logger, *prometheus.Registry, bool, string, bool) (cluster.Peer, error)) { - +) { grpcBindAddr = cmd.Flag("grpc-address", "Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components if you use gossip, 'grpc-advertise-address' is empty and you require cross-node connection."). Default("0.0.0.0:10901").String() - grpcAdvertiseAddr := cmd.Flag("grpc-advertise-address", "Explicit (external) host:port address to advertise for gRPC StoreAPI in gossip cluster. If empty, 'grpc-address' will be used."). - String() - grpcTLSSrvCert = cmd.Flag("grpc-server-tls-cert", "TLS Certificate for gRPC server, leave blank to disable TLS").Default("").String() grpcTLSSrvKey = cmd.Flag("grpc-server-tls-key", "TLS Key for the gRPC server, leave blank to disable TLS").Default("").String() grpcTLSSrvClientCA = cmd.Flag("grpc-server-tls-client-ca", "TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert)").Default("").String() + return grpcBindAddr, + grpcTLSSrvCert, + grpcTLSSrvKey, + grpcTLSSrvClientCA +} + +func regCommonServerFlags(cmd *kingpin.CmdClause) ( + grpcBindAddr *string, + httpBindAddr *string, + grpcTLSSrvCert *string, + grpcTLSSrvKey *string, + grpcTLSSrvClientCA *string, + peerFunc func(log.Logger, *prometheus.Registry, bool, string, bool) (cluster.Peer, error)) { + httpBindAddr = regHTTPAddrFlag(cmd) + grpcBindAddr, grpcTLSSrvCert, grpcTLSSrvKey, grpcTLSSrvClientCA = regGRPCFlags(cmd) + grpcAdvertiseAddr := cmd.Flag("grpc-advertise-address", "Explicit (external) host:port address to advertise for gRPC StoreAPI in gossip cluster. If empty, 'grpc-address' will be used."). + String() clusterBindAddr := cmd.Flag("cluster.address", "Listen ip:port address for gossip cluster."). Default("0.0.0.0:10900").String() diff --git a/cmd/thanos/main.go b/cmd/thanos/main.go index 1be93b6b54..2dbbad88a3 100644 --- a/cmd/thanos/main.go +++ b/cmd/thanos/main.go @@ -78,6 +78,7 @@ func main() { registerCompact(cmds, app, "compact") registerBucket(cmds, app, "bucket") registerDownsample(cmds, app, "downsample") + registerReceive(cmds, app, "receive") cmd, err := app.Parse(os.Args[1:]) if err != nil { diff --git a/cmd/thanos/receive.go b/cmd/thanos/receive.go new file mode 100644 index 0000000000..63a3ba9c36 --- /dev/null +++ b/cmd/thanos/receive.go @@ -0,0 +1,231 @@ +package main + +import ( + "context" + "fmt" + "net" + "sync" + "time" + + "github.com/go-kit/kit/log" + "github.com/go-kit/kit/log/level" + "github.com/improbable-eng/thanos/pkg/component" + "github.com/improbable-eng/thanos/pkg/receive" + "github.com/improbable-eng/thanos/pkg/runutil" + "github.com/improbable-eng/thanos/pkg/store" + "github.com/improbable-eng/thanos/pkg/store/storepb" + "github.com/oklog/run" + opentracing "github.com/opentracing/opentracing-go" + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/storage/tsdb" + "google.golang.org/grpc" + kingpin "gopkg.in/alecthomas/kingpin.v2" +) + +func registerReceive(m map[string]setupFunc, app *kingpin.Application, name string) { + cmd := app.Command(name, "Accept Prometheus remote write API requests and write to local tsdb (EXPERIMENTAL, this may change drastically without notice)") + + grpcBindAddr, cert, key, clientCA := regGRPCFlags(cmd) + httpMetricsBindAddr := regHTTPAddrFlag(cmd) + + remoteWriteAddress := cmd.Flag("remote-write.address", "Address to listen on for remote write requests."). + Default("0.0.0.0:19291").String() + + dataDir := cmd.Flag("tsdb.path", "Data directory of TSDB."). + Default("./data").String() + + m[name] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ bool) error { + return runReceive( + g, + logger, + reg, + tracer, + *grpcBindAddr, + *cert, + *key, + *clientCA, + *httpMetricsBindAddr, + *remoteWriteAddress, + *dataDir, + ) + } +} + +func runReceive( + g *run.Group, + logger log.Logger, + reg *prometheus.Registry, + tracer opentracing.Tracer, + grpcBindAddr string, + cert string, + key string, + clientCA string, + httpMetricsBindAddr string, + remoteWriteAddress string, + dataDir string, +) error { + logger = log.With(logger, "component", "receive") + level.Warn(logger).Log("msg", "setting up receive; the Thanos receive component is EXPERIMENTAL, it may break significantly without notice") + + tsdbCfg := &tsdb.Options{ + Retention: model.Duration(time.Hour * 24 * 15), + NoLockfile: true, + MinBlockDuration: model.Duration(time.Hour * 2), + MaxBlockDuration: model.Duration(time.Hour * 2), + } + + localStorage := &tsdb.ReadyStorage{} + receiver := receive.NewWriter(log.With(logger, "component", "receive-writer"), localStorage) + webHandler := receive.NewHandler(log.With(logger, "component", "receive-handler"), &receive.Options{ + Receiver: receiver, + ListenAddress: remoteWriteAddress, + Registry: reg, + ReadyStorage: localStorage, + }) + + // Start all components while we wait for TSDB to open but only load + // initial config and mark ourselves as ready after it completed. + dbOpen := make(chan struct{}) + + // sync.Once is used to make sure we can close the channel at different execution stages(SIGTERM or when the config is loaded). + type closeOnce struct { + C chan struct{} + once sync.Once + Close func() + } + // Wait until the server is ready to handle reloading. + reloadReady := &closeOnce{ + C: make(chan struct{}), + } + reloadReady.Close = func() { + reloadReady.once.Do(func() { + close(reloadReady.C) + }) + } + + level.Debug(logger).Log("msg", "setting up endpoint readiness") + { + // Initial configuration loading. + cancel := make(chan struct{}) + g.Add( + func() error { + select { + case <-dbOpen: + break + case <-cancel: + reloadReady.Close() + return nil + } + + reloadReady.Close() + + webHandler.Ready() + level.Info(logger).Log("msg", "server is ready to receive web requests.") + <-cancel + return nil + }, + func(err error) { + close(cancel) + }, + ) + } + + level.Debug(logger).Log("msg", "setting up tsdb") + { + // TSDB. + cancel := make(chan struct{}) + g.Add( + func() error { + level.Info(logger).Log("msg", "starting TSDB ...") + db, err := tsdb.Open( + dataDir, + log.With(logger, "component", "tsdb"), + reg, + tsdbCfg, + ) + if err != nil { + return fmt.Errorf("opening storage failed: %s", err) + } + level.Info(logger).Log("msg", "tsdb started") + + startTimeMargin := int64(2 * time.Duration(tsdbCfg.MinBlockDuration).Seconds() * 1000) + localStorage.Set(db, startTimeMargin) + close(dbOpen) + <-cancel + return nil + }, + func(err error) { + if err := localStorage.Close(); err != nil { + level.Error(logger).Log("msg", "error stopping storage", "err", err) + } + close(cancel) + }, + ) + } + + level.Debug(logger).Log("msg", "setting up metric http listen-group") + if err := metricHTTPListenGroup(g, logger, reg, httpMetricsBindAddr); err != nil { + return err + } + + level.Debug(logger).Log("msg", "setting up grpc server") + { + var ( + s *grpc.Server + l net.Listener + err error + ) + g.Add(func() error { + select { + case <-dbOpen: + break + } + + l, err = net.Listen("tcp", grpcBindAddr) + if err != nil { + return errors.Wrap(err, "listen API address") + } + + db := localStorage.Get() + tsdbStore := store.NewTSDBStore(log.With(logger, "component", "thanos-tsdb-store"), reg, db, component.Receive, nil) + + opts, err := defaultGRPCServerOpts(logger, reg, tracer, cert, key, clientCA) + if err != nil { + return errors.Wrap(err, "setup gRPC server") + } + s = grpc.NewServer(opts...) + storepb.RegisterStoreServer(s, tsdbStore) + + level.Info(logger).Log("msg", "listening for StoreAPI gRPC", "address", grpcBindAddr) + return errors.Wrap(s.Serve(l), "serve gRPC") + }, func(error) { + if s != nil { + s.Stop() + } + if l != nil { + runutil.CloseWithLogOnErr(logger, l, "store gRPC listener") + } + }) + } + + level.Debug(logger).Log("msg", "setting up receive http handler") + { + ctx, cancel := context.WithCancel(context.Background()) + g.Add( + func() error { + if err := webHandler.Run(ctx); err != nil { + return fmt.Errorf("error starting web server: %s", err) + } + return nil + }, + func(err error) { + cancel() + }, + ) + } + level.Info(logger).Log("msg", "starting receiver") + + return nil +} diff --git a/docs/components/query.md b/docs/components/query.md index 9638dacd34..f79960c10b 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -148,16 +148,14 @@ Flags: If 0 no trace will be sent periodically, unless forced by baggage item. See `pkg/tracing/tracing.go` for details. + --http-address="0.0.0.0:10902" + Listen host:port for HTTP endpoints. --grpc-address="0.0.0.0:10901" Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components if you use gossip, 'grpc-advertise-address' is empty and you require cross-node connection. - --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS - Explicit (external) host:port address to - advertise for gRPC StoreAPI in gossip cluster. - If empty, 'grpc-address' will be used. --grpc-server-tls-cert="" TLS Certificate for gRPC server, leave blank to disable TLS --grpc-server-tls-key="" TLS Key for the gRPC server, leave blank to @@ -166,8 +164,10 @@ Flags: TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert) - --http-address="0.0.0.0:10902" - Listen host:port for HTTP endpoints. + --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS + Explicit (external) host:port address to + advertise for gRPC StoreAPI in gossip cluster. + If empty, 'grpc-address' will be used. --cluster.address="0.0.0.0:10900" Listen ip:port address for gossip cluster. --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS diff --git a/docs/components/rule.md b/docs/components/rule.md index 521496a707..be35f0d6e6 100644 --- a/docs/components/rule.md +++ b/docs/components/rule.md @@ -55,16 +55,14 @@ Flags: If 0 no trace will be sent periodically, unless forced by baggage item. See `pkg/tracing/tracing.go` for details. + --http-address="0.0.0.0:10902" + Listen host:port for HTTP endpoints. --grpc-address="0.0.0.0:10901" Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components if you use gossip, 'grpc-advertise-address' is empty and you require cross-node connection. - --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS - Explicit (external) host:port address to - advertise for gRPC StoreAPI in gossip cluster. - If empty, 'grpc-address' will be used. --grpc-server-tls-cert="" TLS Certificate for gRPC server, leave blank to disable TLS --grpc-server-tls-key="" TLS Key for the gRPC server, leave blank to @@ -73,8 +71,10 @@ Flags: TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert) - --http-address="0.0.0.0:10902" - Listen host:port for HTTP endpoints. + --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS + Explicit (external) host:port address to + advertise for gRPC StoreAPI in gossip cluster. + If empty, 'grpc-address' will be used. --cluster.address="0.0.0.0:10900" Listen ip:port address for gossip cluster. --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS diff --git a/docs/components/sidecar.md b/docs/components/sidecar.md index f5a8d47232..fdb1955205 100644 --- a/docs/components/sidecar.md +++ b/docs/components/sidecar.md @@ -59,16 +59,14 @@ Flags: If 0 no trace will be sent periodically, unless forced by baggage item. See `pkg/tracing/tracing.go` for details. + --http-address="0.0.0.0:10902" + Listen host:port for HTTP endpoints. --grpc-address="0.0.0.0:10901" Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components if you use gossip, 'grpc-advertise-address' is empty and you require cross-node connection. - --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS - Explicit (external) host:port address to - advertise for gRPC StoreAPI in gossip cluster. - If empty, 'grpc-address' will be used. --grpc-server-tls-cert="" TLS Certificate for gRPC server, leave blank to disable TLS --grpc-server-tls-key="" TLS Key for the gRPC server, leave blank to @@ -77,8 +75,10 @@ Flags: TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert) - --http-address="0.0.0.0:10902" - Listen host:port for HTTP endpoints. + --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS + Explicit (external) host:port address to + advertise for gRPC StoreAPI in gossip cluster. + If empty, 'grpc-address' will be used. --cluster.address="0.0.0.0:10900" Listen ip:port address for gossip cluster. --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS diff --git a/docs/components/store.md b/docs/components/store.md index e32aa11d16..840795ac18 100644 --- a/docs/components/store.md +++ b/docs/components/store.md @@ -44,16 +44,14 @@ Flags: If 0 no trace will be sent periodically, unless forced by baggage item. See `pkg/tracing/tracing.go` for details. + --http-address="0.0.0.0:10902" + Listen host:port for HTTP endpoints. --grpc-address="0.0.0.0:10901" Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components if you use gossip, 'grpc-advertise-address' is empty and you require cross-node connection. - --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS - Explicit (external) host:port address to - advertise for gRPC StoreAPI in gossip cluster. - If empty, 'grpc-address' will be used. --grpc-server-tls-cert="" TLS Certificate for gRPC server, leave blank to disable TLS --grpc-server-tls-key="" TLS Key for the gRPC server, leave blank to @@ -62,8 +60,10 @@ Flags: TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert) - --http-address="0.0.0.0:10902" - Listen host:port for HTTP endpoints. + --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS + Explicit (external) host:port address to + advertise for gRPC StoreAPI in gossip cluster. + If empty, 'grpc-address' will be used. --cluster.address="0.0.0.0:10900" Listen ip:port address for gossip cluster. --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS diff --git a/go.mod b/go.mod index a591df38e4..9932f12049 100644 --- a/go.mod +++ b/go.mod @@ -24,9 +24,11 @@ require ( github.com/miekg/dns v1.0.8 // indirect github.com/minio/minio-go v0.0.0-20190131015406-c8a261de75c1 github.com/mozillazg/go-cos v0.11.0 + github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223 github.com/oklog/run v1.0.0 github.com/oklog/ulid v1.3.1 github.com/olekukonko/tablewriter v0.0.1 + github.com/opentracing-contrib/go-stdlib v0.0.0-20170113013457-1de4cc2120e7 github.com/opentracing/basictracer-go v1.0.0 github.com/opentracing/opentracing-go v1.0.2 github.com/pkg/errors v0.8.1 diff --git a/go.sum b/go.sum index cd5e15a797..645506c5e0 100644 --- a/go.sum +++ b/go.sum @@ -204,6 +204,7 @@ github.com/onsi/ginkgo v1.6.0 h1:Ix8l273rp3QzYgXSR+c8d1fTG7UPgYkOSELPhiY/YGw= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/gomega v1.4.1 h1:PZSj/UFNaVp3KxrzHOcS7oyuWA7LoOY/77yCTEFu21U= github.com/onsi/gomega v1.4.1/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA= +github.com/opentracing-contrib/go-stdlib v0.0.0-20170113013457-1de4cc2120e7 h1:8KbikWulLUcMM96hBxjgoo6gTmCkG6HYSDohv/WygYU= github.com/opentracing-contrib/go-stdlib v0.0.0-20170113013457-1de4cc2120e7/go.mod h1:PLldrQSroqzH70Xl+1DQcGnefIbqsKR7UDaiux3zV+w= github.com/opentracing/basictracer-go v1.0.0 h1:YyUAhaEfjoWXclZVJ9sGoNct7j4TVk7lZWlQw5UXuoo= github.com/opentracing/basictracer-go v1.0.0/go.mod h1:QfBfYuafItcjQuMwinw9GhYKwFXS9KnPs5lxoYwgW74= diff --git a/pkg/component/component.go b/pkg/component/component.go index 368de1f60c..690afa1ba5 100644 --- a/pkg/component/component.go +++ b/pkg/component/component.go @@ -71,6 +71,8 @@ func FromProto(storeType storepb.StoreType) StoreAPI { return Sidecar case storepb.StoreType_STORE: return Store + case storepb.StoreType_RECEIVE: + return Receive default: return nil } @@ -84,4 +86,5 @@ var ( Rule = sourceStoreAPI{component: component{name: "rule"}} Sidecar = sourceStoreAPI{component: component{name: "sidecar"}} Store = sourceStoreAPI{component: component{name: "store"}} + Receive = sourceStoreAPI{component: component{name: "receive"}} ) diff --git a/pkg/receive/handler.go b/pkg/receive/handler.go new file mode 100644 index 0000000000..2ca1f14ce1 --- /dev/null +++ b/pkg/receive/handler.go @@ -0,0 +1,196 @@ +package receive + +import ( + "context" + "fmt" + "io/ioutil" + stdlog "log" + "net" + "net/http" + "sync/atomic" + + "github.com/go-kit/kit/log" + "github.com/go-kit/kit/log/level" + "github.com/gogo/protobuf/proto" + "github.com/golang/snappy" + "github.com/improbable-eng/thanos/pkg/runutil" + "github.com/improbable-eng/thanos/pkg/store/prompb" + conntrack "github.com/mwitkow/go-conntrack" + "github.com/oklog/run" + "github.com/opentracing-contrib/go-stdlib/nethttp" + opentracing "github.com/opentracing/opentracing-go" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/prometheus/common/route" + promtsdb "github.com/prometheus/prometheus/storage/tsdb" +) + +var ( + requestDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "thanos_http_request_duration_seconds", + Help: "Histogram of latencies for HTTP requests.", + Buckets: []float64{.1, .2, .4, 1, 3, 8, 20, 60, 120}, + }, + []string{"handler"}, + ) + responseSize = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "thanos_http_response_size_bytes", + Help: "Histogram of response size for HTTP requests.", + Buckets: prometheus.ExponentialBuckets(100, 10, 8), + }, + []string{"handler"}, + ) +) + +// Options for the web Handler. +type Options struct { + Receiver *Writer + ListenAddress string + Registry prometheus.Registerer + ReadyStorage *promtsdb.ReadyStorage +} + +// Handler serves a Prometheus remote write receiving HTTP endpoint. +type Handler struct { + readyStorage *promtsdb.ReadyStorage + logger log.Logger + receiver *Writer + router *route.Router + options *Options + quitCh chan struct{} + + ready uint32 // ready is uint32 rather than boolean to be able to use atomic functions. +} + +func instrumentHandler(handlerName string, handler http.HandlerFunc) http.HandlerFunc { + return promhttp.InstrumentHandlerDuration( + requestDuration.MustCurryWith(prometheus.Labels{"handler": handlerName}), + promhttp.InstrumentHandlerResponseSize( + responseSize.MustCurryWith(prometheus.Labels{"handler": handlerName}), + handler, + ), + ) +} + +func NewHandler(logger log.Logger, o *Options) *Handler { + router := route.New().WithInstrumentation(instrumentHandler) + if logger == nil { + logger = log.NewNopLogger() + } + + h := &Handler{ + logger: logger, + router: router, + readyStorage: o.ReadyStorage, + receiver: o.Receiver, + options: o, + quitCh: make(chan struct{}), + } + + readyf := h.testReady + router.Post("/api/v1/receive", readyf(h.receive)) + + return h +} + +// Ready sets Handler to be ready. +func (h *Handler) Ready() { + atomic.StoreUint32(&h.ready, 1) +} + +// Verifies whether the server is ready or not. +func (h *Handler) isReady() bool { + ready := atomic.LoadUint32(&h.ready) + return ready > 0 +} + +// Checks if server is ready, calls f if it is, returns 503 if it is not. +func (h *Handler) testReady(f http.HandlerFunc) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + if h.isReady() { + f(w, r) + return + } + + w.WriteHeader(http.StatusServiceUnavailable) + _, err := fmt.Fprintf(w, "Service Unavailable") + if err != nil { + h.logger.Log("msg", "failed to write to response body", "err", err) + } + } +} + +// Quit returns the receive-only quit channel. +func (h *Handler) Quit() <-chan struct{} { + return h.quitCh +} + +// Checks if server is ready, calls f if it is, returns 503 if it is not. +func (h *Handler) testReadyHandler(f http.Handler) http.HandlerFunc { + return h.testReady(f.ServeHTTP) +} + +// Run serves the HTTP endpoints. +func (h *Handler) Run(ctx context.Context) error { + level.Info(h.logger).Log("msg", "Start listening for connections", "address", h.options.ListenAddress) + + listener, err := net.Listen("tcp", h.options.ListenAddress) + if err != nil { + return err + } + + // Monitor incoming connections with conntrack. + listener = conntrack.NewListener(listener, + conntrack.TrackWithName("http"), + conntrack.TrackWithTracing()) + + operationName := nethttp.OperationNameFunc(func(r *http.Request) string { + return fmt.Sprintf("%s %s", r.Method, r.URL.Path) + }) + mux := http.NewServeMux() + mux.Handle("/", h.router) + + errlog := stdlog.New(log.NewStdlibAdapter(level.Error(h.logger)), "", 0) + + httpSrv := &http.Server{ + Handler: nethttp.Middleware(opentracing.GlobalTracer(), mux, operationName), + ErrorLog: errlog, + } + + var g run.Group + g.Add(func() error { + return httpSrv.Serve(listener) + }, func(error) { + runutil.CloseWithLogOnErr(h.logger, listener, "receive HTTP listener") + }) + + return g.Run() +} + +func (h *Handler) receive(w http.ResponseWriter, req *http.Request) { + compressed, err := ioutil.ReadAll(req.Body) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + reqBuf, err := snappy.Decode(nil, compressed) + if err != nil { + level.Error(h.logger).Log("msg", "snappy decode error", "err", err) + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + var wreq prompb.WriteRequest + if err := proto.Unmarshal(reqBuf, &wreq); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + if err := h.receiver.Receive(&wreq); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } +} diff --git a/pkg/receive/writer.go b/pkg/receive/writer.go new file mode 100644 index 0000000000..a4e6c1d7d2 --- /dev/null +++ b/pkg/receive/writer.go @@ -0,0 +1,57 @@ +package receive + +import ( + "github.com/go-kit/kit/log" + "github.com/improbable-eng/thanos/pkg/store/prompb" + "github.com/pkg/errors" + + "github.com/prometheus/prometheus/pkg/labels" + "github.com/prometheus/prometheus/storage" +) + +// Appendable returns an Appender. +type Appendable interface { + Appender() (storage.Appender, error) +} + +type Writer struct { + logger log.Logger + append Appendable +} + +func NewWriter(logger log.Logger, app Appendable) *Writer { + return &Writer{ + logger: logger, + append: app, + } +} + +func (r *Writer) Receive(wreq *prompb.WriteRequest) error { + app, err := r.append.Appender() + if err != nil { + return errors.Wrap(err, "failed to get appender") + } + + for _, t := range wreq.Timeseries { + lset := make(labels.Labels, len(t.Labels)) + for j := range t.Labels { + lset[j] = labels.Label{ + Name: t.Labels[j].Name, + Value: t.Labels[j].Value, + } + } + + for _, s := range t.Samples { + _, err = app.Add(lset, s.Timestamp, s.Value) + if err != nil { + return errors.Wrap(err, "failed to non-fast add") + } + } + } + + if err := app.Commit(); err != nil { + return errors.Wrap(err, "failed to commit") + } + + return nil +} diff --git a/pkg/store/prompb/remote.pb.go b/pkg/store/prompb/remote.pb.go index 590290c658..06a0952c98 100644 --- a/pkg/store/prompb/remote.pb.go +++ b/pkg/store/prompb/remote.pb.go @@ -49,9 +49,49 @@ func (x LabelMatcher_Type) String() string { return proto.EnumName(LabelMatcher_Type_name, int32(x)) } func (LabelMatcher_Type) EnumDescriptor() ([]byte, []int) { - return fileDescriptor_remote_5645ea049238b205, []int{7, 0} + return fileDescriptor_remote_930be8df34ca631b, []int{8, 0} } +type WriteRequest struct { + Timeseries []TimeSeries `protobuf:"bytes,1,rep,name=timeseries" json:"timeseries"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *WriteRequest) Reset() { *m = WriteRequest{} } +func (m *WriteRequest) String() string { return proto.CompactTextString(m) } +func (*WriteRequest) ProtoMessage() {} +func (*WriteRequest) Descriptor() ([]byte, []int) { + return fileDescriptor_remote_930be8df34ca631b, []int{0} +} +func (m *WriteRequest) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *WriteRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_WriteRequest.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalTo(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (dst *WriteRequest) XXX_Merge(src proto.Message) { + xxx_messageInfo_WriteRequest.Merge(dst, src) +} +func (m *WriteRequest) XXX_Size() int { + return m.Size() +} +func (m *WriteRequest) XXX_DiscardUnknown() { + xxx_messageInfo_WriteRequest.DiscardUnknown(m) +} + +var xxx_messageInfo_WriteRequest proto.InternalMessageInfo + type ReadRequest struct { Queries []Query `protobuf:"bytes,1,rep,name=queries" json:"queries"` XXX_NoUnkeyedLiteral struct{} `json:"-"` @@ -63,7 +103,7 @@ func (m *ReadRequest) Reset() { *m = ReadRequest{} } func (m *ReadRequest) String() string { return proto.CompactTextString(m) } func (*ReadRequest) ProtoMessage() {} func (*ReadRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_remote_5645ea049238b205, []int{0} + return fileDescriptor_remote_930be8df34ca631b, []int{1} } func (m *ReadRequest) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -104,7 +144,7 @@ func (m *ReadResponse) Reset() { *m = ReadResponse{} } func (m *ReadResponse) String() string { return proto.CompactTextString(m) } func (*ReadResponse) ProtoMessage() {} func (*ReadResponse) Descriptor() ([]byte, []int) { - return fileDescriptor_remote_5645ea049238b205, []int{1} + return fileDescriptor_remote_930be8df34ca631b, []int{2} } func (m *ReadResponse) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -137,6 +177,7 @@ type Query struct { StartTimestampMs int64 `protobuf:"varint,1,opt,name=start_timestamp_ms,json=startTimestampMs,proto3" json:"start_timestamp_ms,omitempty"` EndTimestampMs int64 `protobuf:"varint,2,opt,name=end_timestamp_ms,json=endTimestampMs,proto3" json:"end_timestamp_ms,omitempty"` Matchers []LabelMatcher `protobuf:"bytes,3,rep,name=matchers" json:"matchers"` + Hints *ReadHints `protobuf:"bytes,4,opt,name=hints" json:"hints,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` @@ -146,7 +187,7 @@ func (m *Query) Reset() { *m = Query{} } func (m *Query) String() string { return proto.CompactTextString(m) } func (*Query) ProtoMessage() {} func (*Query) Descriptor() ([]byte, []int) { - return fileDescriptor_remote_5645ea049238b205, []int{2} + return fileDescriptor_remote_930be8df34ca631b, []int{3} } func (m *Query) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -186,7 +227,7 @@ func (m *QueryResult) Reset() { *m = QueryResult{} } func (m *QueryResult) String() string { return proto.CompactTextString(m) } func (*QueryResult) ProtoMessage() {} func (*QueryResult) Descriptor() ([]byte, []int) { - return fileDescriptor_remote_5645ea049238b205, []int{3} + return fileDescriptor_remote_930be8df34ca631b, []int{4} } func (m *QueryResult) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -227,7 +268,7 @@ func (m *Sample) Reset() { *m = Sample{} } func (m *Sample) String() string { return proto.CompactTextString(m) } func (*Sample) ProtoMessage() {} func (*Sample) Descriptor() ([]byte, []int) { - return fileDescriptor_remote_5645ea049238b205, []int{4} + return fileDescriptor_remote_930be8df34ca631b, []int{5} } func (m *Sample) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -268,7 +309,7 @@ func (m *TimeSeries) Reset() { *m = TimeSeries{} } func (m *TimeSeries) String() string { return proto.CompactTextString(m) } func (*TimeSeries) ProtoMessage() {} func (*TimeSeries) Descriptor() ([]byte, []int) { - return fileDescriptor_remote_5645ea049238b205, []int{5} + return fileDescriptor_remote_930be8df34ca631b, []int{6} } func (m *TimeSeries) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -309,7 +350,7 @@ func (m *Label) Reset() { *m = Label{} } func (m *Label) String() string { return proto.CompactTextString(m) } func (*Label) ProtoMessage() {} func (*Label) Descriptor() ([]byte, []int) { - return fileDescriptor_remote_5645ea049238b205, []int{6} + return fileDescriptor_remote_930be8df34ca631b, []int{7} } func (m *Label) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -352,7 +393,7 @@ func (m *LabelMatcher) Reset() { *m = LabelMatcher{} } func (m *LabelMatcher) String() string { return proto.CompactTextString(m) } func (*LabelMatcher) ProtoMessage() {} func (*LabelMatcher) Descriptor() ([]byte, []int) { - return fileDescriptor_remote_5645ea049238b205, []int{7} + return fileDescriptor_remote_930be8df34ca631b, []int{8} } func (m *LabelMatcher) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -381,7 +422,51 @@ func (m *LabelMatcher) XXX_DiscardUnknown() { var xxx_messageInfo_LabelMatcher proto.InternalMessageInfo +type ReadHints struct { + StepMs int64 `protobuf:"varint,1,opt,name=step_ms,json=stepMs,proto3" json:"step_ms,omitempty"` + Func string `protobuf:"bytes,2,opt,name=func,proto3" json:"func,omitempty"` + StartMs int64 `protobuf:"varint,3,opt,name=start_ms,json=startMs,proto3" json:"start_ms,omitempty"` + EndMs int64 `protobuf:"varint,4,opt,name=end_ms,json=endMs,proto3" json:"end_ms,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *ReadHints) Reset() { *m = ReadHints{} } +func (m *ReadHints) String() string { return proto.CompactTextString(m) } +func (*ReadHints) ProtoMessage() {} +func (*ReadHints) Descriptor() ([]byte, []int) { + return fileDescriptor_remote_930be8df34ca631b, []int{9} +} +func (m *ReadHints) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *ReadHints) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_ReadHints.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalTo(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (dst *ReadHints) XXX_Merge(src proto.Message) { + xxx_messageInfo_ReadHints.Merge(dst, src) +} +func (m *ReadHints) XXX_Size() int { + return m.Size() +} +func (m *ReadHints) XXX_DiscardUnknown() { + xxx_messageInfo_ReadHints.DiscardUnknown(m) +} + +var xxx_messageInfo_ReadHints proto.InternalMessageInfo + func init() { + proto.RegisterType((*WriteRequest)(nil), "prometheus.WriteRequest") proto.RegisterType((*ReadRequest)(nil), "prometheus.ReadRequest") proto.RegisterType((*ReadResponse)(nil), "prometheus.ReadResponse") proto.RegisterType((*Query)(nil), "prometheus.Query") @@ -390,8 +475,42 @@ func init() { proto.RegisterType((*TimeSeries)(nil), "prometheus.TimeSeries") proto.RegisterType((*Label)(nil), "prometheus.Label") proto.RegisterType((*LabelMatcher)(nil), "prometheus.LabelMatcher") + proto.RegisterType((*ReadHints)(nil), "prometheus.ReadHints") proto.RegisterEnum("prometheus.LabelMatcher_Type", LabelMatcher_Type_name, LabelMatcher_Type_value) } +func (m *WriteRequest) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalTo(dAtA) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *WriteRequest) MarshalTo(dAtA []byte) (int, error) { + var i int + _ = i + var l int + _ = l + if len(m.Timeseries) > 0 { + for _, msg := range m.Timeseries { + dAtA[i] = 0xa + i++ + i = encodeVarintRemote(dAtA, i, uint64(msg.Size())) + n, err := msg.MarshalTo(dAtA[i:]) + if err != nil { + return 0, err + } + i += n + } + } + if m.XXX_unrecognized != nil { + i += copy(dAtA[i:], m.XXX_unrecognized) + } + return i, nil +} + func (m *ReadRequest) Marshal() (dAtA []byte, err error) { size := m.Size() dAtA = make([]byte, size) @@ -495,6 +614,16 @@ func (m *Query) MarshalTo(dAtA []byte) (int, error) { i += n } } + if m.Hints != nil { + dAtA[i] = 0x22 + i++ + i = encodeVarintRemote(dAtA, i, uint64(m.Hints.Size())) + n1, err := m.Hints.MarshalTo(dAtA[i:]) + if err != nil { + return 0, err + } + i += n1 + } if m.XXX_unrecognized != nil { i += copy(dAtA[i:], m.XXX_unrecognized) } @@ -682,6 +811,48 @@ func (m *LabelMatcher) MarshalTo(dAtA []byte) (int, error) { return i, nil } +func (m *ReadHints) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalTo(dAtA) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *ReadHints) MarshalTo(dAtA []byte) (int, error) { + var i int + _ = i + var l int + _ = l + if m.StepMs != 0 { + dAtA[i] = 0x8 + i++ + i = encodeVarintRemote(dAtA, i, uint64(m.StepMs)) + } + if len(m.Func) > 0 { + dAtA[i] = 0x12 + i++ + i = encodeVarintRemote(dAtA, i, uint64(len(m.Func))) + i += copy(dAtA[i:], m.Func) + } + if m.StartMs != 0 { + dAtA[i] = 0x18 + i++ + i = encodeVarintRemote(dAtA, i, uint64(m.StartMs)) + } + if m.EndMs != 0 { + dAtA[i] = 0x20 + i++ + i = encodeVarintRemote(dAtA, i, uint64(m.EndMs)) + } + if m.XXX_unrecognized != nil { + i += copy(dAtA[i:], m.XXX_unrecognized) + } + return i, nil +} + func encodeVarintRemote(dAtA []byte, offset int, v uint64) int { for v >= 1<<7 { dAtA[offset] = uint8(v&0x7f | 0x80) @@ -691,6 +862,21 @@ func encodeVarintRemote(dAtA []byte, offset int, v uint64) int { dAtA[offset] = uint8(v) return offset + 1 } +func (m *WriteRequest) Size() (n int) { + var l int + _ = l + if len(m.Timeseries) > 0 { + for _, e := range m.Timeseries { + l = e.Size() + n += 1 + l + sovRemote(uint64(l)) + } + } + if m.XXX_unrecognized != nil { + n += len(m.XXX_unrecognized) + } + return n +} + func (m *ReadRequest) Size() (n int) { var l int _ = l @@ -736,6 +922,10 @@ func (m *Query) Size() (n int) { n += 1 + l + sovRemote(uint64(l)) } } + if m.Hints != nil { + l = m.Hints.Size() + n += 1 + l + sovRemote(uint64(l)) + } if m.XXX_unrecognized != nil { n += len(m.XXX_unrecognized) } @@ -830,6 +1020,28 @@ func (m *LabelMatcher) Size() (n int) { return n } +func (m *ReadHints) Size() (n int) { + var l int + _ = l + if m.StepMs != 0 { + n += 1 + sovRemote(uint64(m.StepMs)) + } + l = len(m.Func) + if l > 0 { + n += 1 + l + sovRemote(uint64(l)) + } + if m.StartMs != 0 { + n += 1 + sovRemote(uint64(m.StartMs)) + } + if m.EndMs != 0 { + n += 1 + sovRemote(uint64(m.EndMs)) + } + if m.XXX_unrecognized != nil { + n += len(m.XXX_unrecognized) + } + return n +} + func sovRemote(x uint64) (n int) { for { n++ @@ -843,6 +1055,88 @@ func sovRemote(x uint64) (n int) { func sozRemote(x uint64) (n int) { return sovRemote(uint64((x << 1) ^ uint64((int64(x) >> 63)))) } +func (m *WriteRequest) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowRemote + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: WriteRequest: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: WriteRequest: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Timeseries", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowRemote + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthRemote + } + postIndex := iNdEx + msglen + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Timeseries = append(m.Timeseries, TimeSeries{}) + if err := m.Timeseries[len(m.Timeseries)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipRemote(dAtA[iNdEx:]) + if err != nil { + return err + } + if skippy < 0 { + return ErrInvalidLengthRemote + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} func (m *ReadRequest) Unmarshal(dAtA []byte) error { l := len(dAtA) iNdEx := 0 @@ -1105,6 +1399,39 @@ func (m *Query) Unmarshal(dAtA []byte) error { return err } iNdEx = postIndex + case 4: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Hints", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowRemote + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthRemote + } + postIndex := iNdEx + msglen + if postIndex > l { + return io.ErrUnexpectedEOF + } + if m.Hints == nil { + m.Hints = &ReadHints{} + } + if err := m.Hints.Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex default: iNdEx = preIndex skippy, err := skipRemote(dAtA[iNdEx:]) @@ -1640,6 +1967,143 @@ func (m *LabelMatcher) Unmarshal(dAtA []byte) error { } return nil } +func (m *ReadHints) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowRemote + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: ReadHints: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: ReadHints: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field StepMs", wireType) + } + m.StepMs = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowRemote + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.StepMs |= (int64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + case 2: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Func", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowRemote + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthRemote + } + postIndex := iNdEx + intStringLen + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Func = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + case 3: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field StartMs", wireType) + } + m.StartMs = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowRemote + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.StartMs |= (int64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + case 4: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field EndMs", wireType) + } + m.EndMs = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowRemote + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.EndMs |= (int64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + default: + iNdEx = preIndex + skippy, err := skipRemote(dAtA[iNdEx:]) + if err != nil { + return err + } + if skippy < 0 { + return ErrInvalidLengthRemote + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} func skipRemote(dAtA []byte) (n int, err error) { l := len(dAtA) iNdEx := 0 @@ -1745,36 +2209,42 @@ var ( ErrIntOverflowRemote = fmt.Errorf("proto: integer overflow") ) -func init() { proto.RegisterFile("remote.proto", fileDescriptor_remote_5645ea049238b205) } - -var fileDescriptor_remote_5645ea049238b205 = []byte{ - // 448 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x74, 0x93, 0xc1, 0x8a, 0x13, 0x41, - 0x10, 0x86, 0xd3, 0x33, 0xc9, 0xc4, 0xad, 0x84, 0x65, 0x2c, 0x16, 0x0d, 0xa2, 0x51, 0xe6, 0x94, - 0x83, 0x64, 0x49, 0x3c, 0x08, 0xb2, 0x07, 0x59, 0x08, 0x1e, 0x74, 0x85, 0xf4, 0xee, 0xc9, 0xcb, - 0x32, 0x31, 0xc5, 0xee, 0xc2, 0x4c, 0x66, 0xd2, 0xdd, 0x23, 0xe4, 0x41, 0x3c, 0xf9, 0x42, 0x39, - 0xfa, 0x04, 0xa2, 0x79, 0x12, 0xe9, 0xea, 0x99, 0xa4, 0xc5, 0xdd, 0x5b, 0x77, 0xd5, 0x57, 0x7f, - 0xfd, 0x55, 0x4d, 0x43, 0x5f, 0x51, 0x5e, 0x18, 0x1a, 0x97, 0xaa, 0x30, 0x05, 0x42, 0xa9, 0x8a, - 0x9c, 0xcc, 0x2d, 0x55, 0xfa, 0xd9, 0xc9, 0x4d, 0x71, 0x53, 0x70, 0xf8, 0xd4, 0x9e, 0x1c, 0x91, - 0xbc, 0x87, 0x9e, 0xa4, 0x74, 0x29, 0x69, 0x5d, 0x91, 0x36, 0x38, 0x81, 0xee, 0xba, 0x22, 0x75, - 0x47, 0x7a, 0x20, 0x5e, 0x85, 0xa3, 0xde, 0xf4, 0xf1, 0xf8, 0x20, 0x31, 0x9e, 0x57, 0xa4, 0x36, - 0xe7, 0xed, 0xed, 0xaf, 0x97, 0x2d, 0xd9, 0x70, 0xc9, 0x07, 0xe8, 0x3b, 0x05, 0x5d, 0x16, 0x2b, - 0x4d, 0xf8, 0x16, 0xba, 0x8a, 0x74, 0x95, 0x99, 0x46, 0xe2, 0xe9, 0x7f, 0x12, 0x92, 0xf3, 0x8d, - 0x50, 0x4d, 0x27, 0x3f, 0x04, 0x74, 0x38, 0x8d, 0xaf, 0x01, 0xb5, 0x49, 0x95, 0xb9, 0x36, 0x77, - 0x39, 0x69, 0x93, 0xe6, 0xe5, 0x75, 0x6e, 0xd5, 0xc4, 0x28, 0x94, 0x31, 0x67, 0xae, 0x9a, 0xc4, - 0x85, 0xc6, 0x11, 0xc4, 0xb4, 0x5a, 0xfe, 0xcb, 0x06, 0xcc, 0x1e, 0xd3, 0x6a, 0xe9, 0x93, 0xef, - 0xe0, 0x51, 0x9e, 0x9a, 0xaf, 0xb7, 0xa4, 0xf4, 0x20, 0x64, 0x6f, 0x03, 0xdf, 0xdb, 0xa7, 0x74, - 0x41, 0xd9, 0x85, 0x03, 0x6a, 0x73, 0x7b, 0x3e, 0xf9, 0x08, 0x3d, 0xcf, 0x3b, 0x9e, 0x01, 0x70, - 0x43, 0x7f, 0x57, 0x4f, 0x7c, 0x31, 0xdb, 0xf7, 0x92, 0xb3, 0xb5, 0x94, 0xc7, 0x27, 0x67, 0x10, - 0x5d, 0xa6, 0x79, 0x99, 0x11, 0x9e, 0x40, 0xe7, 0x5b, 0x9a, 0x55, 0xc4, 0xd3, 0x09, 0xe9, 0x2e, - 0xf8, 0x1c, 0x8e, 0xf6, 0xe3, 0xd4, 0xb3, 0x1c, 0x02, 0xc9, 0x1a, 0xe0, 0xa0, 0x8e, 0xa7, 0x10, - 0x65, 0xd6, 0xf8, 0xbd, 0x2f, 0xc6, 0x23, 0xd5, 0x06, 0x6a, 0x0c, 0xa7, 0xd0, 0xd5, 0xdc, 0xdc, - 0xae, 0xc9, 0x56, 0xa0, 0x5f, 0xe1, 0x7c, 0x35, 0x6f, 0x53, 0x83, 0xc9, 0x04, 0x3a, 0x2c, 0x85, - 0x08, 0xed, 0x55, 0x9a, 0x3b, 0xbb, 0x47, 0x92, 0xcf, 0x87, 0x19, 0x02, 0x0e, 0xba, 0x4b, 0xf2, - 0x5d, 0x40, 0xdf, 0xdf, 0x28, 0x4e, 0xa0, 0x6d, 0x36, 0xa5, 0x2b, 0x3d, 0x9e, 0xbe, 0x78, 0x68, - 0xf3, 0xe3, 0xab, 0x4d, 0x49, 0x92, 0xd1, 0x7d, 0xb7, 0xe0, 0xbe, 0x6e, 0xa1, 0xdf, 0x6d, 0x04, - 0x6d, 0x5b, 0x87, 0x11, 0x04, 0xb3, 0x79, 0xdc, 0xc2, 0x2e, 0x84, 0x9f, 0x67, 0xf3, 0x58, 0xd8, - 0x80, 0x9c, 0xc5, 0x01, 0x07, 0xe4, 0x2c, 0x0e, 0xcf, 0x07, 0xdb, 0x3f, 0xc3, 0xd6, 0x76, 0x37, - 0x14, 0x3f, 0x77, 0x43, 0xf1, 0x7b, 0x37, 0x14, 0x5f, 0x22, 0xeb, 0xa4, 0x5c, 0x2c, 0x22, 0xfe, - 0x12, 0x6f, 0xfe, 0x06, 0x00, 0x00, 0xff, 0xff, 0x64, 0x87, 0x06, 0x4f, 0x44, 0x03, 0x00, 0x00, +func init() { proto.RegisterFile("remote.proto", fileDescriptor_remote_930be8df34ca631b) } + +var fileDescriptor_remote_930be8df34ca631b = []byte{ + // 535 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xa4, 0x53, 0x51, 0x8b, 0xd3, 0x40, + 0x10, 0xbe, 0x34, 0x6d, 0x72, 0x9d, 0x96, 0x23, 0x0e, 0x77, 0x5e, 0x15, 0xad, 0x47, 0x9e, 0x0a, + 0x4a, 0x8f, 0xd6, 0x07, 0x41, 0xee, 0x41, 0x0e, 0x8a, 0x82, 0x57, 0xa1, 0x7b, 0x05, 0xc1, 0x97, + 0x23, 0xbd, 0x8e, 0xd7, 0x4a, 0x36, 0x49, 0xb3, 0x1b, 0xa1, 0x3f, 0xc4, 0xff, 0xd4, 0x47, 0x7f, + 0x81, 0x68, 0x7f, 0x89, 0xec, 0x6e, 0xd2, 0xae, 0x78, 0x3e, 0xf9, 0x96, 0x99, 0xf9, 0xe6, 0x9b, + 0xef, 0xdb, 0x99, 0x40, 0x3b, 0x27, 0x9e, 0x4a, 0xea, 0x67, 0x79, 0x2a, 0x53, 0x84, 0x2c, 0x4f, + 0x39, 0xc9, 0x05, 0x15, 0xe2, 0xf1, 0xf1, 0x5d, 0x7a, 0x97, 0xea, 0xf4, 0xb9, 0xfa, 0x32, 0x88, + 0xf0, 0x0a, 0xda, 0x1f, 0xf3, 0xa5, 0x24, 0x46, 0xab, 0x82, 0x84, 0xc4, 0x0b, 0x00, 0xb9, 0xe4, + 0x24, 0x28, 0x5f, 0x92, 0xe8, 0x38, 0x67, 0x6e, 0xaf, 0x35, 0x7c, 0xd8, 0xdf, 0xd3, 0xf4, 0xa7, + 0x4b, 0x4e, 0xd7, 0xba, 0x7a, 0x59, 0xdf, 0xfc, 0x78, 0x76, 0xc0, 0x2c, 0x7c, 0xf8, 0x06, 0x5a, + 0x8c, 0xa2, 0x79, 0x45, 0x36, 0x00, 0x7f, 0x55, 0xd8, 0x4c, 0x0f, 0x6c, 0xa6, 0x49, 0x41, 0xf9, + 0xba, 0x24, 0xa9, 0x70, 0xe1, 0x5b, 0x68, 0x1b, 0x06, 0x91, 0xa5, 0x89, 0x20, 0x7c, 0x05, 0x7e, + 0x4e, 0xa2, 0x88, 0x65, 0x45, 0x71, 0xfa, 0x17, 0x05, 0xd3, 0xf5, 0x8a, 0xa8, 0x44, 0x87, 0x1b, + 0x07, 0x1a, 0xba, 0x8c, 0x2f, 0x00, 0x85, 0x8c, 0x72, 0x79, 0xa3, 0x85, 0xca, 0x88, 0x67, 0x37, + 0x5c, 0xb1, 0x39, 0x3d, 0x97, 0x05, 0xba, 0x32, 0xad, 0x0a, 0x63, 0x81, 0x3d, 0x08, 0x28, 0x99, + 0xff, 0x89, 0xad, 0x69, 0xec, 0x11, 0x25, 0x73, 0x1b, 0xf9, 0x1a, 0x0e, 0x79, 0x24, 0x6f, 0x17, + 0x94, 0x8b, 0x8e, 0xab, 0xb5, 0x75, 0x6c, 0x6d, 0x57, 0xd1, 0x8c, 0xe2, 0xb1, 0x01, 0x94, 0xe2, + 0x76, 0x78, 0x7c, 0x0e, 0x8d, 0xc5, 0x32, 0x91, 0xa2, 0x53, 0x3f, 0x73, 0x7a, 0xad, 0xe1, 0x89, + 0xdd, 0xa8, 0xfc, 0xbf, 0x53, 0x45, 0x66, 0x30, 0xe1, 0x7b, 0x68, 0x59, 0x46, 0xff, 0x73, 0x45, + 0x17, 0xe0, 0x5d, 0x47, 0x3c, 0x8b, 0x09, 0x8f, 0xa1, 0xf1, 0x35, 0x8a, 0x0b, 0xd2, 0x4f, 0xe1, + 0x30, 0x13, 0xe0, 0x13, 0x68, 0xee, 0xbc, 0x97, 0xc6, 0xf7, 0x89, 0x70, 0x05, 0xb0, 0x67, 0xc7, + 0x73, 0xf0, 0x62, 0xe5, 0xf2, 0xde, 0xf5, 0x6a, 0xff, 0xa5, 0x80, 0x12, 0x86, 0x43, 0xf0, 0x85, + 0x1e, 0xae, 0xde, 0x54, 0x75, 0xa0, 0xdd, 0x61, 0x74, 0x55, 0x8b, 0x2c, 0x81, 0xe1, 0x00, 0x1a, + 0x9a, 0x0a, 0x11, 0xea, 0x49, 0xc4, 0x8d, 0xdc, 0x26, 0xd3, 0xdf, 0x7b, 0x0f, 0x35, 0x9d, 0x34, + 0x41, 0xf8, 0xcd, 0x81, 0xb6, 0xfd, 0xfc, 0x38, 0x80, 0xba, 0x5c, 0x67, 0xa6, 0xf5, 0x68, 0xf8, + 0xf4, 0x5f, 0x6b, 0xea, 0x4f, 0xd7, 0x19, 0x31, 0x0d, 0xdd, 0x4d, 0xab, 0xdd, 0x37, 0xcd, 0xb5, + 0xa7, 0xf5, 0xa0, 0xae, 0xfa, 0xd0, 0x83, 0xda, 0x68, 0x12, 0x1c, 0xa0, 0x0f, 0xee, 0x87, 0xd1, + 0x24, 0x70, 0x54, 0x82, 0x8d, 0x82, 0x9a, 0x4e, 0xb0, 0x51, 0xe0, 0x86, 0x5f, 0xa0, 0xb9, 0x5b, + 0x2e, 0x9e, 0x82, 0x2f, 0x24, 0x59, 0xb7, 0xe8, 0xa9, 0x70, 0x2c, 0xd4, 0xe4, 0xcf, 0x45, 0x72, + 0x5b, 0x4d, 0x56, 0xdf, 0xf8, 0x08, 0x0e, 0xcd, 0x0d, 0x73, 0xa1, 0x87, 0xbb, 0xcc, 0xd7, 0xf1, + 0x58, 0xe0, 0x09, 0x78, 0xea, 0x60, 0xb9, 0xb9, 0x25, 0x97, 0x35, 0x28, 0x99, 0x8f, 0xc5, 0x65, + 0x67, 0xf3, 0xab, 0x7b, 0xb0, 0xd9, 0x76, 0x9d, 0xef, 0xdb, 0xae, 0xf3, 0x73, 0xdb, 0x75, 0x3e, + 0x79, 0xca, 0x75, 0x36, 0x9b, 0x79, 0xfa, 0xcf, 0x7f, 0xf9, 0x3b, 0x00, 0x00, 0xff, 0xff, 0xe5, + 0xcf, 0xa9, 0xcb, 0x2b, 0x04, 0x00, 0x00, } diff --git a/pkg/store/prompb/remote.proto b/pkg/store/prompb/remote.proto index ec8f325398..2f7cf3fc0e 100644 --- a/pkg/store/prompb/remote.proto +++ b/pkg/store/prompb/remote.proto @@ -28,6 +28,10 @@ option (gogoproto.goproto_getters_all) = false; option go_package = "prompb"; +message WriteRequest { + repeated prometheus.TimeSeries timeseries = 1 [(gogoproto.nullable) = false]; +} + message ReadRequest { repeated Query queries = 1 [(gogoproto.nullable) = false]; } @@ -41,6 +45,7 @@ message Query { int64 start_timestamp_ms = 1; int64 end_timestamp_ms = 2; repeated LabelMatcher matchers = 3 [(gogoproto.nullable) = false]; + prometheus.ReadHints hints = 4; } message QueryResult { @@ -73,4 +78,11 @@ message LabelMatcher { Type type = 1; string name = 2; string value = 3; +} + +message ReadHints { + int64 step_ms = 1; // Query step size in milliseconds. + string func = 2; // String representation of surrounding function or aggregation. + int64 start_ms = 3; // Start time in milliseconds. + int64 end_ms = 4; // End time in milliseconds. } \ No newline at end of file diff --git a/pkg/store/storepb/rpc.pb.go b/pkg/store/storepb/rpc.pb.go index 78dec4b95f..e9ddda8711 100644 --- a/pkg/store/storepb/rpc.pb.go +++ b/pkg/store/storepb/rpc.pb.go @@ -32,6 +32,7 @@ const ( StoreType_RULE StoreType = 2 StoreType_SIDECAR StoreType = 3 StoreType_STORE StoreType = 4 + StoreType_RECEIVE StoreType = 5 ) var StoreType_name = map[int32]string{ @@ -40,6 +41,7 @@ var StoreType_name = map[int32]string{ 2: "RULE", 3: "SIDECAR", 4: "STORE", + 5: "RECEIVE", } var StoreType_value = map[string]int32{ "UNKNOWN": 0, @@ -47,13 +49,14 @@ var StoreType_value = map[string]int32{ "RULE": 2, "SIDECAR": 3, "STORE": 4, + "RECEIVE": 5, } func (x StoreType) String() string { return proto.EnumName(StoreType_name, int32(x)) } func (StoreType) EnumDescriptor() ([]byte, []int) { - return fileDescriptor_rpc_e304c8713328de35, []int{0} + return fileDescriptor_rpc_b2f04ff11750c7dd, []int{0} } type Aggr int32 @@ -88,7 +91,7 @@ func (x Aggr) String() string { return proto.EnumName(Aggr_name, int32(x)) } func (Aggr) EnumDescriptor() ([]byte, []int) { - return fileDescriptor_rpc_e304c8713328de35, []int{1} + return fileDescriptor_rpc_b2f04ff11750c7dd, []int{1} } type InfoRequest struct { @@ -101,7 +104,7 @@ func (m *InfoRequest) Reset() { *m = InfoRequest{} } func (m *InfoRequest) String() string { return proto.CompactTextString(m) } func (*InfoRequest) ProtoMessage() {} func (*InfoRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_e304c8713328de35, []int{0} + return fileDescriptor_rpc_b2f04ff11750c7dd, []int{0} } func (m *InfoRequest) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -144,7 +147,7 @@ func (m *InfoResponse) Reset() { *m = InfoResponse{} } func (m *InfoResponse) String() string { return proto.CompactTextString(m) } func (*InfoResponse) ProtoMessage() {} func (*InfoResponse) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_e304c8713328de35, []int{1} + return fileDescriptor_rpc_b2f04ff11750c7dd, []int{1} } func (m *InfoResponse) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -189,7 +192,7 @@ func (m *SeriesRequest) Reset() { *m = SeriesRequest{} } func (m *SeriesRequest) String() string { return proto.CompactTextString(m) } func (*SeriesRequest) ProtoMessage() {} func (*SeriesRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_e304c8713328de35, []int{2} + return fileDescriptor_rpc_b2f04ff11750c7dd, []int{2} } func (m *SeriesRequest) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -232,7 +235,7 @@ func (m *SeriesResponse) Reset() { *m = SeriesResponse{} } func (m *SeriesResponse) String() string { return proto.CompactTextString(m) } func (*SeriesResponse) ProtoMessage() {} func (*SeriesResponse) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_e304c8713328de35, []int{3} + return fileDescriptor_rpc_b2f04ff11750c7dd, []int{3} } func (m *SeriesResponse) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -379,7 +382,7 @@ func (m *LabelNamesRequest) Reset() { *m = LabelNamesRequest{} } func (m *LabelNamesRequest) String() string { return proto.CompactTextString(m) } func (*LabelNamesRequest) ProtoMessage() {} func (*LabelNamesRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_e304c8713328de35, []int{4} + return fileDescriptor_rpc_b2f04ff11750c7dd, []int{4} } func (m *LabelNamesRequest) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -420,7 +423,7 @@ func (m *LabelNamesResponse) Reset() { *m = LabelNamesResponse{} } func (m *LabelNamesResponse) String() string { return proto.CompactTextString(m) } func (*LabelNamesResponse) ProtoMessage() {} func (*LabelNamesResponse) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_e304c8713328de35, []int{5} + return fileDescriptor_rpc_b2f04ff11750c7dd, []int{5} } func (m *LabelNamesResponse) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -461,7 +464,7 @@ func (m *LabelValuesRequest) Reset() { *m = LabelValuesRequest{} } func (m *LabelValuesRequest) String() string { return proto.CompactTextString(m) } func (*LabelValuesRequest) ProtoMessage() {} func (*LabelValuesRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_e304c8713328de35, []int{6} + return fileDescriptor_rpc_b2f04ff11750c7dd, []int{6} } func (m *LabelValuesRequest) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -502,7 +505,7 @@ func (m *LabelValuesResponse) Reset() { *m = LabelValuesResponse{} } func (m *LabelValuesResponse) String() string { return proto.CompactTextString(m) } func (*LabelValuesResponse) ProtoMessage() {} func (*LabelValuesResponse) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_e304c8713328de35, []int{7} + return fileDescriptor_rpc_b2f04ff11750c7dd, []int{7} } func (m *LabelValuesResponse) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -2323,51 +2326,51 @@ var ( ErrIntOverflowRpc = fmt.Errorf("proto: integer overflow") ) -func init() { proto.RegisterFile("rpc.proto", fileDescriptor_rpc_e304c8713328de35) } - -var fileDescriptor_rpc_e304c8713328de35 = []byte{ - // 675 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x7c, 0x54, 0xcd, 0x6e, 0xd3, 0x40, - 0x10, 0x8e, 0x7f, 0xe2, 0xc4, 0x93, 0x36, 0x72, 0xb7, 0x69, 0x71, 0x8c, 0x14, 0xa2, 0x9c, 0xa2, - 0x82, 0x5a, 0x08, 0x12, 0x12, 0xdc, 0x92, 0x36, 0x55, 0x23, 0xda, 0x44, 0x6c, 0x12, 0x0a, 0x5c, - 0x8a, 0xd3, 0x6e, 0x5d, 0x4b, 0x8e, 0x6d, 0xbc, 0x0e, 0x6d, 0xaf, 0xbc, 0x06, 0x37, 0x9e, 0xa6, - 0x47, 0x9e, 0x00, 0x41, 0x9f, 0x04, 0xed, 0x7a, 0x9d, 0xc4, 0xa8, 0xe4, 0xb6, 0xf3, 0x7d, 0xe3, - 0x99, 0x6f, 0x67, 0x3e, 0x2f, 0xe8, 0x51, 0x78, 0xbe, 0x1b, 0x46, 0x41, 0x1c, 0x20, 0x2d, 0xbe, - 0xb2, 0xfd, 0x80, 0x5a, 0xa5, 0xf8, 0x36, 0x24, 0x34, 0x01, 0xad, 0x8a, 0x13, 0x38, 0x01, 0x3f, - 0xee, 0xb1, 0x53, 0x82, 0x36, 0xd6, 0xa1, 0xd4, 0xf3, 0x2f, 0x03, 0x4c, 0xbe, 0xcc, 0x08, 0x8d, - 0x1b, 0x3f, 0x24, 0x58, 0x4b, 0x62, 0x1a, 0x06, 0x3e, 0x25, 0xe8, 0x29, 0x68, 0x9e, 0x3d, 0x21, - 0x1e, 0x35, 0xa5, 0xba, 0xd2, 0x2c, 0xb5, 0xd6, 0x77, 0x93, 0xda, 0xbb, 0xc7, 0x0c, 0xed, 0xa8, - 0x77, 0xbf, 0x9e, 0xe4, 0xb0, 0x48, 0x41, 0x55, 0x28, 0x4e, 0x5d, 0xff, 0x2c, 0x76, 0xa7, 0xc4, - 0x94, 0xeb, 0x52, 0x53, 0xc1, 0x85, 0xa9, 0xeb, 0x8f, 0xdc, 0x29, 0xe1, 0x94, 0x7d, 0x93, 0x50, - 0x8a, 0xa0, 0xec, 0x1b, 0x4e, 0xed, 0x81, 0x4e, 0xe3, 0x20, 0x22, 0xa3, 0xdb, 0x90, 0x98, 0x6a, - 0x5d, 0x6a, 0x96, 0x5b, 0x1b, 0x69, 0x97, 0x61, 0x4a, 0xe0, 0x45, 0x4e, 0xe3, 0xbb, 0x0c, 0xeb, - 0x43, 0x12, 0xb9, 0x84, 0x0a, 0xd9, 0x99, 0xc6, 0xd2, 0xff, 0x1b, 0xcb, 0xd9, 0xc6, 0xaf, 0x18, - 0x15, 0x9f, 0x5f, 0x91, 0x88, 0x9a, 0x0a, 0xbf, 0x5d, 0x25, 0x73, 0xbb, 0x93, 0x84, 0x14, 0x97, - 0x9c, 0xe7, 0xa2, 0x16, 0x6c, 0xb1, 0x92, 0x11, 0xa1, 0x81, 0x37, 0x8b, 0xdd, 0xc0, 0x3f, 0xbb, - 0x76, 0xfd, 0x8b, 0xe0, 0x9a, 0x8b, 0x57, 0xf0, 0xe6, 0xd4, 0xbe, 0xc1, 0x73, 0xee, 0x94, 0x53, - 0xe8, 0x19, 0x80, 0xed, 0x38, 0x11, 0x71, 0xec, 0x98, 0x50, 0x33, 0x5f, 0x57, 0x9a, 0xe5, 0xd6, - 0x5a, 0xda, 0xad, 0xed, 0x38, 0x11, 0x5e, 0xe2, 0xd1, 0x1b, 0xa8, 0x86, 0x76, 0x14, 0xbb, 0xb6, - 0xc7, 0xba, 0xf0, 0x4d, 0x9c, 0x5d, 0xb8, 0xd4, 0x9e, 0x78, 0xe4, 0xc2, 0xd4, 0xea, 0x52, 0xb3, - 0x88, 0x1f, 0x89, 0x84, 0x74, 0x53, 0x07, 0x82, 0x6e, 0x7c, 0x86, 0x72, 0x3a, 0x1c, 0xb1, 0xc3, - 0x26, 0x68, 0x94, 0x23, 0x7c, 0x36, 0xa5, 0x56, 0x79, 0x3e, 0x5d, 0x8e, 0x1e, 0xe5, 0xb0, 0xe0, - 0x91, 0x05, 0x85, 0x6b, 0x3b, 0xf2, 0x5d, 0xdf, 0xe1, 0xb3, 0xd2, 0x8f, 0x72, 0x38, 0x05, 0x3a, - 0x45, 0xd0, 0x22, 0x42, 0x67, 0x5e, 0xdc, 0x18, 0xc0, 0x06, 0x9f, 0x4f, 0xdf, 0x9e, 0x2e, 0x56, - 0xb0, 0x52, 0xb2, 0xb4, 0x5a, 0xf2, 0x21, 0xa0, 0xe5, 0x82, 0x42, 0x76, 0x05, 0xf2, 0x3e, 0x03, - 0xb8, 0xf3, 0x74, 0x9c, 0x04, 0xc8, 0x82, 0xa2, 0x50, 0x44, 0x4d, 0x99, 0x13, 0xf3, 0xb8, 0x71, - 0x29, 0xea, 0xbc, 0xb7, 0xbd, 0xd9, 0x42, 0x59, 0x05, 0xf2, 0xdc, 0x9f, 0x5c, 0x85, 0x8e, 0x93, - 0x60, 0xb5, 0x5e, 0x79, 0xb5, 0xde, 0x1e, 0x6c, 0x66, 0xfa, 0x08, 0xc1, 0xdb, 0xa0, 0x7d, 0xe5, - 0x88, 0x50, 0x2c, 0xa2, 0x55, 0x92, 0x77, 0xba, 0xa0, 0xcf, 0x3d, 0x8e, 0x4a, 0x50, 0x18, 0xf7, - 0xdf, 0xf6, 0x07, 0xa7, 0x7d, 0x23, 0x87, 0x74, 0xc8, 0xbf, 0x1b, 0x77, 0xf1, 0x47, 0x43, 0x42, - 0x45, 0x50, 0xf1, 0xf8, 0xb8, 0x6b, 0xc8, 0x2c, 0x63, 0xd8, 0x3b, 0xe8, 0xee, 0xb7, 0xb1, 0xa1, - 0xb0, 0x8c, 0xe1, 0x68, 0x80, 0xbb, 0x86, 0xba, 0xd3, 0x01, 0x95, 0x99, 0x08, 0x15, 0x40, 0xc1, - 0xed, 0xd3, 0xe4, 0xeb, 0xfd, 0xc1, 0xb8, 0x3f, 0x32, 0x24, 0x86, 0x0d, 0xc7, 0x27, 0x86, 0xcc, - 0x0e, 0x27, 0xbd, 0xbe, 0xa1, 0xf0, 0x43, 0xfb, 0x83, 0xa1, 0xb2, 0x72, 0x3c, 0xab, 0x8b, 0x8d, - 0x7c, 0xeb, 0x9b, 0x0c, 0x79, 0xae, 0x05, 0xbd, 0x00, 0x95, 0x3d, 0x02, 0x68, 0x33, 0x35, 0xca, - 0xd2, 0x13, 0x61, 0x55, 0xb2, 0xa0, 0xb8, 0xfb, 0x6b, 0xd0, 0x12, 0x37, 0xa1, 0xad, 0xac, 0xbb, - 0xd2, 0xcf, 0xb6, 0xff, 0x85, 0x93, 0x0f, 0x9f, 0x4b, 0x68, 0x1f, 0x60, 0xb1, 0x7d, 0x54, 0xcd, - 0xfc, 0x82, 0xcb, 0x16, 0xb3, 0xac, 0x87, 0x28, 0xd1, 0xff, 0x10, 0x4a, 0x4b, 0x2b, 0x41, 0xd9, - 0xd4, 0x8c, 0x1f, 0xac, 0xc7, 0x0f, 0x72, 0x49, 0x9d, 0x4e, 0xf5, 0xee, 0x4f, 0x2d, 0x77, 0x77, - 0x5f, 0x93, 0x7e, 0xde, 0xd7, 0xa4, 0xdf, 0xf7, 0x35, 0xe9, 0x53, 0x81, 0x3f, 0x3c, 0xe1, 0x64, - 0xa2, 0xf1, 0x17, 0xf3, 0xe5, 0xdf, 0x00, 0x00, 0x00, 0xff, 0xff, 0x17, 0xfe, 0xbd, 0x57, 0x69, - 0x05, 0x00, 0x00, +func init() { proto.RegisterFile("rpc.proto", fileDescriptor_rpc_b2f04ff11750c7dd) } + +var fileDescriptor_rpc_b2f04ff11750c7dd = []byte{ + // 683 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x7c, 0x54, 0xc1, 0x6e, 0xda, 0x4c, + 0x10, 0xc6, 0x36, 0x36, 0x78, 0x48, 0x90, 0xb3, 0x21, 0xf9, 0x8d, 0x7f, 0x89, 0x22, 0x4e, 0x28, + 0xad, 0x92, 0x96, 0x4a, 0x95, 0xda, 0x1b, 0x10, 0x47, 0x41, 0x4d, 0x40, 0x5d, 0x20, 0x69, 0x7b, + 0x49, 0x4d, 0xb2, 0x71, 0x2c, 0x19, 0xdb, 0xf5, 0x9a, 0x26, 0xb9, 0xf6, 0x35, 0x7a, 0xeb, 0xd3, + 0xe4, 0xd8, 0x27, 0xa8, 0x5a, 0x9e, 0xa4, 0xf2, 0x7a, 0x0d, 0xb8, 0x4a, 0xb9, 0xed, 0x7c, 0xdf, + 0x78, 0xe6, 0xdb, 0x99, 0xcf, 0x0b, 0x6a, 0x18, 0x5c, 0xee, 0x07, 0xa1, 0x1f, 0xf9, 0x48, 0x89, + 0x6e, 0x2c, 0xcf, 0xa7, 0x46, 0x29, 0xba, 0x0f, 0x08, 0x4d, 0x40, 0xa3, 0x62, 0xfb, 0xb6, 0xcf, + 0x8e, 0x07, 0xf1, 0x29, 0x41, 0x1b, 0x9b, 0x50, 0xea, 0x79, 0xd7, 0x3e, 0x26, 0x9f, 0x67, 0x84, + 0x46, 0x8d, 0xef, 0x02, 0x6c, 0x24, 0x31, 0x0d, 0x7c, 0x8f, 0x12, 0xf4, 0x14, 0x14, 0xd7, 0x9a, + 0x10, 0x97, 0xea, 0x42, 0x5d, 0x6a, 0x96, 0x5a, 0x9b, 0xfb, 0x49, 0xed, 0xfd, 0x93, 0x18, 0xed, + 0xe4, 0x1f, 0x7e, 0x3e, 0xc9, 0x61, 0x9e, 0x82, 0xaa, 0x50, 0x9c, 0x3a, 0xde, 0x45, 0xe4, 0x4c, + 0x89, 0x2e, 0xd6, 0x85, 0xa6, 0x84, 0x0b, 0x53, 0xc7, 0x1b, 0x39, 0x53, 0xc2, 0x28, 0xeb, 0x2e, + 0xa1, 0x24, 0x4e, 0x59, 0x77, 0x8c, 0x3a, 0x00, 0x95, 0x46, 0x7e, 0x48, 0x46, 0xf7, 0x01, 0xd1, + 0xf3, 0x75, 0xa1, 0x59, 0x6e, 0x6d, 0xa5, 0x5d, 0x86, 0x29, 0x81, 0x97, 0x39, 0x8d, 0x6f, 0x22, + 0x6c, 0x0e, 0x49, 0xe8, 0x10, 0xca, 0x65, 0x67, 0x1a, 0x0b, 0xff, 0x6e, 0x2c, 0x66, 0x1b, 0xbf, + 0x8a, 0xa9, 0xe8, 0xf2, 0x86, 0x84, 0x54, 0x97, 0xd8, 0xed, 0x2a, 0x99, 0xdb, 0x9d, 0x26, 0x24, + 0xbf, 0xe4, 0x22, 0x17, 0xb5, 0x60, 0x27, 0x2e, 0x19, 0x12, 0xea, 0xbb, 0xb3, 0xc8, 0xf1, 0xbd, + 0x8b, 0x5b, 0xc7, 0xbb, 0xf2, 0x6f, 0x99, 0x78, 0x09, 0x6f, 0x4f, 0xad, 0x3b, 0xbc, 0xe0, 0xce, + 0x19, 0x85, 0x9e, 0x01, 0x58, 0xb6, 0x1d, 0x12, 0xdb, 0x8a, 0x08, 0xd5, 0xe5, 0xba, 0xd4, 0x2c, + 0xb7, 0x36, 0xd2, 0x6e, 0x6d, 0xdb, 0x0e, 0xf1, 0x0a, 0x8f, 0xde, 0x40, 0x35, 0xb0, 0xc2, 0xc8, + 0xb1, 0xdc, 0xb8, 0x0b, 0xdb, 0xc4, 0xc5, 0x95, 0x43, 0xad, 0x89, 0x4b, 0xae, 0x74, 0xa5, 0x2e, + 0x34, 0x8b, 0xf8, 0x3f, 0x9e, 0x90, 0x6e, 0xea, 0x90, 0xd3, 0x8d, 0x4f, 0x50, 0x4e, 0x87, 0xc3, + 0x77, 0xd8, 0x04, 0x85, 0x32, 0x84, 0xcd, 0xa6, 0xd4, 0x2a, 0x2f, 0xa6, 0xcb, 0xd0, 0xe3, 0x1c, + 0xe6, 0x3c, 0x32, 0xa0, 0x70, 0x6b, 0x85, 0x9e, 0xe3, 0xd9, 0x6c, 0x56, 0xea, 0x71, 0x0e, 0xa7, + 0x40, 0xa7, 0x08, 0x4a, 0x48, 0xe8, 0xcc, 0x8d, 0x1a, 0x03, 0xd8, 0x62, 0xf3, 0xe9, 0x5b, 0xd3, + 0xe5, 0x0a, 0xd6, 0x4a, 0x16, 0xd6, 0x4b, 0x3e, 0x02, 0xb4, 0x5a, 0x90, 0xcb, 0xae, 0x80, 0xec, + 0xc5, 0x00, 0x73, 0x9e, 0x8a, 0x93, 0x00, 0x19, 0x50, 0xe4, 0x8a, 0xa8, 0x2e, 0x32, 0x62, 0x11, + 0x37, 0xae, 0x79, 0x9d, 0x33, 0xcb, 0x9d, 0x2d, 0x95, 0x55, 0x40, 0x66, 0xfe, 0x64, 0x2a, 0x54, + 0x9c, 0x04, 0xeb, 0xf5, 0x8a, 0xeb, 0xf5, 0xf6, 0x60, 0x3b, 0xd3, 0x87, 0x0b, 0xde, 0x05, 0xe5, + 0x0b, 0x43, 0xb8, 0x62, 0x1e, 0xad, 0x93, 0xbc, 0x87, 0x41, 0x5d, 0x78, 0x1c, 0x95, 0xa0, 0x30, + 0xee, 0xbf, 0xed, 0x0f, 0xce, 0xfb, 0x5a, 0x0e, 0xa9, 0x20, 0xbf, 0x1b, 0x9b, 0xf8, 0x83, 0x26, + 0xa0, 0x22, 0xe4, 0xf1, 0xf8, 0xc4, 0xd4, 0xc4, 0x38, 0x63, 0xd8, 0x3b, 0x34, 0xbb, 0x6d, 0xac, + 0x49, 0x71, 0xc6, 0x70, 0x34, 0xc0, 0xa6, 0x96, 0x8f, 0x71, 0x6c, 0x76, 0xcd, 0xde, 0x99, 0xa9, + 0xc9, 0x7b, 0x1d, 0xc8, 0xc7, 0x8e, 0x42, 0x05, 0x90, 0x70, 0xfb, 0x3c, 0x29, 0xd5, 0x1d, 0x8c, + 0xfb, 0x23, 0x4d, 0x88, 0xb1, 0xe1, 0xf8, 0x54, 0x13, 0xe3, 0xc3, 0x69, 0xaf, 0xaf, 0x49, 0xec, + 0xd0, 0x7e, 0x9f, 0xd4, 0x60, 0x59, 0x26, 0xd6, 0xe4, 0xd6, 0x57, 0x11, 0x64, 0x26, 0x0c, 0xbd, + 0x80, 0x7c, 0xfc, 0x22, 0xa0, 0xed, 0xd4, 0x35, 0x2b, 0xef, 0x85, 0x51, 0xc9, 0x82, 0x7c, 0x10, + 0xaf, 0x41, 0x49, 0xac, 0x85, 0x76, 0xb2, 0x56, 0x4b, 0x3f, 0xdb, 0xfd, 0x1b, 0x4e, 0x3e, 0x7c, + 0x2e, 0xa0, 0x2e, 0xc0, 0xd2, 0x0a, 0xa8, 0x9a, 0xf9, 0x1f, 0x57, 0xfd, 0x66, 0x18, 0x8f, 0x51, + 0xbc, 0xff, 0x11, 0x94, 0x56, 0xf6, 0x83, 0xb2, 0xa9, 0x19, 0x73, 0x18, 0xff, 0x3f, 0xca, 0x25, + 0x75, 0x3a, 0xd5, 0x87, 0xdf, 0xb5, 0xdc, 0xc3, 0xbc, 0x26, 0xfc, 0x98, 0xd7, 0x84, 0x5f, 0xf3, + 0x9a, 0xf0, 0xb1, 0xc0, 0x5e, 0xa1, 0x60, 0x32, 0x51, 0xd8, 0xf3, 0xf9, 0xf2, 0x4f, 0x00, 0x00, + 0x00, 0xff, 0xff, 0x33, 0x33, 0x9b, 0x3a, 0x76, 0x05, 0x00, 0x00, } diff --git a/pkg/store/storepb/rpc.proto b/pkg/store/storepb/rpc.proto index 899ddf2aef..2c264f1c15 100644 --- a/pkg/store/storepb/rpc.proto +++ b/pkg/store/storepb/rpc.proto @@ -41,6 +41,7 @@ enum StoreType { RULE = 2; SIDECAR = 3; STORE = 4; + RECEIVE = 5; } message InfoResponse { diff --git a/scripts/quickstart.sh b/scripts/quickstart.sh index 29d6a68059..334d73f2da 100755 --- a/scripts/quickstart.sh +++ b/scripts/quickstart.sh @@ -43,6 +43,8 @@ config: EOF fi +STORES="" + # Start three Prometheus servers monitoring themselves. for i in `seq 1 3` do @@ -89,6 +91,12 @@ done sleep 0.5 +OBJSTORECFG="" +if [ -n "${MINIO_ENABLED}" ] +then +OBJSTORECFG="--objstore.config-file data/bucket.yml" +fi + # Start one sidecar for each Prometheus server. for i in `seq 1 3` do @@ -98,10 +106,10 @@ do --http-address 0.0.0.0:1919${i} \ --prometheus.url http://localhost:909${i} \ --tsdb.path data/prom${i} \ - --objstore.config-file data/bucket.yml \ - --cluster.address 0.0.0.0:1939${i} \ - --cluster.advertise-address 127.0.0.1:1939${i} \ - --cluster.peers 127.0.0.1:19391 & + ${OBJSTORECFG} \ + --cluster.disable & + + STORES="${STORES} --store 127.0.0.1:1909${i}" sleep 0.25 done @@ -116,10 +124,41 @@ then --grpc-address 0.0.0.0:19691 \ --http-address 0.0.0.0:19791 \ --data-dir data/store \ - --objstore.config-file data/bucket.yml \ - --cluster.address 0.0.0.0:19891 \ - --cluster.advertise-address 127.0.0.1:19891 \ - --cluster.peers 127.0.0.1:19391 & + ${OBJSTORECFG} \ + --cluster.disable & + + STORES="${STORES} --store 127.0.0.1:19691" +fi + +sleep 0.5 + +if [ -n "${REMOTE_WRITE_ENABLED}" ] +then + ./thanos receive \ + --debug.name receive \ + --log.level debug \ + --tsdb.path "./data/remote-write-receive-data" \ + --grpc-address 0.0.0.0:19891 \ + --http-address 0.0.0.0:19691 \ + --remote-write.address 0.0.0.0:19291 & + + mkdir -p "data/local-prometheus-data/" + cat < data/local-prometheus-data/prometheus.yml +# When the Thanos remote-write-receive component is started, +# this is an example configuration of a Prometheus server that +# would scrape a local node-exporter and replicate its data to +# the remote write endpoint. +scrape_configs: + - job_name: node + scrape_interval: 1s + static_configs: + - targets: ['localhost:9100'] +remote_write: +- url: http://localhost:19291/api/v1/receive +EOF + ./prometheus --config.file data/local-prometheus-data/prometheus.yml --storage.tsdb.path "data/local-prometheus-data/" & + + STORES="${STORES} --store 127.0.0.1:19891" fi sleep 0.5 @@ -131,9 +170,8 @@ do --debug.name query-${i} \ --grpc-address 0.0.0.0:1999${i} \ --http-address 0.0.0.0:1949${i} \ - --cluster.address 0.0.0.0:1959${i} \ - --cluster.advertise-address 127.0.0.1:1959${i} \ - --cluster.peers 127.0.0.1:19391 & + ${STORES} \ + --cluster.disable & done wait diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index b9a345ef71..bc1c458857 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -4,9 +4,11 @@ import ( "context" "fmt" "net/url" + "os" "testing" "time" + "github.com/go-kit/kit/log" "github.com/improbable-eng/thanos/pkg/promclient" "github.com/improbable-eng/thanos/pkg/runutil" "github.com/improbable-eng/thanos/pkg/testutil" @@ -20,7 +22,8 @@ type testConfig struct { } var ( - firstPromPort = promHTTPPort(1) + firstPromPort = promHTTPPort(1) + remoteWriteEndpoint = fmt.Sprintf("http://%s/api/v1/receive", remoteWriteReceiveHTTP(1)) queryGossipSuite = newSpinupSuite(). Add(scraper(1, defaultPromConfig("prom-"+firstPromPort, 0), true)). @@ -33,15 +36,17 @@ var ( Add(scraper(1, defaultPromConfig("prom-"+firstPromPort, 0), false)). Add(scraper(2, defaultPromConfig("prom-ha", 0), false)). Add(scraper(3, defaultPromConfig("prom-ha", 1), false)). - Add(querierWithStoreFlags(1, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3)), ""). - Add(querierWithStoreFlags(2, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3)), "") + Add(querierWithStoreFlags(1, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3), remoteWriteReceiveGRPC(1)), ""). + Add(querierWithStoreFlags(2, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3), remoteWriteReceiveGRPC(1)), ""). + Add(receiver(1, defaultPromRemoteWriteConfig(remoteWriteEndpoint)), "") queryFileSDSuite = newSpinupSuite(). Add(scraper(1, defaultPromConfig("prom-"+firstPromPort, 0), false)). Add(scraper(2, defaultPromConfig("prom-ha", 0), false)). Add(scraper(3, defaultPromConfig("prom-ha", 1), false)). - Add(querierWithFileSD(1, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3)), ""). - Add(querierWithFileSD(2, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3)), "") + Add(querierWithFileSD(1, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3), remoteWriteReceiveGRPC(1)), ""). + Add(querierWithFileSD(2, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3), remoteWriteReceiveGRPC(1)), ""). + Add(receiver(1, defaultPromRemoteWriteConfig(remoteWriteEndpoint)), "") ) func TestQuery(t *testing.T) { @@ -84,12 +89,16 @@ func testQuerySimple(t *testing.T, conf testConfig) { var res model.Vector + w := log.NewSyncWriter(os.Stderr) + l := log.NewLogfmtLogger(w) + l = log.With(l, "conf-name", conf.name) + // Try query without deduplication. - testutil.Ok(t, runutil.Retry(time.Second, ctx.Done(), func() error { + testutil.Ok(t, runutil.RetryWithLog(l, time.Second, ctx.Done(), func() error { select { case <-exit: cancel() - return nil + return errors.Errorf("exiting test, possibly due to timeout") default: } @@ -98,8 +107,12 @@ func testQuerySimple(t *testing.T, conf testConfig) { if err != nil { return err } - if len(res) != 3 { - return errors.Errorf("unexpected result size %d", len(res)) + expectedRes := 4 + if conf.name == "gossip" { + expectedRes = 3 + } + if len(res) != expectedRes { + return errors.Errorf("unexpected result size %d, expected %d", len(res), expectedRes) } return nil })) @@ -127,6 +140,14 @@ func testQuerySimple(t *testing.T, conf testConfig) { "replica": model.LabelValue("1"), }, res[2].Metric) + if conf.name != "gossip" { + testutil.Equals(t, model.Metric{ + "__name__": "up", + "instance": model.LabelValue("localhost:9100"), + "job": "node", + }, res[3].Metric) + } + // Try query with deduplication. testutil.Ok(t, runutil.Retry(time.Second, ctx.Done(), func() error { select { @@ -141,8 +162,12 @@ func testQuerySimple(t *testing.T, conf testConfig) { if err != nil { return err } - if len(res) != 2 { - return errors.Errorf("unexpected result size for query with deduplication %d", len(res)) + expectedRes := 3 + if conf.name == "gossip" { + expectedRes = 2 + } + if len(res) != expectedRes { + return errors.Errorf("unexpected result size %d, expected %d", len(res), expectedRes) } return nil @@ -160,6 +185,13 @@ func testQuerySimple(t *testing.T, conf testConfig) { "job": "prometheus", "prometheus": "prom-ha", }, res[1].Metric) + if conf.name != "gossip" { + testutil.Equals(t, model.Metric{ + "__name__": "up", + "instance": model.LabelValue("localhost:9100"), + "job": "node", + }, res[2].Metric) + } } func urlParse(t *testing.T, addr string) *url.URL { @@ -183,3 +215,14 @@ scrape_configs: - "localhost:%s" `, name, replicas, firstPromPort) } + +func defaultPromRemoteWriteConfig(remoteWriteEndpoint string) string { + return fmt.Sprintf(` +scrape_configs: +- job_name: 'node' + static_configs: + - targets: ['localhost:9100'] +remote_write: +- url: "%s" +`, remoteWriteEndpoint) +} diff --git a/test/e2e/spinup_test.go b/test/e2e/spinup_test.go index c2ac1e401a..cdf8dd18d6 100644 --- a/test/e2e/spinup_test.go +++ b/test/e2e/spinup_test.go @@ -25,7 +25,8 @@ var ( promHTTPPort = func(i int) string { return fmt.Sprintf("%d", 9090+i) } // We keep this one with localhost, to have perfect match with what Prometheus will expose in up metric. - promHTTP = func(i int) string { return fmt.Sprintf("localhost:%s", promHTTPPort(i)) } + promHTTP = func(i int) string { return fmt.Sprintf("localhost:%s", promHTTPPort(i)) } + promRemoteWriteHTTP = func(i int) string { return fmt.Sprintf("localhost:%s", promHTTPPort(100+i)) } sidecarGRPC = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19090+i) } sidecarHTTP = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19190+i) } @@ -39,6 +40,10 @@ var ( rulerHTTP = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19890+i) } rulerCluster = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19990+i) } + remoteWriteReceiveHTTP = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 18690+i) } + remoteWriteReceiveGRPC = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 18790+i) } + remoteWriteReceiveMetricHTTP = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 18890+i) } + storeGatewayGRPC = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 20090+i) } storeGatewayHTTP = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 20190+i) } @@ -115,6 +120,40 @@ func scraper(i int, config string, gossip bool) (cmdScheduleFunc, string) { }, gossipAddress } +func receiver(i int, config string) cmdScheduleFunc { + return func(workDir string, clusterPeerFlags []string) ([]*exec.Cmd, error) { + promDir := fmt.Sprintf("%s/data/remote-write-prom%d", workDir, i) + if err := os.MkdirAll(promDir, 0777); err != nil { + return nil, errors.Wrap(err, "create prom dir failed") + } + + if err := ioutil.WriteFile(promDir+"/prometheus.yml", []byte(config), 0666); err != nil { + return nil, errors.Wrap(err, "creating prom config failed") + } + + var cmds []*exec.Cmd + cmds = append(cmds, exec.Command(testutil.PrometheusBinary(), + "--config.file", promDir+"/prometheus.yml", + "--storage.tsdb.path", promDir, + "--log.level", "info", + "--web.listen-address", promRemoteWriteHTTP(i), + )) + args := []string{ + "receive", + "--debug.name", fmt.Sprintf("remote-write-receive-%d", i), + "--grpc-address", remoteWriteReceiveGRPC(i), + "--http-address", remoteWriteReceiveMetricHTTP(i), + "--remote-write.address", remoteWriteReceiveHTTP(i), + "--tsdb.path", promDir, + "--log.level", "debug", + } + + cmds = append(cmds, exec.Command("thanos", args...)) + + return cmds, nil + } +} + func querier(i int, replicaLabel string, staticStores ...string) cmdScheduleFunc { return func(_ string, clusterPeerFlags []string) ([]*exec.Cmd, error) { args := append(defaultQuerierFlags(i, replicaLabel), From bc3aaab8af9130a0992b5929839a8a4ee9a876c8 Mon Sep 17 00:00:00 2001 From: Martin Dickson Date: Tue, 12 Mar 2019 16:58:03 +0000 Subject: [PATCH 02/43] Compact: skip compaction for blocks with no samples (#904) * skip compaction for blocks with no samples * update to actually delete the empty input blocks, and to correctly handle from bucket compactor * warn on error deleting empty block * use ULID instead of error for emptyBlockSentinel * don't use a global variable * full stop at end of comment * use boolean to indicate whether there is more compaction work * rename variables * fix test --- pkg/compact/compact.go | 124 +++++++++++++++++++------------- pkg/compact/compact_e2e_test.go | 8 +-- 2 files changed, 78 insertions(+), 54 deletions(-) diff --git a/pkg/compact/compact.go b/pkg/compact/compact.go index 5eea403d01..ca05adfb4a 100644 --- a/pkg/compact/compact.go +++ b/pkg/compact/compact.go @@ -513,23 +513,23 @@ func (cg *Group) Resolution() int64 { // Compact plans and runs a single compaction against the group. The compacted result // is uploaded into the bucket the blocks were retrieved from. -func (cg *Group) Compact(ctx context.Context, dir string, comp tsdb.Compactor) (ulid.ULID, error) { +func (cg *Group) Compact(ctx context.Context, dir string, comp tsdb.Compactor) (bool, ulid.ULID, error) { subDir := filepath.Join(dir, cg.Key()) if err := os.RemoveAll(subDir); err != nil { - return ulid.ULID{}, errors.Wrap(err, "clean compaction group dir") + return false, ulid.ULID{}, errors.Wrap(err, "clean compaction group dir") } if err := os.MkdirAll(subDir, 0777); err != nil { - return ulid.ULID{}, errors.Wrap(err, "create compaction group dir") + return false, ulid.ULID{}, errors.Wrap(err, "create compaction group dir") } - compID, err := cg.compact(ctx, subDir, comp) + shouldRerun, compID, err := cg.compact(ctx, subDir, comp) if err != nil { cg.compactionFailures.Inc() } cg.compactions.Inc() - return compID, err + return shouldRerun, compID, err } // Issue347Error is a type wrapper for errors that should invoke repair process for broken block. @@ -688,13 +688,13 @@ func RepairIssue347(ctx context.Context, logger log.Logger, bkt objstore.Bucket, return nil } -func (cg *Group) compact(ctx context.Context, dir string, comp tsdb.Compactor) (compID ulid.ULID, err error) { +func (cg *Group) compact(ctx context.Context, dir string, comp tsdb.Compactor) (shouldRerun bool, compID ulid.ULID, err error) { cg.mtx.Lock() defer cg.mtx.Unlock() // Check for overlapped blocks. if err := cg.areBlocksOverlapping(nil); err != nil { - return compID, halt(errors.Wrap(err, "pre compaction overlap check")) + return false, ulid.ULID{}, halt(errors.Wrap(err, "pre compaction overlap check")) } // Planning a compaction works purely based on the meta.json files in our future group's dir. @@ -702,21 +702,21 @@ func (cg *Group) compact(ctx context.Context, dir string, comp tsdb.Compactor) ( for _, meta := range cg.blocks { bdir := filepath.Join(dir, meta.ULID.String()) if err := os.MkdirAll(bdir, 0777); err != nil { - return compID, errors.Wrap(err, "create planning block dir") + return false, ulid.ULID{}, errors.Wrap(err, "create planning block dir") } if err := metadata.Write(cg.logger, bdir, meta); err != nil { - return compID, errors.Wrap(err, "write planning meta file") + return false, ulid.ULID{}, errors.Wrap(err, "write planning meta file") } } // Plan against the written meta.json files. plan, err := comp.Plan(dir) if err != nil { - return compID, errors.Wrap(err, "plan compaction") + return false, ulid.ULID{}, errors.Wrap(err, "plan compaction") } if len(plan) == 0 { // Nothing to do. - return compID, nil + return false, ulid.ULID{}, nil } // Due to #183 we verify that none of the blocks in the plan have overlapping sources. @@ -729,45 +729,45 @@ func (cg *Group) compact(ctx context.Context, dir string, comp tsdb.Compactor) ( for _, pdir := range plan { meta, err := metadata.Read(pdir) if err != nil { - return compID, errors.Wrapf(err, "read meta from %s", pdir) + return false, ulid.ULID{}, errors.Wrapf(err, "read meta from %s", pdir) } if cg.Key() != GroupKey(*meta) { - return compID, halt(errors.Wrapf(err, "compact planned compaction for mixed groups. group: %s, planned block's group: %s", cg.Key(), GroupKey(*meta))) + return false, ulid.ULID{}, halt(errors.Wrapf(err, "compact planned compaction for mixed groups. group: %s, planned block's group: %s", cg.Key(), GroupKey(*meta))) } for _, s := range meta.Compaction.Sources { if _, ok := uniqueSources[s]; ok { - return compID, halt(errors.Errorf("overlapping sources detected for plan %v", plan)) + return false, ulid.ULID{}, halt(errors.Errorf("overlapping sources detected for plan %v", plan)) } uniqueSources[s] = struct{}{} } id, err := ulid.Parse(filepath.Base(pdir)) if err != nil { - return compID, errors.Wrapf(err, "plan dir %s", pdir) + return false, ulid.ULID{}, errors.Wrapf(err, "plan dir %s", pdir) } if meta.ULID.Compare(id) != 0 { - return compID, errors.Errorf("mismatch between meta %s and dir %s", meta.ULID, id) + return false, ulid.ULID{}, errors.Errorf("mismatch between meta %s and dir %s", meta.ULID, id) } if err := block.Download(ctx, cg.logger, cg.bkt, id, pdir); err != nil { - return compID, retry(errors.Wrapf(err, "download block %s", id)) + return false, ulid.ULID{}, retry(errors.Wrapf(err, "download block %s", id)) } // Ensure all input blocks are valid. stats, err := block.GatherIndexIssueStats(cg.logger, filepath.Join(pdir, block.IndexFilename), meta.MinTime, meta.MaxTime) if err != nil { - return compID, errors.Wrapf(err, "gather index issues for block %s", pdir) + return false, ulid.ULID{}, errors.Wrapf(err, "gather index issues for block %s", pdir) } if err := stats.CriticalErr(); err != nil { - return compID, halt(errors.Wrapf(err, "block with not healthy index found %s; Compaction level %v; Labels: %v", pdir, meta.Compaction.Level, meta.Thanos.Labels)) + return false, ulid.ULID{}, halt(errors.Wrapf(err, "block with not healthy index found %s; Compaction level %v; Labels: %v", pdir, meta.Compaction.Level, meta.Thanos.Labels)) } if err := stats.Issue347OutsideChunksErr(); err != nil { - return compID, issue347Error(errors.Wrapf(err, "invalid, but reparable block %s", pdir), meta.ULID) + return false, ulid.ULID{}, issue347Error(errors.Wrapf(err, "invalid, but reparable block %s", pdir), meta.ULID) } } level.Debug(cg.logger).Log("msg", "downloaded and verified blocks", @@ -777,7 +777,25 @@ func (cg *Group) compact(ctx context.Context, dir string, comp tsdb.Compactor) ( compID, err = comp.Compact(dir, plan, nil) if err != nil { - return compID, halt(errors.Wrapf(err, "compact blocks %v", plan)) + return false, ulid.ULID{}, halt(errors.Wrapf(err, "compact blocks %v", plan)) + } + if compID == (ulid.ULID{}) { + // Prometheus compactor found that the compacted block would have no samples. + level.Info(cg.logger).Log("msg", "compacted block would have no samples, deleting source blocks", "blocks", fmt.Sprintf("%v", plan)) + for _, block := range plan { + meta, err := metadata.Read(block) + if err != nil { + level.Warn(cg.logger).Log("msg", "failed to read meta for block", "block", block) + continue + } + if meta.Stats.NumSamples == 0 { + if err := cg.deleteBlock(block); err != nil { + level.Warn(cg.logger).Log("msg", "failed to delete empty block found during compaction", "block", block) + } + } + } + // Even though this block was empty, there may be more work to do + return true, ulid.ULID{}, nil } level.Debug(cg.logger).Log("msg", "compacted blocks", "blocks", fmt.Sprintf("%v", plan), "duration", time.Since(begin)) @@ -790,27 +808,27 @@ func (cg *Group) compact(ctx context.Context, dir string, comp tsdb.Compactor) ( Source: metadata.CompactorSource, }, nil) if err != nil { - return compID, errors.Wrapf(err, "failed to finalize the block %s", bdir) + return false, ulid.ULID{}, errors.Wrapf(err, "failed to finalize the block %s", bdir) } if err = os.Remove(filepath.Join(bdir, "tombstones")); err != nil { - return compID, errors.Wrap(err, "remove tombstones") + return false, ulid.ULID{}, errors.Wrap(err, "remove tombstones") } // Ensure the output block is valid. if err := block.VerifyIndex(cg.logger, filepath.Join(bdir, block.IndexFilename), newMeta.MinTime, newMeta.MaxTime); err != nil { - return compID, halt(errors.Wrapf(err, "invalid result block %s", bdir)) + return false, ulid.ULID{}, halt(errors.Wrapf(err, "invalid result block %s", bdir)) } // Ensure the output block is not overlapping with anything else. if err := cg.areBlocksOverlapping(newMeta, plan...); err != nil { - return compID, halt(errors.Wrapf(err, "resulted compacted block %s overlaps with something", bdir)) + return false, ulid.ULID{}, halt(errors.Wrapf(err, "resulted compacted block %s overlaps with something", bdir)) } begin = time.Now() if err := block.Upload(ctx, cg.logger, cg.bkt, bdir); err != nil { - return compID, retry(errors.Wrapf(err, "upload of %s failed", compID)) + return false, ulid.ULID{}, retry(errors.Wrapf(err, "upload of %s failed", compID)) } level.Debug(cg.logger).Log("msg", "uploaded block", "result_block", compID, "duration", time.Since(begin)) @@ -818,27 +836,33 @@ func (cg *Group) compact(ctx context.Context, dir string, comp tsdb.Compactor) ( // into the next planning cycle. // Eventually the block we just uploaded should get synced into the group again (including sync-delay). for _, b := range plan { - id, err := ulid.Parse(filepath.Base(b)) - if err != nil { - return compID, errors.Wrapf(err, "plan dir %s", b) + if err := cg.deleteBlock(b); err != nil { + return false, ulid.ULID{}, retry(errors.Wrapf(err, "delete old block from bucket")) } + cg.groupGarbageCollectedBlocks.Inc() + } - if err := os.RemoveAll(b); err != nil { - return compID, errors.Wrapf(err, "remove old block dir %s", id) - } + return true, compID, nil +} - // Spawn a new context so we always delete a block in full on shutdown. - delCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) - level.Info(cg.logger).Log("msg", "deleting compacted block", "old_block", id, "result_block", compID) - err = block.Delete(delCtx, cg.bkt, id) - cancel() - if err != nil { - return compID, retry(errors.Wrapf(err, "delete old block %s from bucket ", id)) - } - cg.groupGarbageCollectedBlocks.Inc() +func (cg *Group) deleteBlock(b string) error { + id, err := ulid.Parse(filepath.Base(b)) + if err != nil { + return errors.Wrapf(err, "plan dir %s", b) + } + + if err := os.RemoveAll(b); err != nil { + return errors.Wrapf(err, "remove old block dir %s", id) } - return compID, nil + // Spawn a new context so we always delete a block in full on shutdown. + delCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + level.Info(cg.logger).Log("msg", "deleting compacted block", "old_block", id) + if err := block.Delete(delCtx, cg.bkt, id); err != nil { + return errors.Wrapf(err, "delete block %s from bucket", id) + } + return nil } // BucketCompactor compacts blocks in a bucket. @@ -882,31 +906,31 @@ func (c *BucketCompactor) Compact(ctx context.Context) error { return errors.Wrap(err, "garbage") } + level.Info(c.logger).Log("msg", "start of compaction") + groups, err := c.sy.Groups() if err != nil { return errors.Wrap(err, "build compaction groups") } - done := true + finishedAllGroups := true for _, g := range groups { - id, err := g.Compact(ctx, c.compactDir, c.comp) + shouldRerunGroup, _, err := g.Compact(ctx, c.compactDir, c.comp) if err == nil { - // If the returned ID has a zero value, the group had no blocks to be compacted. - // We keep going through the outer loop until no group has any work left. - if id != (ulid.ULID{}) { - done = false + if shouldRerunGroup { + finishedAllGroups = false } continue } if IsIssue347Error(err) { if err := RepairIssue347(ctx, c.logger, c.bkt, err); err == nil { - done = false + finishedAllGroups = false continue } } return errors.Wrap(err, "compaction") } - if done { + if finishedAllGroups { break } } diff --git a/pkg/compact/compact_e2e_test.go b/pkg/compact/compact_e2e_test.go index a2073ad633..6f2194c41f 100644 --- a/pkg/compact/compact_e2e_test.go +++ b/pkg/compact/compact_e2e_test.go @@ -253,18 +253,18 @@ func TestGroup_Compact_e2e(t *testing.T) { comp, err := tsdb.NewLeveledCompactor(nil, log.NewLogfmtLogger(os.Stderr), []int64{1000, 3000}, nil) testutil.Ok(t, err) - id, err := g.Compact(ctx, dir, comp) + shouldRerun, id, err := g.Compact(ctx, dir, comp) testutil.Ok(t, err) - testutil.Assert(t, id == ulid.ULID{}, "group should be empty, but somehow compaction took place") + testutil.Assert(t, !shouldRerun, "group should be empty, but compactor did a compaction and told us to rerun") // Add all metas that would be gathered by syncMetas. for _, m := range metas { testutil.Ok(t, g.Add(m)) } - id, err = g.Compact(ctx, dir, comp) + shouldRerun, id, err = g.Compact(ctx, dir, comp) testutil.Ok(t, err) - testutil.Assert(t, id != ulid.ULID{}, "no compaction took place") + testutil.Assert(t, shouldRerun, "there should be compactible data, but the compactor reported there was not") resDir := filepath.Join(dir, id.String()) testutil.Ok(t, block.Download(ctx, log.NewNopLogger(), bkt, id, resDir)) From 910d43865ee4ea44a4346284048c21d0d32230f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Tue, 12 Mar 2019 16:58:14 +0000 Subject: [PATCH 03/43] Updated changelog to make sure the important change with index-cache-size is known. (#913) Signed-off-by: Bartek Plotka --- .circleci/config.yml | 4 ++-- CHANGELOG.md | 9 ++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 3fc9bfc2bf..9fd45ea5d6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -59,7 +59,7 @@ jobs: publish_master: docker: # Available from https://hub.docker.com/r/circleci/golang/ - - image: circleci/golang:1.10 + - image: circleci/golang:1.12 working_directory: /go/src/github.com/improbable-eng/thanos steps: - checkout @@ -76,7 +76,7 @@ jobs: publish_release: docker: # Available from https://hub.docker.com/r/circleci/golang/ - - image: circleci/golang:1.10 + - image: circleci/golang:1.12 working_directory: /go/src/github.com/improbable-eng/thanos steps: - checkout diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f8a52ddd4..0a3f3b5c9f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,11 +11,18 @@ We use *breaking* word for marking changes that are not backward compatible (rel ## Unreleased +### Added +- [#811](https://github.com/improbable-eng/thanos/pull/811) Remote write receiver + ## [v0.3.2](https://github.com/improbable-eng/thanos/releases/tag/v0.3.2) - 2019.03.04 ### Added - [#851](https://github.com/improbable-eng/thanos/pull/851) New read API endpoint for api/v1/rules and api/v1/alerts. -- [#873](https://github.com/improbable-eng/thanos/pull/873) Store: fix set index cache LRU. +- [#873](https://github.com/improbable-eng/thanos/pull/873) Store: fix set index cache LRU + +:warning: **WARING** :warning: #873 fix fixes actual handling of `index-cache-size`. Handling of limit for this cache was +broken so it was unbounded all the time. From this release actual value matters and is extremely low by default. To "revert" +the old behaviour (no boundary), use a large enough value. ### Fixed - [#833](https://github.com/improbable-eng/thanos/issues/833) Store Gateway matcher regression for intersecting with empty posting. From 2b8669265ebcb3b60bbf4dadb887a504a4bfa56e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Wed, 13 Mar 2019 18:01:41 +0000 Subject: [PATCH 04/43] Updated maintainers' page. (#914) Signed-off-by: Bartek Plotka --- MAINTAINERS.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS.md b/MAINTAINERS.md index bd603ba280..a931dc3a4e 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -1,9 +1,11 @@ # Core Maintainers of this repository -| Name | Email | GitHub | -|---------------------|-----------------------|------------------------------------------| -| Bartłomiej Płotka | bartek@improbable.io | [@bwplotka](https://github.com/bwplotka) | -| Dominic Green | dom@improbable.io | [@domgreen](https://github.com/domgreen) | +| Name | Email | Slack | GitHub | +|-----------------------|------------------------|--------------------------|------------------------------------------------------------| +| Bartłomiej Płotka | bwplotka@gmail.com | `@bwplotka` | [@bwplotka](https://github.com/bwplotka) | +| Dominic Green | dom@improbable.io | `@domgreen` | [@domgreen](https://github.com/domgreen) | +| Frederic Branczyk | fbranczyk@gmail.com | `@brancz` | [@brancz](https://github.com/brancz) | +| Giedrius Statkevičius | giedriuswork@gmail.com | `@Giedrius Statkevičius` | [@GiedriusS](https://github.com/GiedriusS) | ## Storage plugins maintainers From 7fab732bbbac44f40769641e2facd48aca69abd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Giedrius=20Statkevi=C4=8Dius?= Date: Thu, 14 Mar 2019 15:23:13 +0200 Subject: [PATCH 05/43] objstore: do not set last successful time metric before 1st upload (#921) Convert lastSuccessfullUploadTime into a GaugeVec with one dimension: bucket, which makes client_golang *not* make that metric unless at least one value has appeared. This is better for users since Grafana will show that last upload has been 49+ years ago instead of N/A when no uploads had happened before. Fixes #886. --- CHANGELOG.md | 3 +++ pkg/objstore/objstore.go | 12 ++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a3f3b5c9f..5dd5fdd7a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,9 @@ We use *breaking* word for marking changes that are not backward compatible (rel ### Added - [#811](https://github.com/improbable-eng/thanos/pull/811) Remote write receiver +### Fixed +- [#921](https://github.com/improbable-eng/thanos/pull/921) `thanos_objstore_bucket_last_successful_upload_time` now does not appear when no blocks have been uploaded so far + ## [v0.3.2](https://github.com/improbable-eng/thanos/releases/tag/v0.3.2) - 2019.03.04 ### Added diff --git a/pkg/objstore/objstore.go b/pkg/objstore/objstore.go index 2790b29c04..4b241e972a 100644 --- a/pkg/objstore/objstore.go +++ b/pkg/objstore/objstore.go @@ -194,10 +194,10 @@ func BucketWithMetrics(name string, b Bucket, r prometheus.Registerer) Bucket { ConstLabels: prometheus.Labels{"bucket": name}, Buckets: []float64{0.005, 0.01, 0.02, 0.04, 0.08, 0.15, 0.3, 0.6, 1, 1.5, 2.5, 5, 10, 20, 30}, }, []string{"operation"}), - lastSuccessfullUploadTime: prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "thanos_objstore_bucket_last_successful_upload_time", - Help: "Second timestamp of the last successful upload to the bucket.", - ConstLabels: prometheus.Labels{"bucket": name}}), + lastSuccessfullUploadTime: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "thanos_objstore_bucket_last_successful_upload_time", + Help: "Second timestamp of the last successful upload to the bucket.", + }, []string{"bucket"}), } if r != nil { r.MustRegister(bkt.ops, bkt.opsFailures, bkt.opsDuration, bkt.lastSuccessfullUploadTime) @@ -211,7 +211,7 @@ type metricBucket struct { ops *prometheus.CounterVec opsFailures *prometheus.CounterVec opsDuration *prometheus.HistogramVec - lastSuccessfullUploadTime prometheus.Gauge + lastSuccessfullUploadTime *prometheus.GaugeVec } func (b *metricBucket) Iter(ctx context.Context, dir string, f func(name string) error) error { @@ -287,7 +287,7 @@ func (b *metricBucket) Upload(ctx context.Context, name string, r io.Reader) err b.opsFailures.WithLabelValues(op).Inc() } else { //TODO: Use SetToCurrentTime() once we update the Prometheus client_golang - b.lastSuccessfullUploadTime.Set(float64(time.Now().UnixNano()) / 1e9) + b.lastSuccessfullUploadTime.WithLabelValues(b.bkt.Name()).Set(float64(time.Now().UnixNano()) / 1e9) } b.ops.WithLabelValues(op).Inc() b.opsDuration.WithLabelValues(op).Observe(time.Since(start).Seconds()) From 51ff2678532981b3e124016f0d9a071b04f0ea90 Mon Sep 17 00:00:00 2001 From: Povilas Versockas Date: Fri, 15 Mar 2019 12:48:44 +0200 Subject: [PATCH 06/43] Simplify Thanos Query Get Stores (#926) --- cmd/thanos/query.go | 4 +--- pkg/store/proxy.go | 19 ++++--------------- pkg/store/proxy_test.go | 27 +++++---------------------- 3 files changed, 10 insertions(+), 40 deletions(-) diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index a69ac0df1a..df86dbad0c 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -304,9 +304,7 @@ func runQuery( }, dialOpts, ) - proxy = store.NewProxyStore(logger, func(context.Context) ([]store.Client, error) { - return stores.Get(), nil - }, component.Query, selectorLset) + proxy = store.NewProxyStore(logger, stores.Get, component.Query, selectorLset) queryableCreator = query.NewQueryableCreator(logger, proxy, replicaLabel) engine = promql.NewEngine( promql.EngineOpts{ diff --git a/pkg/store/proxy.go b/pkg/store/proxy.go index e9f3cfdc74..556bf77da6 100644 --- a/pkg/store/proxy.go +++ b/pkg/store/proxy.go @@ -37,7 +37,7 @@ type Client interface { // ProxyStore implements the store API that proxies request to all given underlying stores. type ProxyStore struct { logger log.Logger - stores func(context.Context) ([]Client, error) + stores func() []Client component component.StoreAPI selectorLabels labels.Labels } @@ -46,7 +46,7 @@ type ProxyStore struct { // Note that there is no deduplication support. Deduplication should be done on the highest level (just before PromQL) func NewProxyStore( logger log.Logger, - stores func(context.Context) ([]Client, error), + stores func() []Client, component component.StoreAPI, selectorLabels labels.Labels, ) *ProxyStore { @@ -109,13 +109,6 @@ func (s *ProxyStore) Series(r *storepb.SeriesRequest, srv storepb.Store_SeriesSe return nil } - stores, err := s.stores(srv.Context()) - if err != nil { - err = errors.Wrap(err, "failed to get store APIs") - level.Error(s.logger).Log("err", err) - return status.Errorf(codes.Unknown, err.Error()) - } - var ( g, gctx = errgroup.WithContext(srv.Context()) @@ -144,7 +137,7 @@ func (s *ProxyStore) Series(r *storepb.SeriesRequest, srv storepb.Store_SeriesSe closeFn() }() - for _, st := range stores { + for _, st := range s.stores() { // We might be able to skip the store if its meta information indicates // it cannot have series matching our query. // NOTE: all matchers are validated in labelsMatches method so we explicitly ignore error. @@ -337,11 +330,7 @@ func (s *ProxyStore) LabelValues(ctx context.Context, r *storepb.LabelValuesRequ g, gctx = errgroup.WithContext(ctx) ) - stores, err := s.stores(ctx) - if err != nil { - return nil, status.Errorf(codes.Unknown, err.Error()) - } - for _, st := range stores { + for _, st := range s.stores() { store := st g.Go(func() error { resp, err := store.LabelValues(gctx, &storepb.LabelValuesRequest{ diff --git a/pkg/store/proxy_test.go b/pkg/store/proxy_test.go index db1819cebb..fa3e7e110a 100644 --- a/pkg/store/proxy_test.go +++ b/pkg/store/proxy_test.go @@ -42,23 +42,6 @@ func (c *testClient) String() string { return "test" } -func TestProxyStore_Series_StoresFetchFail(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - - q := NewProxyStore(nil, - func(_ context.Context) ([]Client, error) { return nil, errors.New("Fail") }, - component.Query, - nil, - ) - - s := newStoreSeriesServer(context.Background()) - testutil.NotOk(t, q.Series(&storepb.SeriesRequest{ - MinTime: 1, - MaxTime: 300, - Matchers: []storepb.LabelMatcher{{Name: "a", Value: "a", Type: storepb.LabelMatcher_EQ}}, - }, s)) -} - func TestProxyStore_Info(t *testing.T) { defer leaktest.CheckTimeout(t, 10*time.Second)() @@ -66,7 +49,7 @@ func TestProxyStore_Info(t *testing.T) { defer cancel() q := NewProxyStore(nil, - func(context.Context) ([]Client, error) { return nil, nil }, + func() []Client { return nil }, component.Query, nil, ) @@ -419,7 +402,7 @@ func TestProxyStore_Series(t *testing.T) { } { if ok := t.Run(tc.title, func(t *testing.T) { q := NewProxyStore(nil, - func(_ context.Context) ([]Client, error) { return tc.storeAPIs, nil }, // what if err? + func() []Client { return tc.storeAPIs }, component.Query, tc.selectorLabels, ) @@ -460,7 +443,7 @@ func TestProxyStore_Series_RequestParamsProxied(t *testing.T) { }, } q := NewProxyStore(nil, - func(context.Context) ([]Client, error) { return cls, nil }, + func() []Client { return cls }, component.Query, nil, ) @@ -518,7 +501,7 @@ func TestProxyStore_Series_RegressionFillResponseChannel(t *testing.T) { } q := NewProxyStore(nil, - func(context.Context) ([]Client, error) { return cls, nil }, + func() []Client { return cls }, component.Query, tlabels.FromStrings("fed", "a"), ) @@ -555,7 +538,7 @@ func TestProxyStore_LabelValues(t *testing.T) { }}, } q := NewProxyStore(nil, - func(context.Context) ([]Client, error) { return cls, nil }, + func() []Client { return cls }, component.Query, nil, ) From e56881354bc717f17a0bca5bb5b75d33c725fe80 Mon Sep 17 00:00:00 2001 From: Natalie Fioretti Date: Fri, 15 Mar 2019 12:23:08 +0100 Subject: [PATCH 07/43] fixed typo (#927) fixed a typo that obviously explained something wrong --- docs/getting_started.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting_started.md b/docs/getting_started.md index 57423f56bc..9c1f933e6a 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -235,7 +235,7 @@ thanos store \ --grpc-address 0.0.0.0:19090 # GRPC endpoint for StoreAPI ``` -The store gateway occupies small amounts of disk space for caching basic information about data in the object storage. This will rarely exceed more than a few gigabytes and is used to improve restart times. It is not useful but not required to preserve it across restarts. +The store gateway occupies small amounts of disk space for caching basic information about data in the object storage. This will rarely exceed more than a few gigabytes and is used to improve restart times. It is useful but not required to preserve it across restarts. * _[Example Kubernetes manifest](../tutorials/kubernetes-demo/manifests/thanos-store-gateway.yaml)_ From 62e0a98d5ffc050e4be1cd13e82df1a7bd919c95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Fri, 15 Mar 2019 13:54:00 +0000 Subject: [PATCH 08/43] Update design with detail about meta.json (#929) --- docs/design.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/design.md b/docs/design.md index 7ee1ca0a64..1b85081cd2 100644 --- a/docs/design.md +++ b/docs/design.md @@ -43,7 +43,8 @@ A blocks top-level directory is a ULID (like UUID but lexicographically sortable Those block files can be backed up to an object storage and later be queried by another component (see below). -All data is uploaded as it is created by the Prometheus server/storage engine. The `meta.json` file may be extended by a `thanos` section, to which Thanos-specific metadata can be added. Currently this is limited to the "external labels" the producer of the block has assigned. This later helps in filtering blocks for querying without accessing their data files. +All data is uploaded as it is created by the Prometheus server/storage engine. The `meta.json` file may be extended by a `thanos` section, to which Thanos-specific metadata can be added. Currently this it includes the "external labels" the producer of the block has assigned. This later helps in filtering blocks for querying without accessing their data files. +The meta.json is updated during upload time on sidecars. ``` From 166d54e69aa43038c512e7c0c6e61cb00adea2d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Tue, 19 Mar 2019 08:25:28 +0000 Subject: [PATCH 09/43] Added safeguard against 847. Make sure debug stats are always printed, even on error. (#889) Signed-off-by: Bartek Plotka --- pkg/store/bucket.go | 56 ++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/pkg/store/bucket.go b/pkg/store/bucket.go index 775210e3a1..6c38ee81a7 100644 --- a/pkg/store/bucket.go +++ b/pkg/store/bucket.go @@ -664,7 +664,7 @@ func debugFoundBlockSetOverview(logger log.Logger, mint, maxt int64, lset labels // TODO(bwplotka): It buffers all chunks in memory and only then streams to client. // 1. Either count chunk sizes and error out too big query. // 2. Stream posting -> series -> chunk all together. -func (s *BucketStore) Series(req *storepb.SeriesRequest, srv storepb.Store_SeriesServer) error { +func (s *BucketStore) Series(req *storepb.SeriesRequest, srv storepb.Store_SeriesServer) (err error) { matchers, err := translateMatchers(req.Matchers) if err != nil { return status.Error(codes.InvalidArgument, err.Error()) @@ -731,6 +731,25 @@ func (s *BucketStore) Series(req *storepb.SeriesRequest, srv storepb.Store_Serie s.mtx.RUnlock() + defer func() { + s.metrics.seriesDataTouched.WithLabelValues("postings").Observe(float64(stats.postingsTouched)) + s.metrics.seriesDataFetched.WithLabelValues("postings").Observe(float64(stats.postingsFetched)) + s.metrics.seriesDataSizeTouched.WithLabelValues("postings").Observe(float64(stats.postingsTouchedSizeSum)) + s.metrics.seriesDataSizeFetched.WithLabelValues("postings").Observe(float64(stats.postingsFetchedSizeSum)) + s.metrics.seriesDataTouched.WithLabelValues("series").Observe(float64(stats.seriesTouched)) + s.metrics.seriesDataFetched.WithLabelValues("series").Observe(float64(stats.seriesFetched)) + s.metrics.seriesDataSizeTouched.WithLabelValues("series").Observe(float64(stats.seriesTouchedSizeSum)) + s.metrics.seriesDataSizeFetched.WithLabelValues("series").Observe(float64(stats.seriesFetchedSizeSum)) + s.metrics.seriesDataTouched.WithLabelValues("chunks").Observe(float64(stats.chunksTouched)) + s.metrics.seriesDataFetched.WithLabelValues("chunks").Observe(float64(stats.chunksFetched)) + s.metrics.seriesDataSizeTouched.WithLabelValues("chunks").Observe(float64(stats.chunksTouchedSizeSum)) + s.metrics.seriesDataSizeFetched.WithLabelValues("chunks").Observe(float64(stats.chunksFetchedSizeSum)) + s.metrics.resultSeriesCount.Observe(float64(stats.mergedSeriesCount)) + + level.Debug(s.logger).Log("msg", "stats query processed", + "stats", fmt.Sprintf("%+v", stats), "err", err) + }() + // Concurrently get data from all blocks. { span, _ := tracing.StartSpan(srv.Context(), "bucket_store_preload_all") @@ -775,24 +794,6 @@ func (s *BucketStore) Series(req *storepb.SeriesRequest, srv storepb.Store_Serie stats.mergeDuration = time.Since(begin) s.metrics.seriesMergeDuration.Observe(stats.mergeDuration.Seconds()) } - - s.metrics.seriesDataTouched.WithLabelValues("postings").Observe(float64(stats.postingsTouched)) - s.metrics.seriesDataFetched.WithLabelValues("postings").Observe(float64(stats.postingsFetched)) - s.metrics.seriesDataSizeTouched.WithLabelValues("postings").Observe(float64(stats.postingsTouchedSizeSum)) - s.metrics.seriesDataSizeFetched.WithLabelValues("postings").Observe(float64(stats.postingsFetchedSizeSum)) - s.metrics.seriesDataTouched.WithLabelValues("series").Observe(float64(stats.seriesTouched)) - s.metrics.seriesDataFetched.WithLabelValues("series").Observe(float64(stats.seriesFetched)) - s.metrics.seriesDataSizeTouched.WithLabelValues("series").Observe(float64(stats.seriesTouchedSizeSum)) - s.metrics.seriesDataSizeFetched.WithLabelValues("series").Observe(float64(stats.seriesFetchedSizeSum)) - s.metrics.seriesDataTouched.WithLabelValues("chunks").Observe(float64(stats.chunksTouched)) - s.metrics.seriesDataFetched.WithLabelValues("chunks").Observe(float64(stats.chunksFetched)) - s.metrics.seriesDataSizeTouched.WithLabelValues("chunks").Observe(float64(stats.chunksTouchedSizeSum)) - s.metrics.seriesDataSizeFetched.WithLabelValues("chunks").Observe(float64(stats.chunksFetchedSizeSum)) - s.metrics.resultSeriesCount.Observe(float64(stats.mergedSeriesCount)) - - level.Debug(s.logger).Log("msg", "series query processed", - "stats", fmt.Sprintf("%+v", stats)) - return nil } @@ -1239,6 +1240,13 @@ func (p *postingGroup) Postings() index.Postings { return index.EmptyPostings() } + for i, posting := range p.postings { + if posting == nil { + // This should not happen. Debug for https://github.com/improbable-eng/thanos/issues/874. + return index.ErrPostings(errors.Errorf("at least one of %d postings is nil for %s. It was never fetched.", i, p.keys[i])) + } + } + return p.aggregate(p.postings) } @@ -1327,6 +1335,7 @@ func (r *bucketIndexReader) fetchPostings(groups []*postingGroup) error { continue } + r.stats.postingsToFetch++ ptrs = append(ptrs, postingPtr{ptr: ptr, groupID: i, keyID: j}) } } @@ -1342,13 +1351,13 @@ func (r *bucketIndexReader) fetchPostings(groups []*postingGroup) error { }) var g run.Group - for _, p := range parts { + for _, part := range parts { ctx, cancel := context.WithCancel(r.ctx) - i, j := p.elemRng[0], p.elemRng[1] + i, j := part.elemRng[0], part.elemRng[1] - start := int64(p.start) + start := int64(part.start) // We assume index does not have any ptrs that has 0 length. - length := int64(p.end) - start + length := int64(part.end) - start // Fetch from object storage concurrently and update stats and posting list. g.Add(func() error { @@ -1697,6 +1706,7 @@ type queryStats struct { postingsTouched int postingsTouchedSizeSum int + postingsToFetch int postingsFetched int postingsFetchedSizeSum int postingsFetchCount int From cd2061d479c6395c119bc385cd16e0e9ad884ce5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Tue, 19 Mar 2019 11:52:32 +0000 Subject: [PATCH 10/43] sidecar: Handle intermediate restarts of sidecar gracefully. (#941) Signed-off-by: Bartek Plotka --- pkg/block/block.go | 1 + pkg/shipper/shipper.go | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pkg/block/block.go b/pkg/block/block.go index cdc52a3e08..008e1798d4 100644 --- a/pkg/block/block.go +++ b/pkg/block/block.go @@ -118,6 +118,7 @@ func Delete(ctx context.Context, bucket objstore.Bucket, id ulid.ULID) error { } // DownloadMeta downloads only meta file from bucket by block ID. +// TODO(bwplotka): Differentiate between network error & partial upload. func DownloadMeta(ctx context.Context, logger log.Logger, bkt objstore.Bucket, id ulid.ULID) (metadata.Meta, error) { rc, err := bkt.Get(ctx, path.Join(id.String(), MetaFilename)) if err != nil { diff --git a/pkg/shipper/shipper.go b/pkg/shipper/shipper.go index a5208d84f1..c6f526f593 100644 --- a/pkg/shipper/shipper.go +++ b/pkg/shipper/shipper.go @@ -223,7 +223,6 @@ func newLazyOverlapChecker(logger log.Logger, bucket objstore.Bucket, labels fun } func (c *lazyOverlapChecker) sync(ctx context.Context) error { - level.Info(c.logger).Log("msg", "gathering all existing blocks from the remote bucket") if err := c.bucket.Iter(ctx, "", func(path string) error { id, ok := block.IsBlockDir(path) if !ok { @@ -253,6 +252,7 @@ func (c *lazyOverlapChecker) sync(ctx context.Context) error { func (c *lazyOverlapChecker) IsOverlapping(ctx context.Context, newMeta tsdb.BlockMeta) error { if !c.synced { + level.Info(c.logger).Log("msg", "gathering all existing blocks from the remote bucket for check", "id", newMeta.ULID.String()) if err := c.sync(ctx); err != nil { return err } @@ -280,8 +280,8 @@ func (s *Shipper) Sync(ctx context.Context) (uploaded int, err error) { meta, err := ReadMetaFile(s.dir) if err != nil { // If we encounter any error, proceed with an empty meta file and overwrite it later. - // The meta file is only used to deduplicate uploads, which are properly handled - // by the system if their occur anyway. + // The meta file is only used to avoid unnecessary bucket.Exists call, + // which are properly handled by the system if their occur anyway. if !os.IsNotExist(err) { level.Warn(s.logger).Log("msg", "reading meta file failed, will override it", "err", err) } @@ -316,6 +316,15 @@ func (s *Shipper) Sync(ctx context.Context) (uploaded int, err error) { return nil } + // Check against bucket if the meta file for this block exists. + ok, err := s.bucket.Exists(ctx, path.Join(m.ULID.String(), block.MetaFilename)) + if err != nil { + return errors.Wrap(err, "check exists") + } + if ok { + return nil + } + // We only ship of the first compacted block level as normal flow. if m.Compaction.Level > 1 { if !s.uploadCompacted { @@ -329,15 +338,6 @@ func (s *Shipper) Sync(ctx context.Context) (uploaded int, err error) { } } - // Check against bucket if the meta file for this block exists. - ok, err := s.bucket.Exists(ctx, path.Join(m.ULID.String(), block.MetaFilename)) - if err != nil { - return errors.Wrap(err, "check exists") - } - if ok { - return nil - } - if err := s.upload(ctx, m); err != nil { level.Error(s.logger).Log("msg", "shipping failed", "block", m.ULID, "err", err) // No error returned, just log line. This is because we want other blocks to be uploaded even From 4b08be3cfcb82cca062c601024f44b1a0e808b0a Mon Sep 17 00:00:00 2001 From: Vladimir Glafirov Date: Tue, 19 Mar 2019 12:58:32 +0100 Subject: [PATCH 11/43] Azure maturity level (#936) * Migrate to new azblob version * Added error handling * Changed error message * Set Azure maturity to Stable --- .dep-finished | 0 docs/storage.md | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 .dep-finished diff --git a/.dep-finished b/.dep-finished new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/storage.md b/docs/storage.md index 1962ba3472..8395d44568 100644 --- a/docs/storage.md +++ b/docs/storage.md @@ -12,7 +12,7 @@ Current object storage client implementations: |----------------------|-------------------|-----------|---------------| | Google Cloud Storage | Stable (production usage) | yes | @bwplotka | | AWS S3 | Stable (production usage) | yes | @bwplotka | -| Azure Storage Account | Alpha | yes | @vglafirov | +| Azure Storage Account | Stable (production usage) | yes | @vglafirov | | OpenStack Swift | Beta (working PoCs, testing usage) | no | @sudhi-vm | | Tencent COS | Beta (testing usage) | no | @jojohappy | From 7f8b28474c210c8fceb97f371e3b5856566b5ac5 Mon Sep 17 00:00:00 2001 From: Xiang Dai <764524258@qq.com> Date: Tue, 19 Mar 2019 19:58:53 +0800 Subject: [PATCH 12/43] fix wrong link (#940) Signed-off-by: Xiang Dai <764524258@qq.com> --- docs/getting_started.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting_started.md b/docs/getting_started.md index 9c1f933e6a..6a899ac57b 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -30,7 +30,7 @@ The `thanos` binary should now be in your `$PATH` and is the only thing required Thanos bases on vanilla Prometheus (v2.2.1+). -For exact Prometheus version list Thanos was tested against you can find [here](../Makefile#L36) +For exact Prometheus version list Thanos was tested against you can find [here](../Makefile#L42) ## [Sidecar](components/sidecar.md) From 12546c2d7fa186fa85671c9e509aebf69976939d Mon Sep 17 00:00:00 2001 From: Tollef Fog Heen Date: Wed, 20 Mar 2019 11:48:01 +0100 Subject: [PATCH 13/43] Use $labelvalue and $labelselector rather than hard coding app and thanos-query (#951) --- examples/grafana/thanos-query.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/grafana/thanos-query.json b/examples/grafana/thanos-query.json index 816c4cc190..9f9f91d4ac 100644 --- a/examples/grafana/thanos-query.json +++ b/examples/grafana/thanos-query.json @@ -956,7 +956,7 @@ "multi": false, "name": "pod", "options": [], - "query": "label_values(thanos_build_info{app=\"thanos-query\"}, kubernetes_pod_name)", + "query": "label_values(thanos_build_info{$labelselector=\"$labelvalue\"}, kubernetes_pod_name)", "refresh": 1, "regex": "", "sort": 0, From 44272a6f07048cb694aeb2fbbf4f76a7db93a9c1 Mon Sep 17 00:00:00 2001 From: Joseph Date: Wed, 20 Mar 2019 18:53:24 +0800 Subject: [PATCH 14/43] query: support POST method for query/query_range endpoints (#939) Signed-off-by: Joseph Lee --- pkg/query/api/v1.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/query/api/v1.go b/pkg/query/api/v1.go index 61c1603b57..b05a109207 100644 --- a/pkg/query/api/v1.go +++ b/pkg/query/api/v1.go @@ -168,7 +168,10 @@ func (api *API) Register(r *route.Router, tracer opentracing.Tracer, logger log. r.Options("/*path", instr("options", api.options)) r.Get("/query", instr("query", api.query)) + r.Post("/query", instr("query", api.query)) + r.Get("/query_range", instr("query_range", api.queryRange)) + r.Post("/query_range", instr("query_range", api.queryRange)) r.Get("/label/:name/values", instr("label_values", api.labelValues)) From 9f4fa5b7a0b13a1107893f44bd53b27e21eb394f Mon Sep 17 00:00:00 2001 From: Michael Dai Date: Wed, 20 Mar 2019 19:27:09 +0800 Subject: [PATCH 15/43] Fixed lag between file-sd and dns-sd (#933) Signed-off-by: jojohappy --- cmd/thanos/query.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index df86dbad0c..a32c7c6c7a 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -355,6 +355,7 @@ func runQuery( } fileSDCache.Update(update) stores.Update(ctxUpdate) + dnsProvider.Resolve(ctxUpdate, append(fileSDCache.Addresses(), storeAddrs...)) case <-ctxUpdate.Done(): return nil } From 8e0e4dc59b43082f7dd8815946e9e48b60527823 Mon Sep 17 00:00:00 2001 From: Tollef Fog Heen Date: Wed, 20 Mar 2019 13:53:33 +0100 Subject: [PATCH 16/43] Change y-axis unit to rps for rps graph (#950) --- examples/grafana/thanos-query.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/grafana/thanos-query.json b/examples/grafana/thanos-query.json index 9f9f91d4ac..85601c8243 100644 --- a/examples/grafana/thanos-query.json +++ b/examples/grafana/thanos-query.json @@ -155,7 +155,7 @@ }, "yaxes": [ { - "format": "s", + "format": "reqps", "label": null, "logBase": 1, "max": null, From 51e3672ed70140cb0f804776e047877c4b58c315 Mon Sep 17 00:00:00 2001 From: Xiang Dai <764524258@qq.com> Date: Thu, 21 Mar 2019 19:26:25 +0800 Subject: [PATCH 17/43] Remove some white noise (#947) * Remove some white noise Signed-off-by: Xiang Dai <764524258@qq.com> * remove white noise in genflagdocs Signed-off-by: Xiang Dai <764524258@qq.com> --- CHANGELOG.md | 12 +-- CONTRIBUTING.md | 2 +- MAINTAINERS.md | 2 +- README.md | 12 +-- benchmark/README.md | 4 +- docs/components/bucket.md | 42 +++++----- docs/components/compact.md | 18 ++-- docs/components/query.md | 40 ++++----- docs/components/rule.md | 46 +++++----- docs/components/sidecar.md | 40 ++++----- docs/components/store.md | 34 ++++---- docs/design.md | 2 +- docs/getting_started.md | 20 ++--- .../approved/201809_gossip-removal.md | 20 ++--- .../201901-read-write-operations-bucket.md | 84 +++++++++---------- ...201807_store_instance_high_availability.md | 2 +- docs/proposals/rejected/config.md | 14 ++-- docs/release_process.md | 30 +++---- docs/service_discovery.md | 4 +- docs/storage.md | 6 +- scripts/genflagdocs.sh | 3 + 21 files changed, 220 insertions(+), 217 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5dd5fdd7a4..eae8477f48 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,7 +23,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel - [#851](https://github.com/improbable-eng/thanos/pull/851) New read API endpoint for api/v1/rules and api/v1/alerts. - [#873](https://github.com/improbable-eng/thanos/pull/873) Store: fix set index cache LRU -:warning: **WARING** :warning: #873 fix fixes actual handling of `index-cache-size`. Handling of limit for this cache was +:warning: **WARING** :warning: #873 fix fixes actual handling of `index-cache-size`. Handling of limit for this cache was broken so it was unbounded all the time. From this release actual value matters and is extremely low by default. To "revert" the old behaviour (no boundary), use a large enough value. @@ -55,14 +55,14 @@ the old behaviour (no boundary), use a large enough value. - [#649](https://github.com/improbable-eng/thanos/issues/649) - Fixed store label values api to add also external label values. - [#396](https://github.com/improbable-eng/thanos/issues/396) - Fixed sidecar logic for proxying series that has more than 2^16 samples from Prometheus. - [#732](https://github.com/improbable-eng/thanos/pull/732) - Fixed S3 authentication sequence. You can see new sequence enumerated [here](https://github.com/improbable-eng/thanos/blob/master/docs/storage.md#aws-s3-configuration) -- [#745](https://github.com/improbable-eng/thanos/pull/745) - Fixed race conditions and edge cases for Thanos Querier fanout logic. +- [#745](https://github.com/improbable-eng/thanos/pull/745) - Fixed race conditions and edge cases for Thanos Querier fanout logic. - [#651](https://github.com/improbable-eng/thanos/issues/651) - Fixed index cache when asked buffer size is bigger than cache max size. ### Changed - [#529](https://github.com/improbable-eng/thanos/pull/529) Massive improvement for compactor. Downsampling memory consumption was reduce to only store labels and single chunks per each series. - Qurerier UI: Store page now shows the store APIs per component type. -- Prometheus and TSDB deps are now up to date with ~2.7.0 Prometheus version. Lot's of things has changed. See details [here #704](https://github.com/improbable-eng/thanos/pull/704) Known changes that affects us: +- Prometheus and TSDB deps are now up to date with ~2.7.0 Prometheus version. Lot's of things has changed. See details [here #704](https://github.com/improbable-eng/thanos/pull/704) Known changes that affects us: - prometheus/prometheus/discovery/file - [ENHANCEMENT] Discovery: Improve performance of previously slow updates of changes of targets. #4526 - [BUGFIX] Wait for service discovery to stop before exiting #4508 ?? @@ -83,11 +83,11 @@ the old behaviour (no boundary), use a large enough value. - S3 provider: - Added `put_user_metadata` option to config. - Added `insecure_skip_verify` option to config. - + ### Deprecated - + - Tests against Prometheus below v2.2.1. This does not mean *lack* of support for those. Only that we don't tests the compatibility anymore. See [#758](https://github.com/improbable-eng/thanos/issues/758) for details. - + ## [v0.2.1](https://github.com/improbable-eng/thanos/releases/tag/v0.2.1) - 2018.12.27 ### Added diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9f11ecfb4c..6efb35b70f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,7 +15,7 @@ The philosophy of Thanos and our community is borrowing much from UNIX philosoph * Write components that work together * e.g. blocks should be stored in native prometheus format * Make it easy to read, write, and, run components - * e.g. reduce complexity in system design and implementation + * e.g. reduce complexity in system design and implementation ## Adding New Features / Components diff --git a/MAINTAINERS.md b/MAINTAINERS.md index a931dc3a4e..e674254d2d 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -4,7 +4,7 @@ |-----------------------|------------------------|--------------------------|------------------------------------------------------------| | Bartłomiej Płotka | bwplotka@gmail.com | `@bwplotka` | [@bwplotka](https://github.com/bwplotka) | | Dominic Green | dom@improbable.io | `@domgreen` | [@domgreen](https://github.com/domgreen) | -| Frederic Branczyk | fbranczyk@gmail.com | `@brancz` | [@brancz](https://github.com/brancz) | +| Frederic Branczyk | fbranczyk@gmail.com | `@brancz` | [@brancz](https://github.com/brancz) | | Giedrius Statkevičius | giedriuswork@gmail.com | `@Giedrius Statkevičius` | [@GiedriusS](https://github.com/GiedriusS) | ## Storage plugins maintainers diff --git a/README.md b/README.md index 209d83b83e..a734ba329c 100644 --- a/README.md +++ b/README.md @@ -8,13 +8,13 @@ ## Overview -Thanos is a set of components that can be composed into a highly available metric -system with unlimited storage capacity, which can be added seamlessly on top of existing -Prometheus deployments. +Thanos is a set of components that can be composed into a highly available metric +system with unlimited storage capacity, which can be added seamlessly on top of existing +Prometheus deployments. -Thanos leverages the Prometheus 2.0 storage format to cost-efficiently store historical metric +Thanos leverages the Prometheus 2.0 storage format to cost-efficiently store historical metric data in any object storage while retaining fast query latencies. Additionally, it provides -a global query view across all Prometheus installations and can merge data from Prometheus +a global query view across all Prometheus installations and can merge data from Prometheus HA pairs on the fly. Concretely the aims of the project are: @@ -65,7 +65,7 @@ Contributions are very welcome! See our [CONTRIBUTING.md](CONTRIBUTING.md) for m ## Community -Thanos is an open source project and we welcome new contributers and members +Thanos is an open source project and we welcome new contributers and members of the community. Here are ways to get in touch with the community: * Slack: [#thanos](https://join.slack.com/t/improbable-eng/shared_invite/enQtMzQ1ODcyMzQ5MjM4LWY5ZWZmNGM2ODc5MmViNmQ3ZTA3ZTY3NzQwOTBlMTkzZmIxZTIxODk0OWU3YjZhNWVlNDU3MDlkZGViZjhkMjc) diff --git a/benchmark/README.md b/benchmark/README.md index 2453cd4b1a..2a410d84eb 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -38,14 +38,14 @@ We started a prometheus instance with thanos sidecar, scraping 50 targets every * 0.022 average query time (min: 0.018s, max: 0.073s) * *0.812 queries per second* -This shows an added latency of *85-150%* when using thanos query. This is more or less expected, as network operations will have to be done twice. We took a profile of thanos query while under load, finding that about a third of the time is being spent evaluating the promql queries. We are looking into including newer versions of the prometheus libraries into thanos that include optimisations to this component. +This shows an added latency of *85-150%* when using thanos query. This is more or less expected, as network operations will have to be done twice. We took a profile of thanos query while under load, finding that about a third of the time is being spent evaluating the promql queries. We are looking into including newer versions of the prometheus libraries into thanos that include optimisations to this component. Although we have not tested federated prometheus in the same controlled environment, in theory it should incur a similar overhead, as we will still be performing two network hops. ### Store performance To test the store component, we generated 1 year of simulated metrics (100 timeseries taking random values every 15s, a total of 210 million samples). We were able to run heavy queries to touch all 210 million of these samples, e.g. a sum over 100 timeseries takes about 34.6 seconds. Smaller queries, for example fetching 1 year of samples from a single timeseries, were able to run in about 500 milliseconds. -When enabling downsampling over these timeseries, we were able to reduce query times by over 90%. +When enabling downsampling over these timeseries, we were able to reduce query times by over 90%. ### Ingestion To try to find the limits of a single thanos-query service, we spun up a number prometheus instances, each scraping 10 metric-producing endpoints every second. We attached a thanos-query endpoint in front of these scrapers, and ran queries that would touch fetch a most recent metric from each of them. Each metric producing endpoint would serve 100 metrics, taking random values, and the query would fetch the most recent value from each of these metrics: diff --git a/docs/components/bucket.md b/docs/components/bucket.md index 8d20ec378e..e887a036ec 100644 --- a/docs/components/bucket.md +++ b/docs/components/bucket.md @@ -36,18 +36,18 @@ Flags: --version Show application version. --log.level=info Log filtering level. --log.format=logfmt Log format to use. - --gcloudtrace.project=GCLOUDTRACE.PROJECT + --gcloudtrace.project=GCLOUDTRACE.PROJECT GCP project to send Google Cloud Trace tracings to. If empty, tracing will be disabled. - --gcloudtrace.sample-factor=1 + --gcloudtrace.sample-factor=1 How often we send traces (1/). If 0 no trace will be sent periodically, unless forced by baggage item. See `pkg/tracing/tracing.go` for details. - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag. Object store configuration in YAML. @@ -86,35 +86,35 @@ Flags: --version Show application version. --log.level=info Log filtering level. --log.format=logfmt Log format to use. - --gcloudtrace.project=GCLOUDTRACE.PROJECT + --gcloudtrace.project=GCLOUDTRACE.PROJECT GCP project to send Google Cloud Trace tracings to. If empty, tracing will be disabled. - --gcloudtrace.sample-factor=1 + --gcloudtrace.sample-factor=1 How often we send traces (1/). If 0 no trace will be sent periodically, unless forced by baggage item. See `pkg/tracing/tracing.go` for details. - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag. Object store configuration in YAML. - --objstore-backup.config-file= + --objstore-backup.config-file= Path to YAML file that contains object store-backup configuration. Used for repair logic to backup blocks before removal. - --objstore-backup.config= + --objstore-backup.config= Alternative to 'objstore-backup.config-file' flag. Object store-backup configuration in YAML. Used for repair logic to backup blocks before removal. -r, --repair Attempt to repair blocks for which issues were detected - -i, --issues=index_issue... ... + -i, --issues=index_issue... ... Issues to verify (and optionally repair). Possible values: [duplicated_compaction index_issue overlapped_blocks] - --id-whitelist=ID-WHITELIST ... + --id-whitelist=ID-WHITELIST ... Block IDs to verify (and optionally repair) only. If none is specified, all blocks will be verified. Repeated field @@ -143,18 +143,18 @@ Flags: --version Show application version. --log.level=info Log filtering level. --log.format=logfmt Log format to use. - --gcloudtrace.project=GCLOUDTRACE.PROJECT + --gcloudtrace.project=GCLOUDTRACE.PROJECT GCP project to send Google Cloud Trace tracings to. If empty, tracing will be disabled. - --gcloudtrace.sample-factor=1 + --gcloudtrace.sample-factor=1 How often we send traces (1/). If 0 no trace will be sent periodically, unless forced by baggage item. See `pkg/tracing/tracing.go` for details. - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag. Object store configuration in YAML. -o, --output="" Optional format in which to print each block's @@ -184,21 +184,21 @@ Flags: --version Show application version. --log.level=info Log filtering level. --log.format=logfmt Log format to use. - --gcloudtrace.project=GCLOUDTRACE.PROJECT + --gcloudtrace.project=GCLOUDTRACE.PROJECT GCP project to send Google Cloud Trace tracings to. If empty, tracing will be disabled. - --gcloudtrace.sample-factor=1 + --gcloudtrace.sample-factor=1 How often we send traces (1/). If 0 no trace will be sent periodically, unless forced by baggage item. See `pkg/tracing/tracing.go` for details. - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag. Object store configuration in YAML. - -l, --selector=="" ... + -l, --selector=="" ... Selects blocks based on label, e.g. '-l key1="value1" -l key2="value2"'. All key value pairs must match. diff --git a/docs/components/compact.md b/docs/components/compact.md index 2eb6a99824..efd1455fc9 100644 --- a/docs/components/compact.md +++ b/docs/components/compact.md @@ -36,38 +36,38 @@ Flags: --version Show application version. --log.level=info Log filtering level. --log.format=logfmt Log format to use. - --gcloudtrace.project=GCLOUDTRACE.PROJECT + --gcloudtrace.project=GCLOUDTRACE.PROJECT GCP project to send Google Cloud Trace tracings to. If empty, tracing will be disabled. - --gcloudtrace.sample-factor=1 + --gcloudtrace.sample-factor=1 How often we send traces (1/). If 0 no trace will be sent periodically, unless forced by baggage item. See `pkg/tracing/tracing.go` for details. - --http-address="0.0.0.0:10902" + --http-address="0.0.0.0:10902" Listen host:port for HTTP endpoints. --data-dir="./data" Data directory in which to cache blocks and process compactions. - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag. Object store configuration in YAML. --sync-delay=30m Minimum age of fresh (non-compacted) blocks before they are being processed. - --retention.resolution-raw=0d + --retention.resolution-raw=0d How long to retain raw samples in bucket. 0d - disables this retention - --retention.resolution-5m=0d + --retention.resolution-5m=0d How long to retain samples of resolution 1 (5 minutes) in bucket. 0d - disables this retention - --retention.resolution-1h=0d + --retention.resolution-1h=0d How long to retain samples of resolution 2 (1 hour) in bucket. 0d - disables this retention -w, --wait Do not exit after all compactions have been processed and wait for new work. - --block-sync-concurrency=20 + --block-sync-concurrency=20 Number of goroutines to use when syncing block metadata from object storage. diff --git a/docs/components/query.md b/docs/components/query.md index f79960c10b..3a08f70603 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -140,17 +140,17 @@ Flags: --version Show application version. --log.level=info Log filtering level. --log.format=logfmt Log format to use. - --gcloudtrace.project=GCLOUDTRACE.PROJECT + --gcloudtrace.project=GCLOUDTRACE.PROJECT GCP project to send Google Cloud Trace tracings to. If empty, tracing will be disabled. - --gcloudtrace.sample-factor=1 + --gcloudtrace.sample-factor=1 How often we send traces (1/). If 0 no trace will be sent periodically, unless forced by baggage item. See `pkg/tracing/tracing.go` for details. - --http-address="0.0.0.0:10902" + --http-address="0.0.0.0:10902" Listen host:port for HTTP endpoints. - --grpc-address="0.0.0.0:10901" + --grpc-address="0.0.0.0:10901" Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components if you use gossip, @@ -160,51 +160,51 @@ Flags: disable TLS --grpc-server-tls-key="" TLS Key for the gRPC server, leave blank to disable TLS - --grpc-server-tls-client-ca="" + --grpc-server-tls-client-ca="" TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert) - --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS + --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS Explicit (external) host:port address to advertise for gRPC StoreAPI in gossip cluster. If empty, 'grpc-address' will be used. - --cluster.address="0.0.0.0:10900" + --cluster.address="0.0.0.0:10900" Listen ip:port address for gossip cluster. - --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS + --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS Explicit (external) ip:port address to advertise for gossip in gossip cluster. Used internally for membership only. - --cluster.peers=CLUSTER.PEERS ... + --cluster.peers=CLUSTER.PEERS ... Initial peers to join the cluster. It can be either , or . A lookup resolution is done only at the startup. - --cluster.gossip-interval= + --cluster.gossip-interval= Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth. Default is used from a specified network-type. - --cluster.pushpull-interval= + --cluster.pushpull-interval= Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage. Default is used from a specified network-type. - --cluster.refresh-interval=1m + --cluster.refresh-interval=1m Interval for membership to refresh cluster.peers state, 0 disables refresh. - --cluster.secret-key=CLUSTER.SECRET-KEY + --cluster.secret-key=CLUSTER.SECRET-KEY Initial secret key to encrypt cluster gossip. Can be one of AES-128, AES-192, or AES-256 in hexadecimal format. - --cluster.network-type=lan + --cluster.network-type=lan Network type with predefined peers configurations. Sets of configurations accounting the latency differences between network types: local, lan, wan. --cluster.disable If true gossip will be disabled and no cluster related server will be started. - --http-advertise-address=HTTP-ADVERTISE-ADDRESS + --http-advertise-address=HTTP-ADVERTISE-ADDRESS Explicit (external) host:port address to advertise for HTTP QueryAPI in gossip cluster. If empty, 'http-address' will be used. @@ -214,7 +214,7 @@ Flags: --grpc-client-tls-key="" TLS Key for the client's certificate --grpc-client-tls-ca="" TLS CA Certificates to use to verify gRPC servers - --grpc-client-server-name="" + --grpc-client-server-name="" Server name to verify the hostname on the returned gRPC certificates. See https://tools.ietf.org/html/rfc4366#section-3.1 @@ -244,12 +244,12 @@ Flags: --query.timeout=2m Maximum time to process query by query node. --query.max-concurrent=20 Maximum number of queries processed concurrently by query node. - --query.replica-label=QUERY.REPLICA-LABEL + --query.replica-label=QUERY.REPLICA-LABEL Label to treat as a replica indicator along which data is deduplicated. Still you will be able to query without deduplication using 'dedup=false' parameter. - --selector-label=="" ... + --selector-label=="" ... Query selector labels that will be exposed in info endpoint (repeated). --store= ... Addresses of statically configured store API @@ -257,13 +257,13 @@ Flags: prefixed with 'dns+' or 'dnssrv+' to detect store API servers through respective DNS lookups. - --store.sd-files= ... + --store.sd-files= ... Path to files that contain addresses of store API servers. The path can be a glob pattern (repeatable). --store.sd-interval=5m Refresh interval to re-read file SD files. It is used as a resync fallback. - --store.sd-dns-interval=30s + --store.sd-dns-interval=30s Interval between DNS resolutions. --query.auto-downsampling Enable automatic adjustment (step / 5) to what source of data should be used in store gateways diff --git a/docs/components/rule.md b/docs/components/rule.md index be35f0d6e6..60b856223b 100644 --- a/docs/components/rule.md +++ b/docs/components/rule.md @@ -47,17 +47,17 @@ Flags: --version Show application version. --log.level=info Log filtering level. --log.format=logfmt Log format to use. - --gcloudtrace.project=GCLOUDTRACE.PROJECT + --gcloudtrace.project=GCLOUDTRACE.PROJECT GCP project to send Google Cloud Trace tracings to. If empty, tracing will be disabled. - --gcloudtrace.sample-factor=1 + --gcloudtrace.sample-factor=1 How often we send traces (1/). If 0 no trace will be sent periodically, unless forced by baggage item. See `pkg/tracing/tracing.go` for details. - --http-address="0.0.0.0:10902" + --http-address="0.0.0.0:10902" Listen host:port for HTTP endpoints. - --grpc-address="0.0.0.0:10901" + --grpc-address="0.0.0.0:10901" Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components if you use gossip, @@ -67,51 +67,51 @@ Flags: disable TLS --grpc-server-tls-key="" TLS Key for the gRPC server, leave blank to disable TLS - --grpc-server-tls-client-ca="" + --grpc-server-tls-client-ca="" TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert) - --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS + --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS Explicit (external) host:port address to advertise for gRPC StoreAPI in gossip cluster. If empty, 'grpc-address' will be used. - --cluster.address="0.0.0.0:10900" + --cluster.address="0.0.0.0:10900" Listen ip:port address for gossip cluster. - --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS + --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS Explicit (external) ip:port address to advertise for gossip in gossip cluster. Used internally for membership only. - --cluster.peers=CLUSTER.PEERS ... + --cluster.peers=CLUSTER.PEERS ... Initial peers to join the cluster. It can be either , or . A lookup resolution is done only at the startup. - --cluster.gossip-interval= + --cluster.gossip-interval= Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth. Default is used from a specified network-type. - --cluster.pushpull-interval= + --cluster.pushpull-interval= Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage. Default is used from a specified network-type. - --cluster.refresh-interval=1m + --cluster.refresh-interval=1m Interval for membership to refresh cluster.peers state, 0 disables refresh. - --cluster.secret-key=CLUSTER.SECRET-KEY + --cluster.secret-key=CLUSTER.SECRET-KEY Initial secret key to encrypt cluster gossip. Can be one of AES-128, AES-192, or AES-256 in hexadecimal format. - --cluster.network-type=lan + --cluster.network-type=lan Network type with predefined peers configurations. Sets of configurations accounting the latency differences between network types: local, lan, wan. --cluster.disable If true gossip will be disabled and no cluster related server will be started. - --label=="" ... + --label=="" ... Labels to be applied to all generated metrics (repeated). Similar to external labels for Prometheus, used to identify ruler and its @@ -122,7 +122,7 @@ Flags: --eval-interval=30s The default evaluation interval to use. --tsdb.block-duration=2h Block duration for TSDB block. --tsdb.retention=48h Block retention time on local disk. - --alertmanagers.url=ALERTMANAGERS.URL ... + --alertmanagers.url=ALERTMANAGERS.URL ... Alertmanager replica URLs to push firing alerts. Ruler claims success if push to at least one alertmanager from discovered @@ -132,12 +132,12 @@ Flags: defaults to 9093 or the SRV record's value. The URL path is used as a prefix for the regular Alertmanager API path. - --alertmanagers.send-timeout=10s + --alertmanagers.send-timeout=10s Timeout for sending alerts to alertmanager - --alert.query-url=ALERT.QUERY-URL + --alert.query-url=ALERT.QUERY-URL The external Thanos Query URL that would be set in all alerts 'Source' field - --alert.label-drop=ALERT.LABEL-DROP ... + --alert.label-drop=ALERT.LABEL-DROP ... Labels by name to drop before sending to alertmanager. This allows alert to be deduplicated on replica label (repeated). @@ -165,10 +165,10 @@ Flags: stripped prefix value in X-Forwarded-Prefix header. This allows thanos UI to be served on a sub-path. - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag. Object store configuration in YAML. --query= ... Addresses of statically configured query API @@ -176,13 +176,13 @@ Flags: prefixed with 'dns+' or 'dnssrv+' to detect query API servers through respective DNS lookups. - --query.sd-files= ... + --query.sd-files= ... Path to file that contain addresses of query peers. The path can be a glob pattern (repeatable). --query.sd-interval=5m Refresh interval to re-read file SD files. (used as a fallback) - --query.sd-dns-interval=30s + --query.sd-dns-interval=30s Interval between DNS resolutions. ``` diff --git a/docs/components/sidecar.md b/docs/components/sidecar.md index fdb1955205..4966681a1b 100644 --- a/docs/components/sidecar.md +++ b/docs/components/sidecar.md @@ -10,7 +10,7 @@ Prometheus servers connected to the Thanos cluster via the sidecar are subject t * The `--web.enable-lifecycle` flag is enabled if you want to use `reload.*` flags. * The `--storage.tsdb.min-block-duration` and `--storage.tsdb.max-block-duration` must be set to equal values to disable local compaction. The default of `2h` is recommended. -The retention is recommended to not be lower than three times the block duration. This achieves resilience in the face of connectivity issues +The retention is recommended to not be lower than three times the block duration. This achieves resilience in the face of connectivity issues to the object storage since all local data will remain available within the Thanos cluster. If connectivity gets restored the backlog of blocks gets uploaded to the object storage. ```console @@ -51,17 +51,17 @@ Flags: --version Show application version. --log.level=info Log filtering level. --log.format=logfmt Log format to use. - --gcloudtrace.project=GCLOUDTRACE.PROJECT + --gcloudtrace.project=GCLOUDTRACE.PROJECT GCP project to send Google Cloud Trace tracings to. If empty, tracing will be disabled. - --gcloudtrace.sample-factor=1 + --gcloudtrace.sample-factor=1 How often we send traces (1/). If 0 no trace will be sent periodically, unless forced by baggage item. See `pkg/tracing/tracing.go` for details. - --http-address="0.0.0.0:10902" + --http-address="0.0.0.0:10902" Listen host:port for HTTP endpoints. - --grpc-address="0.0.0.0:10901" + --grpc-address="0.0.0.0:10901" Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components if you use gossip, @@ -71,65 +71,65 @@ Flags: disable TLS --grpc-server-tls-key="" TLS Key for the gRPC server, leave blank to disable TLS - --grpc-server-tls-client-ca="" + --grpc-server-tls-client-ca="" TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert) - --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS + --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS Explicit (external) host:port address to advertise for gRPC StoreAPI in gossip cluster. If empty, 'grpc-address' will be used. - --cluster.address="0.0.0.0:10900" + --cluster.address="0.0.0.0:10900" Listen ip:port address for gossip cluster. - --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS + --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS Explicit (external) ip:port address to advertise for gossip in gossip cluster. Used internally for membership only. - --cluster.peers=CLUSTER.PEERS ... + --cluster.peers=CLUSTER.PEERS ... Initial peers to join the cluster. It can be either , or . A lookup resolution is done only at the startup. - --cluster.gossip-interval= + --cluster.gossip-interval= Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth. Default is used from a specified network-type. - --cluster.pushpull-interval= + --cluster.pushpull-interval= Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage. Default is used from a specified network-type. - --cluster.refresh-interval=1m + --cluster.refresh-interval=1m Interval for membership to refresh cluster.peers state, 0 disables refresh. - --cluster.secret-key=CLUSTER.SECRET-KEY + --cluster.secret-key=CLUSTER.SECRET-KEY Initial secret key to encrypt cluster gossip. Can be one of AES-128, AES-192, or AES-256 in hexadecimal format. - --cluster.network-type=lan + --cluster.network-type=lan Network type with predefined peers configurations. Sets of configurations accounting the latency differences between network types: local, lan, wan. --cluster.disable If true gossip will be disabled and no cluster related server will be started. - --prometheus.url=http://localhost:9090 + --prometheus.url=http://localhost:9090 URL at which to reach Prometheus's API. For better performance use local network. --tsdb.path="./data" Data directory of TSDB. --reloader.config-file="" Config file watched by the reloader. - --reloader.config-envsubst-file="" + --reloader.config-envsubst-file="" Output file for environment variable substituted config file. - --reloader.rule-dir=RELOADER.RULE-DIR ... + --reloader.rule-dir=RELOADER.RULE-DIR ... Rule directories for the reloader to refresh (repeated field). - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag. Object store configuration in YAML. diff --git a/docs/components/store.md b/docs/components/store.md index 840795ac18..8b9d5fc79c 100644 --- a/docs/components/store.md +++ b/docs/components/store.md @@ -36,17 +36,17 @@ Flags: --version Show application version. --log.level=info Log filtering level. --log.format=logfmt Log format to use. - --gcloudtrace.project=GCLOUDTRACE.PROJECT + --gcloudtrace.project=GCLOUDTRACE.PROJECT GCP project to send Google Cloud Trace tracings to. If empty, tracing will be disabled. - --gcloudtrace.sample-factor=1 + --gcloudtrace.sample-factor=1 How often we send traces (1/). If 0 no trace will be sent periodically, unless forced by baggage item. See `pkg/tracing/tracing.go` for details. - --http-address="0.0.0.0:10902" + --http-address="0.0.0.0:10902" Listen host:port for HTTP endpoints. - --grpc-address="0.0.0.0:10901" + --grpc-address="0.0.0.0:10901" Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components if you use gossip, @@ -56,44 +56,44 @@ Flags: disable TLS --grpc-server-tls-key="" TLS Key for the gRPC server, leave blank to disable TLS - --grpc-server-tls-client-ca="" + --grpc-server-tls-client-ca="" TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert) - --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS + --grpc-advertise-address=GRPC-ADVERTISE-ADDRESS Explicit (external) host:port address to advertise for gRPC StoreAPI in gossip cluster. If empty, 'grpc-address' will be used. - --cluster.address="0.0.0.0:10900" + --cluster.address="0.0.0.0:10900" Listen ip:port address for gossip cluster. - --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS + --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS Explicit (external) ip:port address to advertise for gossip in gossip cluster. Used internally for membership only. - --cluster.peers=CLUSTER.PEERS ... + --cluster.peers=CLUSTER.PEERS ... Initial peers to join the cluster. It can be either , or . A lookup resolution is done only at the startup. - --cluster.gossip-interval= + --cluster.gossip-interval= Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth. Default is used from a specified network-type. - --cluster.pushpull-interval= + --cluster.pushpull-interval= Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage. Default is used from a specified network-type. - --cluster.refresh-interval=1m + --cluster.refresh-interval=1m Interval for membership to refresh cluster.peers state, 0 disables refresh. - --cluster.secret-key=CLUSTER.SECRET-KEY + --cluster.secret-key=CLUSTER.SECRET-KEY Initial secret key to encrypt cluster gossip. Can be one of AES-128, AES-192, or AES-256 in hexadecimal format. - --cluster.network-type=lan + --cluster.network-type=lan Network type with predefined peers configurations. Sets of configurations accounting the latency differences between @@ -104,15 +104,15 @@ Flags: --index-cache-size=250MB Maximum size of items held in the index cache. --chunk-pool-size=2GB Maximum size of concurrently allocatable bytes for chunks. - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag. Object store configuration in YAML. --sync-block-duration=3m Repeat interval for syncing the blocks between local and remote view. - --block-sync-concurrency=20 + --block-sync-concurrency=20 Number of goroutines to use when syncing blocks from object storage. diff --git a/docs/design.md b/docs/design.md index 1b85081cd2..803632c1bd 100644 --- a/docs/design.md +++ b/docs/design.md @@ -66,7 +66,7 @@ A store node acts as a gateway to block data that is stored in an object storage It continuously synchronizes which blocks exist in the bucket and translates requests for metric data into object storage requests. It implements various strategies to minimize the number of requests to the object storage such as filtering relevant blocks by their meta data (e.g. time range and labels) and caching frequent index lookups. -The Prometheus 2.0 storage layout is optimized for minimal read amplification. For example, sample data for the same time series is sequentially aligned in a chunk file. Similarly, series for the same metric name are sequentially aligned as well. +The Prometheus 2.0 storage layout is optimized for minimal read amplification. For example, sample data for the same time series is sequentially aligned in a chunk file. Similarly, series for the same metric name are sequentially aligned as well. The store node is aware of the files' layout and translates data requests into a plan of a minimum amount of object storage request. Each requests may fetch up to hundreds of thousands of chunks at once. This is essential to satisfy even big queries with a limited amount of requests to the object storage. Currently only index data is cached. Chunk data could be cached but is orders of magnitude larger in size. In the current state, fetching chunk data from the object storage already only accounts for a small fraction of end-to-end latency. Thus, there's currently no incentive to increase the store nodes resource requirements/limit its scalability by adding chunk caching. diff --git a/docs/getting_started.md b/docs/getting_started.md index 6a899ac57b..993248fbdd 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -1,8 +1,8 @@ # Getting started -Thanos provides a global query view, data backup, and historical data access as its core features in a single binary. All three features can be run independently of each other. This allows you to have a subset of Thanos features ready for immediate benefit or testing, while also making it flexible for gradual roll outs in more complex environments. +Thanos provides a global query view, data backup, and historical data access as its core features in a single binary. All three features can be run independently of each other. This allows you to have a subset of Thanos features ready for immediate benefit or testing, while also making it flexible for gradual roll outs in more complex environments. -In this quick-start guide, we will configure Thanos and all components mentioned to work against a Google Cloud Storage bucket. +In this quick-start guide, we will configure Thanos and all components mentioned to work against a Google Cloud Storage bucket. At the moment, Thanos is able to use [different storage providers](storage.md), with the ability to add more providers as necessary. ## Requirements @@ -34,13 +34,13 @@ For exact Prometheus version list Thanos was tested against you can find [here]( ## [Sidecar](components/sidecar.md) -Thanos integrates with existing Prometheus servers through a [Sidecar process](https://docs.microsoft.com/en-us/azure/architecture/patterns/sidecar#solution), which runs on the same machine or in the same pod as the Prometheus server. +Thanos integrates with existing Prometheus servers through a [Sidecar process](https://docs.microsoft.com/en-us/azure/architecture/patterns/sidecar#solution), which runs on the same machine or in the same pod as the Prometheus server. -The purpose of the Sidecar is to backup Prometheus data into an Object Storage bucket, and giving other Thanos components access to the Prometheus instance the Sidecar is attached to. +The purpose of the Sidecar is to backup Prometheus data into an Object Storage bucket, and giving other Thanos components access to the Prometheus instance the Sidecar is attached to. [More details about the Sidecar's functions are available at the sidecar documentation page](components/sidecar.md). -NOTE: If you want to use `reload.*` flags for sidecar, make sure you enable `reload` Prometheus endpoint with flag `--web.enable-lifecycle` +NOTE: If you want to use `reload.*` flags for sidecar, make sure you enable `reload` Prometheus endpoint with flag `--web.enable-lifecycle` ### Backups @@ -104,7 +104,7 @@ The Query component is stateless and horizontally scalable and can be deployed w Query also implements Prometheus's offical HTTP API and can thus be used with external tools such as Grafana. It also serves a derivative of Prometheus's UI for ad-hoc querying. -Below, we will set up a Query to connect to our Sidecars, and expose its HTTP UI. +Below, we will set up a Query to connect to our Sidecars, and expose its HTTP UI. ``` thanos query \ @@ -147,9 +147,9 @@ Go to the configured HTTP address, and you should now be able to query across al ## Communication Between Components The only required communication between nodes is for Thanos Querier to be able to reach gRPC storeAPIs you provide. Thanos Querier periodically calls Info endpoint to collect up-to-date metadata as well as checking the health of given StoreAPI. -The metadata includes the information about time windows and external labels for each node. +The metadata includes the information about time windows and external labels for each node. -There are various ways to tell query component about the StoreAPIs it should query data from. The simplest way is to use a static list of well known addresses to query. +There are various ways to tell query component about the StoreAPIs it should query data from. The simplest way is to use a static list of well known addresses to query. These are repeatable so can add as many endpoint as needed. You can put DNS domain prefixed by `dns://` or `dns+srv://` to have Thanos Query do an `A` or `SRV` lookup to get all required IPs to communicate with. ``` @@ -158,7 +158,7 @@ thanos query \ --grpc-address 0.0.0.0:19092 \ # gRPC endpoint for Store API --store 1.2.3.4:19090 \ # Static gRPC Store API Address for the query node to query --store 1.2.3.5:19090 \ # Also repeatable - --store dns://rest.thanos.peers:19092 # Use DNS lookup for getting all registered IPs as separate StoreAPIs + --store dns://rest.thanos.peers:19092 # Use DNS lookup for getting all registered IPs as separate StoreAPIs ``` Read more details [here](/docs/service_discovery.md) @@ -208,7 +208,7 @@ thanos query \ --cluster.peers 127.0.0.1:19392 \ # Another cluster peer (many can be added to discover nodes) --store 1.2.3.4:19090 \ # Static gRPC Store API Address for the query node to query --store 1.2.3.5:19090 \ # Also repeatable - --store dns://rest.thanos.peers:19092 # Use DNS lookup for getting all registered IPs as separate StoreAPIs + --store dns://rest.thanos.peers:19092 # Use DNS lookup for getting all registered IPs as separate StoreAPIs ``` When to use gossip vs store flags? diff --git a/docs/proposals/approved/201809_gossip-removal.md b/docs/proposals/approved/201809_gossip-removal.md index a509ca8891..e19905c1bd 100644 --- a/docs/proposals/approved/201809_gossip-removal.md +++ b/docs/proposals/approved/201809_gossip-removal.md @@ -9,36 +9,36 @@ Ticket: https://github.com/improbable-eng/thanos/issues/484 ## Summary It is becoming clear that we need to remove gossip protocol as our main way of communication between Thanos Querier and -other components. Static configuration seems to be well enough for our simple use cases. To give users more flexibility -(similar to gossip auto-join logic), we already wanted to introduce a [File SD](https://github.com/improbable-eng/thanos/issues/492) +other components. Static configuration seems to be well enough for our simple use cases. To give users more flexibility +(similar to gossip auto-join logic), we already wanted to introduce a [File SD](https://github.com/improbable-eng/thanos/issues/492) that allows changing `StoreAPI`s on-the-fly. ## Motivation -[Gossip](https://en.wikipedia.org/wiki/Gossip_protocol) protocol (with the [membership](https://github.com/hashicorp/memberlist) implementation) +[Gossip](https://en.wikipedia.org/wiki/Gossip_protocol) protocol (with the [membership](https://github.com/hashicorp/memberlist) implementation) was built into Thanos from the very beginning. The main advantages over other solution to connect components were: * Auto-join and auto-drop of components based on health checks. * Propagation of tiny metadata. After a couple of month of maintaining Thanos project and various discussions with different users, we realized that those advantages -are not outstanding anymore and are not worth keeping, compared to the issues gossip causes. There are numerous reasons why we should +are not outstanding anymore and are not worth keeping, compared to the issues gossip causes. There are numerous reasons why we should deprecate gossip: * Gossip has been proven to be extremely confusing for the new users. Peer logic and really confusing `cluster.advertise-address` that -was sometimes able to magically deduce private IP address (and sometimes not!) were leading to lots of questions and issues. +was sometimes able to magically deduce private IP address (and sometimes not!) were leading to lots of questions and issues. Something that was made for quick ramp-up into Thanos ("just click the button and it auto-joins everything") created lots of confusion and made it harder to experiment. -* Auto-join logic has its price. All peers connected were aware of each other. All were included in metadata propagation. +* Auto-join logic has its price. All peers connected were aware of each other. All were included in metadata propagation. Membership project community made an awesome job to optimize N^N communication (all-to-all), but nevertheless, it is something that is totally unnecessary to Thanos. And it led to wrong assumptions that e.g sidecar needs to be aware of another sidecar etc. *Thanos Querier is the only component that requires to be aware of other `StoreAPI`s*. It is clear that `all-to-all` communication is neither necessary nor educative. -* Unless specifically configured (which requires advanced knowledge) Gossip uses mix of TCP and UPD underneath its -custom app level protocol. This is a no-go if you use L7 loadbalancers and proxies. -* Global gossip is really difficult to achieve. BTW, If you know how to setup this, please write a blog about it! (: +* Unless specifically configured (which requires advanced knowledge) Gossip uses mix of TCP and UPD underneath its +custom app level protocol. This is a no-go if you use L7 loadbalancers and proxies. +* Global gossip is really difficult to achieve. BTW, If you know how to setup this, please write a blog about it! (: * With the addition of the simplest possible solution to give Thanos Querier knowledge where are `StoreAPI`s (static `--store` flag), we needed to implement health check and metadata propagation anyway. In fact, `StoreAPI.Info` was already there all the time. * Gossip operates per `peer` level and there is no way you can abstract multiple peers behind loadbalancer. This hides easy -solutions from our eyes, e.g how to make Store Gateway HA. Without gossip, you can just use Kubernetes HA Service or any other loadbalancer. +solutions from our eyes, e.g how to make Store Gateway HA. Without gossip, you can just use Kubernetes HA Service or any other loadbalancer. To support Store Gateway HA for gossip we would end up implementing LB logic in Thanos Querier (like proposed [here](https://github.com/improbable-eng/thanos/pull/404)) * At some point we want to be flexible and allow other discovery mechanisms. Gossip does not work for everyone and static flags are too.. static. (: We need [File SD](https://github.com/improbable-eng/thanos/issues/492) for flexibility anyway. diff --git a/docs/proposals/approved/201901-read-write-operations-bucket.md b/docs/proposals/approved/201901-read-write-operations-bucket.md index b7cf0bdae7..811f347dcd 100644 --- a/docs/proposals/approved/201901-read-write-operations-bucket.md +++ b/docs/proposals/approved/201901-read-write-operations-bucket.md @@ -4,7 +4,7 @@ Status: draft | in-review | rejected | **accepted** | complete Implementation Owner: [@bwplotka](https://github.com/bwplotka) -Tickets: +Tickets: * https://github.com/improbable-eng/thanos/issues/298 (eventual consistency) * https://github.com/improbable-eng/thanos/issues/377 (eventual consistency & partial upload) * https://github.com/improbable-eng/thanos/issues/564 (retention vs store gateway) @@ -14,32 +14,32 @@ Tickets: Our goals here are: -* A) Define consistent way of having multiple readers and multiple writers (technically multiple appenders and single remover) on shared object storage that can be -eventual consistent. -* B) Allow readers to sync the object storage changes in eventual consistent manner: (e.g to poll for changes +* A) Define consistent way of having multiple readers and multiple writers (technically multiple appenders and single remover) on shared object storage that can be +eventual consistent. +* B) Allow readers to sync the object storage changes in eventual consistent manner: (e.g to poll for changes every X minutes instead of watch and react to changes immediately) * C) Allow readers to detect and handle partial uploads consistently without any coordination. -## Motivation +## Motivation Thanos performs similar operations as Prometheus do on TSDB blocks with the following differences: * It operates on Object Storage API instead of local filesystem. * Operations are done from multiple processes that lack coordination. -Moving from lock-based logic to coordination free and from strongly consistent local filesystem to potentially eventually -consistent remote simplified "filesystem" in form of Object Storage API, causes additional cases that we need to consider +Moving from lock-based logic to coordination free and from strongly consistent local filesystem to potentially eventually +consistent remote simplified "filesystem" in form of Object Storage API, causes additional cases that we need to consider in Thanos system, like: * Thanos sidecar or compactor crashes during the process of uploading the block. It uploaded index, 2 chunk files and crashed. How to ensure readers (e.g compactor, store gateway) will handle this gracefully? * Thanos compactor uploads compacted block and deletes source blocks. After next sync iteration it does not see a new block (read after write eventual consistency). It sees gap, wrongly plans next compaction and causes non-resolvable overlap. * Thanos compactor uploads compacted block and deletes source blocks. Thanos Store Gateway syncs every 3m so it missed that fact. Next query that hits store gateway tries to fetch deleted source blocks and fails. -Currently, we have only basic safeguards against those in form of [`syncDelay`](https://github.com/improbable-eng/thanos/blob/bc088285a1b4bf464fdf2539e4b365b805874eed/pkg/compact/compact.go#L187). +Currently, we have only basic safeguards against those in form of [`syncDelay`](https://github.com/improbable-eng/thanos/blob/bc088285a1b4bf464fdf2539e4b365b805874eed/pkg/compact/compact.go#L187). That helps with operations between sidecar and compactor but does not fix inter-compactor logic in case of eventual consistent storage or partial upload failures. -We also partially assumed strong consistency for object storage, because [GCS has strong consistency](https://cloud.google.com/storage/docs/consistency) and S3 has it as well, +We also partially assumed strong consistency for object storage, because [GCS has strong consistency](https://cloud.google.com/storage/docs/consistency) and S3 has it as well, but with [some caveats](https://codeburst.io/quick-explanation-of-the-s3-consistency-model-6c9f325e3f82) that we met. -However with increasing number of other providers (ceph, hdfs, COS, Azure, Swift) with no (or unknown) strong consistency +However with increasing number of other providers (ceph, hdfs, COS, Azure, Swift) with no (or unknown) strong consistency guarantee this assumption needs to go away. We have an explicit API for Series fetching in form of gRPC StoreAPI, but we lack the verbose API for other side: the object storage format with assumptions on behavior that will @@ -67,7 +67,7 @@ Below diagram shows the block states defined by our rules. ### Why? -To deal with A and B, so overall eventual consistency, we will leverage existing feature of metric format (TSDB) which is +To deal with A and B, so overall eventual consistency, we will leverage existing feature of metric format (TSDB) which is *block immutability*. In object storage format, a block is a directory named with [`ULID`](https://github.com/oklog/ulid) that contains `chunks` dir with one or more chunk files (each not bigger than 512MB), `index` and `meta.json` files. To ensure every component has the same state of metrics we create first strict rule: @@ -76,7 +76,7 @@ has the same state of metrics we create first strict rule: With this rule alone we restrict bucket operations only to: * Block upload -* Block removal +* Block removal As those operations are not atomic itself, partial blocks or inconsistent block view may occur for short time (read-after-write consistency). To mitigate this we introduce second rule: @@ -93,7 +93,7 @@ To ensure we have block fully uploaded we define: > 3 . TSDB `meta.json` file within block *have to be* uploaded as the **last** one in the block folder. -Currently and as proposed `meta.json` is our integrity marker (it is uploaded as the last one). It matters even when we don't have strong consistency, because +Currently and as proposed `meta.json` is our integrity marker (it is uploaded as the last one). It matters even when we don't have strong consistency, because it helps us detect the case of `upload` crashed in between of upload. We only upload meta.json after successful uploads of rest of the files for the block. With those rule one could say "why to wait 15 minutes if we can just check if meta.json is parsable". In eventual consistency world @@ -104,11 +104,11 @@ Thanks of meta.json being uploaded at the end we can form 4th rule: > 4 . After 15 minutes (or time configured by `syncDelay`), if there is no fully loadable `meta.json`, we assume block is malformed, partially uploaded or marked for deletion and can be *removed* by compactor. [Compactor change needed] -This is to detect partial uploads caused by for example compactor being rolled out or crashed. Other writers managed by `shipper` package relies on +This is to detect partial uploads caused by for example compactor being rolled out or crashed. Other writers managed by `shipper` package relies on fact that blocks are persisted on disk so the upload operation can be retried. Similar to writers, readers are excluding all blocks fresher than 15m or older but without `meta.json`. Overall those 4 rules -solved issue C) and A), B) in terms of block upload. +solved issue C) and A), B) in terms of block upload. However, sometimes we need to change blocks for reasons, like: @@ -116,8 +116,8 @@ However, sometimes we need to change blocks for reasons, like: * potentially in future: `delete series` operations. * repair operations (e.g fixes against Prometheus or Thanos bugs like index issues). -Since all blocks are immutable, such changes are performed by rewriting block (or compacting multiple) into new block. This -makes `removal` operations a first citizen object lifecycle. The main difficult point during this process is to make sure all +Since all blocks are immutable, such changes are performed by rewriting block (or compacting multiple) into new block. This +makes `removal` operations a first citizen object lifecycle. The main difficult point during this process is to make sure all readers are synced and aware that new block is ready in place of the old one(s), so writer can remove old block(s). In eventual consistency system we don't have that information without additional coordination. To mitigate we propose 2 new rules. All to support lazy deletion: @@ -126,7 +126,7 @@ we don't have that information without additional coordination. To mitigate we p This should be the case currently for all components except compactor. For example store having overlapping blocks with even the same data or sidecar and store exposing same data via StoreAPI is handled gracefully by frontend. -The major difference here is compactor. Currently compactor does not allow overlap. It immediately halts compactor and waits for +The major difference here is compactor. Currently compactor does not allow overlap. It immediately halts compactor and waits for manual actions. This is on purpose to not allow block malformation by blocks which should be not compacted together. "Overlaps with the same data" is crucial here as valid overlaps are when: @@ -144,7 +144,7 @@ This rule ensures that we can detect *when* to delete block, and we delete it on This also means that repairing block gracefully (rewriting for deleting series or other reason that does not need to be unblocked), is as easy as uploading new block. Compactor will delete older overlapped block eventually. There is caveat for rule 2nd rule (block being ready) for compactor. In case of compaction process we compact and we want to be aware of this block later on. Because of -eventual consistent storage, we cannot, so we potentially have uploaded a block that is "hidden" for first 15 minutes. This is bad, as compactor +eventual consistent storage, we cannot, so we potentially have uploaded a block that is "hidden" for first 15 minutes. This is bad, as compactor will see the source blocks (see 6th rule why we don't delete those immediately) and will compact same blocks for next 15 min (until we can spot the uploaded block). To mitigate this compactor: * Attempts reading meta.json from not-ready block @@ -155,13 +155,13 @@ In the worst case it is possible that compactor will compact twice same source b To match partial upload safeguards we want to delete block in reverse order: > 7 . To schedule delete operation, delete `meta.json` file. All components will exclude this block and compactor will do eventual deletion assuming the block is partially uploaded. [Compactor change needed] - -We schedule deletions instead of doing them straight away for 3 reasons: + +We schedule deletions instead of doing them straight away for 3 reasons: * Readers that have loaded this block can still access index and metrics for some time. On next sync they will notice lack of meta.json and assume partial block which excludes it from block being loaded. * Only compactor deletes metrics and index. * In further delete steps, starting with meta.json first ensures integrity mark being deleted first, so in case of deletion process being stopped, we can treat this block as partial block (rule 4th) and delete it gracefully. -There might be exception for malformed blocks that blocks compaction or reader operations. Since we may need to unblock the system +There might be exception for malformed blocks that blocks compaction or reader operations. Since we may need to unblock the system immediately the block can be forcibly removed meaning that query failures may occur (reader loaded block, but not aware block was deleted). > 8 . Compactor waits minimum 15m (`deleteDelay`) before deleting the whole `To Delete` block. [Compactor change needed] @@ -171,20 +171,20 @@ spotting lack of meta.json first. After 15 minutes we are ok to delete the whole ## Risks -* What if someone will set too low `syncDelay`? And upload itself will take more than `syncDelay` time. Block will be assumed as `ToDelete` state and will be removed. Other use case is when Prometheus is partitioned/misconfigured from object storage for longer time (hours?). Once up it will upload all blocks with ULID older than time.Now-sync-delay. How to ensure that will not happen? +* What if someone will set too low `syncDelay`? And upload itself will take more than `syncDelay` time. Block will be assumed as `ToDelete` state and will be removed. Other use case is when Prometheus is partitioned/misconfigured from object storage for longer time (hours?). Once up it will upload all blocks with ULID older than time.Now-sync-delay. How to ensure that will not happen? * Grab time for syncDelay from object meta instead? Not from ULID? That is quite hard. - * Thanks of `deleteDelay` it might have still some time to recover, we might rely on that. + * Thanks of `deleteDelay` it might have still some time to recover, we might rely on that. * Add safeguard on upload path? `upload timeout` has to smaller than `syncDelay`. * Minimum syncDelay - + * What if one block is malformed. Readers cannot handle it and crashes. How repair procedure will work? We can have repair process that can download block locally, rewrite it and fix it and upload. Problem is that it will take `syncDelay` time to appear in system. Since we are blocked, we need to make the block available immediately, ignoring eventual consistency limitations. Potential solutions: * Just delete problematic block and live with 15 minute of delay for not having some portion of metrics? Will it not cause any downsampling issues? * Use `ignore_delay` option and avoid Fresh state as possible. Eventual consistency issue may hit us. - -* Do we allow repair procedure when compactor is running? This is quite unsafe as compaction/downsamlping operation might be in progress -so repair procedure might be avoided. Turning off compactor might be tricky as well in potential HA mode. Maybe we want to introduce + +* Do we allow repair procedure when compactor is running? This is quite unsafe as compaction/downsamlping operation might be in progress +so repair procedure might be avoided. Turning off compactor might be tricky as well in potential HA mode. Maybe we want to introduce "locking" some external labels from compactor operations. If yes, how to do it in eventual consistency system? @@ -200,7 +200,7 @@ so repair procedure might be avoided. Turning off compactor might be tricky as w * [Low] Ignore `HealthyOverlapped` to reduce resource consumption for store gateway. * Test common and edge cases e.g: * Worst case scenario for compaction partial upload and multiple duplicated compaction. - * Rule 5th validation: Handling valid and invalid overlaps for various components. + * Rule 5th validation: Handling valid and invalid overlaps for various components. ## Alternatives: @@ -209,7 +209,7 @@ so repair procedure might be avoided. Turning off compactor might be tricky as w As mentioned in #Motivation section this would block community adoption as only a few of object storages has strong or even clearly stated consistency guarantees. -### Full immutability +### Full immutability By not removing any object, we may simplify problem. However we need hard deletions, because they are: - enormously reducing storage size (e.g retention is exactly for this) @@ -218,7 +218,7 @@ By not removing any object, we may simplify problem. However we need hard deleti ### Special integrity JSON file We may want to add some file to commit and mark integrity of the block. That file may hold some information like -files we expect to have in the directory and maybe some hashes of those. +files we expect to have in the directory and maybe some hashes of those. This was rejected for we can reuse existing meta.json as "integrity" marker (e.g if after 15m we don't have meta file, it is partially uploaded and should be removed). @@ -228,33 +228,33 @@ The main drawback of this is changing or required additional things from TSDB bl ### Additional coordination based on system with strong consistency (e.g etcd) -As this would help with mentioned issues, Thanos aims for being coordination free and having no additional system dependency to +As this would help with mentioned issues, Thanos aims for being coordination free and having no additional system dependency to stay operationally simple. - + ### Optimistic/vertical compaction Currently, compactor requires the blocks to appear in order of creation. That might be not the case for eventual consistent - storage. In those cases compactor assumes gap (no block being produced) and compacts with the mentioned gap. Once the new - block is actually available for reading it causes compactor to halt due to overlap. [Vertical compaction](https://github.com/prometheus/tsdb/pull/370) + storage. In those cases compactor assumes gap (no block being produced) and compacts with the mentioned gap. Once the new + block is actually available for reading it causes compactor to halt due to overlap. [Vertical compaction](https://github.com/prometheus/tsdb/pull/370) might help here as can handle those overlaps gracefully, by compacting those together. This however: - + * Heavily complicates the compaction * Malformed block in case of simple misconfiguration (misconfigured external labels - quite often happens) * There are lots of edge cases for eventual consistent storage that will cause vertical compaction to be required quite often which will affect performance. * Does not help with partial upload and syncing issues - + As vertical compaction might be something we can support in future, it clearly does not help with problems we stated here. - + ### Do not introduce `deleteDelay` - + We may avoid introducing additional state, by adding mitigation for not having delayed removal. For example for retention apply or manual block deletion, when we would delete block immediately we can have query failures (object does not exists for getrange operations) - + We can avoid it by: -* Adding `retention` flag to readers (compactor have it, but store gateway does not). Cons: - * This will increase configuration burden as you need to configure retention for each resolution on each reader. +* Adding `retention` flag to readers (compactor have it, but store gateway does not). Cons: + * This will increase configuration burden as you need to configure retention for each resolution on each reader. * This does not cover other deletion reasons, like manual removal (e.g for repair) * Handle not exists errors for queries against store gateway gracefully. For instance, if we have block loaded and bucket operation will return object not exists. We can return no data. Cons: * We might hide actual errors. No difference between "healthy" no exists error and failure one. diff --git a/docs/proposals/rejected/201807_store_instance_high_availability.md b/docs/proposals/rejected/201807_store_instance_high_availability.md index 03d8971ad0..ad01d35f81 100644 --- a/docs/proposals/rejected/201807_store_instance_high_availability.md +++ b/docs/proposals/rejected/201807_store_instance_high_availability.md @@ -9,7 +9,7 @@ Implementation owner: [@mattbostock](https://github.com/mattbostock) This proposal makes total sense and solves our goals when using gossip. However there exists a very easy solution to this problem in form of using just static entry with any loadbalancer like Kubernetes Service to load balance -through different Store Gateways. Those are technically stateless, so request can fetch the data independently. +through different Store Gateways. Those are technically stateless, so request can fetch the data independently. ## Motivation diff --git a/docs/proposals/rejected/config.md b/docs/proposals/rejected/config.md index 8bab0f0f8d..5396a24a14 100644 --- a/docs/proposals/rejected/config.md +++ b/docs/proposals/rejected/config.md @@ -16,7 +16,7 @@ Please see [Issue 387](https://github.com/improbable-eng/thanos/pull/387) and [P Currently, each scraper manages their own configuration via [Prometheus Configuration](https://prometheus.io/docs/prometheus/latest/configuration/configuration/) which contains information about the [scrape_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#%3Cscrape_config%3E) and targets that the scraper will be collecting metrics from. -As we start to dynamically scale the collection of metrics (new Prometheus instances) or increase the targets for a current tenant we wish to keep the collection of metrics to a consistent node and not re-allocate shards to other scrapers. +As we start to dynamically scale the collection of metrics (new Prometheus instances) or increase the targets for a current tenant we wish to keep the collection of metrics to a consistent node and not re-allocate shards to other scrapers. The reason for adding this as a component within the thanos project is primarily due to the need of a sidecar to interact with the Prometheus instance collecting metrics. In the scale down scenario when we want to remove all targets from a scraper we would also want to force the collection / upload of the WAL block from the running Prometheus instance before removing the Prometheus instance. During this period of scaling down / draining of the scraper we would also want the sidecar and Prometheus to continue to serve the data from within the Prometheus instance itself till we can be sure that Store node can fetch the data for our users. ALong with this we are looking to have a sidecar that updates targets via `file_sd_config` this could be a separate sidecar to thanos sidecar but adds another component into the mix. The implementation is also likely to borrow from the thanos codebase to identify which Prometheus instances are in the cluster to assign targets. @@ -61,17 +61,17 @@ We believe that a central point for configuration and management is better in th ┌──────────────────────┐ ┌────────────┬─────────┐ │ Prometheus │ Sidecar │ │ Prometheus │ Sidecar │ └─────────────────┬────┘ └────────────┴────┬────┘ - │ │ + │ │ GetConfig GetConfig │ │ - v v + v v ┌─────────────────────────────┐ │ Config │ └──────────────┬──────────────┘ - │ - Read files - │ - v + │ + Read files + │ + v ┌─────────────────────────────┐ │ prom.yml │ └─────────────────────────────┘ diff --git a/docs/release_process.md b/docs/release_process.md index ab4bd04542..1fdb0f4f1f 100644 --- a/docs/release_process.md +++ b/docs/release_process.md @@ -8,7 +8,7 @@ NOTE: As [Semantic Versioning](http://semver.org/spec/v2.0.0.html) states all 0. We aim for *at least* 1 release per 6 weeks. However, no strict dates are planned. -No release candidates are required until major version. +No release candidates are required until major version. Additionally we aim for `master` branch being stable. @@ -16,44 +16,44 @@ Additionally we aim for `master` branch being stable. Process of cutting a new *minor* Thanos release: -1. Add PR on branch `release-.` that will start minor release branch and prepare changes to cut release. +1. Add PR on branch `release-.` that will start minor release branch and prepare changes to cut release. 1. Bump [VERSION file](/VERSION) 1. Update [CHANGELOG file](/CHANGELOG.md) - Note that `CHANGELOG.md` should only document changes relevant to users of Thanos, including external API changes, performance improvements, and new features. Do not document changes of internal interfaces, code refactorings and clean-ups, changes to the build process, etc. People interested in these are asked to refer to the git history. + Note that `CHANGELOG.md` should only document changes relevant to users of Thanos, including external API changes, performance improvements, and new features. Do not document changes of internal interfaces, code refactorings and clean-ups, changes to the build process, etc. People interested in these are asked to refer to the git history. Format is described in `CHANGELOG.md`. 1. Double check backward compatibility: - 1. *In case of version after `v1+.y.z`*, double check if none of the changes break API compatibility. This should be done in PR review process, but double check is good to have. + 1. *In case of version after `v1+.y.z`*, double check if none of the changes break API compatibility. This should be done in PR review process, but double check is good to have. 1. In case of `v0.y.z`, document all incompatibilities in changelog. - + 1. After review, merge the PR and immediately after this tag a version: - + ```bash $ tag=$(< VERSION) $ git tag -s "v${tag}" -m "v${tag}" $ git push origin "v${tag}" ``` - + Signing a tag with a GPG key is appreciated, but in case you can't add a GPG key to your Github account using the following [procedure](https://help.github.com/articles/generating-a-gpg-key/), you can replace the `-s` flag by `-a` flag of the `git tag` command to only annotate the tag without signing. - + 1. Once a tag is created, the release process through CircleCI will be triggered for this tag. - + 1. You must create a Github Release using the UI for this tag, as otherwise CircleCI will not be able to upload tarballs for this tag. Also, you must create the Github Release using a Github user that has granted access rights to CircleCI. List of maintainers is available [here](/MAINTAINERS.md) - + 1. Go to the releases page of the project, click on the `Draft a new release` button and select the tag you just pushed. Describe release and post relevant entry from changelog. Click `Save draft` rather than `Publish release` at this time. (This will prevent the release being visible before it has got the binaries attached to it.) - - 1. Once tarballs are published on release page, you can click `Publish` and release is complete. - + + 1. Once tarballs are published on release page, you can click `Publish` and release is complete. + 1. Announce `#thanos` slack channel. - + 1. After release create a second PR adding `-master` [VERSION file](/VERSION) suffix to the end of version. This will ensure master built images will have different version then released one. ## Branch management and versioning strategy We use [Semantic Versioning](http://semver.org/). -NOTE: We have a separate branch for each minor release, named `release-.`, e.g. `release-0.1`, `release-0.2`. but they are +NOTE: We have a separate branch for each minor release, named `release-.`, e.g. `release-0.1`, `release-0.2`. but they are *NOT* maintained as we don't have major version yet. ## Pre-releases (release candidates) diff --git a/docs/service_discovery.md b/docs/service_discovery.md index cb73be98a2..ef6c694ef6 100644 --- a/docs/service_discovery.md +++ b/docs/service_discovery.md @@ -6,13 +6,13 @@ This logic is meant to replace Gossip that [is planned to be removed.](/docs/pro Currently places that uses Thanos SD: * `Thanos Query` needs to know about [StoreAPI](https://github.com/improbable-eng/thanos/blob/d3fb337da94d11c78151504b1fccb1d7e036f394/pkg/store/storepb/rpc.proto#L14) servers in order to query metrics from them. * `Thanos Rule` needs to know about `QueryAPI` servers in order to evaluate recording and alerting rules. -* (Only static option with DNS discovery): `Thanos Rule` needs to know about `Alertmanagers` HA replicas in order to send alerts. +* (Only static option with DNS discovery): `Thanos Rule` needs to know about `Alertmanagers` HA replicas in order to send alerts. Currently there are several ways to configure this and they are described below in details: * Static Flags * File SD -* DNS SD +* DNS SD ## Static Flags diff --git a/docs/storage.md b/docs/storage.md index 8395d44568..3460a7c969 100644 --- a/docs/storage.md +++ b/docs/storage.md @@ -4,7 +4,7 @@ Thanos supports any object stores that can be implemented against Thanos [objsto All clients are configured using `--objstore.config-file` to reference to the configuration file or `--objstore.config` to put yaml config directly. -## Implementations +## Implementations Current object storage client implementations: @@ -71,7 +71,7 @@ By default Thanos will try to retrieve credentials from the following sources: 1. From `~/.aws/credentials` 1. IAM credentials retrieved from an instance profile. -NOTE: Getting access key from config file and secret key from other method (and vice versa) is not supported. +NOTE: Getting access key from config file and secret key from other method (and vice versa) is not supported. ### AWS Policies @@ -198,7 +198,7 @@ config: ### OpenStack Swift Configuration Thanos uses [gophercloud](http://gophercloud.io/) client to upload Prometheus data into [OpenStack Swift](https://docs.openstack.org/swift/latest/). -Below is an example configuration file for thanos to use OpenStack swift container as an object store. +Below is an example configuration file for thanos to use OpenStack swift container as an object store. [embedmd]:# (flags/config_swift.txt yaml) ```yaml diff --git a/scripts/genflagdocs.sh b/scripts/genflagdocs.sh index a3d3b58f7a..84fa0bdbc8 100755 --- a/scripts/genflagdocs.sh +++ b/scripts/genflagdocs.sh @@ -44,6 +44,9 @@ for x in "${bucketCommands[@]}"; do ./thanos bucket "${x}" --help &> "docs/components/flags/bucket_${x}.txt" done +# remove white noise +sed -i 's/[ \t]*$//' docs/components/flags/*.txt + go run scripts/bucketcfggen/main.go --output-dir=docs/flags # Change dir so embedmd understand the local references made in our markdown doc. From 8f95cbced47dee37117d50b4e4dd29accb315012 Mon Sep 17 00:00:00 2001 From: Xiang Dai <764524258@qq.com> Date: Fri, 22 Mar 2019 19:04:13 +0800 Subject: [PATCH 18/43] Add help when make failed (#958) refer to [comment](https://github.com/golang/go/issues/29278#issuecomment-447571410) Signed-off-by: Xiang Dai <764524258@qq.com> --- docs/getting_started.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/getting_started.md b/docs/getting_started.md index 993248fbdd..e1e83bb5ed 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -26,6 +26,19 @@ make The `thanos` binary should now be in your `$PATH` and is the only thing required to deploy any of its components. +If you use `golang` below `1.11.5`, you may meet below error: +``` +go: verifying github.com/grpc-ecosystem/go-grpc-middleware@v1.0.0: checksum mismatch + downloaded: h1:BWIsLfhgKhV5g/oF34aRjniBHLTZe5DNekSjbAjIS6c= + go.sum: h1:Iju5GlWwrvL6UBg4zJJt3btmonfrMlCDdsejg4CZE7c= +Makefile:183: recipe for target 'go-mod-tidy' failed +``` + +You can run following cmd then `make` would pass: +``` +go clean -modcache +``` + ## [Prometheus](https://prometheus.io/) Thanos bases on vanilla Prometheus (v2.2.1+). From 731b269c2d214090d5775b8df565ea098710c2e6 Mon Sep 17 00:00:00 2001 From: Peter Szalai Date: Fri, 22 Mar 2019 12:26:36 +0000 Subject: [PATCH 19/43] objstore.s3: add trace functionality (#937) * add trace functionality to S3 client `minio.Client` has a `TraceOn` method which will be called when one set the `traceon: true` in the bucket config. This was a feature request here https://github.com/improbable-eng/thanos/issues/530 * typo * make docs * change s3 trace configuration --- docs/storage.md | 3 +++ pkg/objstore/s3/s3.go | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/docs/storage.md b/docs/storage.md index 3460a7c969..46bdfc8cc2 100644 --- a/docs/storage.md +++ b/docs/storage.md @@ -51,6 +51,8 @@ config: http_config: idle_conn_timeout: 0s insecure_skip_verify: false + trace: + enable: false ``` AWS region to endpoint mapping can be found in this [link](https://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region) @@ -62,6 +64,7 @@ For debug and testing purposes you can set * `insecure: true` to switch to plain insecure HTTP instead of HTTPS * `http_config.insecure_skip_verify: true` to disable TLS certificate verification (if your S3 based storage is using a self-signed certificate, for example) +* `trace.enable: true` to enable the minio client's verbose logging. Each request and response will be logged into the debug logger, so debug level logging must be enabled for this functionality. ### Credentials By default Thanos will try to retrieve credentials from the following sources: diff --git a/pkg/objstore/s3/s3.go b/pkg/objstore/s3/s3.go index 2bfc1041ee..c0fe3b98d9 100644 --- a/pkg/objstore/s3/s3.go +++ b/pkg/objstore/s3/s3.go @@ -44,6 +44,11 @@ type Config struct { SecretKey string `yaml:"secret_key"` PutUserMetadata map[string]string `yaml:"put_user_metadata"` HTTPConfig HTTPConfig `yaml:"http_config"` + TraceConfig TraceConfig `yaml:"trace"` +} + +type TraceConfig struct { + Enable bool `yaml:"enable"` } // HTTPConfig stores the http.Transport configuration for the s3 minio client. @@ -152,6 +157,11 @@ func NewBucketWithConfig(logger log.Logger, config Config, component string) (*B sse = encrypt.NewSSE() } + if config.TraceConfig.Enable { + logWriter := log.NewStdlibAdapter(level.Debug(logger), log.MessageKey("s3TraceMsg")) + client.TraceOn(logWriter) + } + bkt := &Bucket{ logger: logger, name: config.Bucket, From b777825c4ddedd9b43abb4a1a5e155de5f61e772 Mon Sep 17 00:00:00 2001 From: Sylvain Rabot Date: Fri, 22 Mar 2019 16:19:02 +0100 Subject: [PATCH 20/43] Fix panic when Azure error does not match pattern (#961) Signed-off-by: Sylvain Rabot --- pkg/objstore/azure/helpers.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/objstore/azure/helpers.go b/pkg/objstore/azure/helpers.go index 61b1b900f3..da6e96f89c 100644 --- a/pkg/objstore/azure/helpers.go +++ b/pkg/objstore/azure/helpers.go @@ -65,5 +65,9 @@ func getBlobURL(ctx context.Context, accountName, accountKey, containerName, blo func parseError(errorCode string) string { re, _ := regexp.Compile(`X-Ms-Error-Code:\D*\[(\w+)\]`) - return re.FindStringSubmatch(errorCode)[1] + match := re.FindStringSubmatch(errorCode) + if match != nil && len(match) == 2 { + return match[1] + } + return errorCode } From e756fe11d3259368b4d840dc1391229d7a515a67 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Fri, 22 Mar 2019 17:41:01 +0100 Subject: [PATCH 21/43] Make config reloader file writes atomic (#962) * Make config reloader file writes atomic This addresses an issue found in the Prometheus Operator, which reuses this reloader sidecar, but which then also has a second sidecar which may trigger rule-based reloads while the config sidecar is in the middle of writing out its config (in a non-atomic way): https://github.com/coreos/prometheus-operator/issues/2501 I didn't add a test for this because it's hard to catch the original problem to begin with, but it has happened. Signed-off-by: Julius Volz * Explicitly ignore os.Remove() error Signed-off-by: Julius Volz --- pkg/reloader/reloader.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pkg/reloader/reloader.go b/pkg/reloader/reloader.go index 68c9d1709d..e666590451 100644 --- a/pkg/reloader/reloader.go +++ b/pkg/reloader/reloader.go @@ -199,9 +199,16 @@ func (r *Reloader) apply(ctx context.Context) error { return errors.Wrap(err, "expand environment variables") } - if err := ioutil.WriteFile(r.cfgOutputFile, b, 0666); err != nil { + tmpFile := r.cfgOutputFile + ".tmp" + defer func() { + _ = os.Remove(tmpFile) + }() + if err := ioutil.WriteFile(tmpFile, b, 0666); err != nil { return errors.Wrap(err, "write file") } + if err := os.Rename(tmpFile, r.cfgOutputFile); err != nil { + return errors.Wrap(err, "rename file") + } } } From f24d55565d6a99b48f741c204c7e3f291f2a6b84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Giedrius=20Statkevi=C4=8Dius?= Date: Sat, 23 Mar 2019 12:27:32 +0200 Subject: [PATCH 22/43] store: add ability to limit max num of samples / concurrent queries (#798) * store: add ability to limit max samples / conc. queries * store/bucket: account for the RawChunk case Convert raw chunks into XOR encoded chunks and call the NumSamples() method on them to calculate the number of samples. Rip out the samples calculation into a different function because it is used in two different places. * store/bucket_e2e_test: adjust sample limit size It should be actually 30 - I miscalculated this. * store/bucket: add metric thanos_bucket_store_queries_limited_total * store/bucket: register queriesLimited metric * store: make changes according to the review comments * docs/store: update * store: gating naming changes, add span/extra metric * store: improve error messages * store/limiter: improve error messages * store/gate: time -> seconds * store/bucket_e2e_test: narrow down the first query * store/bucket: check for negative maxConcurrent * cmd/store: clarify help message * pkg/store: hook thanos_bucket_store_queries_limited into Limiter * store/bucket_test: fix NewBucketStore call * docs: update again * store/gate: spelling fix * store/gate: spelling fix #2 * store/bucket: remove pointless newline * store/gate: generalize gate timing Make the metric show in general how much time it takes for queries to wait at the gate. * store/gate: convert the g.gateTiming metric into a histogram * store/bucket: change comment wording * store/bucket: remove type from maxSamplesPerChunk Let Go decide by itself what kind of type it needs. * store/bucket: rename metric into thanos_bucket_store_queries_dropped * thanos/store: clarify help message Literally explain what it means in the help message so that it would be clearer. * store/gate: rename metric to thanos_bucket_store_queries_in_flight More fitting as decided by everyone. * store/gate: fix MustRegister() call * docs: update * store/bucket: clarify the name of the span Make it more clearer about what it is for. * store/bucket: inline calculation into the function call No need to create an extra variable in a hot path in the code if we can inline it and it will be just as clear. * CHANGELOG: add item about this * store/gate: reduce number of buckets * store/bucket: rename metric to thanos_bucket_store_queries_dropped_total * store/bucket: move defer out of code block * store/gate: generalize gate for different kinds of subsystems * store/limiter: remove non-nil check * CHANGELOG: fixes * store/limiter: convert failedCounter to non-ptr * store/limiter: remove invalid comment * *: update according to review comments * CHANGELOG: update * *: fix according to review * *: fix according to review * *: make docs * CHANGELOG: clean up * CHANGELOG: update * *: queries_in_flight_total -> queries_in_flight * store/bucket: do not wraper samplesLimiter error The original error already informs us about what is going wrong. * store/bucket: err -> errors.Wrap It's still useful to know that we are talking about samples here exactly. * store: make store.grpc.series-max-concurrency 20 by default Setting it to 0 by default doesn't make sense since the Go channel becomes unbuffered and all queries will timeout. Set it to 20 by default since that's the limit on Thanos Query and naturally there won't be more than 20 by default so it's good. * CHANGELOG: add warning about new limit --- CHANGELOG.md | 19 ++++++++- cmd/thanos/store.go | 12 ++++++ docs/components/store.md | 9 +++++ pkg/store/bucket.go | 76 +++++++++++++++++++++++++++++++----- pkg/store/bucket_e2e_test.go | 8 ++-- pkg/store/bucket_test.go | 2 +- pkg/store/gate.go | 64 ++++++++++++++++++++++++++++++ pkg/store/limiter.go | 31 +++++++++++++++ 8 files changed, 206 insertions(+), 15 deletions(-) create mode 100644 pkg/store/gate.go create mode 100644 pkg/store/limiter.go diff --git a/CHANGELOG.md b/CHANGELOG.md index eae8477f48..0f573b42ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,23 @@ We use *breaking* word for marking changes that are not backward compatible (rel ### Added - [#811](https://github.com/improbable-eng/thanos/pull/811) Remote write receiver +- [#798](https://github.com/improbable-eng/thanos/pull/798) Ability to limit the maximum concurrent about of Series() calls in Thanos Store and the maximum amount of samples. + +New options: + +* `--store.grpc.series-sample-limit` limits the amount of samples that might be retrieved on a single Series() call. By default it is 0. Consider enabling it by setting it to more than 0 if you are running on limited resources. +* `--store.grpc.series-max-concurrency` limits the number of concurrent Series() calls in Thanos Store. By default it is 20. Considering making it lower or bigger depending on the scale of your deployment. + +New metrics: +* `thanos_bucket_store_queries_dropped_total` shows how many queries were dropped due to the samples limit; +* `thanos_bucket_store_queries_concurrent_max` is a constant metric which shows how many Series() calls can concurrently be executed by Thanos Store; +* `thanos_bucket_store_queries_in_flight` shows how many queries are currently "in flight" i.e. they are being executed; +* `thanos_bucket_store_gate_duration_seconds` shows how many seconds it took for queries to pass through the gate in both cases - when that fails and when it does not. + +New tracing span: +* `store_query_gate_ismyturn` shows how long it took for a query to pass (or not) through the gate. + +:warning: **WARNING** :warning: #798 adds a new default limit to Thanos Store: `--store.grpc.series-max-concurrency`. Most likely you will want to make it the same as `--query.max-concurrent` on Thanos Query. ### Fixed - [#921](https://github.com/improbable-eng/thanos/pull/921) `thanos_objstore_bucket_last_successful_upload_time` now does not appear when no blocks have been uploaded so far @@ -23,7 +40,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel - [#851](https://github.com/improbable-eng/thanos/pull/851) New read API endpoint for api/v1/rules and api/v1/alerts. - [#873](https://github.com/improbable-eng/thanos/pull/873) Store: fix set index cache LRU -:warning: **WARING** :warning: #873 fix fixes actual handling of `index-cache-size`. Handling of limit for this cache was +:warning: **WARNING** :warning: #873 fix fixes actual handling of `index-cache-size`. Handling of limit for this cache was broken so it was unbounded all the time. From this release actual value matters and is extremely low by default. To "revert" the old behaviour (no boundary), use a large enough value. diff --git a/cmd/thanos/store.go b/cmd/thanos/store.go index ddf9893061..7f77e15412 100644 --- a/cmd/thanos/store.go +++ b/cmd/thanos/store.go @@ -36,6 +36,12 @@ func registerStore(m map[string]setupFunc, app *kingpin.Application, name string chunkPoolSize := cmd.Flag("chunk-pool-size", "Maximum size of concurrently allocatable bytes for chunks."). Default("2GB").Bytes() + maxSampleCount := cmd.Flag("store.grpc.series-sample-limit", + "Maximum amount of samples returned via a single Series call. 0 means no limit. NOTE: for efficiency we take 120 as the number of samples in chunk (it cannot be bigger than that), so the actual number of samples might be lower, even though the maximum could be hit."). + Default("0").Uint() + + maxConcurrent := cmd.Flag("store.grpc.series-max-concurrency", "Maximum number of concurrent Series calls.").Default("20").Int() + objStoreConfig := regCommonObjStoreFlags(cmd, "", true) syncInterval := cmd.Flag("sync-block-duration", "Repeat interval for syncing the blocks between local and remote view."). @@ -63,6 +69,8 @@ func registerStore(m map[string]setupFunc, app *kingpin.Application, name string peer, uint64(*indexCacheSize), uint64(*chunkPoolSize), + uint64(*maxSampleCount), + int(*maxConcurrent), name, debugLogging, *syncInterval, @@ -87,6 +95,8 @@ func runStore( peer cluster.Peer, indexCacheSizeBytes uint64, chunkPoolSizeBytes uint64, + maxSampleCount uint64, + maxConcurrent int, component string, verbose bool, syncInterval time.Duration, @@ -117,6 +127,8 @@ func runStore( dataDir, indexCacheSizeBytes, chunkPoolSizeBytes, + maxSampleCount, + maxConcurrent, verbose, blockSyncConcurrency, ) diff --git a/docs/components/store.md b/docs/components/store.md index 8b9d5fc79c..7d26d79627 100644 --- a/docs/components/store.md +++ b/docs/components/store.md @@ -104,6 +104,15 @@ Flags: --index-cache-size=250MB Maximum size of items held in the index cache. --chunk-pool-size=2GB Maximum size of concurrently allocatable bytes for chunks. + --store.grpc.series-sample-limit=0 + Maximum amount of samples returned via a single + Series call. 0 means no limit. NOTE: for + efficiency we take 120 as the number of samples + in chunk (it cannot be bigger than that), so + the actual number of samples might be lower, + even though the maximum could be hit. + --store.grpc.series-max-concurrency=20 + Maximum number of concurrent Series calls. --objstore.config-file= Path to YAML file that contains object store configuration. diff --git a/pkg/store/bucket.go b/pkg/store/bucket.go index 6c38ee81a7..d03167499e 100644 --- a/pkg/store/bucket.go +++ b/pkg/store/bucket.go @@ -22,6 +22,7 @@ import ( "github.com/improbable-eng/thanos/pkg/block/metadata" "github.com/improbable-eng/thanos/pkg/compact/downsample" "github.com/improbable-eng/thanos/pkg/component" + "github.com/improbable-eng/thanos/pkg/extprom" "github.com/improbable-eng/thanos/pkg/objstore" "github.com/improbable-eng/thanos/pkg/pool" "github.com/improbable-eng/thanos/pkg/runutil" @@ -42,6 +43,14 @@ import ( "google.golang.org/grpc/status" ) +// maxSamplesPerChunk is approximately the max number of samples that we may have in any given chunk. This is needed +// for precalculating the number of samples that we may have to retrieve and decode for any given query +// without downloading them. Please take a look at https://github.com/prometheus/tsdb/pull/397 to know +// where this number comes from. Long story short: TSDB is made in such a way, and it is made in such a way +// because you barely get any improvements in compression when the number of samples is beyond this. +// Take a look at Figure 6 in this whitepaper http://www.vldb.org/pvldb/vol8/p1816-teller.pdf. +const maxSamplesPerChunk = 120 + type bucketStoreMetrics struct { blocksLoaded prometheus.Gauge blockLoads prometheus.Counter @@ -57,6 +66,8 @@ type bucketStoreMetrics struct { seriesMergeDuration prometheus.Histogram resultSeriesCount prometheus.Summary chunkSizeBytes prometheus.Histogram + queriesDropped prometheus.Counter + queriesLimit prometheus.Gauge } func newBucketStoreMetrics(reg prometheus.Registerer) *bucketStoreMetrics { @@ -132,6 +143,15 @@ func newBucketStoreMetrics(reg prometheus.Registerer) *bucketStoreMetrics { }, }) + m.queriesDropped = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "thanos_bucket_store_queries_dropped_total", + Help: "Number of queries that were dropped due to the sample limit.", + }) + m.queriesLimit = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "thanos_bucket_store_queries_concurrent_max", + Help: "Number of maximum concurrent queries.", + }) + if reg != nil { reg.MustRegister( m.blockLoads, @@ -148,6 +168,8 @@ func newBucketStoreMetrics(reg prometheus.Registerer) *bucketStoreMetrics { m.seriesMergeDuration, m.resultSeriesCount, m.chunkSizeBytes, + m.queriesDropped, + m.queriesLimit, ) } return &m @@ -173,7 +195,12 @@ type BucketStore struct { // Number of goroutines to use when syncing blocks from object storage. blockSyncConcurrency int - partitioner partitioner + // Query gate which limits the maximum amount of concurrent queries. + queryGate *Gate + + // samplesLimiter limits the number of samples per each Series() call. + samplesLimiter *Limiter + partitioner partitioner } // NewBucketStore creates a new bucket backed store that implements the store API against @@ -185,12 +212,19 @@ func NewBucketStore( dir string, indexCacheSizeBytes uint64, maxChunkPoolBytes uint64, + maxSampleCount uint64, + maxConcurrent int, debugLogging bool, blockSyncConcurrency int, ) (*BucketStore, error) { if logger == nil { logger = log.NewNopLogger() } + + if maxConcurrent < 0 { + return nil, errors.Errorf("max concurrency value cannot be lower than 0 (got %v)", maxConcurrent) + } + indexCache, err := newIndexCache(reg, indexCacheSizeBytes) if err != nil { return nil, errors.Wrap(err, "create index cache") @@ -202,6 +236,7 @@ func NewBucketStore( const maxGapSize = 512 * 1024 + metrics := newBucketStoreMetrics(reg) s := &BucketStore{ logger: logger, bucket: bucket, @@ -212,14 +247,18 @@ func NewBucketStore( blockSets: map[uint64]*bucketBlockSet{}, debugLogging: debugLogging, blockSyncConcurrency: blockSyncConcurrency, + queryGate: NewGate(maxConcurrent, extprom.NewSubsystem(reg, "thanos_bucket_store")), + samplesLimiter: NewLimiter(maxSampleCount, metrics.queriesDropped), partitioner: gapBasedPartitioner{maxGapSize: maxGapSize}, } - s.metrics = newBucketStoreMetrics(reg) + s.metrics = metrics if err := os.MkdirAll(dir, 0777); err != nil { return nil, errors.Wrap(err, "create dir") } + s.metrics.queriesLimit.Set(float64(maxConcurrent)) + return s, nil } @@ -472,7 +511,7 @@ func (s *bucketSeriesSet) Err() error { return s.err } -func (s *BucketStore) blockSeries( +func blockSeries( ctx context.Context, ulid ulid.ULID, extLset map[string]string, @@ -480,6 +519,7 @@ func (s *BucketStore) blockSeries( chunkr *bucketChunkReader, matchers []labels.Matcher, req *storepb.SeriesRequest, + samplesLimiter *Limiter, ) (storepb.SeriesSet, *queryStats, error) { ps, err := indexr.ExpandedPostings(matchers) if err != nil { @@ -557,7 +597,7 @@ func (s *BucketStore) blockSeries( } // Preload all chunks that were marked in the previous stage. - if err := chunkr.preload(); err != nil { + if err := chunkr.preload(samplesLimiter); err != nil { return nil, nil, errors.Wrap(err, "preload chunks") } @@ -661,10 +701,17 @@ func debugFoundBlockSetOverview(logger log.Logger, mint, maxt int64, lset labels } // Series implements the storepb.StoreServer interface. -// TODO(bwplotka): It buffers all chunks in memory and only then streams to client. -// 1. Either count chunk sizes and error out too big query. -// 2. Stream posting -> series -> chunk all together. func (s *BucketStore) Series(req *storepb.SeriesRequest, srv storepb.Store_SeriesServer) (err error) { + { + span, _ := tracing.StartSpan(srv.Context(), "store_query_gate_ismyturn") + err := s.queryGate.IsMyTurn(srv.Context()) + span.Finish() + if err != nil { + return errors.Wrapf(err, "failed to wait for turn") + } + } + defer s.queryGate.Done() + matchers, err := translateMatchers(req.Matchers) if err != nil { return status.Error(codes.InvalidArgument, err.Error()) @@ -703,13 +750,14 @@ func (s *BucketStore) Series(req *storepb.SeriesRequest, srv storepb.Store_Serie defer runutil.CloseWithLogOnErr(s.logger, chunkr, "series block") g.Add(func() error { - part, pstats, err := s.blockSeries(ctx, + part, pstats, err := blockSeries(ctx, b.meta.ULID, b.meta.Thanos.Labels, indexr, chunkr, blockMatchers, req, + s.samplesLimiter, ) if err != nil { return errors.Wrapf(err, "fetch series for block %s", b.meta.ULID) @@ -1589,11 +1637,21 @@ func (r *bucketChunkReader) addPreload(id uint64) error { } // preload all added chunk IDs. Must be called before the first call to Chunk is made. -func (r *bucketChunkReader) preload() error { +func (r *bucketChunkReader) preload(samplesLimiter *Limiter) error { const maxChunkSize = 16000 var g run.Group + numChunks := uint64(0) + for _, offsets := range r.preloads { + for range offsets { + numChunks++ + } + } + if err := samplesLimiter.Check(numChunks * maxSamplesPerChunk); err != nil { + return errors.Wrap(err, "exceeded samples limit") + } + for seq, offsets := range r.preloads { sort.Slice(offsets, func(i, j int) bool { return offsets[i] < offsets[j] diff --git a/pkg/store/bucket_e2e_test.go b/pkg/store/bucket_e2e_test.go index 8133e9bc86..997767d055 100644 --- a/pkg/store/bucket_e2e_test.go +++ b/pkg/store/bucket_e2e_test.go @@ -35,7 +35,7 @@ func (s *storeSuite) Close() { s.wg.Wait() } -func prepareStoreWithTestBlocks(t testing.TB, dir string, bkt objstore.Bucket, manyParts bool) *storeSuite { +func prepareStoreWithTestBlocks(t testing.TB, dir string, bkt objstore.Bucket, manyParts bool, maxSampleCount uint64) *storeSuite { series := []labels.Labels{ labels.FromStrings("a", "1", "b", "1"), labels.FromStrings("a", "1", "b", "2"), @@ -87,7 +87,7 @@ func prepareStoreWithTestBlocks(t testing.TB, dir string, bkt objstore.Bucket, m testutil.Ok(t, os.RemoveAll(dir2)) } - store, err := NewBucketStore(log.NewLogfmtLogger(os.Stderr), nil, bkt, dir, 100, 0, false, 20) + store, err := NewBucketStore(log.NewLogfmtLogger(os.Stderr), nil, bkt, dir, 100, 0, maxSampleCount, 20, false, 20) testutil.Ok(t, err) s.store = store @@ -334,7 +334,7 @@ func TestBucketStore_e2e(t *testing.T) { testutil.Ok(t, err) defer func() { testutil.Ok(t, os.RemoveAll(dir)) }() - s := prepareStoreWithTestBlocks(t, dir, bkt, false) + s := prepareStoreWithTestBlocks(t, dir, bkt, false, 0) defer s.Close() testBucketStore_e2e(t, ctx, s) @@ -363,7 +363,7 @@ func TestBucketStore_ManyParts_e2e(t *testing.T) { testutil.Ok(t, err) defer func() { testutil.Ok(t, os.RemoveAll(dir)) }() - s := prepareStoreWithTestBlocks(t, dir, bkt, true) + s := prepareStoreWithTestBlocks(t, dir, bkt, true, 0) defer s.Close() testBucketStore_e2e(t, ctx, s) diff --git a/pkg/store/bucket_test.go b/pkg/store/bucket_test.go index b0d43f23ce..18f953c298 100644 --- a/pkg/store/bucket_test.go +++ b/pkg/store/bucket_test.go @@ -283,7 +283,7 @@ func TestBucketStore_Info(t *testing.T) { dir, err := ioutil.TempDir("", "prometheus-test") testutil.Ok(t, err) - bucketStore, err := NewBucketStore(nil, nil, nil, dir, 2e5, 2e5, false, 20) + bucketStore, err := NewBucketStore(nil, nil, nil, dir, 2e5, 2e5, 0, 0, false, 20) testutil.Ok(t, err) resp, err := bucketStore.Info(ctx, &storepb.InfoRequest{}) diff --git a/pkg/store/gate.go b/pkg/store/gate.go new file mode 100644 index 0000000000..dbbcb7d72a --- /dev/null +++ b/pkg/store/gate.go @@ -0,0 +1,64 @@ +package store + +import ( + "context" + "time" + + "github.com/improbable-eng/thanos/pkg/extprom" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/prometheus/pkg/gate" +) + +// Gate wraps the Prometheus gate with extra metrics. +type Gate struct { + g *gate.Gate + inflightQueries prometheus.Gauge + gateTiming prometheus.Histogram +} + +// NewGate returns a new query gate. +func NewGate(maxConcurrent int, reg *extprom.SubsystemRegisterer) *Gate { + g := &Gate{ + g: gate.New(maxConcurrent), + } + g.inflightQueries = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "queries_in_flight", + Help: "Number of queries that are currently in flight.", + Subsystem: reg.Subsystem(), + }) + g.gateTiming = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "gate_duration_seconds", + Help: "How many seconds it took for queries to wait at the gate.", + Buckets: []float64{ + 0.01, 0.05, 0.1, 0.25, 0.6, 1, 2, 3.5, 5, 10, + }, + Subsystem: reg.Subsystem(), + }) + + if r := reg.Registerer(); r != nil { + r.MustRegister(g.inflightQueries, g.gateTiming) + } + + return g +} + +// IsMyTurn iniates a new query and waits until it's our turn to fulfill a query request. +func (g *Gate) IsMyTurn(ctx context.Context) error { + start := time.Now() + defer func() { + g.gateTiming.Observe(float64(time.Now().Sub(start))) + }() + + if err := g.g.Start(ctx); err != nil { + return err + } + + g.inflightQueries.Inc() + return nil +} + +// Done finishes a query. +func (g *Gate) Done() { + g.inflightQueries.Dec() + g.g.Done() +} diff --git a/pkg/store/limiter.go b/pkg/store/limiter.go new file mode 100644 index 0000000000..2c332a2c6b --- /dev/null +++ b/pkg/store/limiter.go @@ -0,0 +1,31 @@ +package store + +import ( + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" +) + +// Limiter is a simple mechanism for checking if something has passed a certain threshold. +type Limiter struct { + limit uint64 + + // Counter metric which we will increase if Check() fails. + failedCounter prometheus.Counter +} + +// NewLimiter returns a new limiter with a specified limit. 0 disables the limit. +func NewLimiter(limit uint64, ctr prometheus.Counter) *Limiter { + return &Limiter{limit: limit, failedCounter: ctr} +} + +// Check checks if the passed number exceeds the limits or not. +func (l *Limiter) Check(num uint64) error { + if l.limit == 0 { + return nil + } + if num > l.limit { + l.failedCounter.Inc() + return errors.Errorf("limit %v violated (got %v)", l.limit, num) + } + return nil +} From cf6a67a903a40e5cb5af0655bd80df8cf22948e1 Mon Sep 17 00:00:00 2001 From: Milian Reichardt Date: Sat, 23 Mar 2019 11:43:48 +0100 Subject: [PATCH 23/43] bucket verify: sort metas by MinTime before overlap check (#966) --- CHANGELOG.md | 1 + pkg/verifier/overlapped_blocks.go | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f573b42ab..fa6032cdb7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ New tracing span: ### Fixed - [#921](https://github.com/improbable-eng/thanos/pull/921) `thanos_objstore_bucket_last_successful_upload_time` now does not appear when no blocks have been uploaded so far +- [#966](https://github.com/improbable-eng/thanos/pull/966) Bucket: verify no longer warns about overlapping blocks, that overlap `0s` ## [v0.3.2](https://github.com/improbable-eng/thanos/releases/tag/v0.3.2) - 2019.03.04 diff --git a/pkg/verifier/overlapped_blocks.go b/pkg/verifier/overlapped_blocks.go index 675442ace7..072fe54aec 100644 --- a/pkg/verifier/overlapped_blocks.go +++ b/pkg/verifier/overlapped_blocks.go @@ -2,7 +2,6 @@ package verifier import ( "context" - "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" "github.com/improbable-eng/thanos/pkg/block" @@ -11,6 +10,7 @@ import ( "github.com/oklog/ulid" "github.com/pkg/errors" "github.com/prometheus/tsdb" + "sort" ) const OverlappedBlocksIssueID = "overlapped_blocks" @@ -66,6 +66,11 @@ func fetchOverlaps(ctx context.Context, logger log.Logger, bkt objstore.Bucket) overlaps := map[string]tsdb.Overlaps{} for k, groupMetas := range metas { + + sort.Slice(groupMetas, func(i, j int) bool { + return groupMetas[i].MinTime < groupMetas[j].MinTime + }) + o := tsdb.OverlappingBlocks(groupMetas) if len(o) > 0 { overlaps[k] = o From f25194851c337f40a2cfa92b1014638b6439c250 Mon Sep 17 00:00:00 2001 From: Martin Chodur Date: Sun, 24 Mar 2019 00:35:38 +0100 Subject: [PATCH 24/43] feat sidecar: added info log about successful load of external labels (#965) Signed-off-by: Martin Chodur --- cmd/thanos/sidecar.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cmd/thanos/sidecar.go b/cmd/thanos/sidecar.go index 7334cbd4ec..1e31b54d15 100644 --- a/cmd/thanos/sidecar.go +++ b/cmd/thanos/sidecar.go @@ -22,12 +22,12 @@ import ( "github.com/improbable-eng/thanos/pkg/store" "github.com/improbable-eng/thanos/pkg/store/storepb" "github.com/oklog/run" - "github.com/opentracing/opentracing-go" + opentracing "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/tsdb/labels" "google.golang.org/grpc" - "gopkg.in/alecthomas/kingpin.v2" + kingpin "gopkg.in/alecthomas/kingpin.v2" ) func registerSidecar(m map[string]setupFunc, app *kingpin.Application, name string) { @@ -137,6 +137,10 @@ func runSidecar( return err } + level.Info(logger).Log( + "msg", "successfully loaded prometheus external labels", + "external_labels", m.Labels().String(), + ) promUp.Set(1) lastHeartbeat.Set(float64(time.Now().UnixNano()) / 1e9) return nil From c75388a3ac729498d330408a095c722f4b6b160a Mon Sep 17 00:00:00 2001 From: Jack Neely Date: Mon, 25 Mar 2019 11:28:32 -0400 Subject: [PATCH 25/43] compact: accept malformed index (#953) * Allow malformed index patch Handle cases where we detect that postings have labels listed incorrectly due to Prometheus Isuue #5372. With a command line option set these specific errors can be ignored as they happen with Prometheus 2.8.0 and lesser versions. * Fix space in structured logging key * Fix tests to not ignore out of order label errors in postings * Skip verification of newly compacted block if allow malformed index The VerifyIndex() function explicitly states testing of the invariant ordering, so rather than adding additional parameters to change its behavior when --debug.accept-malformed-index is set, we skip the verification step on the newly compacted TSDB block. This allows compaction to happen as normal when out of order labels are present in the index. * PR feedback for acceptMalformedIndex * Use fields instead of function parameters * use the same variable name everywhere * acceptMalformedIndex: Fix tests reflecting field vs parameters * Route acceptMalformedIndex option via the Syncer * accept-malformed-index: PR feedback for comments and error msgs --- cmd/thanos/compact.go | 8 +++++++- pkg/block/index.go | 26 +++++++++++++++++++++++++- pkg/compact/compact.go | 15 +++++++++++++-- pkg/compact/compact_e2e_test.go | 5 +++-- 4 files changed, 48 insertions(+), 6 deletions(-) diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index dd462d6b51..c95cf113d6 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -66,6 +66,9 @@ func registerCompact(m map[string]setupFunc, app *kingpin.Application, name stri haltOnError := cmd.Flag("debug.halt-on-error", "Halt the process if a critical compaction error is detected."). Hidden().Default("true").Bool() + acceptMalformedIndex := cmd.Flag("debug.accept-malformed-index", + "Compaction index verification will ignore out of order label names."). + Hidden().Default("false").Bool() httpAddr := regHTTPAddrFlag(cmd) @@ -102,6 +105,7 @@ func registerCompact(m map[string]setupFunc, app *kingpin.Application, name stri objStoreConfig, time.Duration(*syncDelay), *haltOnError, + *acceptMalformedIndex, *wait, map[compact.ResolutionLevel]time.Duration{ compact.ResolutionLevelRaw: time.Duration(*retentionRaw), @@ -125,6 +129,7 @@ func runCompact( objStoreConfig *pathOrContent, syncDelay time.Duration, haltOnError bool, + acceptMalformedIndex bool, wait bool, retentionByResolution map[compact.ResolutionLevel]time.Duration, component string, @@ -162,7 +167,8 @@ func runCompact( } }() - sy, err := compact.NewSyncer(logger, reg, bkt, syncDelay, blockSyncConcurrency) + sy, err := compact.NewSyncer(logger, reg, bkt, syncDelay, + blockSyncConcurrency, acceptMalformedIndex) if err != nil { return errors.Wrap(err, "create syncer") } diff --git a/pkg/block/index.go b/pkg/block/index.go index 232bf40d37..777654f2e6 100644 --- a/pkg/block/index.go +++ b/pkg/block/index.go @@ -16,6 +16,7 @@ import ( "github.com/prometheus/tsdb/fileutil" "github.com/go-kit/kit/log" + "github.com/go-kit/kit/log/level" "github.com/improbable-eng/thanos/pkg/runutil" "github.com/oklog/ulid" "github.com/pkg/errors" @@ -248,6 +249,20 @@ type Stats struct { // Specifically we mean here chunks with minTime == block.maxTime and maxTime > block.MaxTime. These are // are segregated into separate counters. These chunks are safe to be deleted, since they are duplicated across 2 blocks. Issue347OutsideChunks int + // OutOfOrderLabels represents the number of postings that contained out + // of order labels, a bug present in Prometheus 2.8.0 and below. + OutOfOrderLabels int +} + +// PrometheusIssue5372Err returns an error if the Stats object indicates +// postings with out of order labels. This is corrected by Prometheus Issue +// #5372 and affects Prometheus versions 2.8.0 and below. +func (i Stats) PrometheusIssue5372Err() error { + if i.OutOfOrderLabels > 0 { + return errors.Errorf("index contains %d postings with out of order labels", + i.OutOfOrderLabels) + } + return nil } // Issue347OutsideChunksErr returns error if stats indicates issue347 block issue, that is repaired explicitly before compaction (on plan block). @@ -301,6 +316,10 @@ func (i Stats) AnyErr() error { errMsg = append(errMsg, err.Error()) } + if err := i.PrometheusIssue5372Err(); err != nil { + errMsg = append(errMsg, err.Error()) + } + if len(errMsg) > 0 { return errors.New(strings.Join(errMsg, ", ")) } @@ -348,7 +367,12 @@ func GatherIndexIssueStats(logger log.Logger, fn string, minTime int64, maxTime l0 := lset[0] for _, l := range lset[1:] { if l.Name < l0.Name { - return stats, errors.Errorf("out-of-order label set %s for series %d", lset, id) + stats.OutOfOrderLabels++ + level.Warn(logger).Log("msg", + "out-of-order label set: known bug in Prometheus 2.8.0 and below", + "labelset", fmt.Sprintf("%s", lset), + "series", fmt.Sprintf("%d", id), + ) } l0 = l } diff --git a/pkg/compact/compact.go b/pkg/compact/compact.go index ca05adfb4a..6fd89b76e9 100644 --- a/pkg/compact/compact.go +++ b/pkg/compact/compact.go @@ -47,6 +47,7 @@ type Syncer struct { blocksMtx sync.Mutex blockSyncConcurrency int metrics *syncerMetrics + acceptMalformedIndex bool } type syncerMetrics struct { @@ -128,7 +129,7 @@ func newSyncerMetrics(reg prometheus.Registerer) *syncerMetrics { // NewSyncer returns a new Syncer for the given Bucket and directory. // Blocks must be at least as old as the sync delay for being considered. -func NewSyncer(logger log.Logger, reg prometheus.Registerer, bkt objstore.Bucket, syncDelay time.Duration, blockSyncConcurrency int) (*Syncer, error) { +func NewSyncer(logger log.Logger, reg prometheus.Registerer, bkt objstore.Bucket, syncDelay time.Duration, blockSyncConcurrency int, acceptMalformedIndex bool) (*Syncer, error) { if logger == nil { logger = log.NewNopLogger() } @@ -140,6 +141,7 @@ func NewSyncer(logger log.Logger, reg prometheus.Registerer, bkt objstore.Bucket bkt: bkt, metrics: newSyncerMetrics(reg), blockSyncConcurrency: blockSyncConcurrency, + acceptMalformedIndex: acceptMalformedIndex, }, nil } @@ -291,6 +293,7 @@ func (c *Syncer) Groups() (res []*Group, err error) { c.bkt, labels.FromMap(m.Thanos.Labels), m.Thanos.Downsample.Resolution, + c.acceptMalformedIndex, c.metrics.compactions.WithLabelValues(GroupKey(*m)), c.metrics.compactionFailures.WithLabelValues(GroupKey(*m)), c.metrics.garbageCollectedBlocks, @@ -436,6 +439,7 @@ type Group struct { resolution int64 mtx sync.Mutex blocks map[ulid.ULID]*metadata.Meta + acceptMalformedIndex bool compactions prometheus.Counter compactionFailures prometheus.Counter groupGarbageCollectedBlocks prometheus.Counter @@ -447,6 +451,7 @@ func newGroup( bkt objstore.Bucket, lset labels.Labels, resolution int64, + acceptMalformedIndex bool, compactions prometheus.Counter, compactionFailures prometheus.Counter, groupGarbageCollectedBlocks prometheus.Counter, @@ -460,6 +465,7 @@ func newGroup( labels: lset, resolution: resolution, blocks: map[ulid.ULID]*metadata.Meta{}, + acceptMalformedIndex: acceptMalformedIndex, compactions: compactions, compactionFailures: compactionFailures, groupGarbageCollectedBlocks: groupGarbageCollectedBlocks, @@ -769,6 +775,11 @@ func (cg *Group) compact(ctx context.Context, dir string, comp tsdb.Compactor) ( if err := stats.Issue347OutsideChunksErr(); err != nil { return false, ulid.ULID{}, issue347Error(errors.Wrapf(err, "invalid, but reparable block %s", pdir), meta.ULID) } + + if err := stats.PrometheusIssue5372Err(); !cg.acceptMalformedIndex && err != nil { + return false, ulid.ULID{}, errors.Wrapf(err, + "block id %s, try running with --debug.accept-malformed-index", id) + } } level.Debug(cg.logger).Log("msg", "downloaded and verified blocks", "blocks", fmt.Sprintf("%v", plan), "duration", time.Since(begin)) @@ -816,7 +827,7 @@ func (cg *Group) compact(ctx context.Context, dir string, comp tsdb.Compactor) ( } // Ensure the output block is valid. - if err := block.VerifyIndex(cg.logger, filepath.Join(bdir, block.IndexFilename), newMeta.MinTime, newMeta.MaxTime); err != nil { + if err := block.VerifyIndex(cg.logger, filepath.Join(bdir, block.IndexFilename), newMeta.MinTime, newMeta.MaxTime); !cg.acceptMalformedIndex && err != nil { return false, ulid.ULID{}, halt(errors.Wrapf(err, "invalid result block %s", bdir)) } diff --git a/pkg/compact/compact_e2e_test.go b/pkg/compact/compact_e2e_test.go index 6f2194c41f..738048c5c2 100644 --- a/pkg/compact/compact_e2e_test.go +++ b/pkg/compact/compact_e2e_test.go @@ -32,7 +32,7 @@ func TestSyncer_SyncMetas_e2e(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second) defer cancel() - sy, err := NewSyncer(nil, nil, bkt, 0, 1) + sy, err := NewSyncer(nil, nil, bkt, 0, 1, false) testutil.Ok(t, err) // Generate 15 blocks. Initially the first 10 are synced into memory and only the last @@ -134,7 +134,7 @@ func TestSyncer_GarbageCollect_e2e(t *testing.T) { } // Do one initial synchronization with the bucket. - sy, err := NewSyncer(nil, nil, bkt, 0, 1) + sy, err := NewSyncer(nil, nil, bkt, 0, 1, false) testutil.Ok(t, err) testutil.Ok(t, sy.SyncMetas(ctx)) @@ -244,6 +244,7 @@ func TestGroup_Compact_e2e(t *testing.T) { bkt, extLset, 124, + false, metrics.compactions.WithLabelValues(""), metrics.compactionFailures.WithLabelValues(""), metrics.garbageCollectedBlocks, From db0d3eec6daa4561e509031d715ffcd761187c3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Tue, 26 Mar 2019 10:00:54 +0000 Subject: [PATCH 26/43] Fixed reloader tests to remove flakiness, simplified them. (#969) Signed-off-by: Bartek Plotka --- pkg/reloader/reloader.go | 69 +++--- pkg/reloader/reloader_test.go | 386 +++++++++------------------------- 2 files changed, 147 insertions(+), 308 deletions(-) diff --git a/pkg/reloader/reloader.go b/pkg/reloader/reloader.go index e666590451..88b87a80bb 100644 --- a/pkg/reloader/reloader.go +++ b/pkg/reloader/reloader.go @@ -81,7 +81,7 @@ type Reloader struct { cfgFile string cfgOutputFile string ruleDirs []string - ruleInterval time.Duration + watchInterval time.Duration retryInterval time.Duration lastCfgHash []byte @@ -105,55 +105,74 @@ func New(logger log.Logger, reloadURL *url.URL, cfgFile string, cfgOutputFile st cfgFile: cfgFile, cfgOutputFile: cfgOutputFile, ruleDirs: ruleDirs, - ruleInterval: 3 * time.Minute, + watchInterval: 3 * time.Minute, retryInterval: 5 * time.Second, } } -// Watch starts to watch the config file and rules and process them until the context +// We cannot detect everything via watch. Watch interval controls how often we re-read given dirs non-recursively. +func (r *Reloader) WithWatchInterval(duration time.Duration) { + r.watchInterval = duration +} + +// Watch starts to watch periodically the config file and rules and process them until the context // gets canceled. Config file gets env expanded if cfgOutputFile is specified and reload is trigger if // config or rules changed. +// Watch watchers periodically based on r.watchInterval. +// For config file it watches it directly as well via fsnotify. +// It watches rule dirs as well, but lot's of edge cases are missing, so rely on interval mostly. func (r *Reloader) Watch(ctx context.Context) error { - configWatcher, err := fsnotify.NewWatcher() + watcher, err := fsnotify.NewWatcher() if err != nil { return errors.Wrap(err, "create watcher") } - defer runutil.CloseWithLogOnErr(r.logger, configWatcher, "config watcher close") + defer runutil.CloseWithLogOnErr(r.logger, watcher, "config watcher close") + watchables := map[string]struct{}{} if r.cfgFile != "" { - if err := configWatcher.Add(r.cfgFile); err != nil { - return errors.Wrap(err, "add config file watch") + watchables[filepath.Dir(r.cfgFile)] = struct{}{} + if err := watcher.Add(r.cfgFile); err != nil { + return errors.Wrapf(err, "add config file %s to watcher", r.cfgFile) } - level.Info(r.logger).Log( - "msg", "started watching config file for changes", - "in", r.cfgFile, - "out", r.cfgOutputFile) - err := r.apply(ctx) - if err != nil { + if err := r.apply(ctx); err != nil { return err } } - tick := time.NewTicker(r.ruleInterval) + // Watch rule dirs in best effort manner. + for _, ruleDir := range r.ruleDirs { + watchables[filepath.Dir(ruleDir)] = struct{}{} + if err := watcher.Add(ruleDir); err != nil { + return errors.Wrapf(err, "add rule dir %s to watcher", ruleDir) + } + } + + tick := time.NewTicker(r.watchInterval) defer tick.Stop() + level.Info(r.logger).Log( + "msg", "started watching config file and non-recursively rule dirs for changes", + "cfg", r.cfgFile, + "out", r.cfgOutputFile, + "dirs", strings.Join(r.ruleDirs, ",")) + for { select { case <-ctx.Done(): return nil case <-tick.C: - case event := <-configWatcher.Events: - if event.Name != r.cfgFile { + case event := <-watcher.Events: + // TODO(bwplotka): Add metric if we are not cycling CPU here too much. + if _, ok := watchables[filepath.Dir(event.Name)]; !ok { continue } - case err := <-configWatcher.Errors: + case err := <-watcher.Errors: level.Error(r.logger).Log("msg", "watch error", "err", err) continue } - err := r.apply(ctx) - if err != nil { + if err := r.apply(ctx); err != nil { // Critical error. return err } @@ -162,7 +181,7 @@ func (r *Reloader) Watch(ctx context.Context) error { // apply triggers Prometheus reload if rules or config changed. If cfgOutputFile is set, we also // expand env vars into config file before reloading. -// Reload is retried in retryInterval until ruleInterval. +// Reload is retried in retryInterval until watchInterval. func (r *Reloader) apply(ctx context.Context) error { var ( cfgHash []byte @@ -254,8 +273,10 @@ func (r *Reloader) apply(ctx context.Context) error { } // Retry trigger reload until it succeeded or next tick is near. - retryCtx, cancel := context.WithTimeout(ctx, r.ruleInterval) - err := runutil.RetryWithLog(r.logger, r.retryInterval, retryCtx.Done(), func() error { + retryCtx, cancel := context.WithTimeout(ctx, r.watchInterval) + defer cancel() + + if err := runutil.RetryWithLog(r.logger, r.retryInterval, retryCtx.Done(), func() error { if err := r.triggerReload(ctx); err != nil { return errors.Wrap(err, "trigger reload") } @@ -268,9 +289,7 @@ func (r *Reloader) apply(ctx context.Context) error { "cfg_out", r.cfgOutputFile, "rule_dirs", strings.Join(r.ruleDirs, ", ")) return nil - }) - cancel() - if err != nil { + }); err != nil { level.Error(r.logger).Log("msg", "Failed to trigger reload. Retrying.", "err", err) } diff --git a/pkg/reloader/reloader_test.go b/pkg/reloader/reloader_test.go index 1eea9fa926..7d3438543e 100644 --- a/pkg/reloader/reloader_test.go +++ b/pkg/reloader/reloader_test.go @@ -11,6 +11,7 @@ import ( "path" "strings" "sync" + "sync/atomic" "testing" "time" @@ -24,13 +25,11 @@ func TestReloader_ConfigApply(t *testing.T) { l, err := net.Listen("tcp", "localhost:0") testutil.Ok(t, err) - reloads := 0 + reloads := &atomic.Value{} + reloads.Store(0) i := 0 - promHandlerMu := sync.Mutex{} srv := &http.Server{} srv.Handler = http.HandlerFunc(func(resp http.ResponseWriter, r *http.Request) { - promHandlerMu.Lock() - defer promHandlerMu.Unlock() i++ if i%2 == 0 { // Every second request, fail to ensure that retry works. @@ -38,7 +37,7 @@ func TestReloader_ConfigApply(t *testing.T) { return } - reloads++ + reloads.Store(reloads.Load().(int) + 1) // The only writer. resp.WriteHeader(http.StatusOK) }) go func() { @@ -61,6 +60,7 @@ func TestReloader_ConfigApply(t *testing.T) { output = path.Join(dir, "out", "cfg.yaml") ) reloader := New(nil, reloadURL, input, output, nil) + reloader.watchInterval = 9999 * time.Hour // Disable interval to test watch logic only. reloader.retryInterval = 100 * time.Millisecond testNoConfig(t, reloader) @@ -77,206 +77,100 @@ config: testutil.Ok(t, os.Setenv("TEST_RELOADER_THANOS_ENV", "2")) testutil.Ok(t, os.Setenv("TEST_RELOADER_THANOS_ENV2", "3")) - reloadsFn := func() int { - promHandlerMu.Lock() - promHandlerMu.Unlock() - return reloads - } - testInitialApply(t, reloader, reloadsFn, output) - - reloads = 0 - reloader.lastCfgHash = []byte{} - reloader.lastRuleHash = []byte{} - testOnChangeApply(t, reloader, reloadsFn, input, output) -} - -func testNoConfig(t *testing.T, reloader *Reloader) { - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - err := reloader.Watch(ctx) - cancel() - testutil.NotOk(t, err) - testutil.Assert(t, strings.HasSuffix(err.Error(), "no such file or directory"), "expect error since there is no input config.") -} - -func testUnsetVariables(t *testing.T, reloader *Reloader) { - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - err := reloader.Watch(ctx) - cancel() - testutil.NotOk(t, err) - testutil.Assert(t, strings.HasSuffix(err.Error(), `found reference to unset environment variable "TEST_RELOADER_THANOS_ENV"`), "expect error since there envvars are not set.") -} - -func testInitialApply(t *testing.T, reloader *Reloader, reloadsFn func() int, output string) { - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + g := sync.WaitGroup{} + g.Add(1) go func() { + defer g.Done() + defer cancel() + + reloadsSeen := 0 for { select { case <-ctx.Done(): - case <-time.After(500 * time.Millisecond): + return + case <-time.After(300 * time.Millisecond): } - if ctx.Err() != nil { - break + rel := reloads.Load().(int) + if rel <= reloadsSeen { + continue } + reloadsSeen = rel - if reloadsFn() > 0 { - break - } - } - }() - err := reloader.Watch(ctx) - cancel() - testutil.Ok(t, err) + switch rel { + case 1: + // Initial apply seen (without doing nothing) - testutil.Equals(t, 1, reloadsFn()) - f, err := ioutil.ReadFile(output) - testutil.Ok(t, err) + // Output looks as expected? + f, err := ioutil.ReadFile(output) + testutil.Ok(t, err) - testutil.Equals(t, ` + testutil.Equals(t, ` config: a: 1 b: 2 c: 3 `, string(f)) -} - -func testOnChangeApply(t *testing.T, reloader *Reloader, reloadsFn func() int, input string, output string) { - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - go func() { - for { - select { - case <-ctx.Done(): - case <-time.After(500 * time.Millisecond): - } - if ctx.Err() != nil { - break - } - - if reloadsFn() == 1 { + // Change config, expect reload in another iteration. testutil.Ok(t, ioutil.WriteFile(input, []byte(` config: a: changed b: $(TEST_RELOADER_THANOS_ENV) c: $(TEST_RELOADER_THANOS_ENV2) `), os.ModePerm)) - continue - } - if reloadsFn() > 1 { - break - } - } - }() - err := reloader.Watch(ctx) - cancel() - testutil.Ok(t, err) - - testutil.Equals(t, 2, reloadsFn()) - f, err := ioutil.ReadFile(output) - testutil.Ok(t, err) + case 2: + f, err := ioutil.ReadFile(output) + testutil.Ok(t, err) - testutil.Equals(t, ` + testutil.Equals(t, ` config: a: changed b: 2 c: 3 `, string(f)) -} - -func TestReloader_RuleApply(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - - l, err := net.Listen("tcp", "localhost:0") - testutil.Ok(t, err) - - reloads := 0 - i := 0 - promHandlerMu := sync.Mutex{} - srv := &http.Server{} - srv.Handler = http.HandlerFunc(func(resp http.ResponseWriter, r *http.Request) { - promHandlerMu.Lock() - defer promHandlerMu.Unlock() - - i++ - if i%2 == 0 { - // Every second request, fail to ensure that retry works. - resp.WriteHeader(http.StatusServiceUnavailable) - return - } - - reloads++ - resp.WriteHeader(http.StatusOK) - }) - go func() { - _ = srv.Serve(l) - }() - defer func() { testutil.Ok(t, srv.Close()) }() - - reloadURL, err := url.Parse(fmt.Sprintf("http://%s", l.Addr().String())) - testutil.Ok(t, err) - - dir, err := ioutil.TempDir("", "reloader-rules-test") - testutil.Ok(t, err) - defer func() { testutil.Ok(t, os.RemoveAll(dir)) }() - - reloader := New(nil, reloadURL, "", "", []string{dir}) - reloader.ruleInterval = 100 * time.Millisecond - reloader.retryInterval = 100 * time.Millisecond - - reloadsFn := func() int { - promHandlerMu.Lock() - promHandlerMu.Unlock() - return reloads - } - - testutil.Ok(t, ioutil.WriteFile(dir+"/rule1.yaml", []byte("rule"), os.ModePerm)) - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - go func() { - for { - select { - case <-ctx.Done(): - case <-time.After(300 * time.Millisecond): } - if ctx.Err() != nil { - break - } - - if reloadsFn() == 1 { - testutil.Ok(t, ioutil.WriteFile(dir+"/rule2.yaml", []byte("rule2"), os.ModePerm)) - continue - } - if reloadsFn() == 2 { - // Change rule 1. - testutil.Ok(t, ioutil.WriteFile(dir+"/rule1.yaml", []byte("rule1-changed"), os.ModePerm)) - continue - } - if reloadsFn() > 2 { - break + if rel > 1 { + // All good. + return } } }() err = reloader.Watch(ctx) cancel() - testutil.Ok(t, err) + g.Wait() + testutil.Equals(t, 2, reloads.Load().(int)) +} - testutil.Equals(t, 3, reloadsFn()) +func testNoConfig(t *testing.T, reloader *Reloader) { + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + err := reloader.Watch(ctx) + cancel() + testutil.NotOk(t, err) + testutil.Assert(t, strings.HasSuffix(err.Error(), "no such file or directory"), "expect error since there is no input config.") } -func TestReloader_RuleApplySymlink(t *testing.T) { +func testUnsetVariables(t *testing.T, reloader *Reloader) { + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + err := reloader.Watch(ctx) + cancel() + testutil.NotOk(t, err) + testutil.Assert(t, strings.HasSuffix(err.Error(), `found reference to unset environment variable "TEST_RELOADER_THANOS_ENV"`), "expect error since there envvars are not set.") +} + +func TestReloader_RuleApply(t *testing.T) { defer leaktest.CheckTimeout(t, 10*time.Second)() l, err := net.Listen("tcp", "localhost:0") testutil.Ok(t, err) - reloads := 0 + reloads := &atomic.Value{} + reloads.Store(0) i := 0 - promHandlerMu := sync.Mutex{} srv := &http.Server{} srv.Handler = http.HandlerFunc(func(resp http.ResponseWriter, r *http.Request) { - promHandlerMu.Lock() - defer promHandlerMu.Unlock() - i++ if i%2 == 0 { // Every second request, fail to ensure that retry works. @@ -284,7 +178,7 @@ func TestReloader_RuleApplySymlink(t *testing.T) { return } - reloads++ + reloads.Store(reloads.Load().(int) + 1) // The only writer. resp.WriteHeader(http.StatusOK) }) go func() { @@ -299,148 +193,74 @@ func TestReloader_RuleApplySymlink(t *testing.T) { testutil.Ok(t, err) defer func() { testutil.Ok(t, os.RemoveAll(dir)) }() - sourceDir, err := ioutil.TempDir("", "reload-rules-test-source") - testutil.Ok(t, err) - defer func() { testutil.Ok(t, os.RemoveAll(sourceDir)) }() - - reloader := New(nil, reloadURL, "", "", []string{dir}) - reloader.ruleInterval = 100 * time.Millisecond - reloader.retryInterval = 100 * time.Millisecond - - reloadsFn := func() int { - promHandlerMu.Lock() - promHandlerMu.Unlock() - return reloads - } - - var ( - rules1 string = sourceDir + "/rules-v1.yaml" - rules2 = sourceDir + "/rules-v2.yaml" - watchedSymlink = dir + "/rules.yaml" - atomicSwapSymlink = sourceDir + "/atomic" - ) - // write our source configs - testutil.Ok(t, ioutil.WriteFile(rules1, []byte("rules"), os.ModePerm)) - testutil.Ok(t, ioutil.WriteFile(rules2, []byte("rules-modified"), os.ModePerm)) - // start with v1 - testutil.Ok(t, os.Symlink(rules1, watchedSymlink)) - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - go func() { - for { - select { - case <-ctx.Done(): - case <-time.After(300 * time.Millisecond): - } - - if ctx.Err() != nil { - break - } - - if reloadsFn() == 1 { // first load - // swap the symlink atomically - testutil.Ok(t, os.Symlink(rules2, atomicSwapSymlink)) - testutil.Ok(t, os.Rename(atomicSwapSymlink, watchedSymlink)) - } - - if reloadsFn() > 1 { - break - } - } - }() - err = reloader.Watch(ctx) - cancel() + dir2, err := ioutil.TempDir("", "reload-rules-test2") testutil.Ok(t, err) - testutil.Equals(t, 2, reloadsFn()) -} + defer func() { testutil.Ok(t, os.RemoveAll(dir2)) }() -func TestReloader_RuleApplySymlinkDirs(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + // Symlinked directory. + testutil.Ok(t, os.Mkdir(path.Join(dir2, "rule-dir"), os.ModePerm)) + testutil.Ok(t, os.Symlink(path.Join(dir2, "rule-dir"), path.Join(dir, "rule-dir"))) - l, err := net.Listen("tcp", "localhost:0") - testutil.Ok(t, err) + reloader := New(nil, reloadURL, "", "", []string{dir, path.Join(dir, "rule-dir")}) + reloader.watchInterval = 100 * time.Millisecond + reloader.retryInterval = 100 * time.Millisecond - reloads := 0 - i := 0 - promHandlerMu := sync.Mutex{} - srv := &http.Server{} - srv.Handler = http.HandlerFunc(func(resp http.ResponseWriter, r *http.Request) { - promHandlerMu.Lock() - defer promHandlerMu.Unlock() + // Some initial state. + testutil.Ok(t, ioutil.WriteFile(path.Join(dir, "rule1.yaml"), []byte("rule"), os.ModePerm)) + testutil.Ok(t, ioutil.WriteFile(path.Join(dir2, "rule3-source.yaml"), []byte("rule3"), os.ModePerm)) + testutil.Ok(t, ioutil.WriteFile(path.Join(dir2, "rule-dir", "rule4.yaml"), []byte("rule4"), os.ModePerm)) - i++ - if i%2 == 0 { - // Every second request, fail to ensure that retry works. - resp.WriteHeader(http.StatusServiceUnavailable) - return - } - - reloads++ - resp.WriteHeader(http.StatusOK) - }) + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + g := sync.WaitGroup{} + g.Add(1) go func() { - _ = srv.Serve(l) - }() - defer func() { testutil.Ok(t, srv.Close()) }() - - reloadURL, err := url.Parse(fmt.Sprintf("http://%s", l.Addr().String())) - testutil.Ok(t, err) + defer g.Done() + defer cancel() - workingDir, err := ioutil.TempDir("", "reloader-rules-test") - testutil.Ok(t, err) - defer func() { testutil.Ok(t, os.RemoveAll(workingDir)) }() - - watchedSymlink := workingDir + "/watched" - atomicSwapSymlink := workingDir + "/atomic" - sourceDirA := workingDir + "/a" - sourceDirB := workingDir + "/b" - testutil.Ok(t, os.Mkdir(sourceDirA, os.ModePerm)) - testutil.Ok(t, os.Mkdir(sourceDirB, os.ModePerm)) - // write our source configs - testutil.Ok(t, ioutil.WriteFile(sourceDirA+"/rules.yaml", []byte("rules"), os.ModePerm)) - testutil.Ok(t, ioutil.WriteFile(sourceDirB+"/rules.yaml", []byte("rules-modified"), os.ModePerm)) - // start with v1 - testutil.Ok(t, os.Symlink(sourceDirA, watchedSymlink)) - // set up reloader - reloader := New(nil, reloadURL, "", "", []string{watchedSymlink}) - reloader.ruleInterval = 100 * time.Millisecond - reloader.retryInterval = 100 * time.Millisecond - reloadsFn := func() int { - promHandlerMu.Lock() - promHandlerMu.Unlock() - return reloads - } - - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - go func() { + reloadsSeen := 0 for { select { case <-ctx.Done(): + return case <-time.After(300 * time.Millisecond): } - if ctx.Err() != nil { - break - } - - if reloadsFn() == 1 { // first load - // swap the symlink atomically - testutil.Ok(t, os.Symlink(sourceDirB, atomicSwapSymlink)) - testutil.Ok(t, os.Rename(atomicSwapSymlink, watchedSymlink)) + rel := reloads.Load().(int) + if rel != 0 && rel <= reloadsSeen { + continue } - - if reloadsFn() == 2 { - // swap the symlink back - testutil.Ok(t, os.Symlink(sourceDirA, atomicSwapSymlink)) - testutil.Ok(t, os.Rename(atomicSwapSymlink, watchedSymlink)) + reloadsSeen = rel + + t.Log("Performing step number", rel) + + switch rel { + case 0: + // Add new rule file. + testutil.Ok(t, ioutil.WriteFile(path.Join(dir, "rule2.yaml"), []byte("rule2"), os.ModePerm)) + case 1: + // Change rule 1 in place. + testutil.Ok(t, ioutil.WriteFile(path.Join(dir, "rule1.yaml"), []byte("rule1-changed"), os.ModePerm)) + case 2: + // Add new rule as symlink. + testutil.Ok(t, os.Symlink(path.Join(dir2, "rule3-source.yaml"), path.Join(dir2, "rule3.yaml"))) + testutil.Ok(t, os.Rename(path.Join(dir2, "rule3.yaml"), path.Join(dir, "rule3.yaml"))) + case 3: + // Change rule in symlink. + testutil.Ok(t, ioutil.WriteFile(path.Join(dir2, "rule3-source.yaml"), []byte("rule3-changed"), os.ModePerm)) + case 4: + // Change rule in symlinked directory.. + testutil.Ok(t, ioutil.WriteFile(path.Join(dir2, "rule-dir", "rule4.yaml"), []byte("rule4-changed"), os.ModePerm)) } - - if reloadsFn() > 2 { - break + if rel > 4 { + // All good. + return } } }() err = reloader.Watch(ctx) cancel() + g.Wait() + testutil.Ok(t, err) - testutil.Equals(t, 3, reloadsFn()) + testutil.Equals(t, 5, reloads.Load().(int)) } From faae80ad6fa3c95cecc9976e8c5e7c1ee3d88cfa Mon Sep 17 00:00:00 2001 From: Xiang Dai <764524258@qq.com> Date: Tue, 26 Mar 2019 19:20:45 +0800 Subject: [PATCH 27/43] Add more help about make failed (#973) Signed-off-by: Xiang Dai <764524258@qq.com> --- docs/getting_started.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/getting_started.md b/docs/getting_started.md index e1e83bb5ed..e73d71f372 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -26,7 +26,7 @@ make The `thanos` binary should now be in your `$PATH` and is the only thing required to deploy any of its components. -If you use `golang` below `1.11.5`, you may meet below error: +You may meet below error: ``` go: verifying github.com/grpc-ecosystem/go-grpc-middleware@v1.0.0: checksum mismatch downloaded: h1:BWIsLfhgKhV5g/oF34aRjniBHLTZe5DNekSjbAjIS6c= @@ -34,11 +34,13 @@ go: verifying github.com/grpc-ecosystem/go-grpc-middleware@v1.0.0: checksum mism Makefile:183: recipe for target 'go-mod-tidy' failed ``` -You can run following cmd then `make` would pass: +If your `golang` version is `1.11.4`, you can run following cmd then `make` would pass: ``` go clean -modcache ``` +If your `golang` version is below `1.11.4`, highly recommend you upgrade to `1.11.4` or above. + ## [Prometheus](https://prometheus.io/) Thanos bases on vanilla Prometheus (v2.2.1+). From 5b74df2ae745e61f1ba09c347e63dea271df6d2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Tue, 26 Mar 2019 11:26:35 +0000 Subject: [PATCH 28/43] Replace extprom with native Registry Wrapper. (#971) Signed-off-by: Bartek Plotka --- cmd/thanos/query.go | 9 ++++++--- cmd/thanos/rule.go | 12 ++++++++---- pkg/discovery/dns/provider.go | 14 ++++---------- pkg/extprom/extprom.go | 36 ++++++++++++----------------------- pkg/extprom/extprom_test.go | 14 -------------- pkg/store/bucket.go | 9 ++++++--- pkg/store/gate.go | 31 ++++++++++++++---------------- 7 files changed, 50 insertions(+), 75 deletions(-) delete mode 100644 pkg/extprom/extprom_test.go diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index a32c7c6c7a..98b52808fb 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -29,7 +29,7 @@ import ( "github.com/improbable-eng/thanos/pkg/tracing" "github.com/improbable-eng/thanos/pkg/ui" "github.com/oklog/run" - "github.com/opentracing/opentracing-go" + opentracing "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/route" @@ -39,7 +39,7 @@ import ( "github.com/prometheus/tsdb/labels" "google.golang.org/grpc" "google.golang.org/grpc/credentials" - "gopkg.in/alecthomas/kingpin.v2" + kingpin "gopkg.in/alecthomas/kingpin.v2" ) // registerQuery registers a query command. @@ -276,7 +276,10 @@ func runQuery( } fileSDCache := cache.New() - dnsProvider := dns.NewProvider(logger, extprom.NewSubsystem(reg, "query_store_api")) + dnsProvider := dns.NewProvider( + logger, + extprom.WrapRegistererWithPrefix("thanos_querier_store_apis", reg), + ) var ( stores = query.NewStoreSet( diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index a3847105b4..e9b4a87a0e 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -30,7 +30,7 @@ import ( "github.com/improbable-eng/thanos/pkg/extprom" "github.com/improbable-eng/thanos/pkg/objstore/client" "github.com/improbable-eng/thanos/pkg/promclient" - "github.com/improbable-eng/thanos/pkg/rule/api" + v1 "github.com/improbable-eng/thanos/pkg/rule/api" "github.com/improbable-eng/thanos/pkg/runutil" "github.com/improbable-eng/thanos/pkg/shipper" "github.com/improbable-eng/thanos/pkg/store" @@ -38,7 +38,7 @@ import ( "github.com/improbable-eng/thanos/pkg/tracing" "github.com/improbable-eng/thanos/pkg/ui" "github.com/oklog/run" - "github.com/opentracing/opentracing-go" + opentracing "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" @@ -52,7 +52,7 @@ import ( "github.com/prometheus/prometheus/util/strutil" "github.com/prometheus/tsdb/labels" "google.golang.org/grpc" - "gopkg.in/alecthomas/kingpin.v2" + kingpin "gopkg.in/alecthomas/kingpin.v2" ) // registerRule registers a rule command. @@ -257,7 +257,11 @@ func runRule( // FileSD query addresses. fileSDCache := cache.New() - dnsProvider := dns.NewProvider(logger, extprom.NewSubsystem(reg, "rule_query")) + + dnsProvider := dns.NewProvider( + logger, + extprom.WrapRegistererWithPrefix("thanos_ruler_query_apis", reg), + ) // Hit the HTTP query API of query peers in randomized order until we get a result // back or the context get canceled. diff --git a/pkg/discovery/dns/provider.go b/pkg/discovery/dns/provider.go index e07b5afced..7d1e1357c1 100644 --- a/pkg/discovery/dns/provider.go +++ b/pkg/discovery/dns/provider.go @@ -5,8 +5,6 @@ import ( "strings" "sync" - "github.com/improbable-eng/thanos/pkg/extprom" - "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" "github.com/prometheus/client_golang/prometheus" @@ -25,28 +23,24 @@ type Provider struct { } // NewProvider returns a new empty provider with a default resolver. -func NewProvider(logger log.Logger, reg *extprom.SubsystemRegisterer) *Provider { +func NewProvider(logger log.Logger, reg prometheus.Registerer) *Provider { p := &Provider{ resolver: NewResolver(), resolved: make(map[string][]string), logger: logger, resolverLookupsCount: prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "thanos", - Subsystem: reg.Subsystem(), Name: "dns_lookups_total", Help: "The number of DNS lookups resolutions attempts", }), resolverFailuresCount: prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "thanos", - Subsystem: reg.Subsystem(), Name: "dns_failures_total", Help: "The number of DNS lookup failures", }), } - if r := reg.Registerer(); r != nil { - r.MustRegister(p.resolverLookupsCount) - r.MustRegister(p.resolverFailuresCount) + if reg != nil { + reg.MustRegister(p.resolverLookupsCount) + reg.MustRegister(p.resolverFailuresCount) } return p diff --git a/pkg/extprom/extprom.go b/pkg/extprom/extprom.go index 4489119961..f2d7d09ab1 100644 --- a/pkg/extprom/extprom.go +++ b/pkg/extprom/extprom.go @@ -1,33 +1,21 @@ -// Package extprom is covering code that is used for extending native Prometheus packages functionality. - package extprom -import ( - "github.com/prometheus/client_golang/prometheus" -) - -// SubsystemRegisterer type allows for subsystem specification. All packages that uses this type -// should use the gien subsystem. -// Registerer methods works even if registerer reference points to nil. -type SubsystemRegisterer struct { - registerer prometheus.Registerer - subsystem string -} +import "github.com/prometheus/client_golang/prometheus" -func (r *SubsystemRegisterer) Registerer() prometheus.Registerer { - if r == nil { +// WrapRegistererWithPrefix is like prometheus.WrapRegistererWithPrefix but it passes nil straight through +// which allows nil check. +func WrapRegistererWithPrefix(prefix string, reg prometheus.Registerer) prometheus.Registerer { + if reg == nil { return nil } - return r.registerer + return prometheus.WrapRegistererWithPrefix(prefix, reg) } -func (r *SubsystemRegisterer) Subsystem() string { - if r == nil { - return "" +// WrapRegistererWith is like prometheus.WrapRegistererWith but it passes nil straight through +// which allows nil check. +func WrapRegistererWith(labels prometheus.Labels, reg prometheus.Registerer) prometheus.Registerer { + if reg == nil { + return nil } - return r.subsystem -} - -func NewSubsystem(reg prometheus.Registerer, subsystem string) *SubsystemRegisterer { - return &SubsystemRegisterer{registerer: reg, subsystem: subsystem} + return prometheus.WrapRegistererWith(labels, reg) } diff --git a/pkg/extprom/extprom_test.go b/pkg/extprom/extprom_test.go deleted file mode 100644 index bac67e7672..0000000000 --- a/pkg/extprom/extprom_test.go +++ /dev/null @@ -1,14 +0,0 @@ -package extprom - -import ( - "testing" - - "github.com/improbable-eng/thanos/pkg/testutil" -) - -func TestNilRegisterer(t *testing.T) { - var r *SubsystemRegisterer - - testutil.Equals(t, "", r.Subsystem()) - testutil.Equals(t, nil, r.Registerer()) -} diff --git a/pkg/store/bucket.go b/pkg/store/bucket.go index d03167499e..fb295d167a 100644 --- a/pkg/store/bucket.go +++ b/pkg/store/bucket.go @@ -247,9 +247,12 @@ func NewBucketStore( blockSets: map[uint64]*bucketBlockSet{}, debugLogging: debugLogging, blockSyncConcurrency: blockSyncConcurrency, - queryGate: NewGate(maxConcurrent, extprom.NewSubsystem(reg, "thanos_bucket_store")), - samplesLimiter: NewLimiter(maxSampleCount, metrics.queriesDropped), - partitioner: gapBasedPartitioner{maxGapSize: maxGapSize}, + queryGate: NewGate( + maxConcurrent, + extprom.WrapRegistererWithPrefix("thanos_bucket_store_series", reg), + ), + samplesLimiter: NewLimiter(maxSampleCount, metrics.queriesDropped), + partitioner: gapBasedPartitioner{maxGapSize: maxGapSize}, } s.metrics = metrics diff --git a/pkg/store/gate.go b/pkg/store/gate.go index dbbcb7d72a..0b6d00de1a 100644 --- a/pkg/store/gate.go +++ b/pkg/store/gate.go @@ -4,7 +4,6 @@ import ( "context" "time" - "github.com/improbable-eng/thanos/pkg/extprom" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/prometheus/pkg/gate" ) @@ -17,26 +16,24 @@ type Gate struct { } // NewGate returns a new query gate. -func NewGate(maxConcurrent int, reg *extprom.SubsystemRegisterer) *Gate { +func NewGate(maxConcurrent int, reg prometheus.Registerer) *Gate { g := &Gate{ g: gate.New(maxConcurrent), + inflightQueries: prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "gate_queries_in_flight", + Help: "Number of queries that are currently in flight.", + }), + gateTiming: prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "gate_duration_seconds", + Help: "How many seconds it took for queries to wait at the gate.", + Buckets: []float64{ + 0.01, 0.05, 0.1, 0.25, 0.6, 1, 2, 3.5, 5, 10, + }, + }), } - g.inflightQueries = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "queries_in_flight", - Help: "Number of queries that are currently in flight.", - Subsystem: reg.Subsystem(), - }) - g.gateTiming = prometheus.NewHistogram(prometheus.HistogramOpts{ - Name: "gate_duration_seconds", - Help: "How many seconds it took for queries to wait at the gate.", - Buckets: []float64{ - 0.01, 0.05, 0.1, 0.25, 0.6, 1, 2, 3.5, 5, 10, - }, - Subsystem: reg.Subsystem(), - }) - if r := reg.Registerer(); r != nil { - r.MustRegister(g.inflightQueries, g.gateTiming) + if reg != nil { + reg.MustRegister(g.inflightQueries, g.gateTiming) } return g From 441a769b10208da018a8fb74f27bc6a2d47b402a Mon Sep 17 00:00:00 2001 From: Povilas Versockas Date: Tue, 26 Mar 2019 13:47:58 +0200 Subject: [PATCH 29/43] query: Add store.response-timeout (#928) * Add store.receive-timeout * Apply suggestions from code review Co-Authored-By: povilasv * Apply suggestions from code review Co-Authored-By: povilasv * Fixes after review * Update pkg/store/proxy.go Co-Authored-By: povilasv * Fixes after review --- cmd/thanos/query.go | 6 +- docs/components/query.md | 5 ++ pkg/query/storeset.go | 4 ++ pkg/store/proxy.go | 99 ++++++++++++++++++++++------ pkg/store/proxy_test.go | 139 ++++++++++++++++++++++++++++++++++++++- 5 files changed, 230 insertions(+), 23 deletions(-) diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index 98b52808fb..792ff7362c 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -91,6 +91,8 @@ func registerQuery(m map[string]setupFunc, app *kingpin.Application, name string enablePartialResponse := cmd.Flag("query.partial-response", "Enable partial response for queries if no partial_response param is specified."). Default("true").Bool() + storeResponseTimeout := modelDuration(cmd.Flag("store.response-timeout", "If a Store doesn't send any data in this specified duration then a Store will be ignored and partial data will be returned if it's enabled. 0 disables timeout.").Default("0ms")) + m[name] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ bool) error { peer, err := newPeerFn(logger, reg, true, *httpAdvertiseAddr, true) if err != nil { @@ -139,6 +141,7 @@ func registerQuery(m map[string]setupFunc, app *kingpin.Application, name string *webPrefixHeaderName, *maxConcurrentQueries, time.Duration(*queryTimeout), + time.Duration(*storeResponseTimeout), *replicaLabel, peer, selectorLset, @@ -254,6 +257,7 @@ func runQuery( webPrefixHeaderName string, maxConcurrentQueries int, queryTimeout time.Duration, + storeResponseTimeout time.Duration, replicaLabel string, peer cluster.Peer, selectorLset labels.Labels, @@ -307,7 +311,7 @@ func runQuery( }, dialOpts, ) - proxy = store.NewProxyStore(logger, stores.Get, component.Query, selectorLset) + proxy = store.NewProxyStore(logger, stores.Get, component.Query, selectorLset, storeResponseTimeout) queryableCreator = query.NewQueryableCreator(logger, proxy, replicaLabel) engine = promql.NewEngine( promql.EngineOpts{ diff --git a/docs/components/query.md b/docs/components/query.md index 3a08f70603..972796eeca 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -270,5 +270,10 @@ Flags: if no max_source_resolution param is specified. --query.partial-response Enable partial response for queries if no partial_response param is specified. + --store.response-timeout=0ms + If a Store doesn't send any data in this + specified duration then a Store will be ignored + and partial data will be returned if it's + enabled. 0 disables timeout. ``` diff --git a/pkg/query/storeset.go b/pkg/query/storeset.go index bb9c7698df..a660e6566e 100644 --- a/pkg/query/storeset.go +++ b/pkg/query/storeset.go @@ -196,6 +196,10 @@ func (s *storeRef) String() string { return fmt.Sprintf("Addr: %s Labels: %v Mint: %d Maxt: %d", s.addr, s.Labels(), mint, maxt) } +func (s *storeRef) Addr() string { + return s.addr +} + func (s *storeRef) close() { runutil.CloseWithLogOnErr(s.logger, s.cc, fmt.Sprintf("store %v connection close", s.addr)) } diff --git a/pkg/store/proxy.go b/pkg/store/proxy.go index 556bf77da6..316f803d6d 100644 --- a/pkg/store/proxy.go +++ b/pkg/store/proxy.go @@ -7,6 +7,7 @@ import ( "math" "strings" "sync" + "time" "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" @@ -32,6 +33,8 @@ type Client interface { TimeRange() (mint int64, maxt int64) String() string + // Addr returns address of a Client. + Addr() string } // ProxyStore implements the store API that proxies request to all given underlying stores. @@ -40,6 +43,8 @@ type ProxyStore struct { stores func() []Client component component.StoreAPI selectorLabels labels.Labels + + responseTimeout time.Duration } // NewProxyStore returns a new ProxyStore that uses the given clients that implements storeAPI to fan-in all series to the client. @@ -49,15 +54,18 @@ func NewProxyStore( stores func() []Client, component component.StoreAPI, selectorLabels labels.Labels, + responseTimeout time.Duration, ) *ProxyStore { if logger == nil { logger = log.NewNopLogger() } + s := &ProxyStore{ - logger: logger, - stores: stores, - component: component, - selectorLabels: selectorLabels, + logger: logger, + stores: stores, + component: component, + selectorLabels: selectorLabels, + responseTimeout: responseTimeout, } return s } @@ -147,7 +155,11 @@ func (s *ProxyStore) Series(r *storepb.SeriesRequest, srv storepb.Store_SeriesSe } storeDebugMsgs = append(storeDebugMsgs, fmt.Sprintf("store %s queried", st)) - sc, err := st.Series(gctx, r) + // This is used to cancel this stream when one operations takes too long. + seriesCtx, closeSeries := context.WithCancel(gctx) + defer closeSeries() + + sc, err := st.Series(seriesCtx, r) if err != nil { storeID := fmt.Sprintf("%v", storepb.LabelsToString(st.Labels())) if storeID == "" { @@ -162,12 +174,13 @@ func (s *ProxyStore) Series(r *storepb.SeriesRequest, srv storepb.Store_SeriesSe continue } - // Schedule streamSeriesSet that translates gRPC streamed response into seriesSet (if series) or respCh if warnings. - seriesSet = append(seriesSet, startStreamSeriesSet(gctx, wg, sc, respSender, st.String(), !r.PartialResponseDisabled)) + // Schedule streamSeriesSet that translates gRPC streamed response + // into seriesSet (if series) or respCh if warnings. + seriesSet = append(seriesSet, startStreamSeriesSet(seriesCtx, s.logger, closeSeries, + wg, sc, respSender, st.String(), !r.PartialResponseDisabled, s.responseTimeout)) } level.Debug(s.logger).Log("msg", strings.Join(storeDebugMsgs, ";")) - if len(seriesSet) == 0 { // This is indicates that configured StoreAPIs are not the ones end user expects err := errors.New("No store matched for this query") @@ -196,7 +209,6 @@ func (s *ProxyStore) Series(r *storepb.SeriesRequest, srv storepb.Store_SeriesSe return err } return nil - } type warnSender interface { @@ -206,6 +218,9 @@ type warnSender interface { // streamSeriesSet iterates over incoming stream of series. // All errors are sent out of band via warning channel. type streamSeriesSet struct { + ctx context.Context + logger log.Logger + stream storepb.Store_SeriesClient warnCh warnSender @@ -215,30 +230,44 @@ type streamSeriesSet struct { errMtx sync.Mutex err error - name string + name string + partialResponse bool + + responseTimeout time.Duration + closeSeries context.CancelFunc } func startStreamSeriesSet( ctx context.Context, + logger log.Logger, + closeSeries context.CancelFunc, wg *sync.WaitGroup, stream storepb.Store_SeriesClient, warnCh warnSender, name string, partialResponse bool, + responseTimeout time.Duration, ) *streamSeriesSet { s := &streamSeriesSet{ - stream: stream, - warnCh: warnCh, - recvCh: make(chan *storepb.Series, 10), - name: name, + ctx: ctx, + logger: logger, + closeSeries: closeSeries, + stream: stream, + warnCh: warnCh, + recvCh: make(chan *storepb.Series, 10), + name: name, + partialResponse: partialResponse, + responseTimeout: responseTimeout, } wg.Add(1) go func() { defer wg.Done() defer close(s.recvCh) + for { r, err := s.stream.Recv() + if err == io.EOF { return } @@ -248,14 +277,15 @@ func startStreamSeriesSet( } if err != nil { + wrapErr := errors.Wrapf(err, "receive series from %s", s.name) if partialResponse { - s.warnCh.send(storepb.NewWarnSeriesResponse(errors.Wrap(err, "receive series"))) + s.warnCh.send(storepb.NewWarnSeriesResponse(wrapErr)) return } s.errMtx.Lock() - defer s.errMtx.Unlock() - s.err = err + s.err = wrapErr + s.errMtx.Unlock() return } @@ -269,10 +299,39 @@ func startStreamSeriesSet( return s } -// Next blocks until new message is received or stream is closed. +// Next blocks until new message is received or stream is closed or operation is timed out. func (s *streamSeriesSet) Next() (ok bool) { - s.currSeries, ok = <-s.recvCh - return ok + ctx := s.ctx + timeoutMsg := fmt.Sprintf("failed to receive any data from %s", s.name) + + if s.responseTimeout != 0 { + timeoutMsg = fmt.Sprintf("failed to receive any data in %s from %s", s.responseTimeout.String(), s.name) + + timeoutCtx, done := context.WithTimeout(s.ctx, s.responseTimeout) + defer done() + ctx = timeoutCtx + } + + select { + case s.currSeries, ok = <-s.recvCh: + return ok + case <-ctx.Done(): + // closeSeries to shutdown a goroutine in startStreamSeriesSet. + s.closeSeries() + + err := errors.Wrap(ctx.Err(), timeoutMsg) + if s.partialResponse { + level.Warn(s.logger).Log("err", err, "msg", "returning partial response") + s.warnCh.send(storepb.NewWarnSeriesResponse(err)) + return false + } + s.errMtx.Lock() + s.err = err + s.errMtx.Unlock() + + level.Warn(s.logger).Log("err", err, "msg", "partial response disabled; aborting request") + return false + } } func (s *streamSeriesSet) At() ([]storepb.Label, []storepb.AggrChunk) { diff --git a/pkg/store/proxy_test.go b/pkg/store/proxy_test.go index fa3e7e110a..ed1c1d3b0a 100644 --- a/pkg/store/proxy_test.go +++ b/pkg/store/proxy_test.go @@ -4,6 +4,7 @@ import ( "context" "io" "math" + "os" "testing" "time" @@ -42,6 +43,9 @@ func (c *testClient) String() string { return "test" } +func (c *testClient) Addr() string { + return "testaddr" +} func TestProxyStore_Info(t *testing.T) { defer leaktest.CheckTimeout(t, 10*time.Second)() @@ -51,7 +55,7 @@ func TestProxyStore_Info(t *testing.T) { q := NewProxyStore(nil, func() []Client { return nil }, component.Query, - nil, + nil, 0*time.Second, ) resp, err := q.Info(ctx, &storepb.InfoRequest{}) @@ -400,11 +404,135 @@ func TestProxyStore_Series(t *testing.T) { expectedErr: errors.New("fetch series for [name:\"ext\" value:\"1\" ] test: error!"), }, } { + + if ok := t.Run(tc.title, func(t *testing.T) { + q := NewProxyStore(nil, + func() []Client { return tc.storeAPIs }, + component.Query, + tc.selectorLabels, + 0*time.Second, + ) + + s := newStoreSeriesServer(context.Background()) + + err := q.Series(tc.req, s) + if tc.expectedErr != nil { + testutil.NotOk(t, err) + testutil.Equals(t, tc.expectedErr.Error(), err.Error()) + return + } + + testutil.Ok(t, err) + + seriesEqual(t, tc.expectedSeries, s.SeriesSet) + testutil.Equals(t, tc.expectedWarningsLen, len(s.Warnings), "got %v", s.Warnings) + }); !ok { + return + } + } +} + +func TestProxyStore_SeriesSlowStores(t *testing.T) { + enable := os.Getenv("THANOS_ENABLE_STORE_READ_TIMEOUT_TESTS") + if enable == "" { + t.Skip("enable THANOS_ENABLE_STORE_READ_TIMEOUT_TESTS to run store-read-timeout tests") + } + + defer leaktest.CheckTimeout(t, 20*time.Second)() + + for _, tc := range []struct { + title string + storeAPIs []Client + selectorLabels tlabels.Labels + + req *storepb.SeriesRequest + + expectedSeries []rawSeries + expectedErr error + expectedWarningsLen int + }{ + { + title: "partial response disabled one thanos query is slow to respond", + storeAPIs: []Client{ + &testClient{ + StoreClient: &mockedStoreAPI{ + RespSeries: []*storepb.SeriesResponse{ + storepb.NewWarnSeriesResponse(errors.New("warning")), + storeSeriesResponse(t, labels.FromStrings("a", "b"), []sample{{1, 1}, {2, 2}, {3, 3}}), + }, + RespDuration: 10 * time.Second, + }, + labels: []storepb.Label{{Name: "ext", Value: "1"}}, + minTime: 1, + maxTime: 300, + }, + &testClient{ + StoreClient: &mockedStoreAPI{ + RespSeries: []*storepb.SeriesResponse{ + storepb.NewWarnSeriesResponse(errors.New("warning")), + storeSeriesResponse(t, labels.FromStrings("a", "b"), []sample{{1, 1}, {2, 2}, {3, 3}}), + }, + }, + labels: []storepb.Label{{Name: "ext", Value: "1"}}, + minTime: 1, + maxTime: 300, + }, + }, + req: &storepb.SeriesRequest{ + MinTime: 1, + MaxTime: 300, + Matchers: []storepb.LabelMatcher{{Name: "ext", Value: "1", Type: storepb.LabelMatcher_EQ}}, + PartialResponseDisabled: true, + }, + expectedErr: errors.New("test: failed to receive any data in 4s from test: context deadline exceeded"), + }, + { + title: "partial response enabled one thanos query is slow to respond", + storeAPIs: []Client{ + &testClient{ + StoreClient: &mockedStoreAPI{ + RespSeries: []*storepb.SeriesResponse{ + storepb.NewWarnSeriesResponse(errors.New("warning")), + storeSeriesResponse(t, labels.FromStrings("a", "b"), []sample{{1, 1}, {2, 2}, {3, 3}}), + }, + }, + labels: []storepb.Label{{Name: "ext", Value: "1"}}, + minTime: 1, + maxTime: 300, + }, + &testClient{ + StoreClient: &mockedStoreAPI{ + RespSeries: []*storepb.SeriesResponse{ + storepb.NewWarnSeriesResponse(errors.New("warning")), + storeSeriesResponse(t, labels.FromStrings("b", "c"), []sample{{1, 1}, {2, 2}, {3, 3}}), + }, + RespDuration: 10 * time.Second, + }, + labels: []storepb.Label{{Name: "ext", Value: "1"}}, + minTime: 1, + maxTime: 300, + }, + }, + req: &storepb.SeriesRequest{ + MinTime: 1, + MaxTime: 300, + Matchers: []storepb.LabelMatcher{{Name: "ext", Value: "1", Type: storepb.LabelMatcher_EQ}}, + }, + expectedSeries: []rawSeries{ + { + lset: []storepb.Label{{Name: "a", Value: "b"}}, + samples: []sample{{1, 1}, {2, 2}, {3, 3}}, + }, + }, + expectedWarningsLen: 2, + }, + } { if ok := t.Run(tc.title, func(t *testing.T) { q := NewProxyStore(nil, func() []Client { return tc.storeAPIs }, component.Query, tc.selectorLabels, + 4*time.Second, ) s := newStoreSeriesServer(context.Background()) @@ -446,6 +574,7 @@ func TestProxyStore_Series_RequestParamsProxied(t *testing.T) { func() []Client { return cls }, component.Query, nil, + 0*time.Second, ) ctx := context.Background() @@ -504,6 +633,7 @@ func TestProxyStore_Series_RegressionFillResponseChannel(t *testing.T) { func() []Client { return cls }, component.Query, tlabels.FromStrings("fed", "a"), + 0*time.Second, ) ctx := context.Background() @@ -541,6 +671,7 @@ func TestProxyStore_LabelValues(t *testing.T) { func() []Client { return cls }, component.Query, nil, + 0*time.Second, ) ctx := context.Background() @@ -700,6 +831,7 @@ type mockedStoreAPI struct { RespSeries []*storepb.SeriesResponse RespLabelValues *storepb.LabelValuesResponse RespError error + RespDuration time.Duration LastSeriesReq *storepb.SeriesRequest LastLabelValuesReq *storepb.LabelValuesRequest @@ -712,7 +844,7 @@ func (s *mockedStoreAPI) Info(ctx context.Context, req *storepb.InfoRequest, _ . func (s *mockedStoreAPI) Series(ctx context.Context, req *storepb.SeriesRequest, _ ...grpc.CallOption) (storepb.Store_SeriesClient, error) { s.LastSeriesReq = req - return &StoreSeriesClient{ctx: ctx, respSet: s.RespSeries}, s.RespError + return &StoreSeriesClient{ctx: ctx, respSet: s.RespSeries, respDur: s.RespDuration}, s.RespError } func (s *mockedStoreAPI) LabelNames(ctx context.Context, req *storepb.LabelNamesRequest, _ ...grpc.CallOption) (*storepb.LabelNamesResponse, error) { @@ -732,9 +864,12 @@ type StoreSeriesClient struct { ctx context.Context i int respSet []*storepb.SeriesResponse + respDur time.Duration } func (c *StoreSeriesClient) Recv() (*storepb.SeriesResponse, error) { + time.Sleep(c.respDur) + if c.i >= len(c.respSet) { return nil, io.EOF } From 7465db9f12431e48950a5d6e5ef9939b8dc5e992 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Tue, 26 Mar 2019 12:01:55 +0000 Subject: [PATCH 30/43] e2e: Killed gossip e2e tests. (#972) Signed-off-by: Bartek Plotka --- test/e2e/query_test.go | 35 ++++------ test/e2e/rule_test.go | 18 ++--- test/e2e/spinup_test.go | 124 +++++++++++---------------------- test/e2e/store_gateway_test.go | 6 +- 4 files changed, 59 insertions(+), 124 deletions(-) diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index bc1c458857..7ef9c7ac8b 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -25,36 +25,25 @@ var ( firstPromPort = promHTTPPort(1) remoteWriteEndpoint = fmt.Sprintf("http://%s/api/v1/receive", remoteWriteReceiveHTTP(1)) - queryGossipSuite = newSpinupSuite(). - Add(scraper(1, defaultPromConfig("prom-"+firstPromPort, 0), true)). - Add(scraper(2, defaultPromConfig("prom-ha", 0), true)). - Add(scraper(3, defaultPromConfig("prom-ha", 1), true)). - Add(querier(1, "replica"), queryCluster(1)). - Add(querier(2, "replica"), queryCluster(2)) - queryStaticFlagsSuite = newSpinupSuite(). - Add(scraper(1, defaultPromConfig("prom-"+firstPromPort, 0), false)). - Add(scraper(2, defaultPromConfig("prom-ha", 0), false)). - Add(scraper(3, defaultPromConfig("prom-ha", 1), false)). - Add(querierWithStoreFlags(1, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3), remoteWriteReceiveGRPC(1)), ""). - Add(querierWithStoreFlags(2, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3), remoteWriteReceiveGRPC(1)), ""). - Add(receiver(1, defaultPromRemoteWriteConfig(remoteWriteEndpoint)), "") + Add(scraper(1, defaultPromConfig("prom-"+firstPromPort, 0))). + Add(scraper(2, defaultPromConfig("prom-ha", 0))). + Add(scraper(3, defaultPromConfig("prom-ha", 1))). + Add(querierWithStoreFlags(1, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3), remoteWriteReceiveGRPC(1))). + Add(querierWithStoreFlags(2, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3), remoteWriteReceiveGRPC(1))). + Add(receiver(1, defaultPromRemoteWriteConfig(remoteWriteEndpoint))) queryFileSDSuite = newSpinupSuite(). - Add(scraper(1, defaultPromConfig("prom-"+firstPromPort, 0), false)). - Add(scraper(2, defaultPromConfig("prom-ha", 0), false)). - Add(scraper(3, defaultPromConfig("prom-ha", 1), false)). - Add(querierWithFileSD(1, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3), remoteWriteReceiveGRPC(1)), ""). - Add(querierWithFileSD(2, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3), remoteWriteReceiveGRPC(1)), ""). - Add(receiver(1, defaultPromRemoteWriteConfig(remoteWriteEndpoint)), "") + Add(scraper(1, defaultPromConfig("prom-"+firstPromPort, 0))). + Add(scraper(2, defaultPromConfig("prom-ha", 0))). + Add(scraper(3, defaultPromConfig("prom-ha", 1))). + Add(querierWithFileSD(1, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3), remoteWriteReceiveGRPC(1))). + Add(querierWithFileSD(2, "replica", sidecarGRPC(1), sidecarGRPC(2), sidecarGRPC(3), remoteWriteReceiveGRPC(1))). + Add(receiver(1, defaultPromRemoteWriteConfig(remoteWriteEndpoint))) ) func TestQuery(t *testing.T) { for _, tt := range []testConfig{ - { - "gossip", - queryGossipSuite, - }, { "staticFlag", queryStaticFlagsSuite, diff --git a/test/e2e/rule_test.go b/test/e2e/rule_test.go index a6b8a03f4d..4af72bd648 100644 --- a/test/e2e/rule_test.go +++ b/test/e2e/rule_test.go @@ -29,31 +29,21 @@ groups: ` var ( - ruleGossipSuite = newSpinupSuite(). - Add(querier(1, ""), queryCluster(1)). - Add(ruler(1, alwaysFireRule)). - Add(ruler(2, alwaysFireRule)). - Add(alertManager(1), "") - ruleStaticFlagsSuite = newSpinupSuite(). - Add(querierWithStoreFlags(1, "", rulerGRPC(1), rulerGRPC(2)), ""). + Add(querierWithStoreFlags(1, "", rulerGRPC(1), rulerGRPC(2))). Add(rulerWithQueryFlags(1, alwaysFireRule, queryHTTP(1))). Add(rulerWithQueryFlags(2, alwaysFireRule, queryHTTP(1))). - Add(alertManager(1), "") + Add(alertManager(1)) ruleFileSDSuite = newSpinupSuite(). - Add(querierWithFileSD(1, "", rulerGRPC(1), rulerGRPC(2)), ""). + Add(querierWithFileSD(1, "", rulerGRPC(1), rulerGRPC(2))). Add(rulerWithFileSD(1, alwaysFireRule, queryHTTP(1))). Add(rulerWithFileSD(2, alwaysFireRule, queryHTTP(1))). - Add(alertManager(1), "") + Add(alertManager(1)) ) func TestRule(t *testing.T) { for _, tt := range []testConfig{ - { - "gossip", - ruleGossipSuite, - }, { "staticFlag", ruleStaticFlagsSuite, diff --git a/test/e2e/spinup_test.go b/test/e2e/spinup_test.go index cdf8dd18d6..592c1e7e69 100644 --- a/test/e2e/spinup_test.go +++ b/test/e2e/spinup_test.go @@ -28,17 +28,14 @@ var ( promHTTP = func(i int) string { return fmt.Sprintf("localhost:%s", promHTTPPort(i)) } promRemoteWriteHTTP = func(i int) string { return fmt.Sprintf("localhost:%s", promHTTPPort(100+i)) } - sidecarGRPC = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19090+i) } - sidecarHTTP = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19190+i) } - sidecarCluster = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19390+i) } + sidecarGRPC = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19090+i) } + sidecarHTTP = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19190+i) } - queryGRPC = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19490+i) } - queryHTTP = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19590+i) } - queryCluster = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19690+i) } + queryGRPC = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19490+i) } + queryHTTP = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19590+i) } - rulerGRPC = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19790+i) } - rulerHTTP = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19890+i) } - rulerCluster = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19990+i) } + rulerGRPC = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19790+i) } + rulerHTTP = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 19890+i) } remoteWriteReceiveHTTP = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 18690+i) } remoteWriteReceiveGRPC = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 18790+i) } @@ -50,11 +47,10 @@ var ( minioHTTP = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 20290+i) } ) -type cmdScheduleFunc func(workDir string, clusterPeerFlags []string) ([]*exec.Cmd, error) +type cmdScheduleFunc func(workDir string) ([]*exec.Cmd, error) type spinupSuite struct { cmdScheduleFuncs []cmdScheduleFunc - clusterPeerFlags []string minioConfig s3.Config withPreStartedMinio bool @@ -62,21 +58,13 @@ type spinupSuite struct { func newSpinupSuite() *spinupSuite { return &spinupSuite{} } -func (s *spinupSuite) Add(cmdSchedule cmdScheduleFunc, gossipAddress string) *spinupSuite { +func (s *spinupSuite) Add(cmdSchedule cmdScheduleFunc) *spinupSuite { s.cmdScheduleFuncs = append(s.cmdScheduleFuncs, cmdSchedule) - if gossipAddress != "" { - s.clusterPeerFlags = append(s.clusterPeerFlags, fmt.Sprintf("--cluster.peers"), gossipAddress) - } return s } -func scraper(i int, config string, gossip bool) (cmdScheduleFunc, string) { - gossipAddress := "" - if gossip { - gossipAddress = sidecarCluster(i) - } - - return func(workDir string, clusterPeerFlags []string) ([]*exec.Cmd, error) { +func scraper(i int, config string) cmdScheduleFunc { + return func(workDir string) ([]*exec.Cmd, error) { promDir := fmt.Sprintf("%s/data/prom%d", workDir, i) if err := os.MkdirAll(promDir, 0777); err != nil { return nil, errors.Wrap(err, "create prom dir failed") @@ -93,35 +81,19 @@ func scraper(i int, config string, gossip bool) (cmdScheduleFunc, string) { "--log.level", "info", "--web.listen-address", promHTTP(i), )) - args := []string{ - "sidecar", + return append(cmds, exec.Command("thanos", "sidecar", "--debug.name", fmt.Sprintf("sidecar-%d", i), "--grpc-address", sidecarGRPC(i), "--http-address", sidecarHTTP(i), "--prometheus.url", fmt.Sprintf("http://%s", promHTTP(i)), "--tsdb.path", promDir, - "--cluster.address", sidecarCluster(i), - "--log.level", "debug", - } - - if gossip { - args = append(args, []string{ - "--cluster.advertise-address", sidecarCluster(i), - "--cluster.gossip-interval", "200ms", - "--cluster.pushpull-interval", "200ms", - }...) - args = append(args, clusterPeerFlags...) - } else { - args = append(args, "--cluster.disable") - } - cmds = append(cmds, exec.Command("thanos", args...)) - - return cmds, nil - }, gossipAddress + "--cluster.disable", + "--log.level", "debug")), nil + } } func receiver(i int, config string) cmdScheduleFunc { - return func(workDir string, clusterPeerFlags []string) ([]*exec.Cmd, error) { + return func(workDir string) ([]*exec.Cmd, error) { promDir := fmt.Sprintf("%s/data/remote-write-prom%d", workDir, i) if err := os.MkdirAll(promDir, 0777); err != nil { return nil, errors.Wrap(err, "create prom dir failed") @@ -138,29 +110,21 @@ func receiver(i int, config string) cmdScheduleFunc { "--log.level", "info", "--web.listen-address", promRemoteWriteHTTP(i), )) - args := []string{ - "receive", + return append(cmds, exec.Command("thanos", "receive", "--debug.name", fmt.Sprintf("remote-write-receive-%d", i), "--grpc-address", remoteWriteReceiveGRPC(i), "--http-address", remoteWriteReceiveMetricHTTP(i), "--remote-write.address", remoteWriteReceiveHTTP(i), "--tsdb.path", promDir, - "--log.level", "debug", - } - - cmds = append(cmds, exec.Command("thanos", args...)) - - return cmds, nil + "--log.level", "debug")), nil } } func querier(i int, replicaLabel string, staticStores ...string) cmdScheduleFunc { - return func(_ string, clusterPeerFlags []string) ([]*exec.Cmd, error) { + return func(_ string) ([]*exec.Cmd, error) { args := append(defaultQuerierFlags(i, replicaLabel), - "--cluster.advertise-address", queryCluster(i), "--cluster.gossip-interval", "200ms", "--cluster.pushpull-interval", "200ms") - args = append(args, clusterPeerFlags...) for _, s := range staticStores { args = append(args, "--store", s) } @@ -169,20 +133,18 @@ func querier(i int, replicaLabel string, staticStores ...string) cmdScheduleFunc } func querierWithStoreFlags(i int, replicaLabel string, storesAddresses ...string) cmdScheduleFunc { - return func(_ string, _ []string) ([]*exec.Cmd, error) { + return func(_ string) ([]*exec.Cmd, error) { args := defaultQuerierFlags(i, replicaLabel) for _, addr := range storesAddresses { args = append(args, "--store", addr) } - args = append(args, "--cluster.disable") - return []*exec.Cmd{exec.Command("thanos", args...)}, nil } } func querierWithFileSD(i int, replicaLabel string, storesAddresses ...string) cmdScheduleFunc { - return func(workDir string, _ []string) ([]*exec.Cmd, error) { + return func(workDir string) ([]*exec.Cmd, error) { queryFileSDDir := fmt.Sprintf("%s/data/queryFileSd%d", workDir, i) if err := os.MkdirAll(queryFileSDDir, 0777); err != nil { return nil, errors.Wrap(err, "create prom dir failed") @@ -192,18 +154,18 @@ func querierWithFileSD(i int, replicaLabel string, storesAddresses ...string) cm return nil, errors.Wrap(err, "creating prom config failed") } - args := append(defaultQuerierFlags(i, replicaLabel), + args := append( + defaultQuerierFlags(i, replicaLabel), "--store.sd-files", path.Join(queryFileSDDir, "filesd.json"), - "--store.sd-interval", "5s") - - args = append(args, "--cluster.disable") + "--store.sd-interval", "5s", + ) return []*exec.Cmd{exec.Command("thanos", args...)}, nil } } func storeGateway(i int, bucketConfig []byte) cmdScheduleFunc { - return func(workDir string, _ []string) ([]*exec.Cmd, error) { + return func(workDir string) ([]*exec.Cmd, error) { dbDir := fmt.Sprintf("%s/data/store-gateway%d", workDir, i) if err := os.MkdirAll(dbDir, 0777); err != nil { @@ -225,7 +187,7 @@ func storeGateway(i int, bucketConfig []byte) cmdScheduleFunc { } func alertManager(i int) cmdScheduleFunc { - return func(workDir string, clusterPeerFlags []string) ([]*exec.Cmd, error) { + return func(workDir string) ([]*exec.Cmd, error) { dir := fmt.Sprintf("%s/data/alertmanager%d", workDir, i) if err := os.MkdirAll(dir, 0777); err != nil { @@ -251,8 +213,8 @@ receivers: } } -func ruler(i int, rules string) (cmdScheduleFunc, string) { - return func(workDir string, clusterPeerFlags []string) ([]*exec.Cmd, error) { +func ruler(i int, rules string) cmdScheduleFunc { + return func(workDir string) ([]*exec.Cmd, error) { dbDir := fmt.Sprintf("%s/data/rule%d", workDir, i) if err := os.MkdirAll(dbDir, 0777); err != nil { @@ -264,17 +226,14 @@ func ruler(i int, rules string) (cmdScheduleFunc, string) { } args := append(defaultRulerFlags(i, dbDir), - "--cluster.advertise-address", rulerCluster(i), "--cluster.gossip-interval", "200ms", "--cluster.pushpull-interval", "200ms") - args = append(args, clusterPeerFlags...) - return []*exec.Cmd{exec.Command("thanos", args...)}, nil - }, rulerCluster(i) + } } -func rulerWithQueryFlags(i int, rules string, queryAddresses ...string) (cmdScheduleFunc, string) { - return func(workDir string, clusterPeerFlags []string) ([]*exec.Cmd, error) { +func rulerWithQueryFlags(i int, rules string, queryAddresses ...string) cmdScheduleFunc { + return func(workDir string) ([]*exec.Cmd, error) { dbDir := fmt.Sprintf("%s/data/rule%d", workDir, i) if err := os.MkdirAll(dbDir, 0777); err != nil { @@ -290,14 +249,12 @@ func rulerWithQueryFlags(i int, rules string, queryAddresses ...string) (cmdSche for _, addr := range queryAddresses { args = append(args, "--query", addr) } - args = append(args, "--cluster.disable") - return []*exec.Cmd{exec.Command("thanos", args...)}, nil - }, "" + } } -func rulerWithFileSD(i int, rules string, queryAddresses ...string) (cmdScheduleFunc, string) { - return func(workDir string, clusterPeerFlags []string) ([]*exec.Cmd, error) { +func rulerWithFileSD(i int, rules string, queryAddresses ...string) cmdScheduleFunc { + return func(workDir string) ([]*exec.Cmd, error) { dbDir := fmt.Sprintf("%s/data/rule%d", workDir, i) if err := os.MkdirAll(dbDir, 0777); err != nil { @@ -320,14 +277,13 @@ func rulerWithFileSD(i int, rules string, queryAddresses ...string) (cmdSchedule args := append(defaultRulerFlags(i, dbDir), "--query.sd-files", path.Join(ruleFileSDDir, "filesd.json"), "--query.sd-interval", "5s") - args = append(args, "--cluster.disable") return []*exec.Cmd{exec.Command("thanos", args...)}, nil - }, "" + } } func minio(accessKey string, secretKey string) cmdScheduleFunc { - return func(workDir string, clusterPeerFlags []string) ([]*exec.Cmd, error) { + return func(workDir string) ([]*exec.Cmd, error) { dbDir := fmt.Sprintf("%s/data/minio", workDir) if err := os.MkdirAll(dbDir, 0777); err != nil { @@ -374,7 +330,7 @@ func (s *spinupSuite) Exec(t testing.TB, ctx context.Context, testName string) ( // Start minio before anything else. // NewTestBucketFromConfig is responsible for healthchecking by creating a requested bucket in retry loop. minioExit, err = newSpinupSuite(). - Add(minio(s.minioConfig.AccessKey, s.minioConfig.SecretKey), ""). + Add(minio(s.minioConfig.AccessKey, s.minioConfig.SecretKey)). Exec(t, ctx, testName+"_minio") if err != nil { return nil, errors.Wrap(err, "start minio") @@ -431,7 +387,7 @@ func (s *spinupSuite) Exec(t testing.TB, ctx context.Context, testName string) ( var commands []*exec.Cmd for _, cmdFunc := range s.cmdScheduleFuncs { - cmds, err := cmdFunc(dir, s.clusterPeerFlags) + cmds, err := cmdFunc(dir) if err != nil { return nil, err } @@ -506,7 +462,7 @@ func defaultQuerierFlags(i int, replicaLabel string) []string { "--http-address", queryHTTP(i), "--log.level", "debug", "--query.replica-label", replicaLabel, - "--cluster.address", queryCluster(i), + "--cluster.disable", "--store.sd-dns-interval", "5s", } } @@ -521,7 +477,7 @@ func defaultRulerFlags(i int, dbDir string) []string { "--alertmanagers.url", "http://127.0.0.1:29093", "--grpc-address", rulerGRPC(i), "--http-address", rulerHTTP(i), - "--cluster.address", rulerCluster(i), + "--cluster.disable", "--log.level", "debug", "--query.sd-dns-interval", "5s", } diff --git a/test/e2e/store_gateway_test.go b/test/e2e/store_gateway_test.go index 8f69f3dafa..6c36a963c8 100644 --- a/test/e2e/store_gateway_test.go +++ b/test/e2e/store_gateway_test.go @@ -19,7 +19,7 @@ import ( "github.com/prometheus/common/model" "github.com/prometheus/prometheus/pkg/timestamp" "github.com/prometheus/tsdb/labels" - "gopkg.in/yaml.v2" + yaml "gopkg.in/yaml.v2" ) func TestStoreGatewayQuery(t *testing.T) { @@ -43,8 +43,8 @@ func TestStoreGatewayQuery(t *testing.T) { exit, err := newSpinupSuite(). WithPreStartedMinio(s3Config). - Add(storeGateway(1, config), ""). - Add(querier(1, "replica", storeGatewayGRPC(1)), ""). + Add(storeGateway(1, config)). + Add(querier(1, "replica", storeGatewayGRPC(1))). Exec(t, ctx, "test_store_gateway_query") if err != nil { t.Errorf("spinup failed: %v", err) From 07e090a6c9ae2ef27ad309200147bd060f338228 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Tue, 26 Mar 2019 15:43:48 +0100 Subject: [PATCH 31/43] config: Add possibility to inline ServiceAccount into GCS config (#963) * Add possibility to inline ServiceAccount into GCS config * Update comment in pkg/objstore/gcs/gcs.go Co-Authored-By: metalmatze * Generate docs for GCS config --- docs/storage.md | 30 +++++++++++++++++++++++++++++- go.mod | 1 + pkg/objstore/gcs/gcs.go | 23 ++++++++++++++++++++--- 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/docs/storage.md b/docs/storage.md index 46bdfc8cc2..96582fe962 100644 --- a/docs/storage.md +++ b/docs/storage.md @@ -155,9 +155,13 @@ For example: type: GCS config: bucket: "" + service_account: "" ``` -Application credentials are configured via JSON file, the client looks for: +### Using GOOGLE_APPLICATION_CREDENTIALS + +Application credentials are configured via JSON file and only the bucket needs to be specified, +the client looks for: 1. A JSON file whose path is specified by the `GOOGLE_APPLICATION_CREDENTIALS` environment variable. @@ -171,6 +175,30 @@ Application credentials are configured via JSON file, the client looks for: You can read more on how to get application credential json file in [https://cloud.google.com/docs/authentication/production](https://cloud.google.com/docs/authentication/production) +### Using inline a Service Account + +Another possibility is to inline the ServiceAccount into the Thanos configuration and only maintain one file. +This feature was added, so that the Prometheus Operator only needs to take care of one secret file. + +```yaml +type: GCS +config: + bucket: "thanos" + service_account: |- + { + "type": "service_account", + "project_id": "project", + "private_key_id": "abcdefghijklmnopqrstuvwxyz12345678906666", + "private_key": "-----BEGIN PRIVATE KEY-----\...\n-----END PRIVATE KEY-----\n", + "client_email": "project@thanos.iam.gserviceaccount.com", + "client_id": "123456789012345678901", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/thanos%40gitpods.iam.gserviceaccount.com" + } +``` + ### GCS Policies For deployment: diff --git a/go.mod b/go.mod index 9932f12049..01b4034b4b 100644 --- a/go.mod +++ b/go.mod @@ -38,6 +38,7 @@ require ( github.com/prometheus/tsdb v0.4.0 go.opencensus.io v0.19.0 // indirect golang.org/x/net v0.0.0-20190213061140-3a22650c66bd + golang.org/x/oauth2 v0.0.0-20181203162652-d668ce993890 golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223 // indirect golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2 diff --git a/pkg/objstore/gcs/gcs.go b/pkg/objstore/gcs/gcs.go index 634ebc4741..7743b9449f 100644 --- a/pkg/objstore/gcs/gcs.go +++ b/pkg/objstore/gcs/gcs.go @@ -16,6 +16,7 @@ import ( "github.com/improbable-eng/thanos/pkg/objstore" "github.com/pkg/errors" "github.com/prometheus/common/version" + "golang.org/x/oauth2/google" "google.golang.org/api/iterator" "google.golang.org/api/option" yaml "gopkg.in/yaml.v2" @@ -26,7 +27,8 @@ const DirDelim = "/" // Config stores the configuration for gcs bucket. type Config struct { - Bucket string `yaml:"bucket"` + Bucket string `yaml:"bucket"` + ServiceAccount string `yaml:"service_account"` } // Bucket implements the store.Bucket and shipper.Bucket interfaces against GCS. @@ -47,8 +49,23 @@ func NewBucket(ctx context.Context, logger log.Logger, conf []byte, component st if gc.Bucket == "" { return nil, errors.New("missing Google Cloud Storage bucket name for stored blocks") } - gcsOptions := option.WithUserAgent(fmt.Sprintf("thanos-%s/%s (%s)", component, version.Version, runtime.Version())) - gcsClient, err := storage.NewClient(ctx, gcsOptions) + + var opts []option.ClientOption + + // If ServiceAccount is provided, use them in GCS client, otherwise fallback to Google default logic. + if gc.ServiceAccount != "" { + credentials, err := google.CredentialsFromJSON(ctx, []byte(gc.ServiceAccount)) + if err != nil { + return nil, errors.Wrap(err, "failed to create credentials from JSON") + } + opts = append(opts, option.WithCredentials(credentials)) + } + + opts = append(opts, + option.WithUserAgent(fmt.Sprintf("thanos-%s/%s (%s)", component, version.Version, runtime.Version())), + ) + + gcsClient, err := storage.NewClient(ctx, opts...) if err != nil { return nil, err } From d65ef8d9a023b5bca414b3b9b51476b294fa24a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Tue, 26 Mar 2019 15:37:32 +0000 Subject: [PATCH 32/43] reloader: Minor fix to test - to improve for slow CI. (#978) Signed-off-by: Bartek Plotka --- pkg/reloader/reloader_test.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/reloader/reloader_test.go b/pkg/reloader/reloader_test.go index 7d3438543e..dcf636e7d1 100644 --- a/pkg/reloader/reloader_test.go +++ b/pkg/reloader/reloader_test.go @@ -218,6 +218,7 @@ func TestReloader_RuleApply(t *testing.T) { defer cancel() reloadsSeen := 0 + init := false for { select { case <-ctx.Done(): @@ -226,9 +227,11 @@ func TestReloader_RuleApply(t *testing.T) { } rel := reloads.Load().(int) - if rel != 0 && rel <= reloadsSeen { + if init && rel <= reloadsSeen { continue } + init = true + reloadsSeen = rel t.Log("Performing step number", rel) From c3a73e6b623c3ffc7d8b765da3350f822232dd61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Tue, 26 Mar 2019 19:19:23 +0000 Subject: [PATCH 33/43] Added a new maintainer to the list: @povilasv welcome! (#975) Signed-off-by: Bartek Plotka --- MAINTAINERS.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS.md b/MAINTAINERS.md index e674254d2d..fef2fa6287 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -5,7 +5,18 @@ | Bartłomiej Płotka | bwplotka@gmail.com | `@bwplotka` | [@bwplotka](https://github.com/bwplotka) | | Dominic Green | dom@improbable.io | `@domgreen` | [@domgreen](https://github.com/domgreen) | | Frederic Branczyk | fbranczyk@gmail.com | `@brancz` | [@brancz](https://github.com/brancz) | -| Giedrius Statkevičius | giedriuswork@gmail.com | `@Giedrius Statkevičius` | [@GiedriusS](https://github.com/GiedriusS) | +| Giedrius Statkevičius | giedriuswork@gmail.com | `@Giedrius Statkevičius` | [@GiedriusS](https://github.com/GiedriusS) | +| Povilas Versockas | p.versockas@gmail.com | `@povilasv` | [@povilasv](https://github.com/povilasv) + +We are bunch of people from different companies with various interests and skills. +We are from different parts of Europe: Germany, Lithuania, Poland and UK. +We have something in common though: We all share the love for OpenSource, Golang, Prometheus, :coffee: and Observability topics. + +As either Software Developers or SRE (or both!) we've chosen to maintain (mostly in our free time) Thanos, the de facto way to scale awesome [Prometheus](https://prometheus.io) project. + +Feel free to contact us (preferably on Slack) anytime for feedback, questions or :beers:/:coffee:/:tea:. + +Especially feedback, please share if you have ideas what we can do better! ## Storage plugins maintainers From a5c3d2c937b356745789e1db334e0a98a7e5d991 Mon Sep 17 00:00:00 2001 From: Xiang Dai <764524258@qq.com> Date: Wed, 27 Mar 2019 17:23:35 +0800 Subject: [PATCH 34/43] Add comment about bzr (#948) * Add comment about bzr To fix below issue: $ make which: no bzr in (/pkgs/go/bin:/pkgs/go_1.10/go/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/root/bin) >> formatting code No bzr binary found. make: *** [check-bzr] Error 1 Signed-off-by: Xiang Dai <764524258@qq.com> * add git dep Signed-off-by: Xiang Dai <764524258@qq.com> * fix small languange issue Signed-off-by: Xiang Dai <764524258@qq.com> * rebase master branch Signed-off-by: Xiang Dai <764524258@qq.com> * explain why need git and bzr Signed-off-by: Xiang Dai <764524258@qq.com> * fix one nit Signed-off-by: Xiang Dai <764524258@qq.com> --- docs/getting_started.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/getting_started.md b/docs/getting_started.md index e73d71f372..dad1bdb553 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -15,8 +15,9 @@ At the moment, Thanos is able to use [different storage providers](storage.md), You can find the latest Thanos release [here](https://github.com/improbable-eng/thanos/releases). -If you want to build Thanos from source - -with a working installation of the Go [toolchain](https://github.com/golang/tools) (`GOPATH`, `PATH=${GOPATH}/bin:${PATH}`), Thanos can be downloaded and built by running: + +If you want to build Thanos from source, make sure you have installed `bzr` and `git`. `bzr` is required, because `go` modules will use whatever VCS dependency use and in our case a single deps is using `bzr`. +And that you have a working installation of the Go [toolchain](https://github.com/golang/tools) (`GOPATH`, `PATH=${GOPATH}/bin:${PATH}`), Thanos can be downloaded and built by running: ``` go get -d github.com/improbable-eng/thanos/... From 57a58a75254d60be5fdb74cb63ceff829318117c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Giedrius=20Statkevi=C4=8Dius?= Date: Thu, 28 Mar 2019 14:29:32 +0200 Subject: [PATCH 35/43] store/proxy: set {Min,Max}Time accordingly to the nodes (#982) * store/proxy: set {Min,Max}Time accordingly to the nodes It doesn't make sense to advertise that a Thanos Query node has data since the beginning of the time and till the end of time if it only has nodes which have specific time ranges of data. Change it accordingly by checking each time range of each node and setting it to the min/max values. * store/proxy: add handling for edge case In case there are no configured stores at Thanos Query we want to revert to the old behaviour and inform the Info() callers that indeed we do have all of the data. --- pkg/store/proxy.go | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/pkg/store/proxy.go b/pkg/store/proxy.go index 316f803d6d..9324d62247 100644 --- a/pkg/store/proxy.go +++ b/pkg/store/proxy.go @@ -75,9 +75,31 @@ func (s *ProxyStore) Info(ctx context.Context, r *storepb.InfoRequest) (*storepb res := &storepb.InfoResponse{ Labels: make([]storepb.Label, 0, len(s.selectorLabels)), StoreType: s.component.ToProto(), - MinTime: 0, - MaxTime: math.MaxInt64, } + + MinTime := int64(math.MaxInt64) + MaxTime := int64(0) + + stores := s.stores() + for _, s := range stores { + mint, maxt := s.TimeRange() + if mint < MinTime { + MinTime = mint + } + if maxt > MaxTime { + MaxTime = maxt + } + } + + // Edge case: we have all of the data if there are no stores. + if len(stores) == 0 { + MinTime = 0 + MaxTime = math.MaxInt64 + } + + res.MaxTime = MaxTime + res.MinTime = MinTime + for _, l := range s.selectorLabels { res.Labels = append(res.Labels, storepb.Label{ Name: l.Name, @@ -393,7 +415,7 @@ func (s *ProxyStore) LabelValues(ctx context.Context, r *storepb.LabelValuesRequ store := st g.Go(func() error { resp, err := store.LabelValues(gctx, &storepb.LabelValuesRequest{ - Label: r.Label, + Label: r.Label, PartialResponseDisabled: r.PartialResponseDisabled, }) if err != nil { From 7a83cda50c8ad0f2bd95f960103a3bc0ac14a33c Mon Sep 17 00:00:00 2001 From: Johnathan Falk Date: Thu, 28 Mar 2019 11:37:26 -0400 Subject: [PATCH 36/43] Fix version info after change to go mods. (#991) --- .promu.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.promu.yml b/.promu.yml index 3bf9e9a742..812de03eae 100644 --- a/.promu.yml +++ b/.promu.yml @@ -6,11 +6,11 @@ build: path: ./cmd/thanos flags: -a -tags netgo ldflags: | - -X {{repoPath}}/vendor/github.com/prometheus/common/version.Version={{.Version}} - -X {{repoPath}}/vendor/github.com/prometheus/common/version.Revision={{.Revision}} - -X {{repoPath}}/vendor/github.com/prometheus/common/version.Branch={{.Branch}} - -X {{repoPath}}/vendor/github.com/prometheus/common/version.BuildUser={{user}}@{{host}} - -X {{repoPath}}/vendor/github.com/prometheus/common/version.BuildDate={{date "20060102-15:04:05"}} + -X github.com/prometheus/common/version.Version={{.Version}} + -X github.com/prometheus/common/version.Revision={{.Revision}} + -X github.com/prometheus/common/version.Branch={{.Branch}} + -X github.com/prometheus/common/version.BuildUser={{user}}@{{host}} + -X github.com/prometheus/common/version.BuildDate={{date "20060102-15:04:05"}} crossbuild: platforms: - linux/amd64 From 757835acf6e7bb2e4f8a977862da65fa3249abc1 Mon Sep 17 00:00:00 2001 From: Xiang Dai <764524258@qq.com> Date: Sun, 31 Mar 2019 04:38:18 +0800 Subject: [PATCH 37/43] use origin mod check (#987) Signed-off-by: Xiang Dai <764524258@qq.com> --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index f88731471d..c5a699aed4 100644 --- a/Makefile +++ b/Makefile @@ -183,8 +183,8 @@ go-mod-tidy: check-git check-bzr @go mod tidy .PHONY: check-go-mod -check-go-mod: go-mod-tidy - @git diff --exit-code go.mod go.sum > /dev/null || echo >&2 "go.mod and/or go.sum have uncommited changes. See CONTRIBUTING.md." +check-go-mod: + @go mod verify # tooling deps. TODO(bwplotka): Pin them all to certain version! .PHONY: check-git From 9997bc8e534fb4afe96e6a3ff4c7ba443fc1923b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Giedrius=20Statkevi=C4=8Dius?= Date: Mon, 1 Apr 2019 13:05:57 +0300 Subject: [PATCH 38/43] Makefile: fix mac OS build (#995) On one of my mac OS machines the `find . -type f ...` expands to *a lot* of files, and then all of them are passed as parameters to `goimports`. Unfortunately, it seems like that hits the default limits of ARG_MAX i.e. the maximum amount of bytes that can be allocated for the parameters of a single command line. Fix this by only passing the directories. `goimports` already descends to all child directories and checks all of the files. Also, it properly checks only the Go files as implemented here: https://github.com/golang/tools/blob/master/cmd/goimports/goimports.go#L64 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c5a699aed4..ac9c472017 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ PREFIX ?= $(shell pwd) -FILES ?= $(shell find . -type f -name '*.go' -not -path "./vendor/*") +DIRECTORIES ?= $(shell find . -path './*' -prune -type d -not -path "./vendor") DOCKER_IMAGE_NAME ?= thanos DOCKER_IMAGE_TAG ?= $(subst /,-,$(shell git rev-parse --abbrev-ref HEAD))-$(shell date +%Y-%m-%d)-$(shell git rev-parse --short HEAD) # $GOPATH/bin might not be in $PATH, so we can't assume `which` would give use @@ -132,7 +132,7 @@ errcheck: $(ERRCHECK) .PHONY: format format: $(GOIMPORTS) @echo ">> formatting code" - @$(GOIMPORTS) -w $(FILES) + @$(GOIMPORTS) -w $(DIRECTORIES) # proto generates golang files from Thanos proto files. .PHONY: proto From 84cad51f5e590b5d017fec2c4ce9681dde92518b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Tue, 2 Apr 2019 11:06:50 +0100 Subject: [PATCH 39/43] ruler: Added support for strict rule groups that does not allow partial_response (#970) Signed-off-by: Bartek Plotka --- CHANGELOG.md | 6 + cmd/thanos/query.go | 2 +- cmd/thanos/rule.go | 185 ++++++++++++------- docs/components/query.md | 34 +++- docs/components/rule.md | 135 ++++++++++++-- pkg/promclient/promclient.go | 152 ++++++++++++++-- pkg/rule/api/v1.go | 122 +++++++------ pkg/rule/api/v1_test.go | 59 +++--- pkg/rule/rule.go | 169 +++++++++++++++++ pkg/rule/rule_test.go | 124 +++++++++++++ pkg/store/bucket.go | 2 +- pkg/store/prompb/remote.pb.go | 46 ++++- pkg/store/storepb/custom.go | 8 + pkg/store/storepb/rpc.pb.go | 248 +++++++++++++++++-------- pkg/store/storepb/rpc.proto | 28 ++- pkg/store/storepb/types.pb.go | 31 +++- pkg/ui/rule.go | 26 +-- scripts/genproto.sh | 2 +- test/e2e/query_test.go | 30 +++- test/e2e/rule_test.go | 319 +++++++++++++++++++++++++++++++-- test/e2e/spinup_test.go | 227 ++++++++++++++++------- test/e2e/store_gateway_test.go | 30 +++- 22 files changed, 1596 insertions(+), 389 deletions(-) create mode 100644 pkg/rule/rule.go create mode 100644 pkg/rule/rule_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index fa6032cdb7..03627e3283 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,12 @@ New tracing span: :warning: **WARNING** :warning: #798 adds a new default limit to Thanos Store: `--store.grpc.series-max-concurrency`. Most likely you will want to make it the same as `--query.max-concurrent` on Thanos Query. +- [#970](https://github.com/improbable-eng/thanos/pull/970) Added `PartialResponseStrategy` field for `RuleGroups` for `Ruler`. + +### Changed +- [#970](https://github.com/improbable-eng/thanos/pull/970) Deprecated partial_response_disabled proto field. Added partial_response_strategy instead. Both in gRPC and Query API. +- [#970](https://github.com/improbable-eng/thanos/pull/970) No `PartialResponseStrategy` field for `RuleGroups` by default means `abort` strategy (old PartialResponse disabled) as this is recommended option for Rules and alerts. + ### Fixed - [#921](https://github.com/improbable-eng/thanos/pull/921) `thanos_objstore_bucket_last_successful_upload_time` now does not appear when no blocks have been uploaded so far - [#966](https://github.com/improbable-eng/thanos/pull/966) Bucket: verify no longer warns about overlapping blocks, that overlap `0s` diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index 792ff7362c..fd360e60af 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -282,7 +282,7 @@ func runQuery( fileSDCache := cache.New() dnsProvider := dns.NewProvider( logger, - extprom.WrapRegistererWithPrefix("thanos_querier_store_apis", reg), + extprom.WrapRegistererWithPrefix("thanos_querier_store_apis_", reg), ) var ( diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index e9b4a87a0e..e71d57976c 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -30,6 +30,7 @@ import ( "github.com/improbable-eng/thanos/pkg/extprom" "github.com/improbable-eng/thanos/pkg/objstore/client" "github.com/improbable-eng/thanos/pkg/promclient" + thanosrule "github.com/improbable-eng/thanos/pkg/rule" v1 "github.com/improbable-eng/thanos/pkg/rule/api" "github.com/improbable-eng/thanos/pkg/runutil" "github.com/improbable-eng/thanos/pkg/shipper" @@ -227,13 +228,23 @@ func runRule( Name: "thanos_rule_loaded_rules", Help: "Loaded rules partitioned by file and group", }, - []string{"file", "group"}, + []string{"part_resp_strategy", "file", "group"}, ) + ruleEvalWarnings := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "thanos_rule_evaluation_with_warnings_total", + Help: "The total number of rule evaluation that were successful but had warnings which can indicate partial error.", + }, []string{"strategy"}, + ) + ruleEvalWarnings.WithLabelValues(strings.ToLower(storepb.PartialResponseStrategy_ABORT.String())) + ruleEvalWarnings.WithLabelValues(strings.ToLower(storepb.PartialResponseStrategy_WARN.String())) + reg.MustRegister(configSuccess) reg.MustRegister(configSuccessTime) reg.MustRegister(duplicatedQuery) reg.MustRegister(alertMngrAddrResolutionErrors) reg.MustRegister(rulesLoaded) + reg.MustRegister(ruleEvalWarnings) for _, addr := range queryAddrs { if addr == "" { @@ -260,57 +271,16 @@ func runRule( dnsProvider := dns.NewProvider( logger, - extprom.WrapRegistererWithPrefix("thanos_ruler_query_apis", reg), + extprom.WrapRegistererWithPrefix("thanos_ruler_query_apis_", reg), ) - // Hit the HTTP query API of query peers in randomized order until we get a result - // back or the context get canceled. - queryFn := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { - var addrs []string - - // Add addresses from gossip. - peers := peer.PeerStates(cluster.PeerTypeQuery) - var ids []string - for id := range peers { - ids = append(ids, id) - } - sort.Slice(ids, func(i int, j int) bool { - return strings.Compare(ids[i], ids[j]) < 0 - }) - for _, id := range ids { - addrs = append(addrs, peers[id].QueryAPIAddr) - } - - // Add DNS resolved addresses from static flags and file SD. - // TODO(bwplotka): Consider generating addresses in *url.URL - addrs = append(addrs, dnsProvider.Addresses()...) - - removeDuplicateQueryAddrs(logger, duplicatedQuery, addrs) - - for _, i := range rand.Perm(len(addrs)) { - u, err := url.Parse(fmt.Sprintf("http://%s", addrs[i])) - if err != nil { - return nil, errors.Wrapf(err, "url parse %s", addrs[i]) - } - - span, ctx := tracing.StartSpan(ctx, "/rule_instant_query HTTP[client]") - v, err := promclient.PromqlQueryInstant(ctx, logger, u, q, t, true) - span.Finish() - return v, err - } - return nil, errors.Errorf("no query peer reachable") - } - // Run rule evaluation and alert notifications. var ( alertmgrs = newAlertmanagerSet(alertmgrURLs) alertQ = alert.NewQueue(logger, reg, 10000, 100, labelsTSDBToProm(lset), alertExcludeLabels) - mgr *rules.Manager + ruleMgrs = thanosrule.Managers{} ) { - ctx, cancel := context.WithCancel(context.Background()) - ctx = tracing.ContextWithTracer(ctx, tracer) - notify := func(ctx context.Context, expr string, alerts ...*rules.Alert) { res := make([]*alert.Alert, 0, len(alerts)) for _, alrt := range alerts { @@ -331,26 +301,38 @@ func runRule( } alertQ.Push(res) } - st := tsdb.Adapter(db, 0) - mgr = rules.NewManager(&rules.ManagerOptions{ - Context: ctx, - QueryFunc: queryFn, + + opts := rules.ManagerOptions{ NotifyFunc: notify, Logger: log.With(logger, "component", "rules"), Appendable: st, - Registerer: reg, ExternalURL: nil, TSDB: st, - }) - g.Add(func() error { - mgr.Run() - <-ctx.Done() - mgr.Stop() - return nil - }, func(error) { - cancel() - }) + } + + for _, strategy := range storepb.PartialResponseStrategy_value { + s := storepb.PartialResponseStrategy(strategy) + + ctx, cancel := context.WithCancel(context.Background()) + ctx = tracing.ContextWithTracer(ctx, tracer) + + opts := opts + opts.Registerer = extprom.WrapRegistererWith(prometheus.Labels{"strategy": strings.ToLower(s.String())}, reg) + opts.Context = ctx + opts.QueryFunc = queryFunc(logger, peer, dnsProvider, duplicatedQuery, ruleEvalWarnings, s) + + ruleMgrs[s] = rules.NewManager(&opts) + g.Add(func() error { + ruleMgrs[s].Run() + <-ctx.Done() + + return nil + }, func(error) { + cancel() + ruleMgrs[s].Stop() + }) + } } { var storeLset []storepb.Label @@ -469,11 +451,13 @@ func runRule( level.Error(logger).Log("msg", "retrieving rule files failed. Ignoring file.", "pattern", pat, "err", err) continue } + files = append(files, fs...) } level.Info(logger).Log("msg", "reload rule files", "numFiles", len(files)) - if err := mgr.Update(evalInterval, files); err != nil { + + if err := ruleMgrs.Update(dataDir, evalInterval, files); err != nil { configSuccess.Set(0) level.Error(logger).Log("msg", "reloading rules failed", "err", err) continue @@ -483,9 +467,12 @@ func runRule( configSuccessTime.Set(float64(time.Now().UnixNano()) / 1e9) rulesLoaded.Reset() - for _, group := range mgr.RuleGroups() { - rulesLoaded.WithLabelValues(group.File(), group.Name()).Set(float64(len(group.Rules()))) + for s, mgr := range ruleMgrs { + for _, group := range mgr.RuleGroups() { + rulesLoaded.WithLabelValues(s.String(), group.File(), group.Name()).Set(float64(len(group.Rules()))) + } } + } }, func(error) { close(cancel) @@ -569,9 +556,9 @@ func runRule( "web.prefix-header": webPrefixHeaderName, } - ui.NewRuleUI(logger, mgr, alertQueryURL.String(), flagsMap).Register(router.WithPrefix(webRoutePrefix)) + ui.NewRuleUI(logger, ruleMgrs, alertQueryURL.String(), flagsMap).Register(router.WithPrefix(webRoutePrefix)) - api := v1.NewAPI(logger, mgr) + api := v1.NewAPI(logger, ruleMgrs) api.Register(router.WithPrefix(path.Join(webRoutePrefix, "/api/v1")), tracer, logger) mux := http.NewServeMux() @@ -767,3 +754,75 @@ func removeDuplicateQueryAddrs(logger log.Logger, duplicatedQueriers prometheus. } return deduplicated } + +// queryFunc returns query function that hits the HTTP query API of query peers in randomized order until we get a result +// back or the context get canceled. +func queryFunc( + logger log.Logger, + peer cluster.Peer, + dnsProvider *dns.Provider, + duplicatedQuery prometheus.Counter, + ruleEvalWarnings *prometheus.CounterVec, + partialResponseStrategy storepb.PartialResponseStrategy, +) rules.QueryFunc { + var spanID string + + switch partialResponseStrategy { + case storepb.PartialResponseStrategy_WARN: + spanID = "/rule_instant_query HTTP[client]" + case storepb.PartialResponseStrategy_ABORT: + spanID = "/rule_instant_query_part_resp_abort HTTP[client]" + default: + // Programming error will be caught by tests. + panic(errors.Errorf("unknown partial response strategy %v", partialResponseStrategy).Error()) + } + + return func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { + var addrs []string + + // Add addresses from gossip. + peers := peer.PeerStates(cluster.PeerTypeQuery) + var ids []string + for id := range peers { + ids = append(ids, id) + } + sort.Slice(ids, func(i int, j int) bool { + return strings.Compare(ids[i], ids[j]) < 0 + }) + for _, id := range ids { + addrs = append(addrs, peers[id].QueryAPIAddr) + } + + // Add DNS resolved addresses from static flags and file SD. + // TODO(bwplotka): Consider generating addresses in *url.URL + addrs = append(addrs, dnsProvider.Addresses()...) + + removeDuplicateQueryAddrs(logger, duplicatedQuery, addrs) + + for _, i := range rand.Perm(len(addrs)) { + u, err := url.Parse(fmt.Sprintf("http://%s", addrs[i])) + if err != nil { + return nil, errors.Wrapf(err, "url parse %s", addrs[i]) + } + + span, ctx := tracing.StartSpan(ctx, spanID) + v, warns, err := promclient.PromqlQueryInstant(ctx, logger, u, q, t, promclient.QueryOptions{ + Deduplicate: true, + PartialResponseStrategy: partialResponseStrategy, + }) + span.Finish() + + if err != nil { + level.Error(logger).Log("err", err, "query", q) + } + + if err == nil && len(warns) > 0 { + ruleEvalWarnings.WithLabelValues(strings.ToLower(partialResponseStrategy.String())).Inc() + // TODO(bwplotka): Propagate those to UI, probably requires changing rule manager code ): + level.Warn(logger).Log("warnings", strings.Join(warns, ", "), "query", q) + } + return v, err + } + return nil, errors.Errorf("no query peer reachable") + } +} diff --git a/docs/components/query.md b/docs/components/query.md index 972796eeca..cd38da1053 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -53,8 +53,33 @@ This logic can also be controlled via parameter on QueryAPI. More details below. Overall QueryAPI exposed by Thanos is guaranteed to be compatible with Prometheus 2.x. -However, for additional Thanos features, Thanos, on top of Prometheus adds several -additional parameters listed below as well as custom response fields. +However, for additional Thanos features, Thanos, on top of Prometheus adds +* partial response behaviour +* several additional parameters listed below +* custom response fields. + +### Partial Response + +QueryAPI and StoreAPI has additional behaviour controlled via query parameter called [PartialResponseStrategy](../../pkg/store/storepb/rpc.pb.go). + +This parameter controls tradeoff between accuracy and availability. + +Partial response is a potentially missed result within query against QueryAPI or StoreAPI. This can happen if one +of StoreAPIs is returning error or timeout whereas couple of others returns success. It does not mean you are missing data, +you might lucky enough that you actually get the correct data as the broken StoreAPI did not have anything for your query. + +If partial response happen QueryAPI returns human readable warnings explained [here](query.md#CustomResponseFields) + +NOTE that having warning does not necessary means partial response (e.g no store matched query warning) + +See [this](query.md#PartialResponseStrategy) on how to control this behaviour. + +Querier also allows to configure different timeouts: +* `--query.timeout` +* `--store.response-timeout` + +If you prefer availability over accuracy you can set tighter timeout to underlying StoreAPI than overall query timeout. If partial response +strategy is NOT `abort`, this will "ignore" slower StoreAPIs producing just warning with 200 status code response. ### Deduplication Enabled @@ -77,7 +102,9 @@ Max source resolution is max resolution in seconds we want to use for data we qu * 5m -> we will use max 5m downsampling. * 1h -> we will use max 1h downsampling. -### Partial Response / Error Enabled +### Partial Response Strategy + +// TODO(bwplotka): Update. This will change to "strategy" soon as [PartialResponseStrategy enum here](../../pkg/store/storepb/rpc.proto) | HTTP URL/FORM parameter | Type | Default | Example | |----|----|----|----| @@ -92,6 +119,7 @@ return warning. Any additional field does not break compatibility, however there is no guarantee that Grafana or any other client will understand those. Currently Thanos UI exposed by Thanos understands + ```go type queryData struct { ResultType promql.ValueType `json:"resultType"` diff --git a/docs/components/rule.md b/docs/components/rule.md index 60b856223b..fb7304d757 100644 --- a/docs/components/rule.md +++ b/docs/components/rule.md @@ -1,36 +1,139 @@ -# Rule +# Rule (aka Ruler) -_**NOTE:** The rule component is experimental since it has conceptual tradeoffs that might not be favorable for most use cases. It is recommended to keep deploying rules to the relevant Prometheus servers._ +_**NOTE:** It is recommended to ma deploying rules inside the relevant Prometheus servers locally. Use ruler only on specific cases. Read details[below](rule.md#Risk) why._ _The rule component should in particular not be used to circumvent solving rule deployment properly at the configuration management level._ -The rule component evaluates Prometheus recording and alerting rules against random query nodes in its cluster. Rule results are written back to disk in the Prometheus 2.0 storage format. Rule nodes at the same time participate in the cluster themselves as source store nodes and upload their generated TSDB blocks to an object store. +The rule component evaluates Prometheus recording and alerting rules against chosen query API via repeated `--query` (or FileSD via `--query.sd`). If more then one query is passed, round robin balancing is performed. + +Rule results are written back to disk in the Prometheus 2.0 storage format. Rule nodes at the same time participate in the system as source store nodes, which means that they expose StoreAPI and upload their generated TSDB blocks to an object store. -The data of each rule node can be labeled to satisfy the clusters labeling scheme. High-availability pairs can be run in parallel and should be distinguished by the designated replica label, just like regular Prometheus servers. +You can think of Rule as a simplified Prometheus that does not require a sidecar and does not scrape and do PromQL evaluation (no QueryAPI). + +The data of each Rule node can be labeled to satisfy the clusters labeling scheme. High-availability pairs can be run in parallel and should be distinguished by the designated replica label, just like regular Prometheus servers. +Read more about Ruler in HA in [here](rule.md#Ruler_HA) ``` $ thanos rule \ - --data-dir "/path/to/data" \ - --eval-interval "30s" \ - --rule-file "/path/to/rules/*.rules.yaml" \ - --alert.query-url "http://0.0.0.0:9090" \ - --alertmanagers.url "alert.thanos.io" \ - --cluster.peers "thanos-cluster.example.org" \ - --objstore.config-file "bucket.yml" + --data-dir "/path/to/data" \ + --eval-interval "30s" \ + --rule-file "/path/to/rules/*.rules.yaml" \ + --alert.query-url "http://0.0.0.0:9090" \ # This tells what query URL to link to in UI. + --alertmanagers.url "alert.thanos.io" \ + --query "query.example.org" \ + --query "query2.example.org" \ + --objstore.config-file "bucket.yml" \ + --label 'monitor_cluster="cluster1"' + --label 'replica="A" ``` -The content of `bucket.yml`: +## Risk + +Ruler has conceptual tradeoffs that might not be favorable for most use cases. The main tradeoff is its dependence on +query reliability. For Prometheus it is unlikely to have alert/recording rule evaluation failure as evaluation is local. + +For Ruler the read path is distributed, since most likely ruler is querying Thanos Querier which gets data from remote Store APIs. + +This means that **query failure** are more likely to happen, that's why clear strategy on what will happen to alert and during query +unavailability is the key. + +## Partial Response + +See [this](query.md#PartialResponse) on initial info. + +Rule allows to specify rule groups with additional field that controls PartialResponseStrategy e.g: ```yaml -type: GCS -config: - bucket: example-bucket +groups: +- name: "warn strategy" + partial_response_strategy: "warn" + rules: + - alert: "some" + expr: "up" +- name: "abort strategy" + partial_response_strategy: "abort" + rules: + - alert: "some" + expr: "up" +- name: "by default strategy is abort" + rules: + - alert: "some" + expr: "up" ``` +It is recommended to keep partial response to `abort` for alerts and that is the default as well. + +Essentially for alerting having partial response can result in symptom being missed by Rule's alert. + +## Must have: essential Ruler alerts! + +To be sure that alerting works it is essential to monitor Ruler and alert from another **Scraper (Prometheus + sidecar)** that sits in same cluster. + +The most important metrics to alert on are: + +* `thanos_alert_sender_alerts_dropped_total`. If greater than 0 it means that rule triggered alerts are not being sent to alertmanager which might +indicate connection, incompatibility or misconfiguration problems. + +* `prometheus_rule_evaluation_failures_total`. If greater than 0 it means that rule failed to be evaluated which results in +either gap in rule or potentially ignored alert. Alert heavily on this if this happens for longer than your alert thresholds. +`strategy` label will tell you if failures comes from rules that tolerates [partial response](rule.md#PartialResponse) or not. + +* `prometheus_rule_group_last_duration_seconds < prometheus_rule_group_interval_seconds` If the difference is heavy it means +that rule evaluation took more time than scheduled interval. It can indicate your query backend (e.g Querier) takes too much time +to evaluate the query, that is not fast enough to fill the rule. This might indicate other problems like slow StoreAPis or +too complex query expression in rule. + +* `thanos_rule_evaluation_with_warnings_total`. If you choose to use Rules and Alerts with [partial response strategy](rule.md#PartialResponse) +equals "warn", this metric will tell you how many evaluation ended up with some kind of warning. To see the actual warnings +see WARN log level. This might suggest that those evaluations returns partial response and might be or not accurate. + +Those metrics are important for vanilla Prometheus as well, but even more important when we rely on (sometimes WAN) network. + +// TODO(bwplotka): Rereview them after recent changes in metrics. +See [alerts](/examples/alerts/alerts.md#Ruler) for more example alerts for ruler. + +NOTE: It is also recommend to set an mocked Alert on Ruler that checks if query is up. This might be something simple like `vector(1)` query, just +to check if Querier is live. + +## Performance. + As rule nodes outsource query processing to query nodes, they should generally experience little load. If necessary, functional sharding can be applied by splitting up the sets of rules between HA pairs. Rules are processed with deduplicated data according to the replica label configured on query nodes. -## Deployment +## External labels + +It is *mandatory* to add certain external labels to indicate the ruler origin (e.g `label='replica="A"'` or for `cluster`). +Otherwise running multiple ruler replicas will be not possible, resulting in clash during compaction. + +NOTE: It is advised to put different external labels than labels given by other sources we are recording or alerting against. + +For example: + +* Ruler is in cluster `mon1` and we have Prometheus in cluster `eu1` +* By default we could try having consistent labels so we have `cluster=eu1` for Prometheus and `cluster=mon1` for Ruler. +* We configure `ScraperIsDown` alert that monitors service from `work1` cluster. +* When triggered this alert results in `ScraperIsDown{cluster=mon1}` since external labels always *replace* source labels. + +This effectively drops the important metadata and makes it impossible to tell in what exactly `cluster` the `ScraperIsDown` alert found problem +without falling back to manual query. + +## Ruler UI + +On HTTP address ruler exposes its UI that shows mainly Alerts and Rules page (similar to Prometheus Alerts page). +Each alert is linked to query that alert is performing that you can click to navigate to configured `alert.query-url`. + +## Ruler HA + +Ruler aims to use similar approach as Prometheus does. You can configure external labels, as well as simple relabelling. + +In case of Ruler in HA you need to make sure you have following labelling setup: + +* Labels that identifies the HA group ruler and replica label with different value for each ruler instance, e.g: +`cluster="eu1", replica="A"` and `cluster=eu1, replica="B"` by using `--label` flag. +* Labels that needs to be dropped just before sending to alermanager in order for alertmanger to deduplicate alerts e.g +`--alertmanager.label-drop="replica"`. + +Full relabelling is planned to be done in future and is tracked here: https://github.com/improbable-eng/thanos/issues/660 ## Flags diff --git a/pkg/promclient/promclient.go b/pkg/promclient/promclient.go index e70fbb245b..1622a16340 100644 --- a/pkg/promclient/promclient.go +++ b/pkg/promclient/promclient.go @@ -3,9 +3,13 @@ package promclient import ( + "bufio" + "bytes" + "compress/gzip" "context" "encoding/json" "fmt" + "io" "io/ioutil" "net/http" "net/url" @@ -18,14 +22,17 @@ import ( "time" "github.com/go-kit/kit/log" + "github.com/go-kit/kit/log/level" "github.com/improbable-eng/thanos/pkg/runutil" + "github.com/improbable-eng/thanos/pkg/store/storepb" "github.com/improbable-eng/thanos/pkg/tracing" "github.com/pkg/errors" "github.com/prometheus/common/model" promlabels "github.com/prometheus/prometheus/pkg/labels" + "github.com/prometheus/prometheus/pkg/textparse" "github.com/prometheus/prometheus/promql" "github.com/prometheus/tsdb/labels" - "gopkg.in/yaml.v2" + yaml "gopkg.in/yaml.v2" ) // IsWALFileAccesible returns no error if WAL dir can be found. This helps to tell @@ -242,24 +249,55 @@ func Snapshot(ctx context.Context, logger log.Logger, base *url.URL, skipHead bo return path.Join("snapshots", d.Data.Name), nil } +type QueryOptions struct { + Deduplicate bool + PartialResponseStrategy storepb.PartialResponseStrategy +} + +func (p *QueryOptions) AddTo(values url.Values) error { + values.Add("dedup", fmt.Sprintf("%v", p.Deduplicate)) + + var partialResponseValue string + switch p.PartialResponseStrategy { + case storepb.PartialResponseStrategy_WARN: + partialResponseValue = strconv.FormatBool(true) + case storepb.PartialResponseStrategy_ABORT: + partialResponseValue = strconv.FormatBool(false) + default: + return errors.Errorf("unknown partial response strategy %v", p.PartialResponseStrategy) + } + + // TODO(bwplotka): Apply change from bool to strategy in Query API as well. + values.Add("partial_response", partialResponseValue) + + return nil +} + // QueryInstant performs instant query and returns results in model.Vector type. -func QueryInstant(ctx context.Context, logger log.Logger, base *url.URL, query string, t time.Time, dedup bool) (model.Vector, error) { +func QueryInstant(ctx context.Context, logger log.Logger, base *url.URL, query string, t time.Time, opts QueryOptions) (model.Vector, []string, error) { if logger == nil { logger = log.NewNopLogger() } - u := *base - u.Path = path.Join(u.Path, "/api/v1/query") - - params := url.Values{} + params, err := url.ParseQuery(base.RawQuery) + if err != nil { + return nil, nil, errors.Wrapf(err, "parse raw query %s", base.RawQuery) + } params.Add("query", query) params.Add("time", t.Format(time.RFC3339Nano)) - params.Add("dedup", fmt.Sprintf("%v", dedup)) + if err := opts.AddTo(params); err != nil { + return nil, nil, errors.Wrap(err, "add thanos opts query params") + } + + u := *base + u.Path = path.Join(u.Path, "/api/v1/query") u.RawQuery = params.Encode() + level.Debug(logger).Log("msg", "querying instant", "url", u.String()) + req, err := http.NewRequest("GET", u.String(), nil) if err != nil { - return nil, err + return nil, nil, errors.Wrap(err, "create GET request") } req = req.WithContext(ctx) @@ -269,7 +307,7 @@ func QueryInstant(ctx context.Context, logger log.Logger, base *url.URL, query s } resp, err := client.Do(req) if err != nil { - return nil, err + return nil, nil, errors.Wrapf(err, "perform GET request against %s", u.String()) } defer runutil.CloseWithLogOnErr(logger, resp.Body, "query body") @@ -280,10 +318,13 @@ func QueryInstant(ctx context.Context, logger log.Logger, base *url.URL, query s ResultType string `json:"resultType"` Result json.RawMessage `json:"result"` } `json:"data"` + + // Extra field supported by Thanos Querier. + Warnings []string `json:"warnings"` } if err = json.NewDecoder(resp.Body).Decode(&m); err != nil { - return nil, err + return nil, nil, errors.Wrap(err, "decode query instant response") } var vectorResult model.Vector @@ -293,24 +334,24 @@ func QueryInstant(ctx context.Context, logger log.Logger, base *url.URL, query s switch m.Data.ResultType { case promql.ValueTypeVector: if err = json.Unmarshal(m.Data.Result, &vectorResult); err != nil { - return nil, err + return nil, nil, errors.Wrap(err, "decode result into ValueTypeVector") } case promql.ValueTypeScalar: vectorResult, err = convertScalarJSONToVector(m.Data.Result) if err != nil { - return nil, err + return nil, nil, errors.Wrap(err, "decode result into ValueTypeScalar") } default: - return nil, errors.Errorf("unknown response type: '%q'", m.Data.ResultType) + return nil, nil, errors.Errorf("unknown response type: '%q'", m.Data.ResultType) } - return vectorResult, nil + return vectorResult, m.Warnings, nil } // PromqlQueryInstant performs instant query and returns results in promql.Vector type that is compatible with promql package. -func PromqlQueryInstant(ctx context.Context, logger log.Logger, base *url.URL, query string, t time.Time, dedup bool) (promql.Vector, error) { - vectorResult, err := QueryInstant(ctx, logger, base, query, t, dedup) +func PromqlQueryInstant(ctx context.Context, logger log.Logger, base *url.URL, query string, t time.Time, opts QueryOptions) (promql.Vector, []string, error) { + vectorResult, warnings, err := QueryInstant(ctx, logger, base, query, t, opts) if err != nil { - return nil, err + return nil, nil, err } vec := make(promql.Vector, 0, len(vectorResult)) @@ -332,7 +373,7 @@ func PromqlQueryInstant(ctx context.Context, logger log.Logger, base *url.URL, q }) } - return vec, nil + return vec, warnings, nil } // Scalar response consists of array with mixed types so it needs to be @@ -362,3 +403,78 @@ func convertScalarJSONToVector(scalarJSONResult json.RawMessage) (model.Vector, Value: resultValue, Timestamp: resultTime}}, nil } + +// MetricValues returns current value of instant query and returns results in model.Vector type. +func MetricValues(ctx context.Context, logger log.Logger, base *url.URL, perMetricFn func(metric promlabels.Labels, val float64) error) error { + if logger == nil { + logger = log.NewNopLogger() + } + + u := *base + u.Path = path.Join(u.Path, "/metrics") + + req, err := http.NewRequest("GET", u.String(), nil) + if err != nil { + return errors.Wrap(err, "create GET request") + } + + req.Header.Add("Accept", `application/openmetrics-text; version=0.0.1,text/plain;version=0.0.4;q=0.5,*/*;q=0.1`) + req.Header.Add("Accept-Encoding", "gzip") + + req = req.WithContext(ctx) + + client := &http.Client{ + Transport: tracing.HTTPTripperware(logger, http.DefaultTransport), + } + resp, err := client.Do(req) + if err != nil { + return errors.Wrapf(err, "perform GET request against %s", u.String()) + } + defer runutil.CloseWithLogOnErr(logger, resp.Body, "metrics body") + + if resp.StatusCode != http.StatusOK { + return errors.Errorf("server returned HTTP status %s", resp.Status) + } + + b := &bytes.Buffer{} + if resp.Header.Get("Content-Encoding") != "gzip" { + _, err = io.Copy(b, resp.Body) + if err != nil { + return err + } + } else { + buf := bufio.NewReader(resp.Body) + gzipr, err := gzip.NewReader(buf) + if err != nil { + return err + } + _, err = io.Copy(b, gzipr) + _ = gzipr.Close() + if err != nil { + return err + } + } + + p := textparse.New(b.Bytes(), resp.Header.Get("Content-Type")) + for { + var et textparse.Entry + if et, err = p.Next(); err != nil { + if err == io.EOF { + return nil + } + return err + } + + if et != textparse.EntrySeries { + continue + } + + var lset promlabels.Labels + _ = p.Metric(&lset) + _, _, v := p.Series() + + if err := perMetricFn(lset, v); err != nil { + return err + } + } +} diff --git a/pkg/rule/api/v1.go b/pkg/rule/api/v1.go index be2dc5abf4..df15e1750f 100644 --- a/pkg/rule/api/v1.go +++ b/pkg/rule/api/v1.go @@ -6,31 +6,32 @@ import ( "time" "github.com/NYTimes/gziphandler" + "github.com/go-kit/kit/log" qapi "github.com/improbable-eng/thanos/pkg/query/api" + thanosrule "github.com/improbable-eng/thanos/pkg/rule" + "github.com/improbable-eng/thanos/pkg/store/storepb" "github.com/improbable-eng/thanos/pkg/tracing" - "github.com/prometheus/client_golang/prometheus" - - "github.com/go-kit/kit/log" "github.com/opentracing/opentracing-go" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/route" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/rules" ) type API struct { - logger log.Logger - now func() time.Time - rulesRetriever rulesRetriever + logger log.Logger + now func() time.Time + ruleRetriever RulesRetriever } func NewAPI( logger log.Logger, - rr rulesRetriever, + ruleRetriever RulesRetriever, ) *API { return &API{ - logger: logger, - now: time.Now, - rulesRetriever: rr, + logger: logger, + now: time.Now, + ruleRetriever: ruleRetriever, } } @@ -54,20 +55,20 @@ func (api *API) Register(r *route.Router, tracer opentracing.Tracer, logger log. } -type rulesRetriever interface { - RuleGroups() []*rules.Group - AlertingRules() []*rules.AlertingRule +type RulesRetriever interface { + RuleGroups() []thanosrule.Group + AlertingRules() []thanosrule.AlertingRule } func (api *API) rules(r *http.Request) (interface{}, []error, *qapi.ApiError) { - ruleGroups := api.rulesRetriever.RuleGroups() - res := &RuleDiscovery{RuleGroups: make([]*RuleGroup, len(ruleGroups))} - for i, grp := range ruleGroups { + res := &RuleDiscovery{} + for _, grp := range api.ruleRetriever.RuleGroups() { apiRuleGroup := &RuleGroup{ - Name: grp.Name(), - File: grp.File(), - Interval: grp.Interval().Seconds(), - Rules: []rule{}, + Name: grp.Name(), + File: grp.File(), + Interval: grp.Interval().Seconds(), + Rules: []rule{}, + PartialResponseStrategy: grp.PartialResponseStrategy.String(), } for _, r := range grp.Rules() { @@ -79,17 +80,18 @@ func (api *API) rules(r *http.Request) (interface{}, []error, *qapi.ApiError) { } switch rule := r.(type) { - case *rules.AlertingRule: + case thanosrule.AlertingRule: enrichedRule = alertingRule{ - Name: rule.Name(), - Query: rule.Query().String(), - Duration: rule.Duration().Seconds(), - Labels: rule.Labels(), - Annotations: rule.Annotations(), - Alerts: rulesAlertsToAPIAlerts(rule.ActiveAlerts()), - Health: rule.Health(), - LastError: lastError, - Type: "alerting", + Name: rule.Name(), + Query: rule.Query().String(), + Duration: rule.Duration().Seconds(), + Labels: rule.Labels(), + Annotations: rule.Annotations(), + Alerts: rulesAlertsToAPIAlerts(grp.PartialResponseStrategy, rule.ActiveAlerts()), + Health: rule.Health(), + LastError: lastError, + Type: "alerting", + PartialResponseStrategy: rule.PartialResponseStrategy.String(), } case *rules.RecordingRule: enrichedRule = recordingRule{ @@ -107,22 +109,20 @@ func (api *API) rules(r *http.Request) (interface{}, []error, *qapi.ApiError) { apiRuleGroup.Rules = append(apiRuleGroup.Rules, enrichedRule) } - res.RuleGroups[i] = apiRuleGroup + res.RuleGroups = append(res.RuleGroups, apiRuleGroup) } + return res, nil, nil } func (api *API) alerts(r *http.Request) (interface{}, []error, *qapi.ApiError) { - alertingRules := api.rulesRetriever.AlertingRules() - alerts := []*Alert{} - - for _, alertingRule := range alertingRules { + var alerts []*Alert + for _, alertingRule := range api.ruleRetriever.AlertingRules() { alerts = append( alerts, - rulesAlertsToAPIAlerts(alertingRule.ActiveAlerts())..., + rulesAlertsToAPIAlerts(alertingRule.PartialResponseStrategy, alertingRule.ActiveAlerts())..., ) } - res := &AlertDiscovery{Alerts: alerts} return res, nil, nil @@ -133,22 +133,24 @@ type AlertDiscovery struct { } type Alert struct { - Labels labels.Labels `json:"labels"` - Annotations labels.Labels `json:"annotations"` - State string `json:"state"` - ActiveAt *time.Time `json:"activeAt,omitempty"` - Value float64 `json:"value"` + Labels labels.Labels `json:"labels"` + Annotations labels.Labels `json:"annotations"` + State string `json:"state"` + ActiveAt *time.Time `json:"activeAt,omitempty"` + Value float64 `json:"value"` + PartialResponseStrategy string `json:"partial_response_strategy"` } -func rulesAlertsToAPIAlerts(rulesAlerts []*rules.Alert) []*Alert { +func rulesAlertsToAPIAlerts(s storepb.PartialResponseStrategy, rulesAlerts []*rules.Alert) []*Alert { apiAlerts := make([]*Alert, len(rulesAlerts)) for i, ruleAlert := range rulesAlerts { apiAlerts[i] = &Alert{ - Labels: ruleAlert.Labels, - Annotations: ruleAlert.Annotations, - State: ruleAlert.State.String(), - ActiveAt: &ruleAlert.ActiveAt, - Value: ruleAlert.Value, + PartialResponseStrategy: s.String(), + Labels: ruleAlert.Labels, + Annotations: ruleAlert.Annotations, + State: ruleAlert.State.String(), + ActiveAt: &ruleAlert.ActiveAt, + Value: ruleAlert.Value, } } @@ -165,22 +167,24 @@ type RuleGroup struct { // In order to preserve rule ordering, while exposing type (alerting or recording) // specific properties, both alerting and recording rules are exposed in the // same array. - Rules []rule `json:"rules"` - Interval float64 `json:"interval"` + Rules []rule `json:"rules"` + Interval float64 `json:"interval"` + PartialResponseStrategy string `json:"partial_response_strategy"` } type rule interface{} type alertingRule struct { - Name string `json:"name"` - Query string `json:"query"` - Duration float64 `json:"duration"` - Labels labels.Labels `json:"labels"` - Annotations labels.Labels `json:"annotations"` - Alerts []*Alert `json:"alerts"` - Health rules.RuleHealth `json:"health"` - LastError string `json:"lastError,omitempty"` - Type string `json:"type"` + Name string `json:"name"` + Query string `json:"query"` + Duration float64 `json:"duration"` + Labels labels.Labels `json:"labels"` + Annotations labels.Labels `json:"annotations"` + Alerts []*Alert `json:"alerts"` + Health rules.RuleHealth `json:"health"` + LastError string `json:"lastError,omitempty"` + Type string `json:"type"` + PartialResponseStrategy string `json:"partial_response_strategy"` } type recordingRule struct { diff --git a/pkg/rule/api/v1_test.go b/pkg/rule/api/v1_test.go index af4034bd8d..f3e2c8d585 100644 --- a/pkg/rule/api/v1_test.go +++ b/pkg/rule/api/v1_test.go @@ -12,6 +12,7 @@ import ( "github.com/go-kit/kit/log" qapi "github.com/improbable-eng/thanos/pkg/query/api" + thanosrule "github.com/improbable-eng/thanos/pkg/rule" "github.com/prometheus/common/route" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/promql" @@ -23,7 +24,7 @@ type rulesRetrieverMock struct { testing *testing.T } -func (m rulesRetrieverMock) RuleGroups() []*rules.Group { +func (m rulesRetrieverMock) RuleGroups() []thanosrule.Group { var ar rulesRetrieverMock arules := ar.AlertingRules() storage := testutil.NewStorage(m.testing) @@ -59,10 +60,10 @@ func (m rulesRetrieverMock) RuleGroups() []*rules.Group { r = append(r, recordingRule) group := rules.NewGroup("grp", "/path/to/file", time.Second, r, false, opts) - return []*rules.Group{group} + return []thanosrule.Group{{Group: group}} } -func (m rulesRetrieverMock) AlertingRules() []*rules.AlertingRule { +func (m rulesRetrieverMock) AlertingRules() []thanosrule.AlertingRule { expr1, err := promql.ParseExpr(`absent(test_metric3) != 1`) if err != nil { m.testing.Fatalf("unable to parse alert expression: %s", err) @@ -90,9 +91,9 @@ func (m rulesRetrieverMock) AlertingRules() []*rules.AlertingRule { true, log.NewNopLogger(), ) - var r []*rules.AlertingRule - r = append(r, rule1) - r = append(r, rule2) + var r []thanosrule.AlertingRule + r = append(r, thanosrule.AlertingRule{AlertingRule: rule1}) + r = append(r, thanosrule.AlertingRule{AlertingRule: rule2}) return r } @@ -122,7 +123,10 @@ func TestEndpoints(t *testing.T) { algr.testing = t algr.AlertingRules() algr.RuleGroups() - api := NewAPI(nil, algr) + api := NewAPI( + nil, + algr, + ) testEndpoints(t, api) }) } @@ -142,29 +146,32 @@ func testEndpoints(t *testing.T, api *API) { response: &RuleDiscovery{ RuleGroups: []*RuleGroup{ { - Name: "grp", - File: "/path/to/file", - Interval: 1, + Name: "grp", + File: "/path/to/file", + Interval: 1, + PartialResponseStrategy: "WARN", Rules: []rule{ alertingRule{ - Name: "test_metric3", - Query: "absent(test_metric3) != 1", - Duration: 1, - Labels: labels.Labels{}, - Annotations: labels.Labels{}, - Alerts: []*Alert{}, - Health: "unknown", - Type: "alerting", + Name: "test_metric3", + Query: "absent(test_metric3) != 1", + Duration: 1, + Labels: labels.Labels{}, + Annotations: labels.Labels{}, + Alerts: []*Alert{}, + Health: "unknown", + Type: "alerting", + PartialResponseStrategy: "WARN", }, alertingRule{ - Name: "test_metric4", - Query: "up == 1", - Duration: 1, - Labels: labels.Labels{}, - Annotations: labels.Labels{}, - Alerts: []*Alert{}, - Health: "unknown", - Type: "alerting", + Name: "test_metric4", + Query: "up == 1", + Duration: 1, + Labels: labels.Labels{}, + Annotations: labels.Labels{}, + Alerts: []*Alert{}, + Health: "unknown", + Type: "alerting", + PartialResponseStrategy: "WARN", }, recordingRule{ Name: "recording-rule-1", diff --git a/pkg/rule/rule.go b/pkg/rule/rule.go new file mode 100644 index 0000000000..11d988a897 --- /dev/null +++ b/pkg/rule/rule.go @@ -0,0 +1,169 @@ +package thanosrule + +import ( + "fmt" + "io/ioutil" + "os" + "path" + "path/filepath" + "strings" + "time" + + "github.com/improbable-eng/thanos/pkg/store/storepb" + "github.com/pkg/errors" + "github.com/prometheus/prometheus/pkg/rulefmt" + "github.com/prometheus/prometheus/rules" + "github.com/prometheus/tsdb" + yaml "gopkg.in/yaml.v2" +) + +const tmpRuleDir = ".tmp-rules" + +type Group struct { + *rules.Group + + PartialResponseStrategy storepb.PartialResponseStrategy +} + +type AlertingRule struct { + *rules.AlertingRule + + PartialResponseStrategy storepb.PartialResponseStrategy +} + +type RuleGroups struct { + Groups []RuleGroup `yaml:"groups"` +} + +type RuleGroup struct { + rulefmt.RuleGroup + + PartialResponseStrategy storepb.PartialResponseStrategy `yaml:"partial_response_strategy"` +} + +type Managers map[storepb.PartialResponseStrategy]*rules.Manager + +func (m Managers) RuleGroups() []Group { + var res []Group + for s, r := range m { + for _, group := range r.RuleGroups() { + res = append(res, Group{Group: group, PartialResponseStrategy: s}) + } + } + return res +} + +func (m Managers) AlertingRules() []AlertingRule { + var res []AlertingRule + for s, r := range m { + for _, r := range r.AlertingRules() { + res = append(res, AlertingRule{AlertingRule: r, PartialResponseStrategy: s}) + } + } + return res +} + +func (r *RuleGroup) UnmarshalYAML(unmarshal func(interface{}) error) error { + rs := struct { + String string `yaml:"partial_response_strategy"` + }{} + + errMsg := fmt.Sprintf("failed to unmarshal 'partial_response_strategy'. Possible values are %s", strings.Join(storepb.PartialResponseStrategyValues, ",")) + if err := unmarshal(&rs); err != nil { + return errors.Wrapf(err, errMsg) + } + + rg := rulefmt.RuleGroup{} + if err := unmarshal(&rg); err != nil { + return errors.Wrapf(err, errMsg) + } + + p, ok := storepb.PartialResponseStrategy_value[strings.ToUpper(rs.String)] + if !ok { + if rs.String != "" { + return errors.Errorf("%s. Got: %s", errMsg, rs.String) + } + + // NOTE: For Rule default is abort as this is recommended for alerting. + p = storepb.PartialResponseStrategy_value[storepb.PartialResponseStrategy_ABORT.String()] + } + + r.RuleGroup = rg + r.PartialResponseStrategy = storepb.PartialResponseStrategy(p) + return nil +} + +// Update updates rules from given files to all managers we hold. We decide which groups should go where, based on +// special field in RuleGroup file. +func (m *Managers) Update(dataDir string, evalInterval time.Duration, files []string) error { + var ( + errs tsdb.MultiError + filesMap = map[storepb.PartialResponseStrategy][]string{} + ) + + if err := os.RemoveAll(path.Join(dataDir, tmpRuleDir)); err != nil { + return errors.Wrapf(err, "rm %s", path.Join(dataDir, tmpRuleDir)) + } + if err := os.MkdirAll(path.Join(dataDir, tmpRuleDir), os.ModePerm); err != nil { + return errors.Wrapf(err, "mkdir %s", path.Join(dataDir, tmpRuleDir)) + } + + for _, fn := range files { + b, err := ioutil.ReadFile(fn) + if err != nil { + errs = append(errs, err) + continue + } + + var rg RuleGroups + if err := yaml.Unmarshal(b, &rg); err != nil { + errs = append(errs, err) + continue + } + + // NOTE: This is very ugly, but we need to reparse it into tmp dir without the field to have to reuse + // rules.Manager. The problem is that it uses yaml.UnmarshalStrict for some reasons. + mapped := map[storepb.PartialResponseStrategy]*rulefmt.RuleGroups{} + for _, rg := range rg.Groups { + if _, ok := mapped[rg.PartialResponseStrategy]; !ok { + mapped[rg.PartialResponseStrategy] = &rulefmt.RuleGroups{} + } + + mapped[rg.PartialResponseStrategy].Groups = append( + mapped[rg.PartialResponseStrategy].Groups, + rg.RuleGroup, + ) + } + + for s, rg := range mapped { + b, err := yaml.Marshal(rg) + if err != nil { + errs = append(errs, err) + continue + } + + newFn := path.Join(dataDir, tmpRuleDir, filepath.Base(fn)+"."+s.String()) + if err := ioutil.WriteFile(newFn, b, os.ModePerm); err != nil { + errs = append(errs, err) + continue + } + + filesMap[s] = append(filesMap[s], newFn) + } + + } + + for s, fs := range filesMap { + updater, ok := (*m)[s] + if !ok { + errs = append(errs, errors.Errorf("no updater found for %v", s)) + continue + } + if err := updater.Update(evalInterval, fs); err != nil { + errs = append(errs, err) + continue + } + } + + return errs.Err() +} diff --git a/pkg/rule/rule_test.go b/pkg/rule/rule_test.go new file mode 100644 index 0000000000..932a102d15 --- /dev/null +++ b/pkg/rule/rule_test.go @@ -0,0 +1,124 @@ +package thanosrule + +import ( + "io/ioutil" + "os" + "path" + "sort" + "strings" + "testing" + "time" + + "github.com/go-kit/kit/log" + "github.com/improbable-eng/thanos/pkg/store/storepb" + "github.com/improbable-eng/thanos/pkg/testutil" + "github.com/prometheus/prometheus/rules" +) + +func TestUpdate(t *testing.T) { + dir, err := ioutil.TempDir("", "test_rule_rule_groups") + testutil.Ok(t, err) + defer func() { testutil.Ok(t, os.RemoveAll(dir)) }() + + testutil.Ok(t, ioutil.WriteFile(path.Join(dir, "no_strategy.yaml"), []byte(` +groups: +- name: "something1" + rules: + - alert: "some" + expr: "up" +`), os.ModePerm)) + testutil.Ok(t, ioutil.WriteFile(path.Join(dir, "abort.yaml"), []byte(` +groups: +- name: "something2" + partial_response_strategy: "abort" + rules: + - alert: "some" + expr: "up" +`), os.ModePerm)) + testutil.Ok(t, ioutil.WriteFile(path.Join(dir, "warn.yaml"), []byte(` +groups: +- name: "something3" + partial_response_strategy: "warn" + rules: + - alert: "some" + expr: "up" +`), os.ModePerm)) + testutil.Ok(t, ioutil.WriteFile(path.Join(dir, "wrong.yaml"), []byte(` +groups: +- name: "something4" + partial_response_strategy: "afafsdgsdgs" # Err 1 + rules: + - alert: "some" + expr: "up" +`), os.ModePerm)) + testutil.Ok(t, ioutil.WriteFile(path.Join(dir, "combined.yaml"), []byte(` +groups: +- name: "something5" + partial_response_strategy: "warn" + rules: + - alert: "some" + expr: "up" +- name: "something6" + partial_response_strategy: "abort" + rules: + - alert: "some" + expr: "up" +- name: "something7" + rules: + - alert: "some" + expr: "up" +`), os.ModePerm)) + testutil.Ok(t, ioutil.WriteFile(path.Join(dir, "combined-wrong.yaml"), []byte(` +groups: +- name: "something8" + partial_response_strategy: "warn" + rules: + - alert: "some" + expr: "up" +- name: "something9" + partial_response_strategy: "adad" # Err 2 + rules: + - alert: "some" + expr: "up" +`), os.ModePerm)) + + opts := rules.ManagerOptions{ + Logger: log.NewLogfmtLogger(os.Stderr), + } + m := Managers{ + storepb.PartialResponseStrategy_ABORT: rules.NewManager(&opts), + storepb.PartialResponseStrategy_WARN: rules.NewManager(&opts), + } + + err = m.Update(dir, 10*time.Second, []string{ + path.Join(dir, "no_strategy.yaml"), + path.Join(dir, "abort.yaml"), + path.Join(dir, "warn.yaml"), + path.Join(dir, "wrong.yaml"), + path.Join(dir, "combined.yaml"), + path.Join(dir, "combined_wrong.yaml"), + }) + + testutil.NotOk(t, err) + testutil.Assert(t, strings.HasPrefix(err.Error(), "2 errors: failed to unmarshal 'partial_response_strategy'"), err.Error()) + + g := m[storepb.PartialResponseStrategy_WARN].RuleGroups() + testutil.Equals(t, 2, len(g)) + + sort.Slice(g, func(i, j int) bool { + return g[i].Name() < g[j].Name() + }) + testutil.Equals(t, "something3", g[0].Name()) + testutil.Equals(t, "something5", g[1].Name()) + + g = m[storepb.PartialResponseStrategy_ABORT].RuleGroups() + testutil.Equals(t, 4, len(g)) + + sort.Slice(g, func(i, j int) bool { + return g[i].Name() < g[j].Name() + }) + testutil.Equals(t, "something1", g[0].Name()) + testutil.Equals(t, "something2", g[1].Name()) + testutil.Equals(t, "something6", g[2].Name()) + testutil.Equals(t, "something7", g[3].Name()) +} diff --git a/pkg/store/bucket.go b/pkg/store/bucket.go index fb295d167a..d2070229f1 100644 --- a/pkg/store/bucket.go +++ b/pkg/store/bucket.go @@ -249,7 +249,7 @@ func NewBucketStore( blockSyncConcurrency: blockSyncConcurrency, queryGate: NewGate( maxConcurrent, - extprom.WrapRegistererWithPrefix("thanos_bucket_store_series", reg), + extprom.WrapRegistererWithPrefix("thanos_bucket_store_series_", reg), ), samplesLimiter: NewLimiter(maxSampleCount, metrics.queriesDropped), partitioner: gapBasedPartitioner{maxGapSize: maxGapSize}, diff --git a/pkg/store/prompb/remote.pb.go b/pkg/store/prompb/remote.pb.go index 06a0952c98..8da7ac6b6d 100644 --- a/pkg/store/prompb/remote.pb.go +++ b/pkg/store/prompb/remote.pb.go @@ -53,7 +53,7 @@ func (LabelMatcher_Type) EnumDescriptor() ([]byte, []int) { } type WriteRequest struct { - Timeseries []TimeSeries `protobuf:"bytes,1,rep,name=timeseries" json:"timeseries"` + Timeseries []TimeSeries `protobuf:"bytes,1,rep,name=timeseries,proto3" json:"timeseries"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` @@ -93,7 +93,7 @@ func (m *WriteRequest) XXX_DiscardUnknown() { var xxx_messageInfo_WriteRequest proto.InternalMessageInfo type ReadRequest struct { - Queries []Query `protobuf:"bytes,1,rep,name=queries" json:"queries"` + Queries []Query `protobuf:"bytes,1,rep,name=queries,proto3" json:"queries"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` @@ -134,7 +134,7 @@ var xxx_messageInfo_ReadRequest proto.InternalMessageInfo type ReadResponse struct { // In same order as the request's queries. - Results []QueryResult `protobuf:"bytes,1,rep,name=results" json:"results"` + Results []QueryResult `protobuf:"bytes,1,rep,name=results,proto3" json:"results"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` @@ -176,8 +176,8 @@ var xxx_messageInfo_ReadResponse proto.InternalMessageInfo type Query struct { StartTimestampMs int64 `protobuf:"varint,1,opt,name=start_timestamp_ms,json=startTimestampMs,proto3" json:"start_timestamp_ms,omitempty"` EndTimestampMs int64 `protobuf:"varint,2,opt,name=end_timestamp_ms,json=endTimestampMs,proto3" json:"end_timestamp_ms,omitempty"` - Matchers []LabelMatcher `protobuf:"bytes,3,rep,name=matchers" json:"matchers"` - Hints *ReadHints `protobuf:"bytes,4,opt,name=hints" json:"hints,omitempty"` + Matchers []LabelMatcher `protobuf:"bytes,3,rep,name=matchers,proto3" json:"matchers"` + Hints *ReadHints `protobuf:"bytes,4,opt,name=hints,proto3" json:"hints,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` @@ -217,7 +217,7 @@ func (m *Query) XXX_DiscardUnknown() { var xxx_messageInfo_Query proto.InternalMessageInfo type QueryResult struct { - Timeseries []TimeSeries `protobuf:"bytes,1,rep,name=timeseries" json:"timeseries"` + Timeseries []TimeSeries `protobuf:"bytes,1,rep,name=timeseries,proto3" json:"timeseries"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` @@ -298,8 +298,8 @@ func (m *Sample) XXX_DiscardUnknown() { var xxx_messageInfo_Sample proto.InternalMessageInfo type TimeSeries struct { - Labels []Label `protobuf:"bytes,1,rep,name=labels" json:"labels"` - Samples []Sample `protobuf:"bytes,2,rep,name=samples" json:"samples"` + Labels []Label `protobuf:"bytes,1,rep,name=labels,proto3" json:"labels"` + Samples []Sample `protobuf:"bytes,2,rep,name=samples,proto3" json:"samples"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` @@ -863,6 +863,9 @@ func encodeVarintRemote(dAtA []byte, offset int, v uint64) int { return offset + 1 } func (m *WriteRequest) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if len(m.Timeseries) > 0 { @@ -878,6 +881,9 @@ func (m *WriteRequest) Size() (n int) { } func (m *ReadRequest) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if len(m.Queries) > 0 { @@ -893,6 +899,9 @@ func (m *ReadRequest) Size() (n int) { } func (m *ReadResponse) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if len(m.Results) > 0 { @@ -908,6 +917,9 @@ func (m *ReadResponse) Size() (n int) { } func (m *Query) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if m.StartTimestampMs != 0 { @@ -933,6 +945,9 @@ func (m *Query) Size() (n int) { } func (m *QueryResult) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if len(m.Timeseries) > 0 { @@ -948,6 +963,9 @@ func (m *QueryResult) Size() (n int) { } func (m *Sample) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if m.Value != 0 { @@ -963,6 +981,9 @@ func (m *Sample) Size() (n int) { } func (m *TimeSeries) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if len(m.Labels) > 0 { @@ -984,6 +1005,9 @@ func (m *TimeSeries) Size() (n int) { } func (m *Label) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l l = len(m.Name) @@ -1001,6 +1025,9 @@ func (m *Label) Size() (n int) { } func (m *LabelMatcher) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if m.Type != 0 { @@ -1021,6 +1048,9 @@ func (m *LabelMatcher) Size() (n int) { } func (m *ReadHints) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if m.StepMs != 0 { diff --git a/pkg/store/storepb/custom.go b/pkg/store/storepb/custom.go index 18e2035e69..2097282408 100644 --- a/pkg/store/storepb/custom.go +++ b/pkg/store/storepb/custom.go @@ -6,6 +6,14 @@ import ( "github.com/prometheus/prometheus/pkg/labels" ) +var PartialResponseStrategyValues = func() []string { + var s []string + for k := range PartialResponseStrategy_value { + s = append(s, k) + } + return s +}() + func NewWarnSeriesResponse(err error) *SeriesResponse { return &SeriesResponse{ Result: &SeriesResponse_Warning{ diff --git a/pkg/store/storepb/rpc.pb.go b/pkg/store/storepb/rpc.pb.go index e9ddda8711..a176361ddb 100644 --- a/pkg/store/storepb/rpc.pb.go +++ b/pkg/store/storepb/rpc.pb.go @@ -8,8 +8,10 @@ import fmt "fmt" import math "math" import _ "github.com/gogo/protobuf/gogoproto" -import context "golang.org/x/net/context" -import grpc "google.golang.org/grpc" +import ( + context "golang.org/x/net/context" + grpc "google.golang.org/grpc" +) import io "io" @@ -56,7 +58,30 @@ func (x StoreType) String() string { return proto.EnumName(StoreType_name, int32(x)) } func (StoreType) EnumDescriptor() ([]byte, []int) { - return fileDescriptor_rpc_b2f04ff11750c7dd, []int{0} + return fileDescriptor_rpc_f4f04914f1106c76, []int{0} +} + +type PartialResponseStrategy int32 + +const ( + PartialResponseStrategy_WARN PartialResponseStrategy = 0 + PartialResponseStrategy_ABORT PartialResponseStrategy = 1 +) + +var PartialResponseStrategy_name = map[int32]string{ + 0: "WARN", + 1: "ABORT", +} +var PartialResponseStrategy_value = map[string]int32{ + "WARN": 0, + "ABORT": 1, +} + +func (x PartialResponseStrategy) String() string { + return proto.EnumName(PartialResponseStrategy_name, int32(x)) +} +func (PartialResponseStrategy) EnumDescriptor() ([]byte, []int) { + return fileDescriptor_rpc_f4f04914f1106c76, []int{1} } type Aggr int32 @@ -91,7 +116,7 @@ func (x Aggr) String() string { return proto.EnumName(Aggr_name, int32(x)) } func (Aggr) EnumDescriptor() ([]byte, []int) { - return fileDescriptor_rpc_b2f04ff11750c7dd, []int{1} + return fileDescriptor_rpc_f4f04914f1106c76, []int{2} } type InfoRequest struct { @@ -104,7 +129,7 @@ func (m *InfoRequest) Reset() { *m = InfoRequest{} } func (m *InfoRequest) String() string { return proto.CompactTextString(m) } func (*InfoRequest) ProtoMessage() {} func (*InfoRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_b2f04ff11750c7dd, []int{0} + return fileDescriptor_rpc_f4f04914f1106c76, []int{0} } func (m *InfoRequest) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -134,7 +159,7 @@ func (m *InfoRequest) XXX_DiscardUnknown() { var xxx_messageInfo_InfoRequest proto.InternalMessageInfo type InfoResponse struct { - Labels []Label `protobuf:"bytes,1,rep,name=labels" json:"labels"` + Labels []Label `protobuf:"bytes,1,rep,name=labels,proto3" json:"labels"` MinTime int64 `protobuf:"varint,2,opt,name=min_time,json=minTime,proto3" json:"min_time,omitempty"` MaxTime int64 `protobuf:"varint,3,opt,name=max_time,json=maxTime,proto3" json:"max_time,omitempty"` StoreType StoreType `protobuf:"varint,4,opt,name=storeType,proto3,enum=thanos.StoreType" json:"storeType,omitempty"` @@ -147,7 +172,7 @@ func (m *InfoResponse) Reset() { *m = InfoResponse{} } func (m *InfoResponse) String() string { return proto.CompactTextString(m) } func (*InfoResponse) ProtoMessage() {} func (*InfoResponse) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_b2f04ff11750c7dd, []int{1} + return fileDescriptor_rpc_f4f04914f1106c76, []int{1} } func (m *InfoResponse) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -177,22 +202,24 @@ func (m *InfoResponse) XXX_DiscardUnknown() { var xxx_messageInfo_InfoResponse proto.InternalMessageInfo type SeriesRequest struct { - MinTime int64 `protobuf:"varint,1,opt,name=min_time,json=minTime,proto3" json:"min_time,omitempty"` - MaxTime int64 `protobuf:"varint,2,opt,name=max_time,json=maxTime,proto3" json:"max_time,omitempty"` - Matchers []LabelMatcher `protobuf:"bytes,3,rep,name=matchers" json:"matchers"` - MaxResolutionWindow int64 `protobuf:"varint,4,opt,name=max_resolution_window,json=maxResolutionWindow,proto3" json:"max_resolution_window,omitempty"` - Aggregates []Aggr `protobuf:"varint,5,rep,packed,name=aggregates,enum=thanos.Aggr" json:"aggregates,omitempty"` - PartialResponseDisabled bool `protobuf:"varint,6,opt,name=partial_response_disabled,json=partialResponseDisabled,proto3" json:"partial_response_disabled,omitempty"` - XXX_NoUnkeyedLiteral struct{} `json:"-"` - XXX_unrecognized []byte `json:"-"` - XXX_sizecache int32 `json:"-"` + MinTime int64 `protobuf:"varint,1,opt,name=min_time,json=minTime,proto3" json:"min_time,omitempty"` + MaxTime int64 `protobuf:"varint,2,opt,name=max_time,json=maxTime,proto3" json:"max_time,omitempty"` + Matchers []LabelMatcher `protobuf:"bytes,3,rep,name=matchers,proto3" json:"matchers"` + MaxResolutionWindow int64 `protobuf:"varint,4,opt,name=max_resolution_window,json=maxResolutionWindow,proto3" json:"max_resolution_window,omitempty"` + Aggregates []Aggr `protobuf:"varint,5,rep,packed,name=aggregates,proto3,enum=thanos.Aggr" json:"aggregates,omitempty"` + // Deprecated. Use partial_response_strategy instead. + PartialResponseDisabled bool `protobuf:"varint,6,opt,name=partial_response_disabled,json=partialResponseDisabled,proto3" json:"partial_response_disabled,omitempty"` + PartialResponseStrategy PartialResponseStrategy `protobuf:"varint,7,opt,name=partial_response_strategy,json=partialResponseStrategy,proto3,enum=thanos.PartialResponseStrategy" json:"partial_response_strategy,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` } func (m *SeriesRequest) Reset() { *m = SeriesRequest{} } func (m *SeriesRequest) String() string { return proto.CompactTextString(m) } func (*SeriesRequest) ProtoMessage() {} func (*SeriesRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_b2f04ff11750c7dd, []int{2} + return fileDescriptor_rpc_f4f04914f1106c76, []int{2} } func (m *SeriesRequest) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -235,7 +262,7 @@ func (m *SeriesResponse) Reset() { *m = SeriesResponse{} } func (m *SeriesResponse) String() string { return proto.CompactTextString(m) } func (*SeriesResponse) ProtoMessage() {} func (*SeriesResponse) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_b2f04ff11750c7dd, []int{3} + return fileDescriptor_rpc_f4f04914f1106c76, []int{3} } func (m *SeriesResponse) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -271,7 +298,7 @@ type isSeriesResponse_Result interface { } type SeriesResponse_Series struct { - Series *Series `protobuf:"bytes,1,opt,name=series,oneof"` + Series *Series `protobuf:"bytes,1,opt,name=series,proto3,oneof"` } type SeriesResponse_Warning struct { Warning string `protobuf:"bytes,2,opt,name=warning,proto3,oneof"` @@ -382,7 +409,7 @@ func (m *LabelNamesRequest) Reset() { *m = LabelNamesRequest{} } func (m *LabelNamesRequest) String() string { return proto.CompactTextString(m) } func (*LabelNamesRequest) ProtoMessage() {} func (*LabelNamesRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_b2f04ff11750c7dd, []int{4} + return fileDescriptor_rpc_f4f04914f1106c76, []int{4} } func (m *LabelNamesRequest) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -412,8 +439,8 @@ func (m *LabelNamesRequest) XXX_DiscardUnknown() { var xxx_messageInfo_LabelNamesRequest proto.InternalMessageInfo type LabelNamesResponse struct { - Names []string `protobuf:"bytes,1,rep,name=names" json:"names,omitempty"` - Warnings []string `protobuf:"bytes,2,rep,name=warnings" json:"warnings,omitempty"` + Names []string `protobuf:"bytes,1,rep,name=names,proto3" json:"names,omitempty"` + Warnings []string `protobuf:"bytes,2,rep,name=warnings,proto3" json:"warnings,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` @@ -423,7 +450,7 @@ func (m *LabelNamesResponse) Reset() { *m = LabelNamesResponse{} } func (m *LabelNamesResponse) String() string { return proto.CompactTextString(m) } func (*LabelNamesResponse) ProtoMessage() {} func (*LabelNamesResponse) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_b2f04ff11750c7dd, []int{5} + return fileDescriptor_rpc_f4f04914f1106c76, []int{5} } func (m *LabelNamesResponse) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -464,7 +491,7 @@ func (m *LabelValuesRequest) Reset() { *m = LabelValuesRequest{} } func (m *LabelValuesRequest) String() string { return proto.CompactTextString(m) } func (*LabelValuesRequest) ProtoMessage() {} func (*LabelValuesRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_b2f04ff11750c7dd, []int{6} + return fileDescriptor_rpc_f4f04914f1106c76, []int{6} } func (m *LabelValuesRequest) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -494,8 +521,8 @@ func (m *LabelValuesRequest) XXX_DiscardUnknown() { var xxx_messageInfo_LabelValuesRequest proto.InternalMessageInfo type LabelValuesResponse struct { - Values []string `protobuf:"bytes,1,rep,name=values" json:"values,omitempty"` - Warnings []string `protobuf:"bytes,2,rep,name=warnings" json:"warnings,omitempty"` + Values []string `protobuf:"bytes,1,rep,name=values,proto3" json:"values,omitempty"` + Warnings []string `protobuf:"bytes,2,rep,name=warnings,proto3" json:"warnings,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` @@ -505,7 +532,7 @@ func (m *LabelValuesResponse) Reset() { *m = LabelValuesResponse{} } func (m *LabelValuesResponse) String() string { return proto.CompactTextString(m) } func (*LabelValuesResponse) ProtoMessage() {} func (*LabelValuesResponse) Descriptor() ([]byte, []int) { - return fileDescriptor_rpc_b2f04ff11750c7dd, []int{7} + return fileDescriptor_rpc_f4f04914f1106c76, []int{7} } func (m *LabelValuesResponse) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -544,6 +571,7 @@ func init() { proto.RegisterType((*LabelValuesRequest)(nil), "thanos.LabelValuesRequest") proto.RegisterType((*LabelValuesResponse)(nil), "thanos.LabelValuesResponse") proto.RegisterEnum("thanos.StoreType", StoreType_name, StoreType_value) + proto.RegisterEnum("thanos.PartialResponseStrategy", PartialResponseStrategy_name, PartialResponseStrategy_value) proto.RegisterEnum("thanos.Aggr", Aggr_name, Aggr_value) } @@ -555,8 +583,9 @@ var _ grpc.ClientConn // is compatible with the grpc package it is being compiled against. const _ = grpc.SupportPackageIsVersion4 -// Client API for Store service - +// StoreClient is the client API for Store service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://godoc.org/google.golang.org/grpc#ClientConn.NewStream. type StoreClient interface { // / Info returns meta information about a store e.g labels that makes that store unique as well as time range that is // / available. @@ -637,8 +666,7 @@ func (c *storeClient) LabelValues(ctx context.Context, in *LabelValuesRequest, o return out, nil } -// Server API for Store service - +// StoreServer is the server API for Store service. type StoreServer interface { // / Info returns meta information about a store e.g labels that makes that store unique as well as time range that is // / available. @@ -896,6 +924,11 @@ func (m *SeriesRequest) MarshalTo(dAtA []byte) (int, error) { } i++ } + if m.PartialResponseStrategy != 0 { + dAtA[i] = 0x38 + i++ + i = encodeVarintRpc(dAtA, i, uint64(m.PartialResponseStrategy)) + } if m.XXX_unrecognized != nil { i += copy(dAtA[i:], m.XXX_unrecognized) } @@ -1132,6 +1165,9 @@ func encodeVarintRpc(dAtA []byte, offset int, v uint64) int { return offset + 1 } func (m *InfoRequest) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if m.XXX_unrecognized != nil { @@ -1141,6 +1177,9 @@ func (m *InfoRequest) Size() (n int) { } func (m *InfoResponse) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if len(m.Labels) > 0 { @@ -1165,6 +1204,9 @@ func (m *InfoResponse) Size() (n int) { } func (m *SeriesRequest) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if m.MinTime != 0 { @@ -1192,6 +1234,9 @@ func (m *SeriesRequest) Size() (n int) { if m.PartialResponseDisabled { n += 2 } + if m.PartialResponseStrategy != 0 { + n += 1 + sovRpc(uint64(m.PartialResponseStrategy)) + } if m.XXX_unrecognized != nil { n += len(m.XXX_unrecognized) } @@ -1199,6 +1244,9 @@ func (m *SeriesRequest) Size() (n int) { } func (m *SeriesResponse) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if m.Result != nil { @@ -1211,6 +1259,9 @@ func (m *SeriesResponse) Size() (n int) { } func (m *SeriesResponse_Series) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if m.Series != nil { @@ -1220,6 +1271,9 @@ func (m *SeriesResponse_Series) Size() (n int) { return n } func (m *SeriesResponse_Warning) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l l = len(m.Warning) @@ -1227,6 +1281,9 @@ func (m *SeriesResponse_Warning) Size() (n int) { return n } func (m *LabelNamesRequest) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if m.PartialResponseDisabled { @@ -1239,6 +1296,9 @@ func (m *LabelNamesRequest) Size() (n int) { } func (m *LabelNamesResponse) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if len(m.Names) > 0 { @@ -1260,6 +1320,9 @@ func (m *LabelNamesResponse) Size() (n int) { } func (m *LabelValuesRequest) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l l = len(m.Label) @@ -1276,6 +1339,9 @@ func (m *LabelValuesRequest) Size() (n int) { } func (m *LabelValuesResponse) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if len(m.Values) > 0 { @@ -1657,6 +1723,10 @@ func (m *SeriesRequest) Unmarshal(dAtA []byte) error { if postIndex > l { return io.ErrUnexpectedEOF } + var elementCount int + if elementCount != 0 && len(m.Aggregates) == 0 { + m.Aggregates = make([]Aggr, 0, elementCount) + } for iNdEx < postIndex { var v Aggr for shift := uint(0); ; shift += 7 { @@ -1698,6 +1768,25 @@ func (m *SeriesRequest) Unmarshal(dAtA []byte) error { } } m.PartialResponseDisabled = bool(v != 0) + case 7: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field PartialResponseStrategy", wireType) + } + m.PartialResponseStrategy = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowRpc + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.PartialResponseStrategy |= (PartialResponseStrategy(b) & 0x7F) << shift + if b < 0x80 { + break + } + } default: iNdEx = preIndex skippy, err := skipRpc(dAtA[iNdEx:]) @@ -2326,51 +2415,54 @@ var ( ErrIntOverflowRpc = fmt.Errorf("proto: integer overflow") ) -func init() { proto.RegisterFile("rpc.proto", fileDescriptor_rpc_b2f04ff11750c7dd) } - -var fileDescriptor_rpc_b2f04ff11750c7dd = []byte{ - // 683 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x7c, 0x54, 0xc1, 0x6e, 0xda, 0x4c, - 0x10, 0xc6, 0x36, 0x36, 0x78, 0x48, 0x90, 0xb3, 0x21, 0xf9, 0x8d, 0x7f, 0x89, 0x22, 0x4e, 0x28, - 0xad, 0x92, 0x96, 0x4a, 0x95, 0xda, 0x1b, 0x10, 0x47, 0x41, 0x4d, 0x40, 0x5d, 0x20, 0x69, 0x7b, - 0x49, 0x4d, 0xb2, 0x71, 0x2c, 0x19, 0xdb, 0xf5, 0x9a, 0x26, 0xb9, 0xf6, 0x35, 0x7a, 0xeb, 0xd3, - 0xe4, 0xd8, 0x27, 0xa8, 0x5a, 0x9e, 0xa4, 0xf2, 0x7a, 0x0d, 0xb8, 0x4a, 0xb9, 0xed, 0x7c, 0xdf, - 0x78, 0xe6, 0xdb, 0x99, 0xcf, 0x0b, 0x6a, 0x18, 0x5c, 0xee, 0x07, 0xa1, 0x1f, 0xf9, 0x48, 0x89, - 0x6e, 0x2c, 0xcf, 0xa7, 0x46, 0x29, 0xba, 0x0f, 0x08, 0x4d, 0x40, 0xa3, 0x62, 0xfb, 0xb6, 0xcf, - 0x8e, 0x07, 0xf1, 0x29, 0x41, 0x1b, 0x9b, 0x50, 0xea, 0x79, 0xd7, 0x3e, 0x26, 0x9f, 0x67, 0x84, - 0x46, 0x8d, 0xef, 0x02, 0x6c, 0x24, 0x31, 0x0d, 0x7c, 0x8f, 0x12, 0xf4, 0x14, 0x14, 0xd7, 0x9a, - 0x10, 0x97, 0xea, 0x42, 0x5d, 0x6a, 0x96, 0x5a, 0x9b, 0xfb, 0x49, 0xed, 0xfd, 0x93, 0x18, 0xed, - 0xe4, 0x1f, 0x7e, 0x3e, 0xc9, 0x61, 0x9e, 0x82, 0xaa, 0x50, 0x9c, 0x3a, 0xde, 0x45, 0xe4, 0x4c, - 0x89, 0x2e, 0xd6, 0x85, 0xa6, 0x84, 0x0b, 0x53, 0xc7, 0x1b, 0x39, 0x53, 0xc2, 0x28, 0xeb, 0x2e, - 0xa1, 0x24, 0x4e, 0x59, 0x77, 0x8c, 0x3a, 0x00, 0x95, 0x46, 0x7e, 0x48, 0x46, 0xf7, 0x01, 0xd1, - 0xf3, 0x75, 0xa1, 0x59, 0x6e, 0x6d, 0xa5, 0x5d, 0x86, 0x29, 0x81, 0x97, 0x39, 0x8d, 0x6f, 0x22, - 0x6c, 0x0e, 0x49, 0xe8, 0x10, 0xca, 0x65, 0x67, 0x1a, 0x0b, 0xff, 0x6e, 0x2c, 0x66, 0x1b, 0xbf, - 0x8a, 0xa9, 0xe8, 0xf2, 0x86, 0x84, 0x54, 0x97, 0xd8, 0xed, 0x2a, 0x99, 0xdb, 0x9d, 0x26, 0x24, - 0xbf, 0xe4, 0x22, 0x17, 0xb5, 0x60, 0x27, 0x2e, 0x19, 0x12, 0xea, 0xbb, 0xb3, 0xc8, 0xf1, 0xbd, - 0x8b, 0x5b, 0xc7, 0xbb, 0xf2, 0x6f, 0x99, 0x78, 0x09, 0x6f, 0x4f, 0xad, 0x3b, 0xbc, 0xe0, 0xce, - 0x19, 0x85, 0x9e, 0x01, 0x58, 0xb6, 0x1d, 0x12, 0xdb, 0x8a, 0x08, 0xd5, 0xe5, 0xba, 0xd4, 0x2c, - 0xb7, 0x36, 0xd2, 0x6e, 0x6d, 0xdb, 0x0e, 0xf1, 0x0a, 0x8f, 0xde, 0x40, 0x35, 0xb0, 0xc2, 0xc8, - 0xb1, 0xdc, 0xb8, 0x0b, 0xdb, 0xc4, 0xc5, 0x95, 0x43, 0xad, 0x89, 0x4b, 0xae, 0x74, 0xa5, 0x2e, - 0x34, 0x8b, 0xf8, 0x3f, 0x9e, 0x90, 0x6e, 0xea, 0x90, 0xd3, 0x8d, 0x4f, 0x50, 0x4e, 0x87, 0xc3, - 0x77, 0xd8, 0x04, 0x85, 0x32, 0x84, 0xcd, 0xa6, 0xd4, 0x2a, 0x2f, 0xa6, 0xcb, 0xd0, 0xe3, 0x1c, - 0xe6, 0x3c, 0x32, 0xa0, 0x70, 0x6b, 0x85, 0x9e, 0xe3, 0xd9, 0x6c, 0x56, 0xea, 0x71, 0x0e, 0xa7, - 0x40, 0xa7, 0x08, 0x4a, 0x48, 0xe8, 0xcc, 0x8d, 0x1a, 0x03, 0xd8, 0x62, 0xf3, 0xe9, 0x5b, 0xd3, - 0xe5, 0x0a, 0xd6, 0x4a, 0x16, 0xd6, 0x4b, 0x3e, 0x02, 0xb4, 0x5a, 0x90, 0xcb, 0xae, 0x80, 0xec, - 0xc5, 0x00, 0x73, 0x9e, 0x8a, 0x93, 0x00, 0x19, 0x50, 0xe4, 0x8a, 0xa8, 0x2e, 0x32, 0x62, 0x11, - 0x37, 0xae, 0x79, 0x9d, 0x33, 0xcb, 0x9d, 0x2d, 0x95, 0x55, 0x40, 0x66, 0xfe, 0x64, 0x2a, 0x54, - 0x9c, 0x04, 0xeb, 0xf5, 0x8a, 0xeb, 0xf5, 0xf6, 0x60, 0x3b, 0xd3, 0x87, 0x0b, 0xde, 0x05, 0xe5, - 0x0b, 0x43, 0xb8, 0x62, 0x1e, 0xad, 0x93, 0xbc, 0x87, 0x41, 0x5d, 0x78, 0x1c, 0x95, 0xa0, 0x30, - 0xee, 0xbf, 0xed, 0x0f, 0xce, 0xfb, 0x5a, 0x0e, 0xa9, 0x20, 0xbf, 0x1b, 0x9b, 0xf8, 0x83, 0x26, - 0xa0, 0x22, 0xe4, 0xf1, 0xf8, 0xc4, 0xd4, 0xc4, 0x38, 0x63, 0xd8, 0x3b, 0x34, 0xbb, 0x6d, 0xac, - 0x49, 0x71, 0xc6, 0x70, 0x34, 0xc0, 0xa6, 0x96, 0x8f, 0x71, 0x6c, 0x76, 0xcd, 0xde, 0x99, 0xa9, - 0xc9, 0x7b, 0x1d, 0xc8, 0xc7, 0x8e, 0x42, 0x05, 0x90, 0x70, 0xfb, 0x3c, 0x29, 0xd5, 0x1d, 0x8c, - 0xfb, 0x23, 0x4d, 0x88, 0xb1, 0xe1, 0xf8, 0x54, 0x13, 0xe3, 0xc3, 0x69, 0xaf, 0xaf, 0x49, 0xec, - 0xd0, 0x7e, 0x9f, 0xd4, 0x60, 0x59, 0x26, 0xd6, 0xe4, 0xd6, 0x57, 0x11, 0x64, 0x26, 0x0c, 0xbd, - 0x80, 0x7c, 0xfc, 0x22, 0xa0, 0xed, 0xd4, 0x35, 0x2b, 0xef, 0x85, 0x51, 0xc9, 0x82, 0x7c, 0x10, - 0xaf, 0x41, 0x49, 0xac, 0x85, 0x76, 0xb2, 0x56, 0x4b, 0x3f, 0xdb, 0xfd, 0x1b, 0x4e, 0x3e, 0x7c, - 0x2e, 0xa0, 0x2e, 0xc0, 0xd2, 0x0a, 0xa8, 0x9a, 0xf9, 0x1f, 0x57, 0xfd, 0x66, 0x18, 0x8f, 0x51, - 0xbc, 0xff, 0x11, 0x94, 0x56, 0xf6, 0x83, 0xb2, 0xa9, 0x19, 0x73, 0x18, 0xff, 0x3f, 0xca, 0x25, - 0x75, 0x3a, 0xd5, 0x87, 0xdf, 0xb5, 0xdc, 0xc3, 0xbc, 0x26, 0xfc, 0x98, 0xd7, 0x84, 0x5f, 0xf3, - 0x9a, 0xf0, 0xb1, 0xc0, 0x5e, 0xa1, 0x60, 0x32, 0x51, 0xd8, 0xf3, 0xf9, 0xf2, 0x4f, 0x00, 0x00, - 0x00, 0xff, 0xff, 0x33, 0x33, 0x9b, 0x3a, 0x76, 0x05, 0x00, 0x00, +func init() { proto.RegisterFile("rpc.proto", fileDescriptor_rpc_f4f04914f1106c76) } + +var fileDescriptor_rpc_f4f04914f1106c76 = []byte{ + // 729 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x7c, 0x54, 0xcf, 0x6e, 0xda, 0x4e, + 0x10, 0xc6, 0x36, 0x18, 0x18, 0x12, 0xe4, 0x6c, 0x48, 0x62, 0xfc, 0x93, 0x08, 0xe2, 0x84, 0xf2, + 0xab, 0x48, 0x4b, 0xa5, 0x4a, 0xed, 0x0d, 0x88, 0xa3, 0xa0, 0x26, 0xd0, 0x2e, 0x10, 0xfa, 0xe7, + 0x90, 0x9a, 0x64, 0xe3, 0x58, 0x02, 0x9b, 0x7a, 0x4d, 0x93, 0x5c, 0xfb, 0x28, 0x7d, 0x9a, 0x1c, + 0xfb, 0x04, 0x55, 0x9b, 0xa7, 0xe8, 0xb1, 0xda, 0xf5, 0x1a, 0x70, 0x9b, 0x70, 0xdb, 0xfd, 0xbe, + 0xf1, 0xcc, 0xb7, 0x33, 0x9f, 0x07, 0xb2, 0xfe, 0xf4, 0xbc, 0x36, 0xf5, 0xbd, 0xc0, 0x43, 0x6a, + 0x70, 0x65, 0xb9, 0x1e, 0x35, 0x72, 0xc1, 0xed, 0x94, 0xd0, 0x10, 0x34, 0x0a, 0xb6, 0x67, 0x7b, + 0xfc, 0xb8, 0xcf, 0x4e, 0x21, 0x5a, 0x59, 0x87, 0x5c, 0xdb, 0xbd, 0xf4, 0x30, 0xf9, 0x3c, 0x23, + 0x34, 0xa8, 0x7c, 0x93, 0x60, 0x2d, 0xbc, 0xd3, 0xa9, 0xe7, 0x52, 0x82, 0xfe, 0x07, 0x75, 0x6c, + 0x8d, 0xc8, 0x98, 0xea, 0x52, 0x59, 0xa9, 0xe6, 0xea, 0xeb, 0xb5, 0x30, 0x77, 0xed, 0x98, 0xa1, + 0xcd, 0xe4, 0xdd, 0x8f, 0xdd, 0x04, 0x16, 0x21, 0xa8, 0x08, 0x99, 0x89, 0xe3, 0x9e, 0x05, 0xce, + 0x84, 0xe8, 0x72, 0x59, 0xaa, 0x2a, 0x38, 0x3d, 0x71, 0xdc, 0xbe, 0x33, 0x21, 0x9c, 0xb2, 0x6e, + 0x42, 0x4a, 0x11, 0x94, 0x75, 0xc3, 0xa9, 0x7d, 0xc8, 0xd2, 0xc0, 0xf3, 0x49, 0xff, 0x76, 0x4a, + 0xf4, 0x64, 0x59, 0xaa, 0xe6, 0xeb, 0x1b, 0x51, 0x95, 0x5e, 0x44, 0xe0, 0x45, 0x4c, 0xe5, 0xb7, + 0x0c, 0xeb, 0x3d, 0xe2, 0x3b, 0x84, 0x0a, 0xd9, 0xb1, 0xc2, 0xd2, 0xe3, 0x85, 0xe5, 0x78, 0xe1, + 0x17, 0x8c, 0x0a, 0xce, 0xaf, 0x88, 0x4f, 0x75, 0x85, 0xbf, 0xae, 0x10, 0x7b, 0xdd, 0x49, 0x48, + 0x8a, 0x47, 0xce, 0x63, 0x51, 0x1d, 0xb6, 0x58, 0x4a, 0x9f, 0x50, 0x6f, 0x3c, 0x0b, 0x1c, 0xcf, + 0x3d, 0xbb, 0x76, 0xdc, 0x0b, 0xef, 0x9a, 0x8b, 0x57, 0xf0, 0xe6, 0xc4, 0xba, 0xc1, 0x73, 0x6e, + 0xc8, 0x29, 0xf4, 0x04, 0xc0, 0xb2, 0x6d, 0x9f, 0xd8, 0x56, 0x40, 0xa8, 0x9e, 0x2a, 0x2b, 0xd5, + 0x7c, 0x7d, 0x2d, 0xaa, 0xd6, 0xb0, 0x6d, 0x1f, 0x2f, 0xf1, 0xe8, 0x15, 0x14, 0xa7, 0x96, 0x1f, + 0x38, 0xd6, 0x98, 0x55, 0xe1, 0x93, 0x38, 0xbb, 0x70, 0xa8, 0x35, 0x1a, 0x93, 0x0b, 0x5d, 0x2d, + 0x4b, 0xd5, 0x0c, 0xde, 0x11, 0x01, 0xd1, 0xa4, 0x0e, 0x04, 0x8d, 0x3e, 0x3e, 0xf0, 0x2d, 0x0d, + 0x7c, 0x2b, 0x20, 0xf6, 0xad, 0x9e, 0xe6, 0xed, 0xdd, 0x8d, 0x0a, 0xbf, 0x89, 0xe7, 0xe8, 0x89, + 0xb0, 0x7f, 0x92, 0x47, 0x44, 0xe5, 0x13, 0xe4, 0xa3, 0xce, 0x0b, 0x83, 0x54, 0x41, 0xa5, 0x1c, + 0xe1, 0x8d, 0xcf, 0xd5, 0xf3, 0xf3, 0xd1, 0x71, 0xf4, 0x28, 0x81, 0x05, 0x8f, 0x0c, 0x48, 0x5f, + 0x5b, 0xbe, 0xeb, 0xb8, 0x36, 0x1f, 0x44, 0xf6, 0x28, 0x81, 0x23, 0xa0, 0x99, 0x01, 0xd5, 0x27, + 0x74, 0x36, 0x0e, 0x2a, 0x5d, 0xd8, 0xe0, 0xcd, 0xef, 0x58, 0x93, 0xc5, 0x7c, 0x57, 0xf6, 0x43, + 0x5a, 0xd9, 0x8f, 0xca, 0x21, 0xa0, 0xe5, 0x84, 0x42, 0x76, 0x01, 0x52, 0x2e, 0x03, 0xb8, 0xad, + 0xb3, 0x38, 0xbc, 0x20, 0x03, 0x32, 0x42, 0x11, 0xd5, 0x65, 0x4e, 0xcc, 0xef, 0x95, 0x4b, 0x91, + 0xe7, 0xd4, 0x1a, 0xcf, 0x16, 0xca, 0x0a, 0x90, 0xe2, 0xe6, 0xe7, 0x2a, 0xb2, 0x38, 0xbc, 0xac, + 0xd6, 0x2b, 0xaf, 0xd6, 0xdb, 0x86, 0xcd, 0x58, 0x1d, 0x21, 0x78, 0x1b, 0xd4, 0x2f, 0x1c, 0x11, + 0x8a, 0xc5, 0x6d, 0x95, 0xe4, 0x3d, 0x0c, 0xd9, 0xf9, 0x0f, 0x84, 0x72, 0x90, 0x1e, 0x74, 0x5e, + 0x77, 0xba, 0xc3, 0x8e, 0x96, 0x40, 0x59, 0x48, 0xbd, 0x1d, 0x98, 0xf8, 0xbd, 0x26, 0xa1, 0x0c, + 0x24, 0xf1, 0xe0, 0xd8, 0xd4, 0x64, 0x16, 0xd1, 0x6b, 0x1f, 0x98, 0xad, 0x06, 0xd6, 0x14, 0x16, + 0xd1, 0xeb, 0x77, 0xb1, 0xa9, 0x25, 0x19, 0x8e, 0xcd, 0x96, 0xd9, 0x3e, 0x35, 0xb5, 0xd4, 0x5e, + 0x0d, 0x76, 0x1e, 0x71, 0x0d, 0xcb, 0x34, 0x6c, 0x60, 0x91, 0xbe, 0xd1, 0xec, 0xe2, 0xbe, 0x26, + 0xed, 0x35, 0x21, 0xc9, 0xec, 0x8d, 0xd2, 0xa0, 0xe0, 0xc6, 0x30, 0xe4, 0x5a, 0xdd, 0x41, 0xa7, + 0xaf, 0x49, 0x0c, 0xeb, 0x0d, 0x4e, 0x34, 0x99, 0x1d, 0x4e, 0xda, 0x1d, 0x4d, 0xe1, 0x87, 0xc6, + 0xbb, 0xb0, 0x26, 0x8f, 0x32, 0xb1, 0x96, 0xaa, 0x7f, 0x95, 0x21, 0xc5, 0x1f, 0x82, 0x9e, 0x41, + 0x92, 0xad, 0x27, 0xb4, 0x19, 0xb9, 0x6c, 0x69, 0x79, 0x19, 0x85, 0x38, 0x28, 0x1a, 0xf7, 0x12, + 0xd4, 0xd0, 0x8a, 0x68, 0x2b, 0x6e, 0xcd, 0xe8, 0xb3, 0xed, 0xbf, 0xe1, 0xf0, 0xc3, 0xa7, 0x12, + 0x6a, 0x01, 0x2c, 0xac, 0x83, 0x8a, 0xb1, 0xe5, 0xb0, 0xec, 0x4f, 0xc3, 0x78, 0x88, 0x12, 0xf5, + 0x0f, 0x21, 0xb7, 0x34, 0x4f, 0x14, 0x0f, 0x8d, 0x99, 0xc9, 0xf8, 0xef, 0x41, 0x2e, 0xcc, 0xd3, + 0x2c, 0xde, 0xfd, 0x2a, 0x25, 0xee, 0xee, 0x4b, 0xd2, 0xf7, 0xfb, 0x92, 0xf4, 0xf3, 0xbe, 0x24, + 0x7d, 0x48, 0xf3, 0x95, 0x38, 0x1d, 0x8d, 0x54, 0xbe, 0xcb, 0x9f, 0xff, 0x09, 0x00, 0x00, 0xff, + 0xff, 0x92, 0x5a, 0x97, 0xd8, 0x03, 0x06, 0x00, 0x00, } diff --git a/pkg/store/storepb/rpc.proto b/pkg/store/storepb/rpc.proto index 2c264f1c15..95a9d59e4d 100644 --- a/pkg/store/storepb/rpc.proto +++ b/pkg/store/storepb/rpc.proto @@ -12,10 +12,6 @@ option (gogoproto.unmarshaler_all) = true; option (gogoproto.goproto_getters_all) = false; /// Store reprents API against instance that stores XOR encoded values with label set metadata (e.g Prometheus metrics). -/// -/// Partial Response is supported unless `partial_response_disabled` is true. When disabled any error that will result -/// in partial data returned (e.g missing chunk series because of underlying storeAPI is temporarily not available) is -/// failing the request. service Store { /// Info returns meta information about a store e.g labels that makes that store unique as well as time range that is /// available. @@ -51,6 +47,20 @@ message InfoResponse { StoreType storeType = 4; } +/// PartialResponseStrategy controls partial response handling. +enum PartialResponseStrategy { + /// WARN strategy tells server to treat any error that will related to single StoreAPI (e.g missing chunk series because of underlying + /// storeAPI is temporarily not available) as warning which will not fail the whole query (still OK response). + /// Server should produce those as a warnings field in response. + WARN = 0; + /// ABORT strategy tells server to treat any error that will related to single StoreAPI (e.g missing chunk series because of underlying + /// storeAPI is temporarily not available) as the gRPC error that aborts the query. + /// + /// This is especially useful for any rule/alert evaluations on top of StoreAPI which usually does not tolerate partial + /// errors. + ABORT = 1; +} + message SeriesRequest { int64 min_time = 1; int64 max_time = 2; @@ -59,7 +69,11 @@ message SeriesRequest { int64 max_resolution_window = 4; repeated Aggr aggregates = 5; + // Deprecated. Use partial_response_strategy instead. bool partial_response_disabled = 6; + + // TODO(bwplotka): Move Thanos components to use strategy instead. Inlcuding QueryAPI. + PartialResponseStrategy partial_response_strategy = 7; } enum Aggr { @@ -83,6 +97,9 @@ message SeriesResponse { message LabelNamesRequest { bool partial_response_disabled = 1; + + // TODO(bwplotka): Move Thanos components to use strategy instead. Inlcuding QueryAPI. + PartialResponseStrategy partial_response_strategy = 2; } message LabelNamesResponse { @@ -94,6 +111,9 @@ message LabelValuesRequest { string label = 1; bool partial_response_disabled = 2; + + // TODO(bwplotka): Move Thanos components to use strategy instead. Inlcuding QueryAPI. + PartialResponseStrategy partial_response_strategy = 3; } message LabelValuesResponse { diff --git a/pkg/store/storepb/types.pb.go b/pkg/store/storepb/types.pb.go index 0f6767b97b..9344fbc9d0 100644 --- a/pkg/store/storepb/types.pb.go +++ b/pkg/store/storepb/types.pb.go @@ -153,8 +153,8 @@ func (m *Chunk) XXX_DiscardUnknown() { var xxx_messageInfo_Chunk proto.InternalMessageInfo type Series struct { - Labels []Label `protobuf:"bytes,1,rep,name=labels" json:"labels"` - Chunks []AggrChunk `protobuf:"bytes,2,rep,name=chunks" json:"chunks"` + Labels []Label `protobuf:"bytes,1,rep,name=labels,proto3" json:"labels"` + Chunks []AggrChunk `protobuf:"bytes,2,rep,name=chunks,proto3" json:"chunks"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` @@ -196,12 +196,12 @@ var xxx_messageInfo_Series proto.InternalMessageInfo type AggrChunk struct { MinTime int64 `protobuf:"varint,1,opt,name=min_time,json=minTime,proto3" json:"min_time,omitempty"` MaxTime int64 `protobuf:"varint,2,opt,name=max_time,json=maxTime,proto3" json:"max_time,omitempty"` - Raw *Chunk `protobuf:"bytes,3,opt,name=raw" json:"raw,omitempty"` - Count *Chunk `protobuf:"bytes,4,opt,name=count" json:"count,omitempty"` - Sum *Chunk `protobuf:"bytes,5,opt,name=sum" json:"sum,omitempty"` - Min *Chunk `protobuf:"bytes,6,opt,name=min" json:"min,omitempty"` - Max *Chunk `protobuf:"bytes,7,opt,name=max" json:"max,omitempty"` - Counter *Chunk `protobuf:"bytes,8,opt,name=counter" json:"counter,omitempty"` + Raw *Chunk `protobuf:"bytes,3,opt,name=raw,proto3" json:"raw,omitempty"` + Count *Chunk `protobuf:"bytes,4,opt,name=count,proto3" json:"count,omitempty"` + Sum *Chunk `protobuf:"bytes,5,opt,name=sum,proto3" json:"sum,omitempty"` + Min *Chunk `protobuf:"bytes,6,opt,name=min,proto3" json:"min,omitempty"` + Max *Chunk `protobuf:"bytes,7,opt,name=max,proto3" json:"max,omitempty"` + Counter *Chunk `protobuf:"bytes,8,opt,name=counter,proto3" json:"counter,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` @@ -541,6 +541,9 @@ func encodeVarintTypes(dAtA []byte, offset int, v uint64) int { return offset + 1 } func (m *Label) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l l = len(m.Name) @@ -558,6 +561,9 @@ func (m *Label) Size() (n int) { } func (m *Chunk) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if m.Type != 0 { @@ -574,6 +580,9 @@ func (m *Chunk) Size() (n int) { } func (m *Series) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if len(m.Labels) > 0 { @@ -595,6 +604,9 @@ func (m *Series) Size() (n int) { } func (m *AggrChunk) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if m.MinTime != 0 { @@ -634,6 +646,9 @@ func (m *AggrChunk) Size() (n int) { } func (m *LabelMatcher) Size() (n int) { + if m == nil { + return 0 + } var l int _ = l if m.Type != 0 { diff --git a/pkg/ui/rule.go b/pkg/ui/rule.go index ff4b9412ca..2493e7f0f3 100644 --- a/pkg/ui/rule.go +++ b/pkg/ui/rule.go @@ -10,6 +10,8 @@ import ( "sort" "github.com/go-kit/kit/log" + thanosrule "github.com/improbable-eng/thanos/pkg/rule" + "github.com/improbable-eng/thanos/pkg/store/storepb" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/route" "github.com/prometheus/prometheus/rules" @@ -20,16 +22,16 @@ type Rule struct { flagsMap map[string]string - ruleManager *rules.Manager - queryURL string + ruleManagers thanosrule.Managers + queryURL string } -func NewRuleUI(logger log.Logger, ruleManager *rules.Manager, queryURL string, flagsMap map[string]string) *Rule { +func NewRuleUI(logger log.Logger, ruleManagers map[storepb.PartialResponseStrategy]*rules.Manager, queryURL string, flagsMap map[string]string) *Rule { return &Rule{ - BaseUI: NewBaseUI(logger, "rule_menu.html", ruleTmplFuncs(queryURL)), - flagsMap: flagsMap, - ruleManager: ruleManager, - queryURL: queryURL, + BaseUI: NewBaseUI(logger, "rule_menu.html", ruleTmplFuncs(queryURL)), + flagsMap: flagsMap, + ruleManagers: ruleManagers, + queryURL: queryURL, } } @@ -96,7 +98,7 @@ func ruleTmplFuncs(queryURL string) template.FuncMap { } func (ru *Rule) alerts(w http.ResponseWriter, r *http.Request) { - alerts := ru.ruleManager.AlertingRules() + alerts := ru.ruleManagers.AlertingRules() alertsSorter := byAlertStateAndNameSorter{alerts: alerts} sort.Sort(alertsSorter) @@ -111,13 +113,15 @@ func (ru *Rule) alerts(w http.ResponseWriter, r *http.Request) { prefix := GetWebPrefix(ru.logger, ru.flagsMap, r) + // TODO(bwplotka): Update HTML to include partial response. ru.executeTemplate(w, "alerts.html", prefix, alertStatus) } func (ru *Rule) rules(w http.ResponseWriter, r *http.Request) { prefix := GetWebPrefix(ru.logger, ru.flagsMap, r) - ru.executeTemplate(w, "rules.html", prefix, ru.ruleManager) + // TODO(bwplotka): Update HTML to include partial response. + ru.executeTemplate(w, "rules.html", prefix, ru.ruleManagers) } // root redirects / requests to /graph, taking into account the path prefix value @@ -139,12 +143,12 @@ func (ru *Rule) Register(r *route.Router) { // AlertStatus bundles alerting rules and the mapping of alert states to row classes. type AlertStatus struct { - AlertingRules []*rules.AlertingRule + AlertingRules []thanosrule.AlertingRule AlertStateToRowClass map[rules.AlertState]string } type byAlertStateAndNameSorter struct { - alerts []*rules.AlertingRule + alerts []thanosrule.AlertingRule } func (s byAlertStateAndNameSorter) Len() int { diff --git a/scripts/genproto.sh b/scripts/genproto.sh index fa2933831f..5749e699ea 100755 --- a/scripts/genproto.sh +++ b/scripts/genproto.sh @@ -18,7 +18,7 @@ if ! [[ $(${PROTOC_BIN} --version) =~ "3.4.0" ]]; then exit 255 fi -THANOS_ROOT="${GOPATH}/src/github.com/improbable-eng/thanos" +THANOS_ROOT=$(pwd) PROM_PATH="${THANOS_ROOT}/pkg/store/storepb" GOGOPROTO_ROOT="${THANOS_ROOT}/vendor/github.com/gogo/protobuf" GOGOPROTO_PATH="${GOGOPROTO_ROOT}:${GOGOPROTO_ROOT}/protobuf" diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index 7ef9c7ac8b..d8b4fc2607 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -91,11 +91,22 @@ func testQuerySimple(t *testing.T, conf testConfig) { default: } - var err error - res, err = promclient.QueryInstant(ctx, nil, urlParse(t, "http://"+queryHTTP(1)), "up", time.Now(), false) + var ( + err error + warnings []string + ) + res, warnings, err = promclient.QueryInstant(ctx, nil, urlParse(t, "http://"+queryHTTP(1)), "up", time.Now(), promclient.QueryOptions{ + Deduplicate: false, + }) if err != nil { return err } + + if len(warnings) > 0 { + // we don't expect warnings. + return errors.Errorf("unexpected warnings %s", warnings) + } + expectedRes := 4 if conf.name == "gossip" { expectedRes = 3 @@ -146,11 +157,22 @@ func testQuerySimple(t *testing.T, conf testConfig) { default: } - var err error - res, err = promclient.QueryInstant(ctx, nil, urlParse(t, "http://"+queryHTTP(1)), "up", time.Now(), true) + var ( + err error + warnings []string + ) + res, warnings, err = promclient.QueryInstant(ctx, nil, urlParse(t, "http://"+queryHTTP(1)), "up", time.Now(), promclient.QueryOptions{ + Deduplicate: true, + }) if err != nil { return err } + + if len(warnings) > 0 { + // we don't expect warnings. + return errors.Errorf("unexpected warnings %s", warnings) + } + expectedRes := 3 if conf.name == "gossip" { expectedRes = 2 diff --git a/test/e2e/rule_test.go b/test/e2e/rule_test.go index 4af72bd648..6432638cc4 100644 --- a/test/e2e/rule_test.go +++ b/test/e2e/rule_test.go @@ -3,42 +3,68 @@ package e2e_test import ( "context" "encoding/json" + "fmt" + "io/ioutil" + "math" "net/http" + "os" + "path" "sort" "testing" "time" "github.com/improbable-eng/thanos/pkg/promclient" "github.com/improbable-eng/thanos/pkg/runutil" + "github.com/improbable-eng/thanos/pkg/store/storepb" "github.com/improbable-eng/thanos/pkg/testutil" "github.com/pkg/errors" "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/pkg/timestamp" ) -const alwaysFireRule = ` +const ( + testAlertRuleAbortOnPartialResponse = ` groups: - name: example + # Abort should be a default: partial_response_strategy: "ABORT" rules: - - alert: AlwaysFiring - expr: vector(1) + - alert: TestAlert_AbortOnPartialResponse + # It must be based on actual metrics otherwise call to StoreAPI would be not involved. + expr: absent(some_metric) labels: severity: page annotations: - summary: "I always complain" + summary: "I always complain, but I don't allow partial response in query." ` + testAlertRuleWarnOnPartialResponse = ` +groups: +- name: example + partial_response_strategy: "WARN" + rules: + - alert: TestAlert_WarnOnPartialResponse + # It must be based on actual metric, otherwise call to StoreAPI would be not involved. + expr: absent(some_metric) + labels: + severity: page + annotations: + summary: "I always complain and allow partial response in query." +` +) var ( + alertsToTest = []string{testAlertRuleAbortOnPartialResponse, testAlertRuleWarnOnPartialResponse} + ruleStaticFlagsSuite = newSpinupSuite(). Add(querierWithStoreFlags(1, "", rulerGRPC(1), rulerGRPC(2))). - Add(rulerWithQueryFlags(1, alwaysFireRule, queryHTTP(1))). - Add(rulerWithQueryFlags(2, alwaysFireRule, queryHTTP(1))). + Add(rulerWithQueryFlags(1, alertsToTest, queryHTTP(1))). + Add(rulerWithQueryFlags(2, alertsToTest, queryHTTP(1))). Add(alertManager(1)) ruleFileSDSuite = newSpinupSuite(). Add(querierWithFileSD(1, "", rulerGRPC(1), rulerGRPC(2))). - Add(rulerWithFileSD(1, alwaysFireRule, queryHTTP(1))). - Add(rulerWithFileSD(2, alwaysFireRule, queryHTTP(1))). + Add(rulerWithFileSD(1, alertsToTest, queryHTTP(1))). + Add(rulerWithFileSD(2, alertsToTest, queryHTTP(1))). Add(alertManager(1)) ) @@ -82,14 +108,28 @@ func testRuleComponent(t *testing.T, conf testConfig) { { "__name__": "ALERTS", "severity": "page", - "alertname": "AlwaysFiring", + "alertname": "TestAlert_AbortOnPartialResponse", + "alertstate": "firing", + "replica": "1", + }, + { + "__name__": "ALERTS", + "severity": "page", + "alertname": "TestAlert_AbortOnPartialResponse", + "alertstate": "firing", + "replica": "2", + }, + { + "__name__": "ALERTS", + "severity": "page", + "alertname": "TestAlert_WarnOnPartialResponse", "alertstate": "firing", "replica": "1", }, { "__name__": "ALERTS", "severity": "page", - "alertname": "AlwaysFiring", + "alertname": "TestAlert_WarnOnPartialResponse", "alertstate": "firing", "replica": "2", }, @@ -97,17 +137,27 @@ func testRuleComponent(t *testing.T, conf testConfig) { expAlertLabels := []model.LabelSet{ { "severity": "page", - "alertname": "AlwaysFiring", + "alertname": "TestAlert_AbortOnPartialResponse", + "replica": "1", + }, + { + "severity": "page", + "alertname": "TestAlert_AbortOnPartialResponse", + "replica": "2", + }, + { + "severity": "page", + "alertname": "TestAlert_WarnOnPartialResponse", "replica": "1", }, { "severity": "page", - "alertname": "AlwaysFiring", + "alertname": "TestAlert_WarnOnPartialResponse", "replica": "2", }, } - testutil.Ok(t, runutil.Retry(5*time.Second, ctx.Done(), func() error { + testutil.Ok(t, runutil.Retry(5*time.Second, ctx.Done(), func() (err error) { select { case <-exit: cancel() @@ -118,13 +168,22 @@ func testRuleComponent(t *testing.T, conf testConfig) { qtime := time.Now() // The time series written for the firing alerting rule must be queryable. - res, err := promclient.QueryInstant(ctx, nil, urlParse(t, "http://"+queryHTTP(1)), "ALERTS", time.Now(), false) + res, warnings, err := promclient.QueryInstant(ctx, nil, urlParse(t, "http://"+queryHTTP(1)), "ALERTS", time.Now(), promclient.QueryOptions{ + Deduplicate: false, + }) if err != nil { return err } - if len(res) != 2 { + + if len(warnings) > 0 { + // we don't expect warnings. + return errors.Errorf("unexpected warnings %s", warnings) + } + + if len(res) != len(expMetrics) { return errors.Errorf("unexpected result length %d", len(res)) } + for i, r := range res { if !r.Metric.Equal(expMetrics[i]) { return errors.Errorf("unexpected metric %s", r.Metric) @@ -136,12 +195,13 @@ func testRuleComponent(t *testing.T, conf testConfig) { return errors.Errorf("unexpected value %f", r.Value) } } + // A notification must be sent to Alertmanager. alrts, err := queryAlertmanagerAlerts(ctx, "http://localhost:29093") if err != nil { return err } - if len(alrts) != 2 { + if len(alrts) != len(expAlertLabels) { return errors.Errorf("unexpected alerts length %d", len(alrts)) } for i, a := range alrts { @@ -151,6 +211,233 @@ func testRuleComponent(t *testing.T, conf testConfig) { } return nil })) + + // checks counter ensures we are not missing metrics. + checks := 0 + // Check metrics to make sure we report correct ones that allow handling the AlwaysFiring not being triggered because of query issue. + testutil.Ok(t, promclient.MetricValues(ctx, nil, urlParse(t, "http://"+rulerHTTP(1)), func(lset labels.Labels, val float64) error { + switch lset.Get("__name__") { + case "prometheus_rule_group_rules": + checks++ + if val != 1 { + return errors.Errorf("expected 1 loaded groups for strategy %s but found %v", lset.Get("strategy"), val) + } + } + + return nil + })) + testutil.Equals(t, 2, checks) +} + +type failingStoreAPI struct{} + +func (a *failingStoreAPI) Info(context.Context, *storepb.InfoRequest) (*storepb.InfoResponse, error) { + return &storepb.InfoResponse{ + MinTime: math.MinInt64, + MaxTime: math.MaxInt64, + Labels: []storepb.Label{ + { + Name: "magic", + Value: "store_api", + }, + }, + }, nil +} + +func (a *failingStoreAPI) Series(_ *storepb.SeriesRequest, _ storepb.Store_SeriesServer) error { + return errors.New("I always fail. No reason. I am just offended StoreAPI. Don't touch me") +} + +func (a *failingStoreAPI) LabelNames(context.Context, *storepb.LabelNamesRequest) (*storepb.LabelNamesResponse, error) { + return &storepb.LabelNamesResponse{}, nil +} + +func (a *failingStoreAPI) LabelValues(context.Context, *storepb.LabelValuesRequest) (*storepb.LabelValuesResponse, error) { + return &storepb.LabelValuesResponse{}, nil +} + +// Test Ruler behaviour on different storepb.PartialResponseStrategy when having partial response from single `failingStoreAPI`. +func TestRulePartialResponse(t *testing.T) { + const expectedWarning = "receive series from Addr: 127.0.0.1:21091 Labels: [{magic store_api {} [] 0}] Mint: -9223372036854775808 Maxt: 9223372036854775807: rpc error: code = Unknown desc = I always fail. No reason. I am just offended StoreAPI. Don't touch me" + + dir, err := ioutil.TempDir("", "test_rulepartial_respn") + testutil.Ok(t, err) + defer func() { testutil.Ok(t, os.RemoveAll(dir)) }() + + suite := newSpinupSuite(). + Add(querierWithStoreFlags(1, "", rulerGRPC(1), fakeStoreAPIGRPC(1))). + Add(rulerWithDir(1, dir, queryHTTP(1))). + Add(fakeStoreAPI(1, &failingStoreAPI{})). + Add(alertManager(1)) + + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) + + exit, err := suite.Exec(t, ctx, "test_rule_partial_response_component") + if err != nil { + t.Errorf("spinup failed: %v", err) + cancel() + return + } + + defer func() { + cancel() + <-exit + }() + + testutil.Ok(t, runutil.Retry(5*time.Second, ctx.Done(), func() (err error) { + select { + case <-exit: + cancel() + return nil + default: + } + + // The time series written for the firing alerting rule must be queryable. + res, warnings, err := promclient.QueryInstant(ctx, nil, urlParse(t, "http://"+queryHTTP(1)), "ALERTS", time.Now(), promclient.QueryOptions{ + Deduplicate: false, + }) + if err != nil { + return err + } + + if len(warnings) != 1 { + // We do expect warnings. + return errors.Errorf("unexpected number of warnings, expected 1, got %s", warnings) + } + + // This is tricky as for initial time (1 rule eval, we will have both alerts, as "No store match queries" will be there. + if len(res) != 0 { + return errors.Errorf("unexpected result length. expected %v, got %v", 0, res) + } + return nil + })) + + // Add alerts to ruler, we want to add it only when Querier is rdy, otherwise we will get "no store match the query". + for i, rule := range alertsToTest { + testutil.Ok(t, ioutil.WriteFile(path.Join(dir, fmt.Sprintf("rules-%d.yaml", i)), []byte(rule), 0666)) + } + + resp, err := http.Post("http://"+rulerHTTP(1)+"/-/reload", "", nil) + testutil.Ok(t, err) + defer func() { _, _ = ioutil.ReadAll(resp.Body); _ = resp.Body.Close() }() + testutil.Equals(t, http.StatusOK, resp.StatusCode) + + // We don't expect `AlwaysFiring` as it does NOT allow PartialResponse, so it will trigger `prometheus_rule_evaluation_failures_total` instead. + expMetrics := []model.Metric{ + { + "__name__": "ALERTS", + "severity": "page", + "alertname": "TestAlert_WarnOnPartialResponse", + "alertstate": "firing", + "replica": "1", + }, + } + expAlertLabels := []model.LabelSet{ + { + "severity": "page", + "alertname": "TestAlert_WarnOnPartialResponse", + "replica": "1", + }, + } + + testutil.Ok(t, runutil.Retry(5*time.Second, ctx.Done(), func() (err error) { + select { + case <-exit: + cancel() + return nil + default: + } + + qtime := time.Now() + + // The time series written for the firing alerting rule must be queryable. + res, warnings, err := promclient.QueryInstant(ctx, nil, urlParse(t, "http://"+queryHTTP(1)), "ALERTS", time.Now(), promclient.QueryOptions{ + Deduplicate: false, + }) + if err != nil { + return err + } + + if len(warnings) != 1 { + // We do expect warnings. + return errors.Errorf("unexpected number of warnings, expected 1, got %s", warnings) + } + + if warnings[0] != expectedWarning { + return errors.Errorf("unexpected warning, expected %s, got %s", expectedWarning, warnings[0]) + } + + // This is tricky as for initial time (1 rule eval, we will have both alerts, as "No store match queries" will be there. + if len(res) != len(expMetrics) { + return errors.Errorf("unexpected result length. expected %v, got %v", len(expMetrics), res) + } + + for i, r := range res { + if !r.Metric.Equal(expMetrics[i]) { + return errors.Errorf("unexpected metric %s, expected %s", r.Metric, expMetrics[i]) + } + if int64(r.Timestamp) != timestamp.FromTime(qtime) { + return errors.Errorf("unexpected timestamp %d", r.Timestamp) + } + if r.Value != 1 { + return errors.Errorf("unexpected value %f", r.Value) + } + } + + // A notification must be sent to Alertmanager. + alrts, err := queryAlertmanagerAlerts(ctx, "http://localhost:29093") + if err != nil { + return err + } + if len(alrts) != len(expAlertLabels) { + return errors.Errorf("unexpected alerts length %d", len(alrts)) + } + for i, a := range alrts { + if !a.Labels.Equal(expAlertLabels[i]) { + return errors.Errorf("unexpected labels %s", a.Labels) + } + } + return nil + })) + + // checks counter ensures we are not missing metrics. + checks := 0 + // Check metrics to make sure we report correct ones that allow handling the AlwaysFiring not being triggered because of query issue. + testutil.Ok(t, promclient.MetricValues(ctx, nil, urlParse(t, "http://"+rulerHTTP(1)), func(lset labels.Labels, val float64) error { + switch lset.Get("__name__") { + case "prometheus_rule_group_rules": + checks++ + if val != 1 { + return errors.Errorf("expected 1 loaded groups for strategy %s but found %v", lset.Get("strategy"), val) + } + case "prometheus_rule_evaluation_failures_total": + if lset.Get("strategy") == "abort" { + checks++ + if val <= 0 { + return errors.Errorf("expected rule eval failures for abort strategy rule as we have failing storeAPI but found %v", val) + } + } else if lset.Get("strategy") == "warn" { + checks++ + if val > 0 { + return errors.Errorf("expected no rule eval failures for warm strategy rule but found %v", val) + } + } + case "thanos_rule_evaluation_with_warnings_total": + if lset.Get("strategy") == "warn" { + checks++ + if val <= 0 { + return errors.Errorf("expected rule eval with warnings for warn strategy rule as we have failing storeAPI but found %v", val) + } + } else if lset.Get("strategy") == "abort" { + checks++ + if val > 0 { + return errors.Errorf("expected rule eval with warnings 0 for abort strategy rule but found %v", val) + } + } + } + return nil + })) + testutil.Equals(t, 6, checks) } // TODO(bwplotka): Move to promclient. diff --git a/test/e2e/spinup_test.go b/test/e2e/spinup_test.go index 592c1e7e69..34bd40f695 100644 --- a/test/e2e/spinup_test.go +++ b/test/e2e/spinup_test.go @@ -4,7 +4,9 @@ import ( "bytes" "context" "fmt" + "io" "io/ioutil" + "net" "os" "os/exec" "path" @@ -14,11 +16,11 @@ import ( "github.com/improbable-eng/thanos/pkg/objstore/s3" "github.com/improbable-eng/thanos/pkg/runutil" - + "github.com/improbable-eng/thanos/pkg/store/storepb" "github.com/improbable-eng/thanos/pkg/testutil" - "github.com/oklog/run" "github.com/pkg/errors" + "google.golang.org/grpc" ) var ( @@ -45,9 +47,37 @@ var ( storeGatewayHTTP = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 20190+i) } minioHTTP = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 20290+i) } + + fakeStoreAPIGRPC = func(i int) string { return fmt.Sprintf("127.0.0.1:%d", 21090+i) } ) -type cmdScheduleFunc func(workDir string) ([]*exec.Cmd, error) +type Exec interface { + Start(stdout io.Writer, stderr io.Writer) error + Wait() error + Kill() error + + String() string +} + +type cmdExec struct { + *exec.Cmd +} + +func newCmdExec(cmd *exec.Cmd) *cmdExec { + return &cmdExec{Cmd: cmd} +} + +func (c *cmdExec) Start(stdout io.Writer, stderr io.Writer) error { + c.Stderr = stderr + c.Stdout = stdout + return c.Cmd.Start() +} + +func (c *cmdExec) Kill() error { return c.Process.Signal(syscall.SIGKILL) } + +func (c *cmdExec) String() string { return fmt.Sprintf("%s %s", c.Path, c.Args[1]) } + +type cmdScheduleFunc func(workDir string) ([]Exec, error) type spinupSuite struct { cmdScheduleFuncs []cmdScheduleFunc @@ -64,7 +94,7 @@ func (s *spinupSuite) Add(cmdSchedule cmdScheduleFunc) *spinupSuite { } func scraper(i int, config string) cmdScheduleFunc { - return func(workDir string) ([]*exec.Cmd, error) { + return func(workDir string) ([]Exec, error) { promDir := fmt.Sprintf("%s/data/prom%d", workDir, i) if err := os.MkdirAll(promDir, 0777); err != nil { return nil, errors.Wrap(err, "create prom dir failed") @@ -74,26 +104,26 @@ func scraper(i int, config string) cmdScheduleFunc { return nil, errors.Wrap(err, "creating prom config failed") } - var cmds []*exec.Cmd - cmds = append(cmds, exec.Command(testutil.PrometheusBinary(), + var cmds []Exec + cmds = append(cmds, newCmdExec(exec.Command(testutil.PrometheusBinary(), "--config.file", promDir+"/prometheus.yml", "--storage.tsdb.path", promDir, "--log.level", "info", "--web.listen-address", promHTTP(i), - )) - return append(cmds, exec.Command("thanos", "sidecar", + ))) + return append(cmds, newCmdExec(exec.Command("thanos", "sidecar", "--debug.name", fmt.Sprintf("sidecar-%d", i), "--grpc-address", sidecarGRPC(i), "--http-address", sidecarHTTP(i), "--prometheus.url", fmt.Sprintf("http://%s", promHTTP(i)), "--tsdb.path", promDir, "--cluster.disable", - "--log.level", "debug")), nil + "--log.level", "debug"))), nil } } func receiver(i int, config string) cmdScheduleFunc { - return func(workDir string) ([]*exec.Cmd, error) { + return func(workDir string) ([]Exec, error) { promDir := fmt.Sprintf("%s/data/remote-write-prom%d", workDir, i) if err := os.MkdirAll(promDir, 0777); err != nil { return nil, errors.Wrap(err, "create prom dir failed") @@ -103,48 +133,48 @@ func receiver(i int, config string) cmdScheduleFunc { return nil, errors.Wrap(err, "creating prom config failed") } - var cmds []*exec.Cmd - cmds = append(cmds, exec.Command(testutil.PrometheusBinary(), + var cmds []Exec + cmds = append(cmds, newCmdExec(exec.Command(testutil.PrometheusBinary(), "--config.file", promDir+"/prometheus.yml", "--storage.tsdb.path", promDir, "--log.level", "info", "--web.listen-address", promRemoteWriteHTTP(i), - )) - return append(cmds, exec.Command("thanos", "receive", + ))) + return append(cmds, newCmdExec(exec.Command("thanos", "receive", "--debug.name", fmt.Sprintf("remote-write-receive-%d", i), "--grpc-address", remoteWriteReceiveGRPC(i), "--http-address", remoteWriteReceiveMetricHTTP(i), "--remote-write.address", remoteWriteReceiveHTTP(i), "--tsdb.path", promDir, - "--log.level", "debug")), nil + "--log.level", "debug"))), nil } } func querier(i int, replicaLabel string, staticStores ...string) cmdScheduleFunc { - return func(_ string) ([]*exec.Cmd, error) { + return func(_ string) ([]Exec, error) { args := append(defaultQuerierFlags(i, replicaLabel), "--cluster.gossip-interval", "200ms", "--cluster.pushpull-interval", "200ms") for _, s := range staticStores { args = append(args, "--store", s) } - return []*exec.Cmd{exec.Command("thanos", args...)}, nil + return []Exec{newCmdExec(exec.Command("thanos", args...))}, nil } } func querierWithStoreFlags(i int, replicaLabel string, storesAddresses ...string) cmdScheduleFunc { - return func(_ string) ([]*exec.Cmd, error) { + return func(_ string) ([]Exec, error) { args := defaultQuerierFlags(i, replicaLabel) for _, addr := range storesAddresses { args = append(args, "--store", addr) } - return []*exec.Cmd{exec.Command("thanos", args...)}, nil + return []Exec{newCmdExec(exec.Command("thanos", args...))}, nil } } func querierWithFileSD(i int, replicaLabel string, storesAddresses ...string) cmdScheduleFunc { - return func(workDir string) ([]*exec.Cmd, error) { + return func(workDir string) ([]Exec, error) { queryFileSDDir := fmt.Sprintf("%s/data/queryFileSd%d", workDir, i) if err := os.MkdirAll(queryFileSDDir, 0777); err != nil { return nil, errors.Wrap(err, "create prom dir failed") @@ -160,19 +190,19 @@ func querierWithFileSD(i int, replicaLabel string, storesAddresses ...string) cm "--store.sd-interval", "5s", ) - return []*exec.Cmd{exec.Command("thanos", args...)}, nil + return []Exec{newCmdExec(exec.Command("thanos", args...))}, nil } } func storeGateway(i int, bucketConfig []byte) cmdScheduleFunc { - return func(workDir string) ([]*exec.Cmd, error) { + return func(workDir string) ([]Exec, error) { dbDir := fmt.Sprintf("%s/data/store-gateway%d", workDir, i) if err := os.MkdirAll(dbDir, 0777); err != nil { return nil, errors.Wrap(err, "creating store gateway dir failed") } - return []*exec.Cmd{exec.Command("thanos", + return []Exec{newCmdExec(exec.Command("thanos", "store", "--debug.name", fmt.Sprintf("store-%d", i), "--data-dir", dbDir, @@ -182,12 +212,12 @@ func storeGateway(i int, bucketConfig []byte) cmdScheduleFunc { "--objstore.config", string(bucketConfig), // Accelerated sync time for quicker test (3m by default) "--sync-block-duration", "5s", - )}, nil + ))}, nil } } func alertManager(i int) cmdScheduleFunc { - return func(workDir string) ([]*exec.Cmd, error) { + return func(workDir string) ([]Exec, error) { dir := fmt.Sprintf("%s/data/alertmanager%d", workDir, i) if err := os.MkdirAll(dir, 0777); err != nil { @@ -205,85 +235,150 @@ receivers: if err := ioutil.WriteFile(dir+"/config.yaml", []byte(config), 0666); err != nil { return nil, errors.Wrap(err, "creating alertmanager config file failed") } - return []*exec.Cmd{exec.Command(testutil.AlertmanagerBinary(), + return []Exec{newCmdExec(exec.Command(testutil.AlertmanagerBinary(), "--config.file", dir+"/config.yaml", "--web.listen-address", "127.0.0.1:29093", "--log.level", "debug", - )}, nil + ))}, nil } } -func ruler(i int, rules string) cmdScheduleFunc { - return func(workDir string) ([]*exec.Cmd, error) { +func rulerWithQueryFlags(i int, rules []string, queryAddresses ...string) cmdScheduleFunc { + return func(workDir string) ([]Exec, error) { dbDir := fmt.Sprintf("%s/data/rule%d", workDir, i) if err := os.MkdirAll(dbDir, 0777); err != nil { - return nil, errors.Wrap(err, "creating ruler dir failed") + return nil, errors.Wrap(err, "creating ruler dir") } - err := ioutil.WriteFile(dbDir+"/rules.yaml", []byte(rules), 0666) - if err != nil { - return nil, errors.Wrap(err, "creating ruler file failed") + for i, rule := range rules { + if err := ioutil.WriteFile(path.Join(dbDir, fmt.Sprintf("/rules-%d.yaml", i)), []byte(rule), 0666); err != nil { + return nil, errors.Wrapf(err, "writing rule %s", path.Join(dbDir, fmt.Sprintf("/rules-%d.yaml", i))) + } } - args := append(defaultRulerFlags(i, dbDir), - "--cluster.gossip-interval", "200ms", - "--cluster.pushpull-interval", "200ms") - return []*exec.Cmd{exec.Command("thanos", args...)}, nil + args := defaultRulerFlags(i, dbDir, dbDir) + + for _, addr := range queryAddresses { + args = append(args, "--query", addr) + } + return []Exec{newCmdExec(exec.Command("thanos", args...))}, nil } } -func rulerWithQueryFlags(i int, rules string, queryAddresses ...string) cmdScheduleFunc { - return func(workDir string) ([]*exec.Cmd, error) { +func rulerWithDir(i int, ruleDir string, queryAddresses ...string) cmdScheduleFunc { + return func(workDir string) ([]Exec, error) { dbDir := fmt.Sprintf("%s/data/rule%d", workDir, i) if err := os.MkdirAll(dbDir, 0777); err != nil { - return nil, errors.Wrap(err, "creating ruler dir failed") - } - err := ioutil.WriteFile(dbDir+"/rules.yaml", []byte(rules), 0666) - if err != nil { - return nil, errors.Wrap(err, "creating ruler file failed") + return nil, errors.Wrap(err, "creating ruler dir") } - args := defaultRulerFlags(i, dbDir) + args := defaultRulerFlags(i, dbDir, ruleDir) for _, addr := range queryAddresses { args = append(args, "--query", addr) } - return []*exec.Cmd{exec.Command("thanos", args...)}, nil + return []Exec{newCmdExec(exec.Command("thanos", args...))}, nil } } -func rulerWithFileSD(i int, rules string, queryAddresses ...string) cmdScheduleFunc { - return func(workDir string) ([]*exec.Cmd, error) { +func rulerWithFileSD(i int, rules []string, queryAddresses ...string) cmdScheduleFunc { + return func(workDir string) ([]Exec, error) { dbDir := fmt.Sprintf("%s/data/rule%d", workDir, i) if err := os.MkdirAll(dbDir, 0777); err != nil { - return nil, errors.Wrap(err, "creating ruler dir failed") + return nil, errors.Wrap(err, "creating ruler dir") } - err := ioutil.WriteFile(dbDir+"/rules.yaml", []byte(rules), 0666) - if err != nil { - return nil, errors.Wrap(err, "creating ruler file failed") + for i, rule := range rules { + if err := ioutil.WriteFile(path.Join(dbDir, fmt.Sprintf("/rules-%d.yaml", i)), []byte(rule), 0666); err != nil { + return nil, errors.Wrapf(err, "writing rule %s", path.Join(dbDir, fmt.Sprintf("/rules-%d.yaml", i))) + } } ruleFileSDDir := fmt.Sprintf("%s/data/ruleFileSd%d", workDir, i) if err := os.MkdirAll(ruleFileSDDir, 0777); err != nil { - return nil, errors.Wrap(err, "create ruler filesd dir failed") + return nil, errors.Wrap(err, "create ruler filesd dir") } if err := ioutil.WriteFile(ruleFileSDDir+"/filesd.json", []byte(generateFileSD(queryAddresses)), 0666); err != nil { - return nil, errors.Wrap(err, "creating ruler filesd config failed") + return nil, errors.Wrap(err, "creating ruler filesd config") } - args := append(defaultRulerFlags(i, dbDir), + args := append(defaultRulerFlags(i, dbDir, dbDir), "--query.sd-files", path.Join(ruleFileSDDir, "filesd.json"), "--query.sd-interval", "5s") - return []*exec.Cmd{exec.Command("thanos", args...)}, nil + return []Exec{newCmdExec(exec.Command("thanos", args...))}, nil + } +} + +type sameProcessGRPCServiceExec struct { + i int + stdout io.Writer + stderr io.Writer + + ctx context.Context + cancel context.CancelFunc + srvChan <-chan error + srv *grpc.Server +} + +func (c *sameProcessGRPCServiceExec) Start(stdout io.Writer, stderr io.Writer) error { + c.stderr = stderr + c.stdout = stdout + + if c.ctx != nil { + return errors.New("process already started") + } + c.ctx, c.cancel = context.WithCancel(context.Background()) + + l, err := net.Listen("tcp", fakeStoreAPIGRPC(c.i)) + if err != nil { + return errors.Wrap(err, "listen API address") + } + + srvChan := make(chan error) + go func() { + defer close(srvChan) + if err := c.srv.Serve(l); err != nil { + srvChan <- err + _, _ = c.stderr.Write([]byte(fmt.Sprintf("server failed: %s", err))) + } + + }() + c.srvChan = srvChan + return nil +} + +func (c *sameProcessGRPCServiceExec) Wait() error { + err := <-c.srvChan + if c.ctx.Err() == nil && err != nil { + return err + } + return err +} +func (c *sameProcessGRPCServiceExec) Kill() error { + c.cancel() + c.srv.Stop() + + return nil +} + +func (c *sameProcessGRPCServiceExec) String() string { + return fmt.Sprintf("gRPC service %v on %v", c.i, fakeStoreAPIGRPC(c.i)) +} + +func fakeStoreAPI(i int, svc storepb.StoreServer) cmdScheduleFunc { + return func(_ string) ([]Exec, error) { + srv := grpc.NewServer() + storepb.RegisterStoreServer(srv, svc) + + return []Exec{&sameProcessGRPCServiceExec{i: i, srv: srv}}, nil } } func minio(accessKey string, secretKey string) cmdScheduleFunc { - return func(workDir string) ([]*exec.Cmd, error) { + return func(workDir string) ([]Exec, error) { dbDir := fmt.Sprintf("%s/data/minio", workDir) if err := os.MkdirAll(dbDir, 0777); err != nil { @@ -299,7 +394,7 @@ func minio(accessKey string, secretKey string) cmdScheduleFunc { fmt.Sprintf("MINIO_ACCESS_KEY=%s", accessKey), fmt.Sprintf("MINIO_SECRET_KEY=%s", secretKey)) - return []*exec.Cmd{cmd}, nil + return []Exec{newCmdExec(cmd)}, nil } } @@ -384,7 +479,7 @@ func (s *spinupSuite) Exec(t testing.TB, ctx context.Context, testName string) ( }) } - var commands []*exec.Cmd + var commands []Exec for _, cmdFunc := range s.cmdScheduleFuncs { cmds, err := cmdFunc(dir) @@ -398,11 +493,7 @@ func (s *spinupSuite) Exec(t testing.TB, ctx context.Context, testName string) ( // Run go routine for each command. for _, c := range commands { var stderr, stdout bytes.Buffer - c.Stderr = &stderr - c.Stdout = &stdout - - err := c.Start() - if err != nil { + if err := c.Start(&stdout, &stderr); err != nil { // Let already started commands finish. go func() { _ = g.Run() }() return nil, errors.Wrap(err, "failed to start") @@ -410,7 +501,7 @@ func (s *spinupSuite) Exec(t testing.TB, ctx context.Context, testName string) ( cmd := c g.Add(func() error { - id := fmt.Sprintf("%s %s", cmd.Path, cmd.Args[1]) + id := c.String() err := cmd.Wait() @@ -424,7 +515,7 @@ func (s *spinupSuite) Exec(t testing.TB, ctx context.Context, testName string) ( return errors.Wrap(err, id) }, func(error) { // This's accepted scenario to kill a process immediately for sure and run tests as fast as possible. - _ = cmd.Process.Signal(syscall.SIGKILL) + _ = cmd.Kill() }) } @@ -467,12 +558,12 @@ func defaultQuerierFlags(i int, replicaLabel string) []string { } } -func defaultRulerFlags(i int, dbDir string) []string { +func defaultRulerFlags(i int, dbDir string, ruleDir string) []string { return []string{"rule", "--debug.name", fmt.Sprintf("rule-%d", i), "--label", fmt.Sprintf(`replica="%d"`, i), "--data-dir", dbDir, - "--rule-file", path.Join(dbDir, "*.yaml"), + "--rule-file", path.Join(ruleDir, "*.yaml"), "--eval-interval", "1s", "--alertmanagers.url", "http://127.0.0.1:29093", "--grpc-address", rulerGRPC(i), diff --git a/test/e2e/store_gateway_test.go b/test/e2e/store_gateway_test.go index 6c36a963c8..8648dc5b3d 100644 --- a/test/e2e/store_gateway_test.go +++ b/test/e2e/store_gateway_test.go @@ -93,11 +93,22 @@ func TestStoreGatewayQuery(t *testing.T) { default: } - var err error - res, err = promclient.QueryInstant(ctx, nil, urlParse(t, "http://"+queryHTTP(1)), "{a=\"1\"}", time.Now(), false) + var ( + err error + warnings []string + ) + res, warnings, err = promclient.QueryInstant(ctx, nil, urlParse(t, "http://"+queryHTTP(1)), "{a=\"1\"}", time.Now(), promclient.QueryOptions{ + Deduplicate: false, + }) if err != nil { return err } + + if len(warnings) > 0 { + // we don't expect warnings. + return errors.Errorf("unexpected warnings %s", warnings) + } + if len(res) != 2 { return errors.Errorf("unexpected result size %d", len(res)) } @@ -127,11 +138,22 @@ func TestStoreGatewayQuery(t *testing.T) { default: } - var err error - res, err = promclient.QueryInstant(ctx, nil, urlParse(t, "http://"+queryHTTP(1)), "{a=\"1\"}", time.Now(), true) + var ( + err error + warnings []string + ) + res, warnings, err = promclient.QueryInstant(ctx, nil, urlParse(t, "http://"+queryHTTP(1)), "{a=\"1\"}", time.Now(), promclient.QueryOptions{ + Deduplicate: true, + }) if err != nil { return err } + + if len(warnings) > 0 { + // we don't expect warnings. + return errors.Errorf("unexpected warnings %s", warnings) + } + if len(res) != 1 { return errors.Errorf("unexpected result size %d", len(res)) } From a606f2d62c3a81be51e0a179cd199ca8a6848560 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Tue, 2 Apr 2019 15:26:39 +0100 Subject: [PATCH 40/43] makefile: Excluded tencent doc check. (#1004) Signed-off-by: Bartek Plotka --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ac9c472017..88b9ebb2ff 100644 --- a/Makefile +++ b/Makefile @@ -118,7 +118,7 @@ docs: $(EMBEDMD) build .PHONY: check-docs check-docs: $(EMBEDMD) $(LICHE) build @EMBEDMD_BIN="$(EMBEDMD)" scripts/genflagdocs.sh check - @$(LICHE) --recursive docs --document-root . + @$(LICHE) --recursive docs --exclude "cloud.tencent.com" --document-root . # errcheck performs static analysis and returns error if any of the errors is not checked. .PHONY: errcheck From 6a60f4b4de030ad3f5cde0e149bc3794ed88c0ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Wed, 3 Apr 2019 09:20:29 +0100 Subject: [PATCH 41/43] rule: Made thanos rules generatable; made generated struct Prometheus compatible. (#1005) Signed-off-by: Bartek Plotka --- pkg/rule/rule.go | 33 ++++++++++++++++++++-------- pkg/rule/rule_test.go | 50 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 9 deletions(-) diff --git a/pkg/rule/rule.go b/pkg/rule/rule.go index 11d988a897..3b70ed1f80 100644 --- a/pkg/rule/rule.go +++ b/pkg/rule/rule.go @@ -21,13 +21,11 @@ const tmpRuleDir = ".tmp-rules" type Group struct { *rules.Group - PartialResponseStrategy storepb.PartialResponseStrategy } type AlertingRule struct { *rules.AlertingRule - PartialResponseStrategy storepb.PartialResponseStrategy } @@ -37,8 +35,7 @@ type RuleGroups struct { type RuleGroup struct { rulefmt.RuleGroup - - PartialResponseStrategy storepb.PartialResponseStrategy `yaml:"partial_response_strategy"` + PartialResponseStrategy *storepb.PartialResponseStrategy } type Managers map[storepb.PartialResponseStrategy]*rules.Manager @@ -88,11 +85,29 @@ func (r *RuleGroup) UnmarshalYAML(unmarshal func(interface{}) error) error { p = storepb.PartialResponseStrategy_value[storepb.PartialResponseStrategy_ABORT.String()] } + ps := storepb.PartialResponseStrategy(p) r.RuleGroup = rg - r.PartialResponseStrategy = storepb.PartialResponseStrategy(p) + r.PartialResponseStrategy = &ps return nil } +func (r RuleGroup) MarshalYAML() (interface{}, error) { + var ps *string + if r.PartialResponseStrategy != nil { + str := r.PartialResponseStrategy.String() + ps = &str + } + + rs := struct { + RuleGroup rulefmt.RuleGroup `yaml:",inline"` + PartialResponseStrategy *string `yaml:"partial_response_strategy,omitempty"` + }{ + RuleGroup: r.RuleGroup, + PartialResponseStrategy: ps, + } + return rs, nil +} + // Update updates rules from given files to all managers we hold. We decide which groups should go where, based on // special field in RuleGroup file. func (m *Managers) Update(dataDir string, evalInterval time.Duration, files []string) error { @@ -125,12 +140,12 @@ func (m *Managers) Update(dataDir string, evalInterval time.Duration, files []st // rules.Manager. The problem is that it uses yaml.UnmarshalStrict for some reasons. mapped := map[storepb.PartialResponseStrategy]*rulefmt.RuleGroups{} for _, rg := range rg.Groups { - if _, ok := mapped[rg.PartialResponseStrategy]; !ok { - mapped[rg.PartialResponseStrategy] = &rulefmt.RuleGroups{} + if _, ok := mapped[*rg.PartialResponseStrategy]; !ok { + mapped[*rg.PartialResponseStrategy] = &rulefmt.RuleGroups{} } - mapped[rg.PartialResponseStrategy].Groups = append( - mapped[rg.PartialResponseStrategy].Groups, + mapped[*rg.PartialResponseStrategy].Groups = append( + mapped[*rg.PartialResponseStrategy].Groups, rg.RuleGroup, ) } diff --git a/pkg/rule/rule_test.go b/pkg/rule/rule_test.go index 932a102d15..d469941f84 100644 --- a/pkg/rule/rule_test.go +++ b/pkg/rule/rule_test.go @@ -12,7 +12,9 @@ import ( "github.com/go-kit/kit/log" "github.com/improbable-eng/thanos/pkg/store/storepb" "github.com/improbable-eng/thanos/pkg/testutil" + "github.com/prometheus/prometheus/pkg/rulefmt" "github.com/prometheus/prometheus/rules" + yaml "gopkg.in/yaml.v2" ) func TestUpdate(t *testing.T) { @@ -122,3 +124,51 @@ groups: testutil.Equals(t, "something6", g[2].Name()) testutil.Equals(t, "something7", g[3].Name()) } + +func TestRuleGroupMarshalYAML(t *testing.T) { + const expected = `groups: +- name: something1 + rules: + - alert: some + expr: up +- name: something2 + rules: + - alert: some + expr: up + partial_response_strategy: ABORT +` + + a := storepb.PartialResponseStrategy_ABORT + var input = RuleGroups{ + Groups: []RuleGroup{ + { + RuleGroup: rulefmt.RuleGroup{ + Name: "something1", + Rules: []rulefmt.Rule{ + { + Alert: "some", + Expr: "up", + }, + }, + }, + }, + { + RuleGroup: rulefmt.RuleGroup{ + Name: "something2", + Rules: []rulefmt.Rule{ + { + Alert: "some", + Expr: "up", + }, + }, + }, + PartialResponseStrategy: &a, + }, + }, + } + + b, err := yaml.Marshal(input) + testutil.Ok(t, err) + + testutil.Equals(t, expected, string(b)) +} From 4fd0adca6686fae17768ad099bd143df61014825 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Wed, 3 Apr 2019 09:21:04 +0100 Subject: [PATCH 42/43] Bump github.com/NYTimes/gziphandler from 1.0.1 to 1.1.1 (#1000) Bumps [github.com/NYTimes/gziphandler](https://github.com/NYTimes/gziphandler) from 1.0.1 to 1.1.1. - [Release notes](https://github.com/NYTimes/gziphandler/releases) - [Commits](https://github.com/NYTimes/gziphandler/compare/v1.0.1...v1.1.1) Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 01b4034b4b..b0e865d101 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/improbable-eng/thanos require ( cloud.google.com/go v0.34.0 github.com/Azure/azure-storage-blob-go v0.0.0-20181022225951-5152f14ace1c - github.com/NYTimes/gziphandler v1.0.1 + github.com/NYTimes/gziphandler v1.1.1 github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da github.com/fatih/structtag v1.0.0 github.com/fortytw2/leaktest v1.2.0 diff --git a/go.sum b/go.sum index 645506c5e0..237900b401 100644 --- a/go.sum +++ b/go.sum @@ -13,6 +13,8 @@ github.com/Azure/go-autorest v10.8.1+incompatible h1:u0jVQf+a6k6x8A+sT60l6EY9XZu github.com/Azure/go-autorest v10.8.1+incompatible/go.mod h1:r+4oMnoxhatjLLJ6zxSWATqVooLgysK6ZNox3g/xq24= github.com/NYTimes/gziphandler v1.0.1 h1:iLrQrdwjDd52kHDA5op2UBJFjmOb9g+7scBan4RN8F0= github.com/NYTimes/gziphandler v1.0.1/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= +github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I= +github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c= github.com/OneOfOne/xxhash v1.2.2 h1:KMrpdQIwFcEqXDklaen+P1axHaj9BSKzvpUUfnHldSE= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/StackExchange/wmi v0.0.0-20180725035823-b12b22c5341f/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg= @@ -39,6 +41,7 @@ github.com/cockroachdb/cockroach v0.0.0-20170608034007-84bc9597164f/go.mod h1:xe github.com/cockroachdb/cockroach-go v0.0.0-20181001143604-e0a95dfd547c/go.mod h1:XGLbWH/ujMcbPbhZq52Nv6UrCghb1yGn//133kEsvDk= github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd/go.mod h1:sE/e/2PUdi/liOCUjSTXgM1o87ZssimdTWN964YiIeI= github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dgrijalva/jwt-go v0.0.0-20161101193935-9ed569b5d1ac h1:xrQJVwQCGqDvOO7/0+RyIq5J2M3Q4ZF7Ug/BMQtML1E= @@ -266,8 +269,10 @@ github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72 h1:qLC7fQah7D6K1 github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/spf13/pflag v1.0.3 h1:zPAT6CGy6wXeQ7NtTnaTerfKOsV6V6F8agHXFiazDkg= github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= go.opencensus.io v0.18.0/go.mod h1:vKdFvxhtzZ9onBp9VKHK8z/sRpBMnKAsufL7wlDrCOA= go.opencensus.io v0.19.0 h1:+jrnNy8MR4GZXvwF9PEuSyHxA4NaTf6601oNRwCSXq0= go.opencensus.io v0.19.0/go.mod h1:AYeH0+ZxYyghG8diqaaIq/9P3VgCCt5GF2ldCY4dkFg= From c0bfe35a9c4ae006e0da40d43b5cf34065afc2d6 Mon Sep 17 00:00:00 2001 From: Adrien F Date: Mon, 8 Apr 2019 16:02:50 +0200 Subject: [PATCH 43/43] query: cleanup store statuses as they come and go (#910) Signed-off-by: Adrien Fillon --- CHANGELOG.md | 1 + cmd/thanos/query.go | 7 ++- docs/components/query.md | 3 ++ pkg/query/storeset.go | 71 +++++++++++++++++++++---------- pkg/query/storeset_test.go | 8 ++-- pkg/ui/bindata.go | 82 ++++++++++++++++++------------------ pkg/ui/query.go | 24 ++++++++++- pkg/ui/templates/stores.html | 6 +-- 8 files changed, 129 insertions(+), 73 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 03627e3283..84514b9c83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel ### Added - [#811](https://github.com/improbable-eng/thanos/pull/811) Remote write receiver - [#798](https://github.com/improbable-eng/thanos/pull/798) Ability to limit the maximum concurrent about of Series() calls in Thanos Store and the maximum amount of samples. +- [#910](https://github.com/improbable-eng/thanos/pull/910) Query's stores UI page is now sorted by type and old DNS or File SD stores are removed after 5 minutes (configurable via the new `--store.unhealthy-timeout=5m` flag). New options: diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index fd360e60af..00501bf17d 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -85,7 +85,9 @@ func registerQuery(m map[string]setupFunc, app *kingpin.Application, name string dnsSDInterval := modelDuration(cmd.Flag("store.sd-dns-interval", "Interval between DNS resolutions."). Default("30s")) - enableAutodownsampling := cmd.Flag("query.auto-downsampling", "Enable automatic adjustment (step / 5) to what source of data should be used in store gateways if no max_source_resolution param is specified. "). + unhealthyStoreTimeout := modelDuration(cmd.Flag("store.unhealthy-timeout", "Timeout before an unhealthy store is cleaned from the store UI page.").Default("5m")) + + enableAutodownsampling := cmd.Flag("query.auto-downsampling", "Enable automatic adjustment (step / 5) to what source of data should be used in store gateways if no max_source_resolution param is specified."). Default("false").Bool() enablePartialResponse := cmd.Flag("query.partial-response", "Enable partial response for queries if no partial_response param is specified."). @@ -150,6 +152,7 @@ func registerQuery(m map[string]setupFunc, app *kingpin.Application, name string *enablePartialResponse, fileSD, time.Duration(*dnsSDInterval), + time.Duration(*unhealthyStoreTimeout), ) } } @@ -266,6 +269,7 @@ func runQuery( enablePartialResponse bool, fileSD *file.Discovery, dnsSDInterval time.Duration, + unhealthyStoreTimeout time.Duration, ) error { // TODO(bplotka in PR #513 review): Move arguments into struct. duplicatedStores := prometheus.NewCounter(prometheus.CounterOpts{ @@ -310,6 +314,7 @@ func runQuery( return specs }, dialOpts, + unhealthyStoreTimeout, ) proxy = store.NewProxyStore(logger, stores.Get, component.Query, selectorLset, storeResponseTimeout) queryableCreator = query.NewQueryableCreator(logger, proxy, replicaLabel) diff --git a/docs/components/query.md b/docs/components/query.md index cd38da1053..0558f47587 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -293,6 +293,9 @@ Flags: is used as a resync fallback. --store.sd-dns-interval=30s Interval between DNS resolutions. + --store.unhealthy-timeout=5m + Timeout before an unhealthy store is cleaned + from the store UI page. --query.auto-downsampling Enable automatic adjustment (step / 5) to what source of data should be used in store gateways if no max_source_resolution param is specified. diff --git a/pkg/query/storeset.go b/pkg/query/storeset.go index a660e6566e..7f8344116d 100644 --- a/pkg/query/storeset.go +++ b/pkg/query/storeset.go @@ -81,12 +81,13 @@ type StoreSet struct { dialOpts []grpc.DialOption gRPCInfoCallTimeout time.Duration - mtx sync.RWMutex - storesStatusesMtx sync.RWMutex - stores map[string]*storeRef - storeNodeConnections prometheus.Gauge - externalLabelStores map[string]int - storeStatuses map[string]*StoreStatus + mtx sync.RWMutex + storesStatusesMtx sync.RWMutex + stores map[string]*storeRef + storeNodeConnections prometheus.Gauge + externalLabelStores map[string]int + storeStatuses map[string]*StoreStatus + unhealthyStoreTimeout time.Duration } type storeSetNodeCollector struct { @@ -118,6 +119,7 @@ func NewStoreSet( reg *prometheus.Registry, storeSpecs func() []StoreSpec, dialOpts []grpc.DialOption, + unhealthyStoreTimeout time.Duration, ) *StoreSet { storeNodeConnections := prometheus.NewGauge(prometheus.GaugeOpts{ Name: "thanos_store_nodes_grpc_connections", @@ -135,14 +137,15 @@ func NewStoreSet( } ss := &StoreSet{ - logger: log.With(logger, "component", "storeset"), - storeSpecs: storeSpecs, - dialOpts: dialOpts, - storeNodeConnections: storeNodeConnections, - gRPCInfoCallTimeout: 10 * time.Second, - externalLabelStores: map[string]int{}, - stores: make(map[string]*storeRef), - storeStatuses: make(map[string]*StoreStatus), + logger: log.With(logger, "component", "storeset"), + storeSpecs: storeSpecs, + dialOpts: dialOpts, + storeNodeConnections: storeNodeConnections, + gRPCInfoCallTimeout: 10 * time.Second, + externalLabelStores: map[string]int{}, + stores: make(map[string]*storeRef), + storeStatuses: make(map[string]*StoreStatus), + unhealthyStoreTimeout: unhealthyStoreTimeout, } storeNodeCollector := &storeSetNodeCollector{externalLabelOccurrences: ss.externalLabelOccurrences} @@ -255,6 +258,7 @@ func (s *StoreSet) Update(ctx context.Context) { } s.externalLabelStores = externalLabelStores s.storeNodeConnections.Set(float64(len(s.stores))) + s.cleanUpStoreStatuses() } func (s *StoreSet) getHealthyStores(ctx context.Context) map[string]*storeRef { @@ -345,16 +349,23 @@ func (s *StoreSet) updateStoreStatus(store *storeRef, err error) { s.storesStatusesMtx.Lock() defer s.storesStatusesMtx.Unlock() - now := time.Now() - s.storeStatuses[store.addr] = &StoreStatus{ - Name: store.addr, - LastError: err, - LastCheck: now, - Labels: store.labels, - StoreType: store.storeType, - MinTime: store.minTime, - MaxTime: store.maxTime, + status := StoreStatus{Name: store.addr} + prev, ok := s.storeStatuses[store.addr] + if ok { + status = *prev + } + + status.LastError = err + status.LastCheck = time.Now() + + if err == nil { + status.Labels = store.labels + status.StoreType = store.storeType + status.MinTime = store.minTime + status.MaxTime = store.maxTime } + + s.storeStatuses[store.addr] = &status } func (s *StoreSet) GetStoreStatus() []StoreStatus { @@ -401,3 +412,17 @@ func (s *StoreSet) Close() { st.close() } } + +func (s *StoreSet) cleanUpStoreStatuses() { + s.storesStatusesMtx.Lock() + defer s.storesStatusesMtx.Unlock() + + now := time.Now() + for addr, status := range s.storeStatuses { + if _, ok := s.stores[addr]; !ok { + if now.Sub(status.LastCheck) >= s.unhealthyStoreTimeout { + delete(s.storeStatuses, addr) + } + } + } +} diff --git a/pkg/query/storeset_test.go b/pkg/query/storeset_test.go index 1e2058d52f..bced0d2424 100644 --- a/pkg/query/storeset_test.go +++ b/pkg/query/storeset_test.go @@ -142,7 +142,7 @@ func TestStoreSet_AllAvailable_ThenDown(t *testing.T) { // Testing if duplicates can cause weird results. initialStoreAddr = append(initialStoreAddr, initialStoreAddr[0]) - storeSet := NewStoreSet(nil, nil, specsFromAddrFunc(initialStoreAddr), testGRPCOpts) + storeSet := NewStoreSet(nil, nil, specsFromAddrFunc(initialStoreAddr), testGRPCOpts, time.Minute) storeSet.gRPCInfoCallTimeout = 2 * time.Second defer storeSet.Close() @@ -185,7 +185,7 @@ func TestStoreSet_StaticStores_OneAvailable(t *testing.T) { initialStoreAddr := st.StoreAddresses() st.CloseOne(initialStoreAddr[0]) - storeSet := NewStoreSet(nil, nil, specsFromAddrFunc(initialStoreAddr), testGRPCOpts) + storeSet := NewStoreSet(nil, nil, specsFromAddrFunc(initialStoreAddr), testGRPCOpts, time.Minute) storeSet.gRPCInfoCallTimeout = 2 * time.Second defer storeSet.Close() @@ -215,7 +215,7 @@ func TestStoreSet_StaticStores_NoneAvailable(t *testing.T) { st.CloseOne(initialStoreAddr[0]) st.CloseOne(initialStoreAddr[1]) - storeSet := NewStoreSet(nil, nil, specsFromAddrFunc(initialStoreAddr), testGRPCOpts) + storeSet := NewStoreSet(nil, nil, specsFromAddrFunc(initialStoreAddr), testGRPCOpts, time.Minute) storeSet.gRPCInfoCallTimeout = 2 * time.Second // Should not matter how many of these we run. @@ -259,7 +259,7 @@ func TestStoreSet_AllAvailable_BlockExtLsetDuplicates(t *testing.T) { initialStoreAddr := st.StoreAddresses() - storeSet := NewStoreSet(nil, nil, specsFromAddrFunc(initialStoreAddr), testGRPCOpts) + storeSet := NewStoreSet(nil, nil, specsFromAddrFunc(initialStoreAddr), testGRPCOpts, time.Minute) storeSet.gRPCInfoCallTimeout = 2 * time.Second defer storeSet.Close() diff --git a/pkg/ui/bindata.go b/pkg/ui/bindata.go index 5d3444887d..f245abf856 100644 --- a/pkg/ui/bindata.go +++ b/pkg/ui/bindata.go @@ -122,7 +122,7 @@ func pkgUiTemplates_baseHtml() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/templates/_base.html", size: 1065, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/templates/_base.html", size: 1065, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -142,7 +142,7 @@ func pkgUiTemplatesAlertsHtml() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/templates/alerts.html", size: 2534, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/templates/alerts.html", size: 2534, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -162,7 +162,7 @@ func pkgUiTemplatesFlagsHtml() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/templates/flags.html", size: 433, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/templates/flags.html", size: 433, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -182,7 +182,7 @@ func pkgUiTemplatesGraphHtml() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/templates/graph.html", size: 2061, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/templates/graph.html", size: 2061, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -202,7 +202,7 @@ func pkgUiTemplatesQuery_menuHtml() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/templates/query_menu.html", size: 1479, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/templates/query_menu.html", size: 1479, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -222,7 +222,7 @@ func pkgUiTemplatesRule_menuHtml() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/templates/rule_menu.html", size: 1021, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/templates/rule_menu.html", size: 1021, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -242,7 +242,7 @@ func pkgUiTemplatesRulesHtml() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/templates/rules.html", size: 1103, mode: os.FileMode(420), modTime: time.Unix(1551875947, 0)} + info := bindataFileInfo{name: "pkg/ui/templates/rules.html", size: 1103, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -262,12 +262,12 @@ func pkgUiTemplatesStatusHtml() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/templates/status.html", size: 1286, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/templates/status.html", size: 1286, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } -var _pkgUiTemplatesStoresHtml = []byte("\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\xff\x9c\x55\x41\x6b\xe3\x3c\x10\xbd\xe7\x57\x0c\xe2\x3b\x7e\x89\xa1\x97\x85\xc5\xc9\xb2\x2c\x85\x3d\xb4\x65\xa1\xdd\x5e\x17\xc5\x9a\xc4\xa2\x8a\x64\x34\xe3\x36\x41\xe8\xbf\x2f\x72\xec\xc4\x6e\x9c\x34\xdd\x8b\xb1\x66\x9e\x34\x6f\x66\x9e\x46\x21\x28\x5c\x69\x8b\x20\x4a\x94\x4a\xc4\x38\xc9\x8d\xb6\x2f\xc0\xbb\x0a\xe7\x82\x71\xcb\x59\x41\x24\xc0\xa3\x99\x0b\xe2\x9d\x41\x2a\x11\x59\x40\xe9\x71\x35\x17\x21\x40\x25\xb9\xfc\xe5\x71\xa5\xb7\x10\x63\x46\x2c\x59\x17\x69\x4f\xe6\x6b\x83\x34\x2b\x88\xbe\xbd\xce\x43\x80\x65\xad\x8d\x7a\x46\x4f\xda\x59\x88\x51\x2c\x26\x21\xa0\x55\x31\x4e\x26\x47\x12\x85\xb3\x8c\x96\x1b\x1e\x4a\xbf\x42\x61\x24\xd1\xbc\x31\x4b\x6d\xd1\x4f\x57\xa6\xd6\x4a\x2c\x26\x00\x00\x21\x78\x69\xd7\x08\xff\x11\x3b\x8f\x4f\xbb\x0a\xff\x6f\xff\x09\xbe\xce\x61\x16\x63\x0b\xd3\xab\x1e\xa6\xb5\xe6\xe5\xcd\x22\x84\xa3\x79\xf6\xc8\x5e\xdb\x75\x8c\x79\x56\xde\x74\xe7\xa3\xa1\x3e\xfe\xb7\x7d\xb1\xee\xcd\x42\xc2\x0f\x60\x4d\x1a\x0d\x8a\xe5\xd2\x60\x47\x7b\xbf\x68\xbe\xd3\xa5\xf3\x0a\x3d\x76\xdc\xf7\xe0\x54\xf3\xfe\xda\x1f\x17\x2d\x60\x71\x6b\x55\xe5\xb4\xe5\x3c\xe3\xf2\xd4\xfb\xc8\x92\x6b\x1a\xf7\x7d\xb7\xd6\xd5\xb6\x40\x05\x77\x72\x89\xe6\x0c\xea\x5e\x5b\x78\xd2\x1b\x3c\xe3\x95\xdb\x0b\xde\x3b\x49\x0c\x3f\x51\x1a\x2e\xe1\x47\x89\xc5\xcb\x05\xd8\x3d\x12\xc9\xf5\xbb\x83\xf2\xac\x9f\x72\xf2\xbd\x2b\xc8\xd2\xa9\xdd\x71\x3d\x6c\x78\xea\x71\xdb\xee\xb6\xfa\x67\x8a\xa8\x0e\x8d\x9e\x3d\xc8\x0d\xa6\x16\xb3\x3a\x01\x75\x4d\x4b\x0a\x46\x31\x74\x43\x27\x23\xeb\xb8\x8d\x39\x4b\x59\xdd\x7a\xef\x7c\x2f\xf8\xe1\x38\xaa\xa4\xed\x0e\x94\x06\x3d\x43\xf3\x9d\x52\x5d\x14\x48\x04\x4d\x90\x3f\xda\x2a\x5d\x48\x76\x1e\xd2\x45\x9b\xd6\x55\x85\xbe\x90\x34\x16\xbd\xae\x4e\x83\x64\x29\xca\x18\xd1\x9e\x6c\xaf\x62\xa5\x52\x55\xfd\xe7\x49\x29\xf7\x66\x3f\x43\xeb\x70\x4d\x8e\xd8\x91\x46\x0c\x0d\x87\x9e\x9b\x24\xe2\x63\xcf\x67\x7b\x51\x7f\x94\xe6\x7e\x57\xf3\x9d\x56\x5e\x6f\xa4\xdf\x89\x24\x87\xc6\xd2\xca\x21\x8d\xb1\xd6\xf0\x2c\x4d\x8d\x31\x8a\xb1\x24\xae\x4f\x20\x84\x95\xf3\x1b\xc9\xe9\xe6\x10\xcb\x4d\xd5\x71\xbe\xd7\x36\xd9\xce\x28\xf0\xc2\x3e\xb9\xbd\xbc\x8f\xb4\x2d\xb0\xaf\xcc\xe6\x3a\xc6\x08\x72\xed\xae\x28\x32\x0c\xc7\xe4\x45\x6d\x9f\x94\xf8\x63\x25\x8d\x48\x67\x1f\xf1\xda\x70\xff\x2c\xa9\xe1\x80\x39\xb9\x19\x63\xc3\x02\x0a\x67\x52\xb8\xb9\xf8\x32\xc2\xfb\xc1\x41\xfb\xc2\x78\x5c\x6b\xe2\x34\xd2\x3f\x13\x7f\xc0\x37\xcf\x7a\x03\x2e\xcf\x9a\x87\x62\xe4\xe9\x69\xbc\xcb\xfe\xa0\xec\x3d\x8d\xfd\xea\xbf\x49\x6f\xb5\x5d\x8b\xc5\x18\xcb\x3c\x53\xfa\x75\xf8\x62\xb5\xa6\x6e\xf9\x37\x00\x00\xff\xff\x71\x7e\x93\x06\x0c\x08\x00\x00") +var _pkgUiTemplatesStoresHtml = []byte("\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\xff\x9c\x55\x51\x6b\xeb\x3a\x0c\x7e\xef\xaf\x10\x61\xaf\x6d\x60\x2f\x17\x2e\x69\x2f\x97\xcb\xe0\x3e\x6c\xe3\x40\x77\xf6\x7a\x70\x6d\xb5\x31\x73\xed\x60\x29\x5b\x8b\xf1\x7f\x3f\x38\x4d\xda\x64\x4d\xbb\xee\xbc\x04\x2c\x7d\x8a\x3e\x49\x9f\xe5\x10\x14\xae\xb5\x45\xc8\x4a\x14\x2a\x8b\x71\x52\x18\x6d\xdf\x80\xf7\x15\xce\x33\xc6\x1d\xe7\x92\x28\x03\x8f\x66\x9e\x11\xef\x0d\x52\x89\xc8\x19\x94\x1e\xd7\xf3\x2c\x04\xa8\x04\x97\x3f\x3c\xae\xf5\x0e\x62\xcc\x89\x05\x6b\x99\x62\x72\x5f\x1b\xa4\x99\x24\xfa\xe7\x7d\x1e\x02\xac\x6a\x6d\xd4\x2b\x7a\xd2\xce\x42\x8c\xd9\x62\x12\x02\x5a\x15\xe3\x64\x72\x22\x21\x9d\x65\xb4\xdc\xf0\x50\xfa\x1d\xa4\x11\x44\xf3\xc6\x2c\xb4\x45\x3f\x5d\x9b\x5a\xab\x6c\x31\x01\x00\x08\xc1\x0b\xbb\x41\xb8\x23\x76\x1e\x5f\xf6\x15\xc2\xdf\x73\x98\x2d\x5d\xed\x25\x52\x8c\x2d\x48\xaf\x7b\x88\xd6\x5a\x94\xf7\x8b\x10\x58\xb3\xe9\x87\xcf\x96\xec\xb5\xdd\xc4\x58\xe4\xe5\x7d\x97\x03\x0d\xf5\xa3\x7e\xda\x37\xeb\x3e\x2c\x24\xfc\x00\xd6\x94\xd2\xa0\x58\xac\x0c\x76\xd4\x0f\x87\xe6\x3b\x5d\x39\xaf\xd0\x63\xc7\xff\x00\x4e\x7d\xef\x9f\xfd\xe9\xd0\x02\x16\x0f\x56\x55\x4e\x5b\x2e\x72\x2e\xcf\xbd\x4b\x16\x5c\xd3\xb8\xef\x5f\x6b\x5d\x6d\x25\x2a\x78\x14\x2b\x34\x17\x50\x4f\xda\xc2\x8b\xde\xe2\x05\xaf\xd8\x5d\xf1\x3e\x0a\x62\xf8\x1f\x85\xe1\x12\xfe\x2b\x51\xbe\x5d\x81\x3d\x21\x91\xd8\x7c\xfa\x51\x91\xf7\x4b\x4e\xbe\x4f\x0d\x59\x39\xb5\x3f\x9d\x87\x43\x4f\x03\xd7\x56\xe1\x0e\xee\x66\xcb\x64\xa0\xf3\x59\x5f\x68\xab\x5a\x84\x70\xc0\xce\x9e\xc5\x16\xd3\xd0\x59\x9d\x81\xba\x31\x26\x5d\x63\x36\x74\x43\x27\x2f\xeb\xb8\x4d\x3b\x4b\x75\x3e\x78\xef\x7c\x2f\xf9\xf1\x77\x54\x09\xdb\xfd\x50\x18\xf4\x0c\xcd\x77\x4a\xb5\x94\x48\x04\x4d\x92\x5f\xda\x2a\x2d\x05\x3b\x0f\xe9\xfa\x4d\xeb\xaa\x42\x2f\x05\x8d\x65\xaf\xab\xf3\x24\x79\xca\x32\x46\xb4\x27\xe4\x9b\x58\xa9\xd4\x67\xff\x7d\x52\xca\x7d\xd8\xef\xd0\x3a\x5e\x9c\x13\x76\x64\x10\x43\xc3\x51\x05\x26\xc9\x3a\xa9\xe0\xd8\xff\x24\xf3\xaf\xca\x3c\x44\x35\xdf\x69\xe5\xf5\x56\xf8\x7d\x96\xe4\xd0\x58\x5a\x39\xa4\xe5\xd6\x1a\x5e\x85\xa9\x31\xc6\x6c\xac\x88\xdb\x0b\x08\x61\xed\xfc\x56\x70\xba\x4b\xc4\x62\x5b\x75\x9c\x9f\xb4\x4d\xb6\x0b\x0a\xbc\x12\x27\x76\xd7\xe3\x48\x5b\x89\x7d\x65\x36\x17\x34\x46\x10\x1b\x77\x43\x93\x61\xb8\x3e\xaf\x6a\xfb\xac\xc5\x5f\x2b\x69\x44\x3a\x87\x8c\xb7\xa6\xfb\x63\x49\x0d\x57\xce\xd9\xcd\x18\x5b\x16\x20\x9d\x49\xe9\xe6\xd9\x5f\x23\xbc\x9f\x1d\xd0\x61\xfb\x78\xdc\x68\xe2\xb4\xe4\xbf\x93\x7f\xc0\xb7\xc8\x7b\x2b\xaf\xc8\x9b\xa7\x63\xe4\x31\x6a\xbc\xab\xfe\xea\xec\x3d\x98\xfd\xee\x7f\x08\x6f\xb5\xdd\x64\x8b\x31\x96\x45\xae\xf4\xfb\xf0\x0d\x6b\x4d\xdd\xf1\x77\x00\x00\x00\xff\xff\x68\x27\x86\x45\x22\x08\x00\x00") func pkgUiTemplatesStoresHtmlBytes() ([]byte, error) { return bindataRead( @@ -282,7 +282,7 @@ func pkgUiTemplatesStoresHtml() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/templates/stores.html", size: 2060, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/templates/stores.html", size: 2082, mode: os.FileMode(420), modTime: time.Unix(1552832748, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -302,7 +302,7 @@ func pkgUiStaticCssAlertsCss() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/css/alerts.css", size: 383, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/css/alerts.css", size: 383, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -322,7 +322,7 @@ func pkgUiStaticCssGraphCss() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/css/graph.css", size: 3363, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/css/graph.css", size: 3363, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -342,7 +342,7 @@ func pkgUiStaticCssPrometheusCss() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/css/prometheus.css", size: 322, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/css/prometheus.css", size: 322, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -362,7 +362,7 @@ func pkgUiStaticCssRulesCss() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/css/rules.css", size: 190, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/css/rules.css", size: 190, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -382,7 +382,7 @@ func pkgUiStaticImgAjaxLoaderGif() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/img/ajax-loader.gif", size: 847, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/img/ajax-loader.gif", size: 847, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -402,7 +402,7 @@ func pkgUiStaticImgFaviconIco() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/img/favicon.ico", size: 15886, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/img/favicon.ico", size: 15886, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -422,7 +422,7 @@ func pkgUiStaticJsAlertsJs() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/js/alerts.js", size: 1152, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/js/alerts.js", size: 1152, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -442,7 +442,7 @@ func pkgUiStaticJsGraphJs() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/js/graph.js", size: 32282, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/js/graph.js", size: 32282, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -462,7 +462,7 @@ func pkgUiStaticJsGraph_templateHandlebar() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/js/graph_template.handlebar", size: 7611, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/js/graph_template.handlebar", size: 7611, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -482,7 +482,7 @@ func pkgUiStaticVendorBootstrap331CssBootstrapThemeMinCss() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/css/bootstrap-theme.min.css", size: 19835, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/css/bootstrap-theme.min.css", size: 19835, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -502,7 +502,7 @@ func pkgUiStaticVendorBootstrap331CssBootstrapMinCss() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/css/bootstrap.min.css", size: 113498, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/css/bootstrap.min.css", size: 113498, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -522,7 +522,7 @@ func pkgUiStaticVendorBootstrap331FontsGlyphiconsHalflingsRegularEot() (*asset, return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/fonts/glyphicons-halflings-regular.eot", size: 20335, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/fonts/glyphicons-halflings-regular.eot", size: 20335, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -542,7 +542,7 @@ func pkgUiStaticVendorBootstrap331FontsGlyphiconsHalflingsRegularSvg() (*asset, return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/fonts/glyphicons-halflings-regular.svg", size: 62926, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/fonts/glyphicons-halflings-regular.svg", size: 62926, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -562,7 +562,7 @@ func pkgUiStaticVendorBootstrap331FontsGlyphiconsHalflingsRegularTtf() (*asset, return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/fonts/glyphicons-halflings-regular.ttf", size: 41280, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/fonts/glyphicons-halflings-regular.ttf", size: 41280, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -582,7 +582,7 @@ func pkgUiStaticVendorBootstrap331FontsGlyphiconsHalflingsRegularWoff() (*asset, return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/fonts/glyphicons-halflings-regular.woff", size: 23320, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/fonts/glyphicons-halflings-regular.woff", size: 23320, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -602,7 +602,7 @@ func pkgUiStaticVendorBootstrap331JsBootstrapMinJs() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/js/bootstrap.min.js", size: 35601, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/js/bootstrap.min.js", size: 35601, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -622,7 +622,7 @@ func pkgUiStaticVendorBootstrap331JsNpmJs() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/js/npm.js", size: 484, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap-3.3.1/js/npm.js", size: 484, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -642,7 +642,7 @@ func pkgUiStaticVendorBootstrap3TypeaheadBootstrap3TypeaheadMinJs() (*asset, err return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap3-typeahead/bootstrap3-typeahead.min.js", size: 7856, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/bootstrap3-typeahead/bootstrap3-typeahead.min.js", size: 7856, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -662,7 +662,7 @@ func pkgUiStaticVendorEonasdanBootstrapDatetimepickerBootstrapDatetimepickerMinC return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/eonasdan-bootstrap-datetimepicker/bootstrap-datetimepicker.min.css", size: 7771, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/eonasdan-bootstrap-datetimepicker/bootstrap-datetimepicker.min.css", size: 7771, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -682,7 +682,7 @@ func pkgUiStaticVendorEonasdanBootstrapDatetimepickerBootstrapDatetimepickerMinJ return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/eonasdan-bootstrap-datetimepicker/bootstrap-datetimepicker.min.js", size: 48881, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/eonasdan-bootstrap-datetimepicker/bootstrap-datetimepicker.min.js", size: 48881, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -702,7 +702,7 @@ func pkgUiStaticVendorFuzzyFuzzyJs() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/fuzzy/fuzzy.js", size: 5669, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/fuzzy/fuzzy.js", size: 5669, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -722,7 +722,7 @@ func pkgUiStaticVendorJsJqueryHotkeysJs() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/js/jquery.hotkeys.js", size: 4490, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/js/jquery.hotkeys.js", size: 4490, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -742,7 +742,7 @@ func pkgUiStaticVendorJsJqueryMinJs() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/js/jquery.min.js", size: 86671, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/js/jquery.min.js", size: 86671, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -762,7 +762,7 @@ func pkgUiStaticVendorJsJquerySelectionJs() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/js/jquery.selection.js", size: 12881, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/js/jquery.selection.js", size: 12881, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -782,7 +782,7 @@ func pkgUiStaticVendorMomentMomentTimezoneWithDataMinJs() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/moment/moment-timezone-with-data.min.js", size: 184190, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/moment/moment-timezone-with-data.min.js", size: 184190, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -802,7 +802,7 @@ func pkgUiStaticVendorMomentMomentMinJs() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/moment/moment.min.js", size: 61281, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/moment/moment.min.js", size: 61281, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -822,7 +822,7 @@ func pkgUiStaticVendorMustacheMustacheMinJs() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/mustache/mustache.min.js", size: 9528, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/mustache/mustache.min.js", size: 9528, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -842,7 +842,7 @@ func pkgUiStaticVendorRickshawRickshawMinCss() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/rickshaw/rickshaw.min.css", size: 6102, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/rickshaw/rickshaw.min.css", size: 6102, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -862,7 +862,7 @@ func pkgUiStaticVendorRickshawRickshawMinJs() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/rickshaw/rickshaw.min.js", size: 76322, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/rickshaw/rickshaw.min.js", size: 76322, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -882,7 +882,7 @@ func pkgUiStaticVendorRickshawVendorD3LayoutMinJs() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/rickshaw/vendor/d3.layout.min.js", size: 17514, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/rickshaw/vendor/d3.layout.min.js", size: 17514, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -902,7 +902,7 @@ func pkgUiStaticVendorRickshawVendorD3V3Js() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "pkg/ui/static/vendor/rickshaw/vendor/d3.v3.js", size: 144718, mode: os.FileMode(420), modTime: time.Unix(1551875090, 0)} + info := bindataFileInfo{name: "pkg/ui/static/vendor/rickshaw/vendor/d3.v3.js", size: 144718, mode: os.FileMode(420), modTime: time.Unix(1552065161, 0)} a := &asset{bytes: bytes, info: info} return a, nil } diff --git a/pkg/ui/query.go b/pkg/ui/query.go index 969f07e596..fdf9d750b5 100644 --- a/pkg/ui/query.go +++ b/pkg/ui/query.go @@ -5,6 +5,7 @@ import ( "net/http" "os" "path" + "sort" "strings" "time" @@ -122,7 +123,28 @@ func (q *Query) stores(w http.ResponseWriter, r *http.Request) { for _, status := range q.storeSet.GetStoreStatus() { statuses[status.StoreType] = append(statuses[status.StoreType], status) } - q.executeTemplate(w, "stores.html", prefix, statuses) + + sources := make([]component.StoreAPI, 0, len(statuses)) + for k := range statuses { + sources = append(sources, k) + } + sort.Slice(sources, func(i int, j int) bool { + if sources[i] == nil { + return false + } + if sources[j] == nil { + return true + } + return sources[i].String() < sources[j].String() + }) + + q.executeTemplate(w, "stores.html", prefix, struct { + Stores map[component.StoreAPI][]query.StoreStatus + Sources []component.StoreAPI + }{ + Stores: statuses, + Sources: sources, + }) } func (q *Query) flags(w http.ResponseWriter, r *http.Request) { diff --git a/pkg/ui/templates/stores.html b/pkg/ui/templates/stores.html index 7750a1d1a0..a499cc16b7 100644 --- a/pkg/ui/templates/stores.html +++ b/pkg/ui/templates/stores.html @@ -4,9 +4,9 @@ {{define "content"}}
- {{range $storeType, $stores := .}} + {{range $storeType := .Sources}} {{if $storeType}} -

{{$storeType.String}}

+

{{title $storeType.String}}

{{else}}

Unknown Type

{{end}} @@ -23,7 +23,7 @@

Unknown Type

- {{range $store := $stores}} + {{range $store := index $.Stores $storeType}} {{$store.Name}}