Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

querier: Allows single store in case of duplicates. Drop others. #1338

Merged
merged 2 commits into from
Jul 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ We use *breaking* word for marking changes that are not backward compatible (rel

## Unreleased.

- [#1338](https://github.com/improbable-eng/thanos/pull/1338) Querier still warns on store API duplicate, but allows a single one from duplicated set. This is gracefully warn about the problematic logic and not disrupt immediately.

### Fixed

- [#1327](https://github.com/improbable-eng/thanos/pull/1327) `/series` API end-point now properly returns an empty array just like Prometheus if there are no results
Expand Down Expand Up @@ -42,6 +44,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel
To migrate over the old `--gcloudtrace.*` configuration, your tracing configuration should look like this:

```yaml

---
type: STACKDRIVER
config:
Expand All @@ -54,7 +57,9 @@ The other `type` you can use is `JAEGER` now. The `config` keys and values are J

### Changed

- [#1284](https://github.com/improbable-eng/thanos/pull/1284) Add support for multiple label-sets in Info gRPC service. This deprecates the single `Labels` slice of the `InfoResponse`, in a future release backward compatible handling for the single set of Labels will be removed. Upgrading to v0.6.0 or higher is advised.
- [#1284](https://github.com/improbable-eng/thanos/pull/1284) Add support for multiple label-sets in Info gRPC service.
This deprecates the single `Labels` slice of the `InfoResponse`, in a future release backward compatible handling for the single set of Labels will be removed. Upgrading to v0.6.0 or higher is advised.
*breaking* If you run have duplicate queries in your Querier configuration with hierarchical federation of multiple Queries this PR makes Thanos Querier to detect this case and block all duplicates. Refer to 0.6.1 which at least allows for single replica to work.

- [#1314](https://github.com/improbable-eng/thanos/pull/1314) Removes `http_request_duration_microseconds` (Summary) and adds `http_request_duration_seconds` (Histogram) from http server instrumentation used in Thanos APIs and UIs.

Expand All @@ -70,7 +75,7 @@ The other `type` you can use is `JAEGER` now. The `config` keys and values are J

- [#1227](https://github.com/improbable-eng/thanos/pull/1227) Some context handling issues were fixed in Thanos Compact; some unnecessary memory allocations were removed in the hot path of Thanos Store.

- [#1183](https://github.com/improbable-eng/thanos/pull/1183) Compactor now correctly propogates retriable/haltable errors which means that it will not unnecessarily restart if such an error occurs
- [#1183](https://github.com/improbable-eng/thanos/pull/1183) Compactor now correctly propagates retriable/haltable errors which means that it will not unnecessarily restart if such an error occurs

- [#1231](https://github.com/improbable-eng/thanos/pull/1231) Receive now correctly handles SIGINT and closes without deadlocking

Expand Down
26 changes: 15 additions & 11 deletions pkg/query/storeset.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,28 +240,32 @@ func (s *StoreSet) Update(ctx context.Context) {

// Add stores that are not yet in s.stores.
for addr, store := range healthyStores {
// Check if it has some external labels specified.
// No external labels means strictly store gateway or ruler and it is fine to have access to multiple instances of them.
if _, ok := s.stores[addr]; ok {
s.updateStoreStatus(store, nil)
continue
}

// Check if the store has some external labels specified and if any if there are duplicates.
// We warn and exclude duplicates because it unnecessarily puts additional load on querier, network and underlying StoreAPIs and
// it indicates misconfiguration.
//
// Sidecar will error out if it will be configured with empty external labels.
// Note: No external labels means strictly store gateway or ruler and it is fine to have access to multiple instances of them.
// Any other component will error out if it will be configured with empty external labels.
externalLabels := externalLabelsFromStore(store)
storesWithExternalLabels := externalLabelOccurrencesInStores[externalLabels]
if len(store.LabelSets()) > 0 && storesWithExternalLabels != 1 {
if len(store.LabelSets()) > 0 && externalLabelOccurrencesInStores[externalLabels] != 1 {
store.close()
s.updateStoreStatus(store, errors.New(droppingStoreMessage))
level.Warn(s.logger).Log("msg", droppingStoreMessage, "address", addr, "extLset", externalLabels, "duplicates", storesWithExternalLabels)
continue
}

if _, ok := s.stores[addr]; ok {
s.updateStoreStatus(store, nil)
level.Warn(s.logger).Log("msg", droppingStoreMessage, "address", addr, "extLset", externalLabels, "duplicates", externalLabelOccurrencesInStores[externalLabels])
// We don't want to block all of them. Leave one to not disrupt in terms of migration.
externalLabelOccurrencesInStores[externalLabels]--
continue
}

s.stores[addr] = store
s.updateStoreStatus(store, nil)
level.Info(s.logger).Log("msg", "adding new store to query storeset", "address", addr)
}

s.externalLabelOccurrencesInStores = externalLabelOccurrencesInStores
s.storeNodeConnections.Set(float64(len(s.stores)))
s.cleanUpStoreStatuses()
Expand Down
55 changes: 24 additions & 31 deletions pkg/query/storeset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -241,38 +241,35 @@ func TestStoreSet_AllAvailable_BlockExtLsetDuplicates(t *testing.T) {

storeExtLabels := [][]storepb.Label{
{
{
Name: "l1",
Value: "v1",
},
{Name: "l1", Value: "v1"},
},
{
{
Name: "l1",
Value: "v2",
},
{Name: "l1", Value: "v2"},
{Name: "l2", Value: "v3"},
},
{
{
// Duplicate with above.
Name: "l1",
Value: "v2",
},
// Duplicate with above.
{Name: "l1", Value: "v2"},
{Name: "l2", Value: "v3"},
},
// Two store nodes, they don't have ext labels set.
nil,
nil,
{
// Duplicate with two others.
{Name: "l1", Value: "v2"},
{Name: "l2", Value: "v3"},
},
}
st, err := newTestStores(5, storeExtLabels...)

st, err := newTestStores(6, storeExtLabels...)
testutil.Ok(t, err)
defer st.Close()

initialStoreAddr := st.StoreAddresses()

logger := log.NewLogfmtLogger(log.NewSyncWriter(os.Stderr))
logger = level.NewFilter(logger, level.AllowDebug())
logger = log.With(logger, "ts", log.DefaultTimestampUTC, "caller", log.DefaultCaller)
storeSet := NewStoreSet(logger, nil, specsFromAddrFunc(initialStoreAddr), testGRPCOpts, time.Minute)
storeSet := NewStoreSet(logger, nil, specsFromAddrFunc(st.StoreAddresses()), testGRPCOpts, time.Minute)
storeSet.gRPCInfoCallTimeout = 2 * time.Second
defer storeSet.Close()

Expand All @@ -282,12 +279,9 @@ func TestStoreSet_AllAvailable_BlockExtLsetDuplicates(t *testing.T) {
storeSet.Update(context.Background())
storeSet.Update(context.Background())

storeNum := len(storeSet.stores)
expectedStoreNum := 5 - 2
testutil.Assert(t, storeNum == expectedStoreNum, fmt.Sprintf("all services should respond just fine, but we expect duplicates being blocked. Expected %d stores, got %d", expectedStoreNum, storeNum))
testutil.Assert(t, len(storeSet.stores) == 4, fmt.Sprintf("all services should respond just fine, but we expect duplicates being blocked. Expected %d stores, got %d", 4, len(storeSet.stores)))

// Sort result to be able to compare.

var existingStoreLabels [][]storepb.Label
for _, store := range storeSet.stores {
for _, ls := range store.LabelSets() {
Expand All @@ -298,14 +292,13 @@ func TestStoreSet_AllAvailable_BlockExtLsetDuplicates(t *testing.T) {
return len(existingStoreLabels[i]) > len(existingStoreLabels[j])
})

var i int
for _, lset := range existingStoreLabels {
testutil.Equals(t, storeExtLabels[i], lset)

i++
if i == 1 {
// Store 1 and 2 should be blocked, so fast forward.
i += 2
}
}
testutil.Equals(t, [][]storepb.Label{
{
{Name: "l1", Value: "v2"},
{Name: "l2", Value: "v3"},
},
{
{Name: "l1", Value: "v1"},
},
}, existingStoreLabels)
}