Skip to content

Commit

Permalink
Improve Querier's error message
Browse files Browse the repository at this point in the history
  • Loading branch information
hczhu-db committed Nov 1, 2024
1 parent a80f9a3 commit 9128246
Showing 1 changed file with 20 additions and 9 deletions.
29 changes: 20 additions & 9 deletions pkg/store/proxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,20 +338,26 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb.

checkGroupReplicaErrors := func(st Client, err error) error {
if len(failedStores[st.GroupKey()]) > 1 {
msg := "Multiple replicas have failures for the same group"
group := st.GroupKey()
replicas := fmt.Sprintf("%+v", failedStores[group])
level.Error(reqLogger).Log(
"msg", "Multipel replicas have failures for the same group",
"group", st.GroupKey(),
"replicas", fmt.Sprintf("%+v", failedStores[st.GroupKey()]),
"msg", msg,
"group", group,
"replicas", replicas,
)
return err
return fmt.Errorf("%s group=%s replicas=%s: %w", msg, group, replicas, err)
}
if len(groupReplicaStores[st.GroupKey()]) == 1 && failedStores[st.GroupKey()][st.ReplicaKey()] > 1 {
msg := "A group with single replica has multiple failures"
group := st.GroupKey()
replicas := fmt.Sprintf("%+v", failedStores[group])
level.Error(reqLogger).Log(
"msg", "A single replica group has multiple failures",
"group", st.GroupKey(),
"replicas", fmt.Sprintf("%+v", failedStores[st.GroupKey()]),
"msg", msg,
"group", group,
"replicas", replicas,
)
return err
return fmt.Errorf("%s group=%s replicas=%s: %w", msg, group, replicas, err)
}
return nil
}
Expand Down Expand Up @@ -404,6 +410,7 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb.
}

i := 0
var firstWarning *string
for respHeap.Next() {
i++
if r.Limit > 0 && i > int(r.Limit) {
Expand All @@ -422,10 +429,14 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb.
// TODO: attribute the warning to the store(group key and replica key) that produced it.
// Each client streams a sequence of time series, so it's not trivial to attribute the warning to a specific client.
if totalFailedStores > 1 {
level.Error(reqLogger).Log("msg", "more than one stores have failed")
level.Error(reqLogger).Log("msg", "more than one stores had warnings")
// If we don't know which store has failed, we can tolerate at most one failed store.
if firstWarning != nil {
warning += "; " + *firstWarning
}
return status.Error(codes.Aborted, warning)
}
firstWarning = &warning
} else if r.PartialResponseDisabled || r.PartialResponseStrategy == storepb.PartialResponseStrategy_ABORT {
return status.Error(codes.Aborted, resp.GetWarning())
}
Expand Down

0 comments on commit 9128246

Please sign in to comment.