From 91282463a3798ed2842a870282aa5d4fac8a23fa Mon Sep 17 00:00:00 2001 From: "HC Zhu (DB)" Date: Thu, 31 Oct 2024 21:34:05 -0700 Subject: [PATCH] Improve Querier's error message --- pkg/store/proxy.go | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/pkg/store/proxy.go b/pkg/store/proxy.go index 3babd93ee2..28db933d33 100644 --- a/pkg/store/proxy.go +++ b/pkg/store/proxy.go @@ -338,20 +338,26 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb. checkGroupReplicaErrors := func(st Client, err error) error { if len(failedStores[st.GroupKey()]) > 1 { + msg := "Multiple replicas have failures for the same group" + group := st.GroupKey() + replicas := fmt.Sprintf("%+v", failedStores[group]) level.Error(reqLogger).Log( - "msg", "Multipel replicas have failures for the same group", - "group", st.GroupKey(), - "replicas", fmt.Sprintf("%+v", failedStores[st.GroupKey()]), + "msg", msg, + "group", group, + "replicas", replicas, ) - return err + return fmt.Errorf("%s group=%s replicas=%s: %w", msg, group, replicas, err) } if len(groupReplicaStores[st.GroupKey()]) == 1 && failedStores[st.GroupKey()][st.ReplicaKey()] > 1 { + msg := "A group with single replica has multiple failures" + group := st.GroupKey() + replicas := fmt.Sprintf("%+v", failedStores[group]) level.Error(reqLogger).Log( - "msg", "A single replica group has multiple failures", - "group", st.GroupKey(), - "replicas", fmt.Sprintf("%+v", failedStores[st.GroupKey()]), + "msg", msg, + "group", group, + "replicas", replicas, ) - return err + return fmt.Errorf("%s group=%s replicas=%s: %w", msg, group, replicas, err) } return nil } @@ -404,6 +410,7 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb. } i := 0 + var firstWarning *string for respHeap.Next() { i++ if r.Limit > 0 && i > int(r.Limit) { @@ -422,10 +429,14 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb. // TODO: attribute the warning to the store(group key and replica key) that produced it. // Each client streams a sequence of time series, so it's not trivial to attribute the warning to a specific client. if totalFailedStores > 1 { - level.Error(reqLogger).Log("msg", "more than one stores have failed") + level.Error(reqLogger).Log("msg", "more than one stores had warnings") // If we don't know which store has failed, we can tolerate at most one failed store. + if firstWarning != nil { + warning += "; " + *firstWarning + } return status.Error(codes.Aborted, warning) } + firstWarning = &warning } else if r.PartialResponseDisabled || r.PartialResponseStrategy == storepb.PartialResponseStrategy_ABORT { return status.Error(codes.Aborted, resp.GetWarning()) }