From 42ede1543cd123a6ec191b0d65bd7c3bb136761a Mon Sep 17 00:00:00 2001 From: Alex Bublichenko <46664526+abliqo@users.noreply.github.com> Date: Wed, 30 Dec 2020 19:36:20 -0500 Subject: [PATCH] [aggregator/client] Metric for dropped metrics (#3054) Problem: When collector fails to write a metric to aggregator, it logs an error but in practice it is almost impossible to tell whether it failed to write to both peers owning a shard or only one of them, i.e. whether data is lost or not. Solultion: Emit a metric indicating data loss. --- src/aggregator/client/tcp_client.go | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/aggregator/client/tcp_client.go b/src/aggregator/client/tcp_client.go index 725894049b..40a519457c 100644 --- a/src/aggregator/client/tcp_client.go +++ b/src/aggregator/client/tcp_client.go @@ -268,9 +268,10 @@ func (c *TCPClient) write( return err } var ( - shardID = c.shardFn(metricID, uint32(placement.NumShards())) - instances = placement.InstancesForShard(shardID) - multiErr = xerrors.NewMultiError() + shardID = c.shardFn(metricID, uint32(placement.NumShards())) + instances = placement.InstancesForShard(shardID) + multiErr = xerrors.NewMultiError() + oneOrMoreSucceeded = false ) for _, instance := range instances { // NB(xichen): the shard should technically always be found because the instances @@ -288,7 +289,15 @@ func (c *TCPClient) write( } if err = c.writerMgr.Write(instance, shardID, payload); err != nil { multiErr = multiErr.Add(err) + continue } + + oneOrMoreSucceeded = true + } + + if !oneOrMoreSucceeded { + // unrectifiable loss + c.metrics.dropped.Inc(1) } onPlacementDoneFn() @@ -329,6 +338,7 @@ type tcpClientMetrics struct { flush tally.Counter shardNotOwned tally.Counter shardNotWriteable tally.Counter + dropped tally.Counter } func newTCPClientMetrics( @@ -343,5 +353,6 @@ func newTCPClientMetrics( flush: scope.Counter("flush"), shardNotOwned: scope.Counter("shard-not-owned"), shardNotWriteable: scope.Counter("shard-not-writeable"), + dropped: scope.Counter("dropped"), } }