Skip to content

Commit

Permalink
kvcoord: improve some DistSender metric help texts
Browse files Browse the repository at this point in the history
Release justification: documentation-only changes
Release note: None
  • Loading branch information
tbg committed Sep 13, 2021
1 parent a93a568 commit d8a217b
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 18 deletions.
34 changes: 24 additions & 10 deletions pkg/kv/kvclient/kvcoord/dist_sender.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,25 +73,25 @@ var (
}
metaTransportSentCount = metric.Metadata{
Name: "distsender.rpc.sent",
Help: "Number of RPCs sent",
Help: "Number of replica-addressed RPCs sent",
Measurement: "RPCs",
Unit: metric.Unit_COUNT,
}
metaTransportLocalSentCount = metric.Metadata{
Name: "distsender.rpc.sent.local",
Help: "Number of local RPCs sent",
Help: "Number of replica-addressed RPCs sent through the local-server optimization",
Measurement: "RPCs",
Unit: metric.Unit_COUNT,
}
metaTransportSenderNextReplicaErrCount = metric.Metadata{
Name: "distsender.rpc.sent.nextreplicaerror",
Help: "Number of RPCs sent due to per-replica errors",
Help: "Number of replica-addressed RPCs sent due to per-replica errors",
Measurement: "RPCs",
Unit: metric.Unit_COUNT,
}
metaDistSenderNotLeaseHolderErrCount = metric.Metadata{
Name: "distsender.errors.notleaseholder",
Help: "Number of NotLeaseHolderErrors encountered",
Help: "Number of NotLeaseHolderErrors encountered from replica-addressed RPCs",
Measurement: "Errors",
Unit: metric.Unit_COUNT,
}
Expand All @@ -108,20 +108,34 @@ var (
Unit: metric.Unit_COUNT,
}
metaDistSenderSlowRPCs = metric.Metadata{
Name: "requests.slow.distsender",
Help: "Number of RPCs stuck or retrying for a long time",
Name: "requests.slow.distsender",
Help: `Number of replica-bound RPCs currently stuck or retrying for a long time.
Note that this is not a good signal for KV health. The remote side of the
RPCs tracked here may experience contention, so an end user can easily
cause values for this metric to be emitted by leaving a transaction open
for a long time and contending with it using a second transaction.`,
Measurement: "Requests",
Unit: metric.Unit_COUNT,
}
metaDistSenderMethodCountTmpl = metric.Metadata{
Name: "distsender.rpc.%s.sent",
Help: "Number of %s requests sent",
Name: "distsender.rpc.%s.sent",
Help: `Number of %s requests processed.
This counts the requests in batches handed to DistSender, not the RPCs
sent to individual Ranges as a result.`,
Measurement: "RPCs",
Unit: metric.Unit_COUNT,
}
metaDistSenderErrCountTmpl = metric.Metadata{
Name: "distsender.rpc.err.%s",
Help: "Number of %s errors received",
Name: "distsender.rpc.err.%s",
Help: `Number of %s errors received replica-bound RPCs
This counts how often error of the specified type was received back from replicas
as part of executing possibly range-spanning requests. Failures to reach the target
replica will be accounted for as 'roachpb.CommunicationErrType' and unclassified
errors as 'roachpb.InternalErrType'.
`,
Measurement: "Errors",
Unit: metric.Unit_COUNT,
}
Expand Down
47 changes: 39 additions & 8 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -999,28 +999,59 @@ var (

// Slow request metrics.
metaLatchRequests = metric.Metadata{
Name: "requests.slow.latch",
Help: "Number of requests that have been stuck for a long time acquiring latches",
Name: "requests.slow.latch",
Help: `Number of requests that have been stuck for a long time acquiring latches.
Latches moderate access to the KV keyspace for the purpose of evaluating and
replicating commands. A slow latch acquisition attempt is often caused by
another request holding and not releasing its latches in a timely manner. This
in turn can either be caused by a long delay in evaluation (for example, under
severe system overload) or by delays at the replication layer.
This gauge registering a nonzero value usually indicates a serious problem and
should be investigated.
`,
Measurement: "Requests",
Unit: metric.Unit_COUNT,
}
metaSlowLeaseRequests = metric.Metadata{
Name: "requests.slow.lease",
Help: "Number of requests that have been stuck for a long time acquiring a lease",
Name: "requests.slow.lease",
Help: `Number of requests that have been stuck for a long time acquiring a lease.
This gauge registering a nonzero value usually indicates range or replica
unavailability, and should be investigated. In the common case, we also
expect to see 'requests.slow.raft' to register a nonzero value, indicating
that the lease requests are not getting a timely response from the replication
layer.
`,
Measurement: "Requests",
Unit: metric.Unit_COUNT,
}
metaSlowRaftRequests = metric.Metadata{
Name: "requests.slow.raft",
Help: "Number of requests that have been stuck for a long time in raft",
Name: "requests.slow.raft",
Help: `Number of requests that have been stuck for a long time in the replication layer.
An (evaluated) request has to pass through the replication layer, notably the
quota pool and raft. If it fails to do so within a highly permissive duration,
the gauge is incremented (and decremented again once the request is either
applied or returns an error).
A nonzero value indicates range or replica unavailability, and should be investigated.
`,
Measurement: "Requests",
Unit: metric.Unit_COUNT,
}

// Backpressure metrics.
metaBackpressuredOnSplitRequests = metric.Metadata{
Name: "requests.backpressure.split",
Help: "Number of backpressured writes waiting on a Range split",
Name: "requests.backpressure.split",
Help: `Number of backpressured writes waiting on a Range split.
A Range will backpressure (roughly) non-system traffic when the range is above
the configured size until the range splits. When the rate of this metric is
nonzero over extended periods of time, it should be investigated why splits are
not occurring.
`,
Measurement: "Writes",
Unit: metric.Unit_COUNT,
}
Expand Down

0 comments on commit d8a217b

Please sign in to comment.