From 12d656a72cde48f7b40ce6c20bdf057d22458084 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sat, 27 May 2023 20:15:18 +0000 Subject: [PATCH 1/2] kvserver: add `leases.liveness` metric This patch adds the metric `leases.liveness` tracking the number of liveness range leases per node (generally 1 or 0). This is useful to find out which node had the liveness lease at a particular time. I ran a 10k range cluster to look at the CPU cost of the key comparisons, it didn't show up on CPU profiles. Epic: none Release note (ops change): added the metric `leases.liveness` showing the number of liveness range leases per node (generally 1 or 0), to track the liveness range leaseholder. --- pkg/kv/kvserver/metrics.go | 8 ++++++++ pkg/kv/kvserver/replica_metrics.go | 14 +++++++++----- pkg/kv/kvserver/store.go | 5 +++++ pkg/ts/catalog/chart_catalog.go | 1 + 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index 48e9ba9a1d89..0ac65b7c33fc 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -154,6 +154,12 @@ var ( Measurement: "Replicas", Unit: metric.Unit_COUNT, } + metaLeaseLivenessCount = metric.Metadata{ + Name: "leases.liveness", + Help: "Number of replica leaseholders for the liveness range(s)", + Measurement: "Replicas", + Unit: metric.Unit_COUNT, + } // Storage metrics. metaLiveBytes = metric.Metadata{ @@ -1674,6 +1680,7 @@ type StoreMetrics struct { LeaseTransferErrorCount *metric.Counter LeaseExpirationCount *metric.Gauge LeaseEpochCount *metric.Gauge + LeaseLivenessCount *metric.Gauge // Storage metrics. ResolveCommitCount *metric.Counter @@ -2208,6 +2215,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { LeaseTransferErrorCount: metric.NewCounter(metaLeaseTransferErrorCount), LeaseExpirationCount: metric.NewGauge(metaLeaseExpirationCount), LeaseEpochCount: metric.NewGauge(metaLeaseEpochCount), + LeaseLivenessCount: metric.NewGauge(metaLeaseLivenessCount), // Intent resolution metrics. ResolveCommitCount: metric.NewCounter(metaResolveCommit), diff --git a/pkg/kv/kvserver/replica_metrics.go b/pkg/kv/kvserver/replica_metrics.go index c5a85062f95e..f3d386f86116 100644 --- a/pkg/kv/kvserver/replica_metrics.go +++ b/pkg/kv/kvserver/replica_metrics.go @@ -16,6 +16,7 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/allocatorimpl" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" @@ -27,11 +28,12 @@ import ( // ReplicaMetrics contains details on the current status of the replica. type ReplicaMetrics struct { - Leader bool - LeaseValid bool - Leaseholder bool - LeaseType roachpb.LeaseType - LeaseStatus kvserverpb.LeaseStatus + Leader bool + LeaseValid bool + Leaseholder bool + LeaseType roachpb.LeaseType + LeaseStatus kvserverpb.LeaseStatus + LivenessLease bool // Quiescent indicates whether the replica believes itself to be quiesced. Quiescent bool @@ -135,6 +137,8 @@ func calcReplicaMetrics( m.LeaseValid = true leaseOwner = leaseStatus.Lease.OwnedBy(storeID) m.LeaseType = leaseStatus.Lease.Type() + m.LivenessLease = leaseOwner && + keys.NodeLivenessSpan.Overlaps(desc.RSpan().AsRawSpanWithNoLocals()) } m.Leaseholder = m.LeaseValid && leaseOwner m.Leader = isRaftLeader(raftStatus) diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index 295092ad6dae..d853176d0a22 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -3264,6 +3264,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error { leaseHolderCount int64 leaseExpirationCount int64 leaseEpochCount int64 + leaseLivenessCount int64 raftLeaderNotLeaseHolderCount int64 raftLeaderInvalidLeaseCount int64 quiescentCount int64 @@ -3333,6 +3334,9 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error { case roachpb.LeaseEpoch: leaseEpochCount++ } + if metrics.LivenessLease { + leaseLivenessCount++ + } } if metrics.Quiescent { quiescentCount++ @@ -3396,6 +3400,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error { s.metrics.LeaseHolderCount.Update(leaseHolderCount) s.metrics.LeaseExpirationCount.Update(leaseExpirationCount) s.metrics.LeaseEpochCount.Update(leaseEpochCount) + s.metrics.LeaseLivenessCount.Update(leaseLivenessCount) s.metrics.QuiescentCount.Update(quiescentCount) s.metrics.UninitializedCount.Update(uninitializedCount) s.metrics.AverageQueriesPerSecond.Update(averageQueriesPerSecond) diff --git a/pkg/ts/catalog/chart_catalog.go b/pkg/ts/catalog/chart_catalog.go index a7b2736085b4..84c499c86ec8 100644 --- a/pkg/ts/catalog/chart_catalog.go +++ b/pkg/ts/catalog/chart_catalog.go @@ -1692,6 +1692,7 @@ var charts = []sectionDescription{ Metrics: []string{ "leases.epoch", "leases.expiration", + "leases.liveness", "replicas.leaseholders", "replicas.leaders_not_leaseholders", "replicas.leaders_invalid_lease", From 4f274ed21815432817b9e112f7ff3da82c6e13c7 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sun, 28 May 2023 08:10:28 +0000 Subject: [PATCH 2/2] kvserver: log system range lease acquisition This patch logs acquisition of meta/liveness range leases to the health log. These leases are critical to cluster health, and during debugging it's useful to know their location over time. Epic: none Release note: None --- pkg/kv/kvserver/replica_proposal.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pkg/kv/kvserver/replica_proposal.go b/pkg/kv/kvserver/replica_proposal.go index f66a1a8466bb..9ee38e5bb22c 100644 --- a/pkg/kv/kvserver/replica_proposal.go +++ b/pkg/kv/kvserver/replica_proposal.go @@ -368,6 +368,17 @@ func (r *Replica) leasePostApplyLocked( r.gossipFirstRangeLocked(ctx) } + // Log acquisition of meta and liveness range leases. These are critical to + // cluster health, so it's useful to know their location over time. + if leaseChangingHands && iAmTheLeaseHolder && + r.descRLocked().StartKey.Less(roachpb.RKey(keys.NodeLivenessKeyMax)) { + if r.ownsValidLeaseRLocked(ctx, now) { + log.Health.Infof(ctx, "acquired system range lease: %s", newLease) + } else { + log.Health.Warningf(ctx, "applied system range lease after it expired: %s", newLease) + } + } + if (leaseChangingHands || maybeSplit) && iAmTheLeaseHolder && hasExpirationBasedLease { if requiresExpirationBasedLease { // Whenever we first acquire an expiration-based lease for a range that