Skip to content

Commit

Permalink
Merge pull request #104077 from erikgrinaker/backport22.2-104008
Browse files Browse the repository at this point in the history
release-22.2: kvserver: improve system lease observability
  • Loading branch information
erikgrinaker authored Jun 6, 2023
2 parents 5b83009 + 4f274ed commit 0e56ed0
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 5 deletions.
8 changes: 8 additions & 0 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,12 @@ var (
Measurement: "Replicas",
Unit: metric.Unit_COUNT,
}
metaLeaseLivenessCount = metric.Metadata{
Name: "leases.liveness",
Help: "Number of replica leaseholders for the liveness range(s)",
Measurement: "Replicas",
Unit: metric.Unit_COUNT,
}

// Storage metrics.
metaLiveBytes = metric.Metadata{
Expand Down Expand Up @@ -1674,6 +1680,7 @@ type StoreMetrics struct {
LeaseTransferErrorCount *metric.Counter
LeaseExpirationCount *metric.Gauge
LeaseEpochCount *metric.Gauge
LeaseLivenessCount *metric.Gauge

// Storage metrics.
ResolveCommitCount *metric.Counter
Expand Down Expand Up @@ -2208,6 +2215,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
LeaseTransferErrorCount: metric.NewCounter(metaLeaseTransferErrorCount),
LeaseExpirationCount: metric.NewGauge(metaLeaseExpirationCount),
LeaseEpochCount: metric.NewGauge(metaLeaseEpochCount),
LeaseLivenessCount: metric.NewGauge(metaLeaseLivenessCount),

// Intent resolution metrics.
ResolveCommitCount: metric.NewCounter(metaResolveCommit),
Expand Down
14 changes: 9 additions & 5 deletions pkg/kv/kvserver/replica_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/allocatorimpl"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
Expand All @@ -27,11 +28,12 @@ import (

// ReplicaMetrics contains details on the current status of the replica.
type ReplicaMetrics struct {
Leader bool
LeaseValid bool
Leaseholder bool
LeaseType roachpb.LeaseType
LeaseStatus kvserverpb.LeaseStatus
Leader bool
LeaseValid bool
Leaseholder bool
LeaseType roachpb.LeaseType
LeaseStatus kvserverpb.LeaseStatus
LivenessLease bool

// Quiescent indicates whether the replica believes itself to be quiesced.
Quiescent bool
Expand Down Expand Up @@ -135,6 +137,8 @@ func calcReplicaMetrics(
m.LeaseValid = true
leaseOwner = leaseStatus.Lease.OwnedBy(storeID)
m.LeaseType = leaseStatus.Lease.Type()
m.LivenessLease = leaseOwner &&
keys.NodeLivenessSpan.Overlaps(desc.RSpan().AsRawSpanWithNoLocals())
}
m.Leaseholder = m.LeaseValid && leaseOwner
m.Leader = isRaftLeader(raftStatus)
Expand Down
11 changes: 11 additions & 0 deletions pkg/kv/kvserver/replica_proposal.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,17 @@ func (r *Replica) leasePostApplyLocked(
r.gossipFirstRangeLocked(ctx)
}

// Log acquisition of meta and liveness range leases. These are critical to
// cluster health, so it's useful to know their location over time.
if leaseChangingHands && iAmTheLeaseHolder &&
r.descRLocked().StartKey.Less(roachpb.RKey(keys.NodeLivenessKeyMax)) {
if r.ownsValidLeaseRLocked(ctx, now) {
log.Health.Infof(ctx, "acquired system range lease: %s", newLease)
} else {
log.Health.Warningf(ctx, "applied system range lease after it expired: %s", newLease)
}
}

if (leaseChangingHands || maybeSplit) && iAmTheLeaseHolder && hasExpirationBasedLease {
if requiresExpirationBasedLease {
// Whenever we first acquire an expiration-based lease for a range that
Expand Down
5 changes: 5 additions & 0 deletions pkg/kv/kvserver/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -3264,6 +3264,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
leaseHolderCount int64
leaseExpirationCount int64
leaseEpochCount int64
leaseLivenessCount int64
raftLeaderNotLeaseHolderCount int64
raftLeaderInvalidLeaseCount int64
quiescentCount int64
Expand Down Expand Up @@ -3333,6 +3334,9 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
case roachpb.LeaseEpoch:
leaseEpochCount++
}
if metrics.LivenessLease {
leaseLivenessCount++
}
}
if metrics.Quiescent {
quiescentCount++
Expand Down Expand Up @@ -3396,6 +3400,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
s.metrics.LeaseHolderCount.Update(leaseHolderCount)
s.metrics.LeaseExpirationCount.Update(leaseExpirationCount)
s.metrics.LeaseEpochCount.Update(leaseEpochCount)
s.metrics.LeaseLivenessCount.Update(leaseLivenessCount)
s.metrics.QuiescentCount.Update(quiescentCount)
s.metrics.UninitializedCount.Update(uninitializedCount)
s.metrics.AverageQueriesPerSecond.Update(averageQueriesPerSecond)
Expand Down
1 change: 1 addition & 0 deletions pkg/ts/catalog/chart_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -1692,6 +1692,7 @@ var charts = []sectionDescription{
Metrics: []string{
"leases.epoch",
"leases.expiration",
"leases.liveness",
"replicas.leaseholders",
"replicas.leaders_not_leaseholders",
"replicas.leaders_invalid_lease",
Expand Down

0 comments on commit 0e56ed0

Please sign in to comment.