From 8ef4e36a2f12dde227624c9717376fe7785dea74 Mon Sep 17 00:00:00 2001 From: Austen McClernon Date: Fri, 21 Jul 2023 20:16:14 +0000 Subject: [PATCH] kvserver: add lease preference metrics There were no existing metrics to monitor the lease preference conformance. This commit adds two metric gauges: `leases.preferences.violating` and `leases.preferences.less-preferred`. These metrics are reported by the store. `leases.preferences.violating` indicates the number of valid leases a store owns, which satisfy none of the preferences applied. `leases.preferences.less-preferred` indicates the number of valid leases a store owns, which satisfy some of the preferences applied, but not the first one. For example, with a lease preference `'[[+zone=a],[+zone=b]]'`, the metric values with different leaseholders are: ``` leaseholder_locality="zone=c" leases.preferences.less-preferred: 0 leases.preferences.violating: 1 ``` ``` leaseholder_locality="zone=b" leases.preferences.less-preferred: 1 leases.preferences.violating: 0 ``` ``` leaseholder_locality="zone=a" leases.preferences.less-preferred: 0 leases.preferences.violating: 0 ``` When no preferences are applied, the lease is not counted in either metric. Epic: none Informs: #106100 Release note (ops change): Introduce two new metrics to monitor lease range preference conformance. `leases.preferences.violating` indicates the number of valid leases a store owns, which satisfy none of the preferences applied. `leases.preferences.less-preferred` indicates the number of valid leases a store owns, which satisfy some of the preferences applied, but not the first one. --- pkg/kv/kvserver/metrics.go | 43 +++++++++++----- pkg/kv/kvserver/replica_metrics.go | 64 +++++++++++++++-------- pkg/kv/kvserver/replica_range_lease.go | 70 ++++++++++++++++++++------ pkg/kv/kvserver/store.go | 43 ++++++++++------ 4 files changed, 155 insertions(+), 65 deletions(-) diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index 2e16c4b66f26..5730d55de188 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -170,6 +170,19 @@ var ( Measurement: "Replicas", Unit: metric.Unit_COUNT, } + metaLeaseViolatingPreferencesCount = metric.Metadata{ + Name: "leases.preferences.violating", + Help: "Number of replica leaseholders which violate lease preferences", + Measurement: "Replicas", + Unit: metric.Unit_COUNT, + } + metaLeaseLessPreferredCount = metric.Metadata{ + Name: "leases.preferences.less-preferred", + Help: "Number of replica leaseholders which satisfy a lease " + + "preference which is not the most preferred", + Measurement: "Replicas", + Unit: metric.Unit_COUNT, + } // Storage metrics. metaLiveBytes = metric.Metadata{ @@ -2174,14 +2187,16 @@ type StoreMetrics struct { // Lease request metrics for successful and failed lease requests. These // count proposals (i.e. it does not matter how many replicas apply the // lease). - LeaseRequestSuccessCount *metric.Counter - LeaseRequestErrorCount *metric.Counter - LeaseRequestLatency metric.IHistogram - LeaseTransferSuccessCount *metric.Counter - LeaseTransferErrorCount *metric.Counter - LeaseExpirationCount *metric.Gauge - LeaseEpochCount *metric.Gauge - LeaseLivenessCount *metric.Gauge + LeaseRequestSuccessCount *metric.Counter + LeaseRequestErrorCount *metric.Counter + LeaseRequestLatency metric.IHistogram + LeaseTransferSuccessCount *metric.Counter + LeaseTransferErrorCount *metric.Counter + LeaseExpirationCount *metric.Gauge + LeaseEpochCount *metric.Gauge + LeaseLivenessCount *metric.Gauge + LeaseViolatingPreferencesCount *metric.Gauge + LeaseLessPreferredCount *metric.Gauge // Storage metrics. ResolveCommitCount *metric.Counter @@ -2828,11 +2843,13 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { Duration: histogramWindow, Buckets: metric.IOLatencyBuckets, }), - LeaseTransferSuccessCount: metric.NewCounter(metaLeaseTransferSuccessCount), - LeaseTransferErrorCount: metric.NewCounter(metaLeaseTransferErrorCount), - LeaseExpirationCount: metric.NewGauge(metaLeaseExpirationCount), - LeaseEpochCount: metric.NewGauge(metaLeaseEpochCount), - LeaseLivenessCount: metric.NewGauge(metaLeaseLivenessCount), + LeaseTransferSuccessCount: metric.NewCounter(metaLeaseTransferSuccessCount), + LeaseTransferErrorCount: metric.NewCounter(metaLeaseTransferErrorCount), + LeaseExpirationCount: metric.NewGauge(metaLeaseExpirationCount), + LeaseEpochCount: metric.NewGauge(metaLeaseEpochCount), + LeaseLivenessCount: metric.NewGauge(metaLeaseLivenessCount), + LeaseViolatingPreferencesCount: metric.NewGauge(metaLeaseViolatingPreferencesCount), + LeaseLessPreferredCount: metric.NewGauge(metaLeaseLessPreferredCount), // Intent resolution metrics. ResolveCommitCount: metric.NewCounter(metaResolveCommit), diff --git a/pkg/kv/kvserver/replica_metrics.go b/pkg/kv/kvserver/replica_metrics.go index e273f3916d1b..070c37977c59 100644 --- a/pkg/kv/kvserver/replica_metrics.go +++ b/pkg/kv/kvserver/replica_metrics.go @@ -28,12 +28,14 @@ import ( // ReplicaMetrics contains details on the current status of the replica. type ReplicaMetrics struct { - Leader bool - LeaseValid bool - Leaseholder bool - LeaseType roachpb.LeaseType - LeaseStatus kvserverpb.LeaseStatus - LivenessLease bool + Leader bool + LeaseValid bool + Leaseholder bool + LeaseType roachpb.LeaseType + LeaseStatus kvserverpb.LeaseStatus + LivenessLease bool + ViolatingLeasePreferences bool + LessPreferredLease bool // Quiescent indicates whether the replica believes itself to be quiesced. Quiescent bool @@ -74,6 +76,10 @@ func (r *Replica) Metrics( latchMetrics := r.concMgr.LatchMetrics() lockTableMetrics := r.concMgr.LockTableMetrics() + storeAttrs := r.store.Attrs() + nodeAttrs := r.store.nodeDesc.Attrs + nodeLocality := r.store.nodeDesc.Locality + r.mu.RLock() var qpUsed, qpCap int64 @@ -92,6 +98,9 @@ func (r *Replica) Metrics( raftStatus: r.raftSparseStatusRLocked(), leaseStatus: r.leaseStatusAtRLocked(ctx, now), storeID: r.store.StoreID(), + storeAttrs: storeAttrs, + nodeAttrs: nodeAttrs, + nodeLocality: nodeLocality, quiescent: r.mu.quiescent, ticking: ticking, latchMetrics: latchMetrics, @@ -118,6 +127,8 @@ type calcReplicaMetricsInput struct { raftStatus *raftSparseStatus leaseStatus kvserverpb.LeaseStatus storeID roachpb.StoreID + storeAttrs, nodeAttrs roachpb.Attributes + nodeLocality roachpb.Locality quiescent bool ticking bool latchMetrics concurrency.LatchMetrics @@ -130,14 +141,23 @@ type calcReplicaMetricsInput struct { } func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics { - var validLease, validLeaseOwner, livenessLease bool + var validLease, validLeaseOwner, livenessLease, violatingLeasePreferences, lessPreferredLease bool var validLeaseType roachpb.LeaseType if d.leaseStatus.IsValid() { validLease = true validLeaseOwner = d.leaseStatus.Lease.OwnedBy(d.storeID) validLeaseType = d.leaseStatus.Lease.Type() - livenessLease = validLeaseOwner && - keys.NodeLivenessSpan.Overlaps(d.desc.RSpan().AsRawSpanWithNoLocals()) + if validLeaseOwner { + livenessLease = keys.NodeLivenessSpan.Overlaps(d.desc.RSpan().AsRawSpanWithNoLocals()) + switch makeLeasePreferenceStatus( + d.leaseStatus, d.storeID, d.storeAttrs, d.nodeAttrs, + d.nodeLocality, d.conf.LeasePreferences) { + case leasePreferencesViolating: + violatingLeasePreferences = true + case leasePreferencesLessPreferred: + lessPreferredLease = true + } + } } rangeCounter, unavailable, underreplicated, overreplicated := calcRangeCounter( @@ -154,18 +174,20 @@ func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics { const raftLogTooLargeMultiple = 4 return ReplicaMetrics{ - Leader: leader, - LeaseValid: validLease, - Leaseholder: validLeaseOwner, - LeaseType: validLeaseType, - LeaseStatus: d.leaseStatus, - LivenessLease: livenessLease, - Quiescent: d.quiescent, - Ticking: d.ticking, - RangeCounter: rangeCounter, - Unavailable: unavailable, - Underreplicated: underreplicated, - Overreplicated: overreplicated, + Leader: leader, + LeaseValid: validLease, + Leaseholder: validLeaseOwner, + LeaseType: validLeaseType, + LeaseStatus: d.leaseStatus, + LivenessLease: livenessLease, + ViolatingLeasePreferences: violatingLeasePreferences, + LessPreferredLease: lessPreferredLease, + Quiescent: d.quiescent, + Ticking: d.ticking, + RangeCounter: rangeCounter, + Unavailable: unavailable, + Underreplicated: underreplicated, + Overreplicated: overreplicated, RaftLogTooLarge: d.raftLogSizeTrusted && d.raftLogSize > raftLogTooLargeMultiple*d.raftCfg.RaftLogTruncationThreshold, BehindCount: leaderBehindCount, diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go index 71208e126083..83d77161ad14 100644 --- a/pkg/kv/kvserver/replica_range_lease.go +++ b/pkg/kv/kvserver/replica_range_lease.go @@ -1536,25 +1536,65 @@ func (r *Replica) hasCorrectLeaseTypeRLocked(lease roachpb.Lease) bool { return hasExpirationLease == r.shouldUseExpirationLeaseRLocked() } -// LeaseViolatesPreferences checks if current replica owns the lease and if it -// violates the lease preferences defined in the span config. If there is an -// error or no preferences defined then it will return false and consider that -// to be in-conformance. +// leasePreferencesStatus represents the state of satisfying lease preferences. +type leasePreferencesStatus int + +const ( + // leasePreferencesViolating indicates the leaseholder does not + // satisfy any lease preference applied. + leasePreferencesViolating leasePreferencesStatus = iota + // leasePreferencesLessPreferred indicates the leaseholder satisfies _some_ + // preference, however not the most preferred. + leasePreferencesLessPreferred + // leasePreferencesOK indicates the lease satisfies the first + // preference, or no lease preferences are applied. + leasePreferencesOK +) + +// LeaseViolatesPreferences checks if this replica owns the lease and if it +// violates the lease preferences defined in the span config. If no preferences +// are defined then it will return false and consider it to be in conformance. func (r *Replica) LeaseViolatesPreferences(ctx context.Context) bool { - storeDesc, err := r.store.Descriptor(ctx, true /* useCached */) - if err != nil { - log.Infof(ctx, "Unable to load the descriptor %v: cannot check if lease violates preference", err) + storeID := r.store.StoreID() + now := r.Clock().NowAsClockTimestamp() + r.mu.RLock() + leaseStatus := r.leaseStatusAtRLocked(ctx, now) + preferences := r.mu.conf.LeasePreferences + r.mu.RUnlock() + + if !leaseStatus.IsValid() || !leaseStatus.Lease.OwnedBy(storeID) { return false } - conf := r.SpanConfig() - if len(conf.LeasePreferences) == 0 { - return false + + storeAttrs := r.store.Attrs() + nodeAttrs := r.store.nodeDesc.Attrs + nodeLocality := r.store.nodeDesc.Locality + preferenceStatus := makeLeasePreferenceStatus( + leaseStatus, storeID, storeAttrs, nodeAttrs, nodeLocality, preferences) + + return preferenceStatus == leasePreferencesViolating +} + +func makeLeasePreferenceStatus( + leaseStatus kvserverpb.LeaseStatus, + storeID roachpb.StoreID, + storeAttrs, nodeAttrs roachpb.Attributes, + nodeLocality roachpb.Locality, + preferences []roachpb.LeasePreference, +) leasePreferencesStatus { + if !leaseStatus.IsValid() || !leaseStatus.Lease.OwnedBy(storeID) { + return leasePreferencesOK + } + if len(preferences) == 0 { + return leasePreferencesOK } - for _, preference := range conf.LeasePreferences { - if constraint.CheckStoreConjunction(*storeDesc, preference.Constraints) { - return false + for i, preference := range preferences { + if constraint.CheckConjunction(storeAttrs, nodeAttrs, nodeLocality, preference.Constraints) { + if i > 0 { + return leasePreferencesLessPreferred + } + return leasePreferencesOK } } - // We have at lease one preference set up, but we don't satisfy any. - return true + return leasePreferencesViolating } diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index a14054ea19ca..d08dbc8ef370 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -2926,22 +2926,24 @@ func (s *Store) RangeFeed( // whenever availability changes. func (s *Store) updateReplicationGauges(ctx context.Context) error { var ( - raftLeaderCount int64 - leaseHolderCount int64 - leaseExpirationCount int64 - leaseEpochCount int64 - leaseLivenessCount int64 - raftLeaderNotLeaseHolderCount int64 - raftLeaderInvalidLeaseCount int64 - quiescentCount int64 - uninitializedCount int64 - averageQueriesPerSecond float64 - averageRequestsPerSecond float64 - averageReadsPerSecond float64 - averageWritesPerSecond float64 - averageReadBytesPerSecond float64 - averageWriteBytesPerSecond float64 - averageCPUNanosPerSecond float64 + raftLeaderCount int64 + leaseHolderCount int64 + leaseExpirationCount int64 + leaseEpochCount int64 + leaseLivenessCount int64 + leaseViolatingPreferencesCount int64 + leaseLessPreferredCount int64 + raftLeaderNotLeaseHolderCount int64 + raftLeaderInvalidLeaseCount int64 + quiescentCount int64 + uninitializedCount int64 + averageQueriesPerSecond float64 + averageRequestsPerSecond float64 + averageReadsPerSecond float64 + averageWritesPerSecond float64 + averageReadBytesPerSecond float64 + averageWriteBytesPerSecond float64 + averageCPUNanosPerSecond float64 rangeCount int64 unavailableRangeCount int64 @@ -3004,6 +3006,13 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error { if metrics.LivenessLease { leaseLivenessCount++ } + // NB: Can't be satisfying a less preferred preference, and also + // satisfying no preferences. + if metrics.ViolatingLeasePreferences { + leaseViolatingPreferencesCount++ + } else if metrics.LessPreferredLease { + leaseLessPreferredCount++ + } } if metrics.Quiescent { quiescentCount++ @@ -3062,6 +3071,8 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error { s.metrics.LeaseHolderCount.Update(leaseHolderCount) s.metrics.LeaseExpirationCount.Update(leaseExpirationCount) s.metrics.LeaseEpochCount.Update(leaseEpochCount) + s.metrics.LeaseViolatingPreferencesCount.Update(leaseViolatingPreferencesCount) + s.metrics.LeaseLessPreferredCount.Update(leaseLessPreferredCount) s.metrics.LeaseLivenessCount.Update(leaseLivenessCount) s.metrics.QuiescentCount.Update(quiescentCount) s.metrics.UninitializedCount.Update(uninitializedCount)