Skip to content

Commit

Permalink
kvserver: add lease preference metrics
Browse files Browse the repository at this point in the history
There were no existing metrics to monitor the lease preference
conformance. This commit adds two metric gauges:
`leases.preferences.violating` and `leases.preferences.less-preferred`.
These metrics are reported by the store.

`leases.preferences.violating` indicates the number of valid leases a
store owns, which satisfy none of the preferences applied.

`leases.preferences.less-preferred` indicates the number of valid leases
a store owns, which satisfy some of the preferences applied, but not the
first one.

For example, with a lease preference `'[[+zone=a],[+zone=b]]'`, the
metric values with different leaseholders are:

```
leaseholder_locality="zone=c"
leases.preferences.less-preferred: 0
leases.preferences.violating: 1
```

```
leaseholder_locality="zone=b"
leases.preferences.less-preferred: 1
leases.preferences.violating: 0
```

```
leaseholder_locality="zone=a"
leases.preferences.less-preferred: 0
leases.preferences.violating: 0
```

When no preferences are applied, the lease is not counted in either
metric.

Epic: none

Informs: #106100

Release note (ops change): Introduce two new metrics to monitor lease
range preference conformance. `leases.preferences.violating` indicates
the number of valid leases a store owns, which satisfy none of the
preferences applied. `leases.preferences.less-preferred` indicates the
number of valid leases a store owns, which satisfy some of the
preferences applied, but not the first one.
  • Loading branch information
kvoli committed Jul 24, 2023
1 parent f748f00 commit 8ef4e36
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 65 deletions.
43 changes: 30 additions & 13 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,19 @@ var (
Measurement: "Replicas",
Unit: metric.Unit_COUNT,
}
metaLeaseViolatingPreferencesCount = metric.Metadata{
Name: "leases.preferences.violating",
Help: "Number of replica leaseholders which violate lease preferences",
Measurement: "Replicas",
Unit: metric.Unit_COUNT,
}
metaLeaseLessPreferredCount = metric.Metadata{
Name: "leases.preferences.less-preferred",
Help: "Number of replica leaseholders which satisfy a lease " +
"preference which is not the most preferred",
Measurement: "Replicas",
Unit: metric.Unit_COUNT,
}

// Storage metrics.
metaLiveBytes = metric.Metadata{
Expand Down Expand Up @@ -2174,14 +2187,16 @@ type StoreMetrics struct {
// Lease request metrics for successful and failed lease requests. These
// count proposals (i.e. it does not matter how many replicas apply the
// lease).
LeaseRequestSuccessCount *metric.Counter
LeaseRequestErrorCount *metric.Counter
LeaseRequestLatency metric.IHistogram
LeaseTransferSuccessCount *metric.Counter
LeaseTransferErrorCount *metric.Counter
LeaseExpirationCount *metric.Gauge
LeaseEpochCount *metric.Gauge
LeaseLivenessCount *metric.Gauge
LeaseRequestSuccessCount *metric.Counter
LeaseRequestErrorCount *metric.Counter
LeaseRequestLatency metric.IHistogram
LeaseTransferSuccessCount *metric.Counter
LeaseTransferErrorCount *metric.Counter
LeaseExpirationCount *metric.Gauge
LeaseEpochCount *metric.Gauge
LeaseLivenessCount *metric.Gauge
LeaseViolatingPreferencesCount *metric.Gauge
LeaseLessPreferredCount *metric.Gauge

// Storage metrics.
ResolveCommitCount *metric.Counter
Expand Down Expand Up @@ -2828,11 +2843,13 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
Duration: histogramWindow,
Buckets: metric.IOLatencyBuckets,
}),
LeaseTransferSuccessCount: metric.NewCounter(metaLeaseTransferSuccessCount),
LeaseTransferErrorCount: metric.NewCounter(metaLeaseTransferErrorCount),
LeaseExpirationCount: metric.NewGauge(metaLeaseExpirationCount),
LeaseEpochCount: metric.NewGauge(metaLeaseEpochCount),
LeaseLivenessCount: metric.NewGauge(metaLeaseLivenessCount),
LeaseTransferSuccessCount: metric.NewCounter(metaLeaseTransferSuccessCount),
LeaseTransferErrorCount: metric.NewCounter(metaLeaseTransferErrorCount),
LeaseExpirationCount: metric.NewGauge(metaLeaseExpirationCount),
LeaseEpochCount: metric.NewGauge(metaLeaseEpochCount),
LeaseLivenessCount: metric.NewGauge(metaLeaseLivenessCount),
LeaseViolatingPreferencesCount: metric.NewGauge(metaLeaseViolatingPreferencesCount),
LeaseLessPreferredCount: metric.NewGauge(metaLeaseLessPreferredCount),

// Intent resolution metrics.
ResolveCommitCount: metric.NewCounter(metaResolveCommit),
Expand Down
64 changes: 43 additions & 21 deletions pkg/kv/kvserver/replica_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@ import (

// ReplicaMetrics contains details on the current status of the replica.
type ReplicaMetrics struct {
Leader bool
LeaseValid bool
Leaseholder bool
LeaseType roachpb.LeaseType
LeaseStatus kvserverpb.LeaseStatus
LivenessLease bool
Leader bool
LeaseValid bool
Leaseholder bool
LeaseType roachpb.LeaseType
LeaseStatus kvserverpb.LeaseStatus
LivenessLease bool
ViolatingLeasePreferences bool
LessPreferredLease bool

// Quiescent indicates whether the replica believes itself to be quiesced.
Quiescent bool
Expand Down Expand Up @@ -74,6 +76,10 @@ func (r *Replica) Metrics(
latchMetrics := r.concMgr.LatchMetrics()
lockTableMetrics := r.concMgr.LockTableMetrics()

storeAttrs := r.store.Attrs()
nodeAttrs := r.store.nodeDesc.Attrs
nodeLocality := r.store.nodeDesc.Locality

r.mu.RLock()

var qpUsed, qpCap int64
Expand All @@ -92,6 +98,9 @@ func (r *Replica) Metrics(
raftStatus: r.raftSparseStatusRLocked(),
leaseStatus: r.leaseStatusAtRLocked(ctx, now),
storeID: r.store.StoreID(),
storeAttrs: storeAttrs,
nodeAttrs: nodeAttrs,
nodeLocality: nodeLocality,
quiescent: r.mu.quiescent,
ticking: ticking,
latchMetrics: latchMetrics,
Expand All @@ -118,6 +127,8 @@ type calcReplicaMetricsInput struct {
raftStatus *raftSparseStatus
leaseStatus kvserverpb.LeaseStatus
storeID roachpb.StoreID
storeAttrs, nodeAttrs roachpb.Attributes
nodeLocality roachpb.Locality
quiescent bool
ticking bool
latchMetrics concurrency.LatchMetrics
Expand All @@ -130,14 +141,23 @@ type calcReplicaMetricsInput struct {
}

func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics {
var validLease, validLeaseOwner, livenessLease bool
var validLease, validLeaseOwner, livenessLease, violatingLeasePreferences, lessPreferredLease bool
var validLeaseType roachpb.LeaseType
if d.leaseStatus.IsValid() {
validLease = true
validLeaseOwner = d.leaseStatus.Lease.OwnedBy(d.storeID)
validLeaseType = d.leaseStatus.Lease.Type()
livenessLease = validLeaseOwner &&
keys.NodeLivenessSpan.Overlaps(d.desc.RSpan().AsRawSpanWithNoLocals())
if validLeaseOwner {
livenessLease = keys.NodeLivenessSpan.Overlaps(d.desc.RSpan().AsRawSpanWithNoLocals())
switch makeLeasePreferenceStatus(
d.leaseStatus, d.storeID, d.storeAttrs, d.nodeAttrs,
d.nodeLocality, d.conf.LeasePreferences) {
case leasePreferencesViolating:
violatingLeasePreferences = true
case leasePreferencesLessPreferred:
lessPreferredLease = true
}
}
}

rangeCounter, unavailable, underreplicated, overreplicated := calcRangeCounter(
Expand All @@ -154,18 +174,20 @@ func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics {

const raftLogTooLargeMultiple = 4
return ReplicaMetrics{
Leader: leader,
LeaseValid: validLease,
Leaseholder: validLeaseOwner,
LeaseType: validLeaseType,
LeaseStatus: d.leaseStatus,
LivenessLease: livenessLease,
Quiescent: d.quiescent,
Ticking: d.ticking,
RangeCounter: rangeCounter,
Unavailable: unavailable,
Underreplicated: underreplicated,
Overreplicated: overreplicated,
Leader: leader,
LeaseValid: validLease,
Leaseholder: validLeaseOwner,
LeaseType: validLeaseType,
LeaseStatus: d.leaseStatus,
LivenessLease: livenessLease,
ViolatingLeasePreferences: violatingLeasePreferences,
LessPreferredLease: lessPreferredLease,
Quiescent: d.quiescent,
Ticking: d.ticking,
RangeCounter: rangeCounter,
Unavailable: unavailable,
Underreplicated: underreplicated,
Overreplicated: overreplicated,
RaftLogTooLarge: d.raftLogSizeTrusted &&
d.raftLogSize > raftLogTooLargeMultiple*d.raftCfg.RaftLogTruncationThreshold,
BehindCount: leaderBehindCount,
Expand Down
70 changes: 55 additions & 15 deletions pkg/kv/kvserver/replica_range_lease.go
Original file line number Diff line number Diff line change
Expand Up @@ -1536,25 +1536,65 @@ func (r *Replica) hasCorrectLeaseTypeRLocked(lease roachpb.Lease) bool {
return hasExpirationLease == r.shouldUseExpirationLeaseRLocked()
}

// LeaseViolatesPreferences checks if current replica owns the lease and if it
// violates the lease preferences defined in the span config. If there is an
// error or no preferences defined then it will return false and consider that
// to be in-conformance.
// leasePreferencesStatus represents the state of satisfying lease preferences.
type leasePreferencesStatus int

const (
// leasePreferencesViolating indicates the leaseholder does not
// satisfy any lease preference applied.
leasePreferencesViolating leasePreferencesStatus = iota
// leasePreferencesLessPreferred indicates the leaseholder satisfies _some_
// preference, however not the most preferred.
leasePreferencesLessPreferred
// leasePreferencesOK indicates the lease satisfies the first
// preference, or no lease preferences are applied.
leasePreferencesOK
)

// LeaseViolatesPreferences checks if this replica owns the lease and if it
// violates the lease preferences defined in the span config. If no preferences
// are defined then it will return false and consider it to be in conformance.
func (r *Replica) LeaseViolatesPreferences(ctx context.Context) bool {
storeDesc, err := r.store.Descriptor(ctx, true /* useCached */)
if err != nil {
log.Infof(ctx, "Unable to load the descriptor %v: cannot check if lease violates preference", err)
storeID := r.store.StoreID()
now := r.Clock().NowAsClockTimestamp()
r.mu.RLock()
leaseStatus := r.leaseStatusAtRLocked(ctx, now)
preferences := r.mu.conf.LeasePreferences
r.mu.RUnlock()

if !leaseStatus.IsValid() || !leaseStatus.Lease.OwnedBy(storeID) {
return false
}
conf := r.SpanConfig()
if len(conf.LeasePreferences) == 0 {
return false

storeAttrs := r.store.Attrs()
nodeAttrs := r.store.nodeDesc.Attrs
nodeLocality := r.store.nodeDesc.Locality
preferenceStatus := makeLeasePreferenceStatus(
leaseStatus, storeID, storeAttrs, nodeAttrs, nodeLocality, preferences)

return preferenceStatus == leasePreferencesViolating
}

func makeLeasePreferenceStatus(
leaseStatus kvserverpb.LeaseStatus,
storeID roachpb.StoreID,
storeAttrs, nodeAttrs roachpb.Attributes,
nodeLocality roachpb.Locality,
preferences []roachpb.LeasePreference,
) leasePreferencesStatus {
if !leaseStatus.IsValid() || !leaseStatus.Lease.OwnedBy(storeID) {
return leasePreferencesOK
}
if len(preferences) == 0 {
return leasePreferencesOK
}
for _, preference := range conf.LeasePreferences {
if constraint.CheckStoreConjunction(*storeDesc, preference.Constraints) {
return false
for i, preference := range preferences {
if constraint.CheckConjunction(storeAttrs, nodeAttrs, nodeLocality, preference.Constraints) {
if i > 0 {
return leasePreferencesLessPreferred
}
return leasePreferencesOK
}
}
// We have at lease one preference set up, but we don't satisfy any.
return true
return leasePreferencesViolating
}
43 changes: 27 additions & 16 deletions pkg/kv/kvserver/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -2926,22 +2926,24 @@ func (s *Store) RangeFeed(
// whenever availability changes.
func (s *Store) updateReplicationGauges(ctx context.Context) error {
var (
raftLeaderCount int64
leaseHolderCount int64
leaseExpirationCount int64
leaseEpochCount int64
leaseLivenessCount int64
raftLeaderNotLeaseHolderCount int64
raftLeaderInvalidLeaseCount int64
quiescentCount int64
uninitializedCount int64
averageQueriesPerSecond float64
averageRequestsPerSecond float64
averageReadsPerSecond float64
averageWritesPerSecond float64
averageReadBytesPerSecond float64
averageWriteBytesPerSecond float64
averageCPUNanosPerSecond float64
raftLeaderCount int64
leaseHolderCount int64
leaseExpirationCount int64
leaseEpochCount int64
leaseLivenessCount int64
leaseViolatingPreferencesCount int64
leaseLessPreferredCount int64
raftLeaderNotLeaseHolderCount int64
raftLeaderInvalidLeaseCount int64
quiescentCount int64
uninitializedCount int64
averageQueriesPerSecond float64
averageRequestsPerSecond float64
averageReadsPerSecond float64
averageWritesPerSecond float64
averageReadBytesPerSecond float64
averageWriteBytesPerSecond float64
averageCPUNanosPerSecond float64

rangeCount int64
unavailableRangeCount int64
Expand Down Expand Up @@ -3004,6 +3006,13 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
if metrics.LivenessLease {
leaseLivenessCount++
}
// NB: Can't be satisfying a less preferred preference, and also
// satisfying no preferences.
if metrics.ViolatingLeasePreferences {
leaseViolatingPreferencesCount++
} else if metrics.LessPreferredLease {
leaseLessPreferredCount++
}
}
if metrics.Quiescent {
quiescentCount++
Expand Down Expand Up @@ -3062,6 +3071,8 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
s.metrics.LeaseHolderCount.Update(leaseHolderCount)
s.metrics.LeaseExpirationCount.Update(leaseExpirationCount)
s.metrics.LeaseEpochCount.Update(leaseEpochCount)
s.metrics.LeaseViolatingPreferencesCount.Update(leaseViolatingPreferencesCount)
s.metrics.LeaseLessPreferredCount.Update(leaseLessPreferredCount)
s.metrics.LeaseLivenessCount.Update(leaseLivenessCount)
s.metrics.QuiescentCount.Update(quiescentCount)
s.metrics.UninitializedCount.Update(uninitializedCount)
Expand Down

0 comments on commit 8ef4e36

Please sign in to comment.