Skip to content

Commit

Permalink
storage: aggregate iterator stats
Browse files Browse the repository at this point in the history
Aggregate the iterator stats across all of an engine's iterators. Expose seven
new timeseries metrics for visibility into the behavior of storage engine iterators:

  - storage.iterator.block-load.bytes
  - storage.iterator.block-load.cached-bytes
  - storage.iterator.block-load.read-duration
  - storage.iterator.external.seeks
  - storage.iterator.external.steps
  - storage.iterator.internal.seeks
  - storage.iterator.internal.steps

Close #95790.
Epic: None
Release note (ops change): Introduces seven new timeseries metrics for better
visibility into the behavior of storage engine iterators and their internals.
  • Loading branch information
jbowens committed Apr 2, 2023
1 parent efb9314 commit e1254a8
Show file tree
Hide file tree
Showing 11 changed files with 276 additions and 52 deletions.
98 changes: 95 additions & 3 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ var (
Unit: metric.Unit_COUNT,
}

// RocksDB/Pebble metrics.
// Pebble metrics.
metaRdbBlockCacheHits = metric.Metadata{
Name: "rocksdb.block.cache.hits",
Help: "Count of block cache hits",
Expand Down Expand Up @@ -616,6 +616,60 @@ Raft applied index at which this checkpoint was taken.`,
Unit: metric.Unit_COUNT,
}

metaBlockBytes = metric.Metadata{
Name: "storage.iterator.block-load.bytes",
Help: "Bytes loaded by storage engine iterators (possibly cached). See storage.AggregatedIteratorStats for details.",
Measurement: "Bytes",
Unit: metric.Unit_BYTES,
}
metaBlockBytesInCache = metric.Metadata{
Name: "storage.iterator.block-load.cached-bytes",
Help: "Bytes loaded by storage engine iterators from the block cache. See storage.AggregatedIteratorStats for details.",
Measurement: "Bytes",
Unit: metric.Unit_BYTES,
}
metaBlockReadDuration = metric.Metadata{
Name: "storage.iterator.block-load.read-duration",
Help: "Cumulative time storage engine iterators spent loading blocks from durable storage. See storage.AggregatedIteratorStats for details.",
Measurement: "Nanoseconds",
Unit: metric.Unit_NANOSECONDS,
}
metaIterExternalSeeks = metric.Metadata{
Name: "storage.iterator.external.seeks",
Help: "Cumulative count of seeks performed on storage engine iterators. See storage.AggregatedIteratorStats for details.",
Measurement: "Iterator Ops",
Unit: metric.Unit_COUNT,
}
metaIterExternalSteps = metric.Metadata{
Name: "storage.iterator.external.steps",
Help: "Cumulative count of steps performed on storage engine iterators. See storage.AggregatedIteratorStats for details.",
Measurement: "Iterator Ops",
Unit: metric.Unit_COUNT,
}
metaIterInternalSeeks = metric.Metadata{
Name: "storage.iterator.internal.seeks",
Help: `Cumulative count of seeks performed internally within storage engine iterators.
A value high relative to 'storage.iterator.external.seeks'
is a good indication that there's an accumulation of garbage
internally within the storage engine.
See storage.AggregatedIteratorStats for details.`,
Measurement: "Iterator Ops",
Unit: metric.Unit_COUNT,
}
metaIterInternalSteps = metric.Metadata{
Name: "storage.iterator.internal.steps",
Help: `Cumulative count of steps performed internally within storage engine iterators.
A value high relative to 'storage.iterator.external.steps'
is a good indication that there's an accumulation of garbage
internally within the storage engine.
See storage.AggregatedIteratorStats for more details.`,
Measurement: "Iterator Ops",
Unit: metric.Unit_COUNT,
}
metaSharedStorageBytesWritten = metric.Metadata{
Name: "storage.shared-storage.write",
Help: "Bytes written to external storage",
Expand Down Expand Up @@ -1843,9 +1897,17 @@ type StoreMetrics struct {
// before pebble, and this name is kept for backwards compatibility despite
// the backing metrics now originating from pebble.
//
// All of these are cumulative values. They are maintained by pebble and
// All of these are cumulative values. Most are maintained by pebble and
// so we have to expose them as gauges (lest we start tracking deltas from
// the respective last stats we got from pebble).
//
// There's a bit of a semantic mismatch here because the mechanism of
// updating these metrics is a gauge (eg, we're reading the current value,
// not incrementing) but semantically some of them are monotonically
// increasing counters.
//
// TODO(jackson): Reconcile this mismatch so that metrics that are
// semantically counters are exported as such to Prometheus. See #99922.
RdbBlockCacheHits *metric.Gauge
RdbBlockCacheMisses *metric.Gauge
RdbBlockCacheUsage *metric.Gauge
Expand Down Expand Up @@ -1876,6 +1938,13 @@ type StoreMetrics struct {
RdbWriteStallNanos *metric.Gauge
SharedStorageBytesRead *metric.Gauge
SharedStorageBytesWritten *metric.Gauge
IterBlockBytes *metric.Gauge
IterBlockBytesInCache *metric.Gauge
IterBlockReadDuration *metric.Gauge
IterExternalSeeks *metric.Gauge
IterExternalSteps *metric.Gauge
IterInternalSeeks *metric.Gauge
IterInternalSteps *metric.Gauge
FlushableIngestCount *metric.Gauge
FlushableIngestTableCount *metric.Gauge
FlushableIngestTableSize *metric.Gauge
Expand Down Expand Up @@ -2393,7 +2462,16 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
ReadWithinUncertaintyIntervalErrorServerSideRetrySuccess: metric.NewCounter(metaReadWithinUncertaintyIntervalErrorServerSideRetrySuccess),
ReadWithinUncertaintyIntervalErrorServerSideRetryFailure: metric.NewCounter(metaReadWithinUncertaintyIntervalErrorServerSideRetryFailure),

// RocksDB/Pebble metrics.
// Pebble metrics.
//
// These are all gauges today because almost all of them are cumulative
// metrics recorded within Pebble. Internally within Pebble some of
// these are monotonically increasing counters. There's a bit of a
// semantic mismatch here because the mechanism of updating the metric
// is a gauge (eg, we're reading the current value, not incrementing)
// but the meaning of the metric itself is a counter.
// TODO(jackson): Reconcile this mismatch so that metrics that are
// semantically counters are exported as such to Prometheus. See #99922.
RdbBlockCacheHits: metric.NewGauge(metaRdbBlockCacheHits),
RdbBlockCacheMisses: metric.NewGauge(metaRdbBlockCacheMisses),
RdbBlockCacheUsage: metric.NewGauge(metaRdbBlockCacheUsage),
Expand Down Expand Up @@ -2422,6 +2500,13 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
RdbLevelScore: rdbLevelScore,
RdbWriteStalls: metric.NewGauge(metaRdbWriteStalls),
RdbWriteStallNanos: metric.NewGauge(metaRdbWriteStallNanos),
IterBlockBytes: metric.NewGauge(metaBlockBytes),
IterBlockBytesInCache: metric.NewGauge(metaBlockBytesInCache),
IterBlockReadDuration: metric.NewGauge(metaBlockReadDuration),
IterExternalSeeks: metric.NewGauge(metaIterExternalSeeks),
IterExternalSteps: metric.NewGauge(metaIterExternalSteps),
IterInternalSeeks: metric.NewGauge(metaIterInternalSeeks),
IterInternalSteps: metric.NewGauge(metaIterInternalSteps),
SharedStorageBytesRead: metric.NewGauge(metaSharedStorageBytesRead),
SharedStorageBytesWritten: metric.NewGauge(metaSharedStorageBytesWritten),
FlushableIngestCount: metric.NewGauge(metaFlushableIngestCount),
Expand Down Expand Up @@ -2755,6 +2840,13 @@ func (sm *StoreMetrics) updateEngineMetrics(m storage.Metrics) {
sm.RdbWriteStallNanos.Update(m.WriteStallDuration.Nanoseconds())
sm.DiskSlow.Update(m.DiskSlowCount)
sm.DiskStalled.Update(m.DiskStallCount)
sm.IterBlockBytes.Update(int64(m.Iterator.BlockBytes))
sm.IterBlockBytesInCache.Update(int64(m.Iterator.BlockBytesInCache))
sm.IterBlockReadDuration.Update(int64(m.Iterator.BlockReadDuration))
sm.IterExternalSeeks.Update(int64(m.Iterator.ExternalSeeks))
sm.IterExternalSteps.Update(int64(m.Iterator.ExternalSteps))
sm.IterInternalSeeks.Update(int64(m.Iterator.InternalSeeks))
sm.IterInternalSteps.Update(int64(m.Iterator.InternalSteps))
sm.SharedStorageBytesRead.Update(m.SharedStorageReadBytes)
sm.SharedStorageBytesWritten.Update(m.SharedStorageWriteBytes)
sm.RdbL0Sublevels.Update(int64(m.Levels[0].Sublevels))
Expand Down
1 change: 0 additions & 1 deletion pkg/kv/kvserver/spanset/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ go_library(
"//pkg/keys",
"//pkg/roachpb",
"//pkg/storage",
"//pkg/storage/pebbleiter",
"//pkg/util/hlc",
"//pkg/util/log",
"//pkg/util/protoutil",
Expand Down
4 changes: 2 additions & 2 deletions pkg/storage/bench_pebble_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,11 +223,11 @@ func BenchmarkMVCCGet_Pebble(b *testing.B) {
ctx := context.Background()
for _, batch := range []bool{false, true} {
b.Run(fmt.Sprintf("batch=%t", batch), func(b *testing.B) {
for _, numVersions := range []int{1, 10, 100} {
for _, numVersions := range []int{10} {
b.Run(fmt.Sprintf("versions=%d", numVersions), func(b *testing.B) {
for _, valueSize := range []int{8} {
b.Run(fmt.Sprintf("valueSize=%d", valueSize), func(b *testing.B) {
for _, numRangeKeys := range []int{0, 1, 100} {
for _, numRangeKeys := range []int{0} {
b.Run(fmt.Sprintf("numRangeKeys=%d", numRangeKeys), func(b *testing.B) {
runMVCCGet(ctx, b, mvccBenchData{
numVersions: numVersions,
Expand Down
64 changes: 54 additions & 10 deletions pkg/storage/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,8 @@ type EngineIterator interface {
// CloneContext is an opaque type encapsulating sufficient context to construct
// a clone of an existing iterator.
type CloneContext struct {
rawIter pebbleiter.Iterator
rawIter pebbleiter.Iterator
statsReporter statsReporter
}

// IterOptions contains options used to create an {MVCC,Engine}Iterator.
Expand Down Expand Up @@ -468,6 +469,7 @@ type IterOptions struct {
// Range keys themselves are not affected by the masking, and will be
// emitted as normal.
RangeKeyMaskingBelow hlc.Timestamp

// useL6Filters allows the caller to opt into reading filter blocks for
// L6 sstables. Only for use with Prefix = true. Helpful if a lot of prefix
// Seeks are expected in quick succession, that are also likely to not
Expand Down Expand Up @@ -1019,15 +1021,7 @@ type WriteBatch interface {
// *pebble.Metrics struct, which has its own documentation.
type Metrics struct {
*pebble.Metrics
// WriteStallCount counts the number of times Pebble intentionally delayed
// incoming writes. Currently, the only two reasons for this to happen are:
// - "memtable count limit reached"
// - "L0 file count limit exceeded"
//
// We do not split this metric across these two reasons, but they can be
// distinguished in the pebble logs.
WriteStallCount int64
WriteStallDuration time.Duration
Iterator AggregatedIteratorStats
// DiskSlowCount counts the number of times Pebble records disk slowness.
DiskSlowCount int64
// DiskStallCount counts the number of times Pebble observes slow writes
Expand All @@ -1037,6 +1031,56 @@ type Metrics struct {
SharedStorageWriteBytes int64
// SharedStorageReadBytes counts the number of bytes read from shared storage.
SharedStorageReadBytes int64
// WriteStallCount counts the number of times Pebble intentionally delayed
// incoming writes. Currently, the only two reasons for this to happen are:
// - "memtable count limit reached"
// - "L0 file count limit exceeded"
//
// We do not split this metric across these two reasons, but they can be
// distinguished in the pebble logs.
WriteStallCount int64
WriteStallDuration time.Duration
}

// AggregatedIteratorStats holds cumulative stats, collected and summed over all
// of an engine's iterators.
type AggregatedIteratorStats struct {
// BlockBytes holds the sum of sizes of all loaded blocks. If the block was
// compressed, this is the compressed bytes. This value includes blocks that
// were loaded from the cache, and bytes that needed to be read from
// persistent storage.
//
// Currently, there may be some gaps in coverage. (At the time of writing,
// 2nd-level index blocks are excluded.)
BlockBytes uint64
// BlockBytesInCache holds the subset of BlockBytes that were already in the
// block cache, requiring no I/O.
BlockBytesInCache uint64
// BlockReadDuration accumulates the duration spent fetching blocks due to
// block cache misses.
//
// Currently, there may be some gaps in coverage. (At the time of writing,
// range deletion and range key blocks, meta index blocks and properties
// blocks are all excluded.)
BlockReadDuration time.Duration
// ExternalSeeks is the total count of seeks in forward and backward
// directions performed on pebble.Iterators.
ExternalSeeks int
// ExternalSteps is the total count of relative positioning operations (eg,
// Nexts, Prevs, NextPrefix, NextWithLimit, etc) in forward and backward
// directions performed on pebble.Iterators.
ExternalSteps int
// InternalSeeks is the total count of steps in forward and backward
// directions performed on Pebble's internal iterator. If this is high
// relative to ExternalSeeks, it's a good indication that there's an
// accumulation of garbage within the LSM (NOT MVCC garbage).
InternalSeeks int
// InternalSteps is the total count of relative positioning operations (eg,
// Nexts, Prevs, NextPrefix, etc) in forward and backward directions
// performed on pebble's internal iterator. If this is high relative to
// ExternalSteps, it's a good indication that there's an accumulation of
// garbage within the LSM (NOT MVCC garbage).
InternalSteps int
}

// MetricsForInterval is a set of pebble.Metrics that need to be saved in order to
Expand Down
3 changes: 1 addition & 2 deletions pkg/storage/intent_interleaving_iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,7 @@ func newIntentInterleavingIterator(reader Reader, opts IterOptions) MVCCIterator
if reader.ConsistentIterators() {
iter = maybeUnwrapUnsafeIter(reader.NewMVCCIterator(MVCCKeyIterKind, opts)).(*pebbleIterator)
} else {
cloneCtx := intentIter.CloneContext()
iter = newPebbleIteratorByCloning(cloneCtx.rawIter, opts, StandardDurability)
iter = newPebbleIteratorByCloning(intentIter.CloneContext(), opts, StandardDurability)
}

*iiIter = intentInterleavingIter{
Expand Down
3 changes: 1 addition & 2 deletions pkg/storage/mvcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -5052,8 +5052,7 @@ func MVCCResolveWriteIntentRange(
mvccIter = rw.NewMVCCIterator(MVCCKeyIterKind, iterOpts)
} else {
// For correctness, we need mvccIter to be consistent with engineIter.
cloneCtx := engineIter.CloneContext()
mvccIter = newPebbleIteratorByCloning(cloneCtx.rawIter, iterOpts, StandardDurability)
mvccIter = newPebbleIteratorByCloning(engineIter.CloneContext(), iterOpts, StandardDurability)
}
iterAndBuf := GetBufUsingIter(mvccIter)
defer func() {
Expand Down
Loading

0 comments on commit e1254a8

Please sign in to comment.