Skip to content

Commit

Permalink
Merge #99726
Browse files Browse the repository at this point in the history
99726: storage: aggregate iterator stats r=jbowens a=jbowens

Aggregate the iterator stats across all of an engine's iterators. Expose seven
new timeseries metrics for visibility into the behavior of storage engine iterators:

  - storage.iterator.block-load.bytes
  - storage.iterator.block-load.cached-bytes
  - storage.iterator.block-load.read-duration
  - storage.iterator.external.seeks
  - storage.iterator.external.steps
  - storage.iterator.internal.seeks
  - storage.iterator.internal.steps

Close #95790.
Epic: None
Release note (ops change): Introduces seven new timeseries metrics for better
visibility into the behavior of storage engine iterators and their internals.

Co-authored-by: Jackson Owens <[email protected]>
  • Loading branch information
craig[bot] and jbowens committed Apr 3, 2023
2 parents 1e27c65 + e1254a8 commit bebd2dd
Show file tree
Hide file tree
Showing 12 changed files with 289 additions and 58 deletions.
98 changes: 95 additions & 3 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ var (
Unit: metric.Unit_COUNT,
}

// RocksDB/Pebble metrics.
// Pebble metrics.
metaRdbBlockCacheHits = metric.Metadata{
Name: "rocksdb.block.cache.hits",
Help: "Count of block cache hits",
Expand Down Expand Up @@ -616,6 +616,60 @@ Raft applied index at which this checkpoint was taken.`,
Unit: metric.Unit_COUNT,
}

metaBlockBytes = metric.Metadata{
Name: "storage.iterator.block-load.bytes",
Help: "Bytes loaded by storage engine iterators (possibly cached). See storage.AggregatedIteratorStats for details.",
Measurement: "Bytes",
Unit: metric.Unit_BYTES,
}
metaBlockBytesInCache = metric.Metadata{
Name: "storage.iterator.block-load.cached-bytes",
Help: "Bytes loaded by storage engine iterators from the block cache. See storage.AggregatedIteratorStats for details.",
Measurement: "Bytes",
Unit: metric.Unit_BYTES,
}
metaBlockReadDuration = metric.Metadata{
Name: "storage.iterator.block-load.read-duration",
Help: "Cumulative time storage engine iterators spent loading blocks from durable storage. See storage.AggregatedIteratorStats for details.",
Measurement: "Nanoseconds",
Unit: metric.Unit_NANOSECONDS,
}
metaIterExternalSeeks = metric.Metadata{
Name: "storage.iterator.external.seeks",
Help: "Cumulative count of seeks performed on storage engine iterators. See storage.AggregatedIteratorStats for details.",
Measurement: "Iterator Ops",
Unit: metric.Unit_COUNT,
}
metaIterExternalSteps = metric.Metadata{
Name: "storage.iterator.external.steps",
Help: "Cumulative count of steps performed on storage engine iterators. See storage.AggregatedIteratorStats for details.",
Measurement: "Iterator Ops",
Unit: metric.Unit_COUNT,
}
metaIterInternalSeeks = metric.Metadata{
Name: "storage.iterator.internal.seeks",
Help: `Cumulative count of seeks performed internally within storage engine iterators.
A value high relative to 'storage.iterator.external.seeks'
is a good indication that there's an accumulation of garbage
internally within the storage engine.
See storage.AggregatedIteratorStats for details.`,
Measurement: "Iterator Ops",
Unit: metric.Unit_COUNT,
}
metaIterInternalSteps = metric.Metadata{
Name: "storage.iterator.internal.steps",
Help: `Cumulative count of steps performed internally within storage engine iterators.
A value high relative to 'storage.iterator.external.steps'
is a good indication that there's an accumulation of garbage
internally within the storage engine.
See storage.AggregatedIteratorStats for more details.`,
Measurement: "Iterator Ops",
Unit: metric.Unit_COUNT,
}
metaSharedStorageBytesWritten = metric.Metadata{
Name: "storage.shared-storage.write",
Help: "Bytes written to external storage",
Expand Down Expand Up @@ -1843,9 +1897,17 @@ type StoreMetrics struct {
// before pebble, and this name is kept for backwards compatibility despite
// the backing metrics now originating from pebble.
//
// All of these are cumulative values. They are maintained by pebble and
// All of these are cumulative values. Most are maintained by pebble and
// so we have to expose them as gauges (lest we start tracking deltas from
// the respective last stats we got from pebble).
//
// There's a bit of a semantic mismatch here because the mechanism of
// updating these metrics is a gauge (eg, we're reading the current value,
// not incrementing) but semantically some of them are monotonically
// increasing counters.
//
// TODO(jackson): Reconcile this mismatch so that metrics that are
// semantically counters are exported as such to Prometheus. See #99922.
RdbBlockCacheHits *metric.Gauge
RdbBlockCacheMisses *metric.Gauge
RdbBlockCacheUsage *metric.Gauge
Expand Down Expand Up @@ -1876,6 +1938,13 @@ type StoreMetrics struct {
RdbWriteStallNanos *metric.Gauge
SharedStorageBytesRead *metric.Gauge
SharedStorageBytesWritten *metric.Gauge
IterBlockBytes *metric.Gauge
IterBlockBytesInCache *metric.Gauge
IterBlockReadDuration *metric.Gauge
IterExternalSeeks *metric.Gauge
IterExternalSteps *metric.Gauge
IterInternalSeeks *metric.Gauge
IterInternalSteps *metric.Gauge
FlushableIngestCount *metric.Gauge
FlushableIngestTableCount *metric.Gauge
FlushableIngestTableSize *metric.Gauge
Expand Down Expand Up @@ -2393,7 +2462,16 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
ReadWithinUncertaintyIntervalErrorServerSideRetrySuccess: metric.NewCounter(metaReadWithinUncertaintyIntervalErrorServerSideRetrySuccess),
ReadWithinUncertaintyIntervalErrorServerSideRetryFailure: metric.NewCounter(metaReadWithinUncertaintyIntervalErrorServerSideRetryFailure),

// RocksDB/Pebble metrics.
// Pebble metrics.
//
// These are all gauges today because almost all of them are cumulative
// metrics recorded within Pebble. Internally within Pebble some of
// these are monotonically increasing counters. There's a bit of a
// semantic mismatch here because the mechanism of updating the metric
// is a gauge (eg, we're reading the current value, not incrementing)
// but the meaning of the metric itself is a counter.
// TODO(jackson): Reconcile this mismatch so that metrics that are
// semantically counters are exported as such to Prometheus. See #99922.
RdbBlockCacheHits: metric.NewGauge(metaRdbBlockCacheHits),
RdbBlockCacheMisses: metric.NewGauge(metaRdbBlockCacheMisses),
RdbBlockCacheUsage: metric.NewGauge(metaRdbBlockCacheUsage),
Expand Down Expand Up @@ -2422,6 +2500,13 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
RdbLevelScore: rdbLevelScore,
RdbWriteStalls: metric.NewGauge(metaRdbWriteStalls),
RdbWriteStallNanos: metric.NewGauge(metaRdbWriteStallNanos),
IterBlockBytes: metric.NewGauge(metaBlockBytes),
IterBlockBytesInCache: metric.NewGauge(metaBlockBytesInCache),
IterBlockReadDuration: metric.NewGauge(metaBlockReadDuration),
IterExternalSeeks: metric.NewGauge(metaIterExternalSeeks),
IterExternalSteps: metric.NewGauge(metaIterExternalSteps),
IterInternalSeeks: metric.NewGauge(metaIterInternalSeeks),
IterInternalSteps: metric.NewGauge(metaIterInternalSteps),
SharedStorageBytesRead: metric.NewGauge(metaSharedStorageBytesRead),
SharedStorageBytesWritten: metric.NewGauge(metaSharedStorageBytesWritten),
FlushableIngestCount: metric.NewGauge(metaFlushableIngestCount),
Expand Down Expand Up @@ -2755,6 +2840,13 @@ func (sm *StoreMetrics) updateEngineMetrics(m storage.Metrics) {
sm.RdbWriteStallNanos.Update(m.WriteStallDuration.Nanoseconds())
sm.DiskSlow.Update(m.DiskSlowCount)
sm.DiskStalled.Update(m.DiskStallCount)
sm.IterBlockBytes.Update(int64(m.Iterator.BlockBytes))
sm.IterBlockBytesInCache.Update(int64(m.Iterator.BlockBytesInCache))
sm.IterBlockReadDuration.Update(int64(m.Iterator.BlockReadDuration))
sm.IterExternalSeeks.Update(int64(m.Iterator.ExternalSeeks))
sm.IterExternalSteps.Update(int64(m.Iterator.ExternalSteps))
sm.IterInternalSeeks.Update(int64(m.Iterator.InternalSeeks))
sm.IterInternalSteps.Update(int64(m.Iterator.InternalSteps))
sm.SharedStorageBytesRead.Update(m.SharedStorageReadBytes)
sm.SharedStorageBytesWritten.Update(m.SharedStorageWriteBytes)
sm.RdbL0Sublevels.Update(int64(m.Levels[0].Sublevels))
Expand Down
1 change: 0 additions & 1 deletion pkg/kv/kvserver/spanset/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ go_library(
"//pkg/keys",
"//pkg/roachpb",
"//pkg/storage",
"//pkg/storage/pebbleiter",
"//pkg/util/hlc",
"//pkg/util/log",
"//pkg/util/protoutil",
Expand Down
7 changes: 3 additions & 4 deletions pkg/kv/kvserver/spanset/batch.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/storage/pebbleiter"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
Expand Down Expand Up @@ -421,9 +420,9 @@ func (i *EngineIterator) UnsafeRawEngineKey() []byte {
return i.i.UnsafeRawEngineKey()
}

// GetRawIter is part of the storage.EngineIterator interface.
func (i *EngineIterator) GetRawIter() pebbleiter.Iterator {
return i.i.GetRawIter()
// CloneContext is part of the storage.EngineIterator interface.
func (i *EngineIterator) CloneContext() storage.CloneContext {
return i.i.CloneContext()
}

// Stats is part of the storage.EngineIterator interface.
Expand Down
4 changes: 2 additions & 2 deletions pkg/storage/bench_pebble_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,11 +223,11 @@ func BenchmarkMVCCGet_Pebble(b *testing.B) {
ctx := context.Background()
for _, batch := range []bool{false, true} {
b.Run(fmt.Sprintf("batch=%t", batch), func(b *testing.B) {
for _, numVersions := range []int{1, 10, 100} {
for _, numVersions := range []int{10} {
b.Run(fmt.Sprintf("versions=%d", numVersions), func(b *testing.B) {
for _, valueSize := range []int{8} {
b.Run(fmt.Sprintf("valueSize=%d", valueSize), func(b *testing.B) {
for _, numRangeKeys := range []int{0, 1, 100} {
for _, numRangeKeys := range []int{0} {
b.Run(fmt.Sprintf("numRangeKeys=%d", numRangeKeys), func(b *testing.B) {
runMVCCGet(ctx, b, mvccBenchData{
numVersions: numVersions,
Expand Down
74 changes: 62 additions & 12 deletions pkg/storage/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -361,9 +361,9 @@ type EngineIterator interface {
// Value returns the current value as a byte slice.
// REQUIRES: latest positioning function returned valid=true.
Value() ([]byte, error)
// GetRawIter is a low-level method only for use in the storage package,
// that returns the underlying pebble Iterator.
GetRawIter() pebbleiter.Iterator
// CloneContext is a low-level method only for use in the storage package,
// that provides sufficient context that the iterator may be cloned.
CloneContext() CloneContext
// SeekEngineKeyGEWithLimit is similar to SeekEngineKeyGE, but takes an
// additional exclusive upper limit parameter. The limit is semantically
// best-effort, and is an optimization to avoid O(n^2) iteration behavior in
Expand All @@ -388,6 +388,13 @@ type EngineIterator interface {
Stats() IteratorStats
}

// CloneContext is an opaque type encapsulating sufficient context to construct
// a clone of an existing iterator.
type CloneContext struct {
rawIter pebbleiter.Iterator
statsReporter statsReporter
}

// IterOptions contains options used to create an {MVCC,Engine}Iterator.
//
// For performance, every {MVCC,Engine}Iterator must specify either Prefix or
Expand Down Expand Up @@ -462,6 +469,7 @@ type IterOptions struct {
// Range keys themselves are not affected by the masking, and will be
// emitted as normal.
RangeKeyMaskingBelow hlc.Timestamp

// useL6Filters allows the caller to opt into reading filter blocks for
// L6 sstables. Only for use with Prefix = true. Helpful if a lot of prefix
// Seeks are expected in quick succession, that are also likely to not
Expand Down Expand Up @@ -1013,15 +1021,7 @@ type WriteBatch interface {
// *pebble.Metrics struct, which has its own documentation.
type Metrics struct {
*pebble.Metrics
// WriteStallCount counts the number of times Pebble intentionally delayed
// incoming writes. Currently, the only two reasons for this to happen are:
// - "memtable count limit reached"
// - "L0 file count limit exceeded"
//
// We do not split this metric across these two reasons, but they can be
// distinguished in the pebble logs.
WriteStallCount int64
WriteStallDuration time.Duration
Iterator AggregatedIteratorStats
// DiskSlowCount counts the number of times Pebble records disk slowness.
DiskSlowCount int64
// DiskStallCount counts the number of times Pebble observes slow writes
Expand All @@ -1031,6 +1031,56 @@ type Metrics struct {
SharedStorageWriteBytes int64
// SharedStorageReadBytes counts the number of bytes read from shared storage.
SharedStorageReadBytes int64
// WriteStallCount counts the number of times Pebble intentionally delayed
// incoming writes. Currently, the only two reasons for this to happen are:
// - "memtable count limit reached"
// - "L0 file count limit exceeded"
//
// We do not split this metric across these two reasons, but they can be
// distinguished in the pebble logs.
WriteStallCount int64
WriteStallDuration time.Duration
}

// AggregatedIteratorStats holds cumulative stats, collected and summed over all
// of an engine's iterators.
type AggregatedIteratorStats struct {
// BlockBytes holds the sum of sizes of all loaded blocks. If the block was
// compressed, this is the compressed bytes. This value includes blocks that
// were loaded from the cache, and bytes that needed to be read from
// persistent storage.
//
// Currently, there may be some gaps in coverage. (At the time of writing,
// 2nd-level index blocks are excluded.)
BlockBytes uint64
// BlockBytesInCache holds the subset of BlockBytes that were already in the
// block cache, requiring no I/O.
BlockBytesInCache uint64
// BlockReadDuration accumulates the duration spent fetching blocks due to
// block cache misses.
//
// Currently, there may be some gaps in coverage. (At the time of writing,
// range deletion and range key blocks, meta index blocks and properties
// blocks are all excluded.)
BlockReadDuration time.Duration
// ExternalSeeks is the total count of seeks in forward and backward
// directions performed on pebble.Iterators.
ExternalSeeks int
// ExternalSteps is the total count of relative positioning operations (eg,
// Nexts, Prevs, NextPrefix, NextWithLimit, etc) in forward and backward
// directions performed on pebble.Iterators.
ExternalSteps int
// InternalSeeks is the total count of steps in forward and backward
// directions performed on Pebble's internal iterator. If this is high
// relative to ExternalSeeks, it's a good indication that there's an
// accumulation of garbage within the LSM (NOT MVCC garbage).
InternalSeeks int
// InternalSteps is the total count of relative positioning operations (eg,
// Nexts, Prevs, NextPrefix, etc) in forward and backward directions
// performed on pebble's internal iterator. If this is high relative to
// ExternalSteps, it's a good indication that there's an accumulation of
// garbage within the LSM (NOT MVCC garbage).
InternalSteps int
}

// MetricsForInterval is a set of pebble.Metrics that need to be saved in order to
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/intent_interleaving_iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ func newIntentInterleavingIterator(reader Reader, opts IterOptions) MVCCIterator
if reader.ConsistentIterators() {
iter = maybeUnwrapUnsafeIter(reader.NewMVCCIterator(MVCCKeyIterKind, opts)).(*pebbleIterator)
} else {
iter = newPebbleIteratorByCloning(intentIter.GetRawIter(), opts, StandardDurability)
iter = newPebbleIteratorByCloning(intentIter.CloneContext(), opts, StandardDurability)
}

*iiIter = intentInterleavingIter{
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/mvcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -5052,7 +5052,7 @@ func MVCCResolveWriteIntentRange(
mvccIter = rw.NewMVCCIterator(MVCCKeyIterKind, iterOpts)
} else {
// For correctness, we need mvccIter to be consistent with engineIter.
mvccIter = newPebbleIteratorByCloning(engineIter.GetRawIter(), iterOpts, StandardDurability)
mvccIter = newPebbleIteratorByCloning(engineIter.CloneContext(), iterOpts, StandardDurability)
}
iterAndBuf := GetBufUsingIter(mvccIter)
defer func() {
Expand Down
Loading

0 comments on commit bebd2dd

Please sign in to comment.