Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

storage: aggregate iterator stats #99726

Merged
merged 2 commits into from
Apr 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 95 additions & 3 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ var (
Unit: metric.Unit_COUNT,
}

// RocksDB/Pebble metrics.
// Pebble metrics.
metaRdbBlockCacheHits = metric.Metadata{
Name: "rocksdb.block.cache.hits",
Help: "Count of block cache hits",
Expand Down Expand Up @@ -616,6 +616,60 @@ Raft applied index at which this checkpoint was taken.`,
Unit: metric.Unit_COUNT,
}

metaBlockBytes = metric.Metadata{
Name: "storage.iterator.block-load.bytes",
Help: "Bytes loaded by storage engine iterators (possibly cached). See storage.AggregatedIteratorStats for details.",
Measurement: "Bytes",
Unit: metric.Unit_BYTES,
}
metaBlockBytesInCache = metric.Metadata{
Name: "storage.iterator.block-load.cached-bytes",
Help: "Bytes loaded by storage engine iterators from the block cache. See storage.AggregatedIteratorStats for details.",
Measurement: "Bytes",
Unit: metric.Unit_BYTES,
}
metaBlockReadDuration = metric.Metadata{
Name: "storage.iterator.block-load.read-duration",
Help: "Cumulative time storage engine iterators spent loading blocks from durable storage. See storage.AggregatedIteratorStats for details.",
Measurement: "Nanoseconds",
Unit: metric.Unit_NANOSECONDS,
}
metaIterExternalSeeks = metric.Metadata{
Name: "storage.iterator.external.seeks",
Help: "Cumulative count of seeks performed on storage engine iterators. See storage.AggregatedIteratorStats for details.",
Measurement: "Iterator Ops",
Unit: metric.Unit_COUNT,
}
metaIterExternalSteps = metric.Metadata{
Name: "storage.iterator.external.steps",
Help: "Cumulative count of steps performed on storage engine iterators. See storage.AggregatedIteratorStats for details.",
Measurement: "Iterator Ops",
Unit: metric.Unit_COUNT,
}
metaIterInternalSeeks = metric.Metadata{
Name: "storage.iterator.internal.seeks",
Help: `Cumulative count of seeks performed internally within storage engine iterators.

A value high relative to 'storage.iterator.external.seeks'
is a good indication that there's an accumulation of garbage
internally within the storage engine.

See storage.AggregatedIteratorStats for details.`,
Measurement: "Iterator Ops",
Unit: metric.Unit_COUNT,
}
metaIterInternalSteps = metric.Metadata{
Name: "storage.iterator.internal.steps",
Help: `Cumulative count of steps performed internally within storage engine iterators.

A value high relative to 'storage.iterator.external.steps'
is a good indication that there's an accumulation of garbage
internally within the storage engine.

See storage.AggregatedIteratorStats for more details.`,
Measurement: "Iterator Ops",
Unit: metric.Unit_COUNT,
}
metaSharedStorageBytesWritten = metric.Metadata{
Name: "storage.shared-storage.write",
Help: "Bytes written to external storage",
Expand Down Expand Up @@ -1843,9 +1897,17 @@ type StoreMetrics struct {
// before pebble, and this name is kept for backwards compatibility despite
// the backing metrics now originating from pebble.
//
// All of these are cumulative values. They are maintained by pebble and
// All of these are cumulative values. Most are maintained by pebble and
// so we have to expose them as gauges (lest we start tracking deltas from
// the respective last stats we got from pebble).
//
// There's a bit of a semantic mismatch here because the mechanism of
// updating these metrics is a gauge (eg, we're reading the current value,
// not incrementing) but semantically some of them are monotonically
// increasing counters.
//
// TODO(jackson): Reconcile this mismatch so that metrics that are
// semantically counters are exported as such to Prometheus. See #99922.
RdbBlockCacheHits *metric.Gauge
RdbBlockCacheMisses *metric.Gauge
RdbBlockCacheUsage *metric.Gauge
Expand Down Expand Up @@ -1876,6 +1938,13 @@ type StoreMetrics struct {
RdbWriteStallNanos *metric.Gauge
SharedStorageBytesRead *metric.Gauge
SharedStorageBytesWritten *metric.Gauge
IterBlockBytes *metric.Gauge
IterBlockBytesInCache *metric.Gauge
IterBlockReadDuration *metric.Gauge
IterExternalSeeks *metric.Gauge
IterExternalSteps *metric.Gauge
IterInternalSeeks *metric.Gauge
IterInternalSteps *metric.Gauge
FlushableIngestCount *metric.Gauge
FlushableIngestTableCount *metric.Gauge
FlushableIngestTableSize *metric.Gauge
Expand Down Expand Up @@ -2393,7 +2462,16 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
ReadWithinUncertaintyIntervalErrorServerSideRetrySuccess: metric.NewCounter(metaReadWithinUncertaintyIntervalErrorServerSideRetrySuccess),
ReadWithinUncertaintyIntervalErrorServerSideRetryFailure: metric.NewCounter(metaReadWithinUncertaintyIntervalErrorServerSideRetryFailure),

// RocksDB/Pebble metrics.
// Pebble metrics.
//
// These are all gauges today because almost all of them are cumulative
// metrics recorded within Pebble. Internally within Pebble some of
// these are monotonically increasing counters. There's a bit of a
// semantic mismatch here because the mechanism of updating the metric
// is a gauge (eg, we're reading the current value, not incrementing)
// but the meaning of the metric itself is a counter.
// TODO(jackson): Reconcile this mismatch so that metrics that are
// semantically counters are exported as such to Prometheus. See #99922.
RdbBlockCacheHits: metric.NewGauge(metaRdbBlockCacheHits),
RdbBlockCacheMisses: metric.NewGauge(metaRdbBlockCacheMisses),
RdbBlockCacheUsage: metric.NewGauge(metaRdbBlockCacheUsage),
Expand Down Expand Up @@ -2422,6 +2500,13 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
RdbLevelScore: rdbLevelScore,
RdbWriteStalls: metric.NewGauge(metaRdbWriteStalls),
RdbWriteStallNanos: metric.NewGauge(metaRdbWriteStallNanos),
IterBlockBytes: metric.NewGauge(metaBlockBytes),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was expecting them all to be counters. They are all gauges. What's going on?

IterBlockBytesInCache: metric.NewGauge(metaBlockBytesInCache),
IterBlockReadDuration: metric.NewGauge(metaBlockReadDuration),
IterExternalSeeks: metric.NewGauge(metaIterExternalSeeks),
IterExternalSteps: metric.NewGauge(metaIterExternalSteps),
IterInternalSeeks: metric.NewGauge(metaIterInternalSeeks),
IterInternalSteps: metric.NewGauge(metaIterInternalSteps),
SharedStorageBytesRead: metric.NewGauge(metaSharedStorageBytesRead),
SharedStorageBytesWritten: metric.NewGauge(metaSharedStorageBytesWritten),
FlushableIngestCount: metric.NewGauge(metaFlushableIngestCount),
Expand Down Expand Up @@ -2755,6 +2840,13 @@ func (sm *StoreMetrics) updateEngineMetrics(m storage.Metrics) {
sm.RdbWriteStallNanos.Update(m.WriteStallDuration.Nanoseconds())
sm.DiskSlow.Update(m.DiskSlowCount)
sm.DiskStalled.Update(m.DiskStallCount)
sm.IterBlockBytes.Update(int64(m.Iterator.BlockBytes))
sm.IterBlockBytesInCache.Update(int64(m.Iterator.BlockBytesInCache))
sm.IterBlockReadDuration.Update(int64(m.Iterator.BlockReadDuration))
sm.IterExternalSeeks.Update(int64(m.Iterator.ExternalSeeks))
sm.IterExternalSteps.Update(int64(m.Iterator.ExternalSteps))
sm.IterInternalSeeks.Update(int64(m.Iterator.InternalSeeks))
sm.IterInternalSteps.Update(int64(m.Iterator.InternalSteps))
sm.SharedStorageBytesRead.Update(m.SharedStorageReadBytes)
sm.SharedStorageBytesWritten.Update(m.SharedStorageWriteBytes)
sm.RdbL0Sublevels.Update(int64(m.Levels[0].Sublevels))
Expand Down
1 change: 0 additions & 1 deletion pkg/kv/kvserver/spanset/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ go_library(
"//pkg/keys",
"//pkg/roachpb",
"//pkg/storage",
"//pkg/storage/pebbleiter",
"//pkg/util/hlc",
"//pkg/util/log",
"//pkg/util/protoutil",
Expand Down
7 changes: 3 additions & 4 deletions pkg/kv/kvserver/spanset/batch.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/storage/pebbleiter"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
Expand Down Expand Up @@ -421,9 +420,9 @@ func (i *EngineIterator) UnsafeRawEngineKey() []byte {
return i.i.UnsafeRawEngineKey()
}

// GetRawIter is part of the storage.EngineIterator interface.
func (i *EngineIterator) GetRawIter() pebbleiter.Iterator {
return i.i.GetRawIter()
// CloneContext is part of the storage.EngineIterator interface.
func (i *EngineIterator) CloneContext() storage.CloneContext {
return i.i.CloneContext()
}

// Stats is part of the storage.EngineIterator interface.
Expand Down
4 changes: 2 additions & 2 deletions pkg/storage/bench_pebble_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,11 +223,11 @@ func BenchmarkMVCCGet_Pebble(b *testing.B) {
ctx := context.Background()
for _, batch := range []bool{false, true} {
b.Run(fmt.Sprintf("batch=%t", batch), func(b *testing.B) {
for _, numVersions := range []int{1, 10, 100} {
for _, numVersions := range []int{10} {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jbowens did you intend to merge this change to the benchmark?

b.Run(fmt.Sprintf("versions=%d", numVersions), func(b *testing.B) {
for _, valueSize := range []int{8} {
b.Run(fmt.Sprintf("valueSize=%d", valueSize), func(b *testing.B) {
for _, numRangeKeys := range []int{0, 1, 100} {
for _, numRangeKeys := range []int{0} {
b.Run(fmt.Sprintf("numRangeKeys=%d", numRangeKeys), func(b *testing.B) {
runMVCCGet(ctx, b, mvccBenchData{
numVersions: numVersions,
Expand Down
74 changes: 62 additions & 12 deletions pkg/storage/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -361,9 +361,9 @@ type EngineIterator interface {
// Value returns the current value as a byte slice.
// REQUIRES: latest positioning function returned valid=true.
Value() ([]byte, error)
// GetRawIter is a low-level method only for use in the storage package,
// that returns the underlying pebble Iterator.
GetRawIter() pebbleiter.Iterator
// CloneContext is a low-level method only for use in the storage package,
// that provides sufficient context that the iterator may be cloned.
CloneContext() CloneContext
// SeekEngineKeyGEWithLimit is similar to SeekEngineKeyGE, but takes an
// additional exclusive upper limit parameter. The limit is semantically
// best-effort, and is an optimization to avoid O(n^2) iteration behavior in
Expand All @@ -388,6 +388,13 @@ type EngineIterator interface {
Stats() IteratorStats
}

// CloneContext is an opaque type encapsulating sufficient context to construct
// a clone of an existing iterator.
type CloneContext struct {
rawIter pebbleiter.Iterator
statsReporter statsReporter
}

// IterOptions contains options used to create an {MVCC,Engine}Iterator.
//
// For performance, every {MVCC,Engine}Iterator must specify either Prefix or
Expand Down Expand Up @@ -462,6 +469,7 @@ type IterOptions struct {
// Range keys themselves are not affected by the masking, and will be
// emitted as normal.
RangeKeyMaskingBelow hlc.Timestamp

// useL6Filters allows the caller to opt into reading filter blocks for
// L6 sstables. Only for use with Prefix = true. Helpful if a lot of prefix
// Seeks are expected in quick succession, that are also likely to not
Expand Down Expand Up @@ -1013,15 +1021,7 @@ type WriteBatch interface {
// *pebble.Metrics struct, which has its own documentation.
type Metrics struct {
*pebble.Metrics
// WriteStallCount counts the number of times Pebble intentionally delayed
// incoming writes. Currently, the only two reasons for this to happen are:
// - "memtable count limit reached"
// - "L0 file count limit exceeded"
//
// We do not split this metric across these two reasons, but they can be
// distinguished in the pebble logs.
WriteStallCount int64
WriteStallDuration time.Duration
Iterator AggregatedIteratorStats
// DiskSlowCount counts the number of times Pebble records disk slowness.
DiskSlowCount int64
// DiskStallCount counts the number of times Pebble observes slow writes
Expand All @@ -1031,6 +1031,56 @@ type Metrics struct {
SharedStorageWriteBytes int64
// SharedStorageReadBytes counts the number of bytes read from shared storage.
SharedStorageReadBytes int64
// WriteStallCount counts the number of times Pebble intentionally delayed
// incoming writes. Currently, the only two reasons for this to happen are:
// - "memtable count limit reached"
// - "L0 file count limit exceeded"
//
// We do not split this metric across these two reasons, but they can be
// distinguished in the pebble logs.
WriteStallCount int64
WriteStallDuration time.Duration
}

// AggregatedIteratorStats holds cumulative stats, collected and summed over all
// of an engine's iterators.
type AggregatedIteratorStats struct {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be helpful to reference this type in the help text for each of the metrics as a pointer for more detailed semantics of what's tracked within each.

// BlockBytes holds the sum of sizes of all loaded blocks. If the block was
// compressed, this is the compressed bytes. This value includes blocks that
// were loaded from the cache, and bytes that needed to be read from
// persistent storage.
//
// Currently, there may be some gaps in coverage. (At the time of writing,
// 2nd-level index blocks are excluded.)
BlockBytes uint64
// BlockBytesInCache holds the subset of BlockBytes that were already in the
// block cache, requiring no I/O.
BlockBytesInCache uint64
// BlockReadDuration accumulates the duration spent fetching blocks due to
// block cache misses.
//
// Currently, there may be some gaps in coverage. (At the time of writing,
// range deletion and range key blocks, meta index blocks and properties
// blocks are all excluded.)
BlockReadDuration time.Duration
// ExternalSeeks is the total count of seeks in forward and backward
// directions performed on pebble.Iterators.
ExternalSeeks int
// ExternalSteps is the total count of relative positioning operations (eg,
// Nexts, Prevs, NextPrefix, NextWithLimit, etc) in forward and backward
// directions performed on pebble.Iterators.
ExternalSteps int
// InternalSeeks is the total count of steps in forward and backward
// directions performed on Pebble's internal iterator. If this is high
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"If this is high..." could you copy this to the metric help text?

// relative to ExternalSeeks, it's a good indication that there's an
// accumulation of garbage within the LSM (NOT MVCC garbage).
InternalSeeks int
// InternalSteps is the total count of relative positioning operations (eg,
// Nexts, Prevs, NextPrefix, etc) in forward and backward directions
// performed on pebble's internal iterator. If this is high relative to
// ExternalSteps, it's a good indication that there's an accumulation of
// garbage within the LSM (NOT MVCC garbage).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto about "If this is high..."

InternalSteps int
}

// MetricsForInterval is a set of pebble.Metrics that need to be saved in order to
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/intent_interleaving_iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ func newIntentInterleavingIterator(reader Reader, opts IterOptions) MVCCIterator
if reader.ConsistentIterators() {
iter = maybeUnwrapUnsafeIter(reader.NewMVCCIterator(MVCCKeyIterKind, opts)).(*pebbleIterator)
} else {
iter = newPebbleIteratorByCloning(intentIter.GetRawIter(), opts, StandardDurability)
iter = newPebbleIteratorByCloning(intentIter.CloneContext(), opts, StandardDurability)
}

*iiIter = intentInterleavingIter{
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/mvcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -5052,7 +5052,7 @@ func MVCCResolveWriteIntentRange(
mvccIter = rw.NewMVCCIterator(MVCCKeyIterKind, iterOpts)
} else {
// For correctness, we need mvccIter to be consistent with engineIter.
mvccIter = newPebbleIteratorByCloning(engineIter.GetRawIter(), iterOpts, StandardDurability)
mvccIter = newPebbleIteratorByCloning(engineIter.CloneContext(), iterOpts, StandardDurability)
}
iterAndBuf := GetBufUsingIter(mvccIter)
defer func() {
Expand Down
Loading