-
Notifications
You must be signed in to change notification settings - Fork 3.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
105286: kvserver: introduce raft memory tracking r=nvanbenschoten a=pav-kv This PR introduces bytes tracking/limiting for raft entries. At the moment, it only tracks entries returned by raft `Ready()`. This only includes committed entries pulled in order to be applied to the state machine. The lifetime of these entries is limited to `handleRaftReadyRaftMuLocked` duration, so the acquired bytes quota is released at the end of this call. Currently, `Ready()` calls `Storage.Entries()` only to fetch committed entries to be applied. In the future, it will also fetch entries when sending `MsgApp` messages to followers. The tracking will be extended correspondingly. The limit is controlled by `COCKROACH_RAFT_ENTRIES_MEMORY_LIMIT` env variable, which is currently disabled by default. ### Testing The effect has been demonstrated on a test #120886 which creates a large backlog of unapplied committed entries across multiple ranges. When the server is started, it eagerly tries to apply all these entries. See the screenshots of a few metrics during the unconstrained run, and a run with memory limiting. The memory footprint has dropped from 10GB to 6GB on this test, without negatively impacting the throughput (in fact, there is throughput improvement; this can be accidental, but I observed it multiple times). <details> <summary>Before</summary> <img width="1062" alt="Screenshot 2024-03-22 at 13 09 56" src="https://github.com/cockroachdb/cockroach/assets/3757441/09cd14c0-54ab-4585-8983-5562d5a5b49e"> <img width="1050" alt="Screenshot 2024-03-22 at 13 10 10" src="https://github.com/cockroachdb/cockroach/assets/3757441/93d89241-5f51-41d8-9b81-c52fe428f869"> <img width="1061" alt="Screenshot 2024-03-22 at 13 10 22" src="https://github.com/cockroachdb/cockroach/assets/3757441/446e589b-8cef-455e-bd53-75fc98191791"> </details> <details> <summary>After (with COCKROACH_RAFT_ENTRIES_MEMORY_LIMIT=24M)</summary> <img width="1053" alt="Screenshot 2024-03-22 at 13 13 30" src="https://github.com/cockroachdb/cockroach/assets/3757441/9b1e7d67-0d46-48b4-af5c-f1f381b208f1"> <img width="1048" alt="Screenshot 2024-03-22 at 13 13 38" src="https://github.com/cockroachdb/cockroach/assets/3757441/79a91937-46a7-4461-aca2-27ed6387f44c"> <img width="1051" alt="Screenshot 2024-03-22 at 13 13 48" src="https://github.com/cockroachdb/cockroach/assets/3757441/7b02344f-5494-4e3a-9d04-48c6bbedd097"> </details> --- Part of #102840 Release note (performance improvement): This change introduced pacing for pulling raft entries from storage when applying them to the state machine. This helps avoiding OOMs or excessive resource usage when a node catches up on many committed entries, which ultimately affects availability and performance (e.g., tail latencies). New metrics `raft.loaded_entries.bytes` and `raft.loaded_entries.reserved.bytes` gauge the total size of log entries pulled from raft for applying to the state machine, and can be used for figuring out a reasonable soft limit. The soft limit can be set via `COCKROACH_RAFT_ENTRIES_MEMORY_LIMIT` env var which is currently disabled by default. Co-authored-by: Pavel Kalinnikov <[email protected]>
- Loading branch information
Showing
12 changed files
with
395 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
// Copyright 2024 The Cockroach Authors. | ||
// | ||
// Use of this software is governed by the Business Source License | ||
// included in the file licenses/BSL.txt. | ||
// | ||
// As of the Change Date specified in that file, in accordance with | ||
// the Business Source License, use of this software will be governed | ||
// by the Apache License, Version 2.0, included in the file | ||
// licenses/APL.txt. | ||
|
||
package logstore | ||
|
||
import ( | ||
"github.com/cockroachdb/cockroach/pkg/util/envutil" | ||
"github.com/cockroachdb/cockroach/pkg/util/metric" | ||
) | ||
|
||
// raftEntriesMemoryLimit is the "global" soft limit for the total size of raft | ||
// log entries pulled into memory simultaneously. Currently, this only includes | ||
// the entries pulled as part of the local state machine application flow. | ||
// | ||
// No limit if <= 0. | ||
var raftEntriesMemoryLimit = envutil.EnvOrDefaultBytes( | ||
"COCKROACH_RAFT_ENTRIES_MEMORY_LIMIT", 0) | ||
|
||
// NewRaftEntriesSoftLimit returns the SoftLimit configured with the default | ||
// memory limit. | ||
func NewRaftEntriesSoftLimit() *SoftLimit { | ||
reservedBytesMetric := metric.NewGauge(metric.Metadata{ | ||
Name: "raft.loaded_entries.reserved.bytes", | ||
Help: "Bytes allocated by raft Storage.Entries calls that are still kept in memory", | ||
Measurement: "Memory", | ||
Unit: metric.Unit_BYTES, | ||
}) | ||
return &SoftLimit{Metric: reservedBytesMetric, Limit: raftEntriesMemoryLimit} | ||
} | ||
|
||
// The reservation sizes are optimized with the following constraints in mind: | ||
// | ||
// - Per kvserver.defaultRaftSchedulerConcurrency, there are <= 128 raft | ||
// scheduler workers. | ||
// - Per bulk.IngestBatchSize and kvserverbase.MaxCommandSize, raft entries are | ||
// up to 16 MB in size. Entries are typically small, but can reach this | ||
// limit, e.g. if these are AddSSTable commands or large batch INSERTs. | ||
// Typically, compression reduces the max size to < 8MB. | ||
const ( | ||
// minReserveSize is the granularity at which bytes are reserved from the | ||
// SoftLimit for small allocations (below smallReserveSize). Batching small | ||
// allocations this way amortizes the runtime cost of SoftLimit which is | ||
// typically shared by many goroutines. | ||
minReserveSize = 256 << 10 | ||
// smallReserveSize is the threshold below which the size is considered | ||
// "small", and causes reserving in multiples of minReserveSize at a time. | ||
smallReserveSize = 4 << 20 | ||
) | ||
|
||
// SoftLimit is a byte size limit with weak guarantees. It tracks the global | ||
// usage in a metric, and gives the user a hint when the usage has reached the | ||
// soft limit. | ||
// | ||
// When at the limit, the user is supposed to back off acquiring resources, but | ||
// is not strictly required to. There can be overflows when the user acquires | ||
// resources optimistically. It is recommended to make sure that the possible | ||
// overflows are bounded (e.g. there is a bounded number of workers, and each | ||
// can optimistically acquire a bounded amount), and to account for these | ||
// overflows when picking the soft limit. | ||
// | ||
// TODO(pav-kv): integrate with mon.BytesMonitor. | ||
type SoftLimit struct { | ||
Metric *metric.Gauge // the "global" usage metric | ||
Limit int64 // the soft limit | ||
} | ||
|
||
// acquire accepts the given number of bytes for tracking. | ||
func (s *SoftLimit) acquire(x uint64) { | ||
s.Metric.Inc(int64(x)) | ||
} | ||
|
||
// withinLimit returns true iff the bytes usage hasn't reached the soft limit. | ||
func (s *SoftLimit) withinLimit() bool { | ||
return s.Limit <= 0 || s.Metric.Value() < s.Limit | ||
} | ||
|
||
// release removes the given number of bytes from tracking. | ||
func (s *SoftLimit) release(x uint64) { | ||
s.Metric.Dec(int64(x)) | ||
} | ||
|
||
// BytesAccount acquires bytes from SoftLimit, and releases them at the end of | ||
// the lifetime. | ||
type BytesAccount struct { | ||
lim *SoftLimit | ||
metric *metric.Gauge // the "local" usage metric | ||
used uint64 | ||
reserved uint64 // reserved bytes are not used yet but are accounted for in lim | ||
} | ||
|
||
// NewAccount creates a BytesAccount consuming from this SoftLimit. | ||
func (s *SoftLimit) NewAccount(metric *metric.Gauge) BytesAccount { | ||
return BytesAccount{lim: s, metric: metric} | ||
} | ||
|
||
// Initialized returns true iff this BytesAccount is usable. | ||
func (b *BytesAccount) Initialized() bool { | ||
return b != nil && b.lim != nil | ||
} | ||
|
||
// Grow optimistically acquires and accounts for the given number of bytes. | ||
// Returns false when this leads to the SoftLimit overflow, in which case the | ||
// user should back off acquiring any more resources. | ||
func (b *BytesAccount) Grow(x uint64) (withinLimit bool) { | ||
if !b.Initialized() { | ||
return true | ||
} | ||
withinLimit = true | ||
if x > b.reserved { | ||
need := roundSize(x - b.reserved) | ||
b.lim.acquire(need) | ||
b.reserved += need | ||
if withinLimit = b.lim.withinLimit(); !withinLimit && b.reserved > x { | ||
// If we reached the soft limit, drain the remainder of the reserved bytes | ||
// to the limiter. We will not use it - the client typically stops calling | ||
// Grow after this. | ||
b.lim.release(b.reserved - x) | ||
b.reserved = x | ||
} | ||
} | ||
b.reserved -= x | ||
b.used += x | ||
if b.metric != nil { | ||
b.metric.Inc(int64(x)) | ||
} | ||
return withinLimit | ||
} | ||
|
||
// Clear returns all the reserved bytes into the SoftLimit. | ||
func (b *BytesAccount) Clear() { | ||
if !b.Initialized() { | ||
return | ||
} | ||
if b.metric != nil { | ||
b.metric.Dec(int64(b.used)) | ||
} | ||
b.lim.release(b.used + b.reserved) | ||
b.used, b.reserved = 0, 0 | ||
} | ||
|
||
func roundSize(size uint64) uint64 { | ||
if size >= smallReserveSize { | ||
// Don't round the size up if the allocation is large. This also avoids edge | ||
// cases in the math below if size == math.MaxInt64. | ||
return size | ||
} | ||
return (size + minReserveSize - 1) / minReserveSize * minReserveSize | ||
} | ||
|
||
// sizeHelper helps to build a batch of entries with the total size not | ||
// exceeding the static and dynamic byte limits. The user calls the add() method | ||
// until sizeHelper.done becomes true. For all add() calls that returned true, | ||
// the corresponding data can be added to the batch. | ||
// | ||
// In some exceptional cases, the size of the batch can exceed the limits: | ||
// - maxBytes can be exceeded by the first added entry, | ||
// - the soft limit can be exceeded by the last added entry. | ||
type sizeHelper struct { | ||
bytes uint64 | ||
maxBytes uint64 | ||
account *BytesAccount | ||
done bool | ||
} | ||
|
||
// add returns true if the given number of bytes can be added to the batch | ||
// without exceeding the maxBytes limit, or overflowing the bytes account. The | ||
// first add call always returns true. | ||
// | ||
// Must not be called after sizeHelper.done becomes true. | ||
func (s *sizeHelper) add(bytes uint64) bool { | ||
if s.bytes == 0 { // this is the first entry, always take it | ||
s.bytes += bytes | ||
s.done = !s.account.Grow(bytes) || s.bytes > s.maxBytes | ||
return true | ||
} else if s.bytes+bytes > s.maxBytes { | ||
s.done = true | ||
return false | ||
} | ||
s.bytes += bytes | ||
s.done = !s.account.Grow(bytes) | ||
return true | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
// Copyright 2024 The Cockroach Authors. | ||
// | ||
// Use of this software is governed by the Business Source License | ||
// included in the file licenses/BSL.txt. | ||
// | ||
// As of the Change Date specified in that file, in accordance with | ||
// the Business Source License, use of this software will be governed | ||
// by the Apache License, Version 2.0, included in the file | ||
// licenses/APL.txt. | ||
|
||
package logstore | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/util/metric" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestSoftLimiter(t *testing.T) { | ||
lim := SoftLimit{Metric: metric.NewGauge(metric.Metadata{}), Limit: 1000} | ||
require.Zero(t, lim.Metric.Value()) | ||
lim.acquire(100) | ||
require.Equal(t, int64(100), lim.Metric.Value()) | ||
require.True(t, lim.withinLimit()) | ||
lim.acquire(900) | ||
require.Equal(t, int64(1000), lim.Metric.Value()) | ||
require.False(t, lim.withinLimit()) | ||
lim.release(100) | ||
require.Equal(t, int64(900), lim.Metric.Value()) | ||
require.True(t, lim.withinLimit()) | ||
lim.acquire(10000) | ||
require.Equal(t, int64(10900), lim.Metric.Value()) | ||
require.False(t, lim.withinLimit()) | ||
lim.release(900) | ||
require.Equal(t, int64(10000), lim.Metric.Value()) | ||
require.False(t, lim.withinLimit()) | ||
lim.release(10000) | ||
require.Zero(t, lim.Metric.Value()) | ||
require.True(t, lim.withinLimit()) | ||
|
||
lim.Limit = 0 // no limit | ||
lim.acquire(100) | ||
require.True(t, lim.withinLimit()) | ||
lim.acquire(1000000) | ||
require.True(t, lim.withinLimit()) | ||
} | ||
|
||
func TestBytesAccount(t *testing.T) { | ||
lim := SoftLimit{Metric: metric.NewGauge(metric.Metadata{}), Limit: 1 << 20} | ||
a1, a2 := lim.NewAccount(nil), lim.NewAccount(nil) | ||
require.True(t, a1.Grow(256<<10)) | ||
|
||
require.True(t, a2.Grow(128<<10)) | ||
require.True(t, a2.Grow(256<<10)) | ||
require.False(t, a2.Grow(512<<10)) | ||
require.Equal(t, uint64(512+256+128)<<10, a2.used) | ||
require.Zero(t, a2.reserved) | ||
// a2 returns all the reserved bytes to the limiter. | ||
require.Equal(t, int64(a1.used+a1.reserved+a2.used), lim.Metric.Value()) | ||
|
||
require.False(t, a1.Grow(10)) | ||
require.Equal(t, uint64(10+256<<10), a1.used) | ||
require.Zero(t, a1.reserved) | ||
// a1 returns all the reserved bytes to the limiter. | ||
require.Equal(t, int64(a1.used+a2.used), lim.Metric.Value()) | ||
|
||
require.False(t, lim.withinLimit()) | ||
a2.Clear() | ||
require.True(t, lim.withinLimit()) | ||
require.True(t, a1.Grow(1000)) | ||
a1.Clear() | ||
|
||
require.Zero(t, lim.Metric.Value()) | ||
} | ||
|
||
func TestSizeHelper(t *testing.T) { | ||
lim := SoftLimit{Metric: metric.NewGauge(metric.Metadata{}), Limit: 1 << 20} | ||
for _, tt := range []struct { | ||
max uint64 | ||
sizes []uint64 | ||
take int | ||
}{ | ||
// Limits are not reached. | ||
{max: 1023, sizes: []uint64{10}, take: 1}, | ||
{max: 100, sizes: []uint64{10, 10, 20, 10}, take: 4}, | ||
// Max size limit is reached. | ||
{max: 100, sizes: []uint64{30, 30, 30, 30}, take: 3}, | ||
{max: 100, sizes: []uint64{100, 1}, take: 1}, | ||
{max: 100, sizes: []uint64{200, 1}, take: 1}, | ||
{max: 1000, sizes: []uint64{100, 900, 100}, take: 2}, | ||
{max: 1000, sizes: []uint64{100, 500, 10000}, take: 2}, | ||
{max: 1000, sizes: []uint64{100, 1000}, take: 1}, | ||
// Soft limiter kicks in. | ||
{max: 64 << 20, sizes: []uint64{8 << 20}, take: 1}, | ||
{max: 64 << 20, sizes: []uint64{4 << 20, 4 << 20}, take: 1}, | ||
{max: 64 << 20, sizes: []uint64{1 << 20, 1 << 20, 1 << 20}, take: 1}, | ||
{max: 64 << 20, sizes: []uint64{100, 1 << 20, 1 << 20}, take: 2}, | ||
{max: 64 << 20, sizes: []uint64{100, 1000, 1234, 1 << 20, 1 << 20}, take: 4}, | ||
} { | ||
t.Run("", func(t *testing.T) { | ||
acc := lim.NewAccount(nil) | ||
defer acc.Clear() | ||
sh := sizeHelper{maxBytes: tt.max, account: &acc} | ||
|
||
took := 0 | ||
for ln := len(tt.sizes); took < ln && !sh.done; took++ { | ||
if size := tt.sizes[took]; !sh.add(size) { | ||
require.True(t, sh.done) | ||
break | ||
} | ||
} | ||
require.Equal(t, tt.take, took) | ||
|
||
var want uint64 | ||
for _, size := range tt.sizes[:tt.take] { | ||
want += size | ||
} | ||
require.Equal(t, want, sh.bytes) | ||
}) | ||
} | ||
} |
Oops, something went wrong.