kv: integrate raft async storage writes

Fixes cockroachdb#17500. Waiting on github.com/cockroachdb/pebble/pull/2117. This commit integrates with the `AsyncStorageWrites` functionality that we added to Raft in github.com/etcd-io/raft/pull/8. \## Approach The commit makes the minimal changes needed to integrate with async storage writes and pull fsyncs out of the raft state machine loop. It does not make an effort to extract the non-durable portion of raft log writes or raft log application onto separate goroutine pools, as was described in cockroachdb#17500. Those changes will also be impactful, but they're non trivial and bump into a pipelining vs. batching trade-off, so they are left as future work items (TODO(nvanbenschoten): open new issues). With this change, asynchronous Raft log syncs are enabled by the new `DB.ApplyNoSyncWait` Pebble API introduced in github.com/cockroachdb/pebble/pull/2117. The `handleRaftReady` state machine loop continues to initiate Raft log writes, but it uses the Pebble API to offload waiting on durability to a separate goroutine. This separate goroutine then sends the corresponding `MsgStorageAppend`'s response messages where they need to go (locally and/or to the Raft leader) when the fsync completes. The async storage writes functionality in Raft makes this all safe. \## Benchmark Results The result of this change is reduced interference between Raft proposals. As a result, it reduces end-to-end commit latency. github.com/etcd-io/raft/pull/8 presented a collection of benchmark results captured from integrating async storage writes with rafttoy. When integrated into CockroachDB, we see similar improvements to average and tail latency. However, it doesn't provide the throughput improvements at the top end because log appends and state machine application have not yet been extracted into separate goroutine pools, which would facilitate increased opportunity for batching. TODO: add images ---- Release note (performance improvement): The Raft proposal pipeline has been optimized to reduce interference between Raft proposals. This improves average and tail write latency at high concurrency.
nvanbenschoten · Jan 6, 2023 · 7f77d1a · 7f77d1a
1 parent 063f37e
commit 7f77d1a
Show file tree

Hide file tree

Showing 10 changed files with 630 additions and 356 deletions.
diff --git a/pkg/kv/kvserver/logstore/BUILD.bazel b/pkg/kv/kvserver/logstore/BUILD.bazel
@@ -8,6 +8,7 @@ go_library(
         "sideload.go",
         "sideload_disk.go",
         "stateloader.go",
+        "sync_waiter.go",
     ],
     importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/logstore",
     visibility = ["//visibility:public"],
@@ -28,9 +29,11 @@ go_library(
         "//pkg/util/log",
         "//pkg/util/metric",
         "//pkg/util/protoutil",
+        "//pkg/util/stop",
         "//pkg/util/timeutil",
         "@com_github_cockroachdb_errors//:errors",
         "@com_github_cockroachdb_errors//oserror",
+        "@com_github_cockroachdb_pebble//record",
         "@com_github_cockroachdb_redact//:redact",
         "@io_etcd_go_raft_v3//:raft",
         "@io_etcd_go_raft_v3//raftpb",
@@ -43,6 +46,7 @@ go_test(
     srcs = [
         "logstore_bench_test.go",
         "sideload_test.go",
+        "sync_waiter_test.go",
     ],
     args = ["-test.timeout=295s"],
     embed = [":logstore"],
@@ -60,6 +64,7 @@ go_test(
         "//pkg/util/log",
         "//pkg/util/metric",
         "//pkg/util/protoutil",
+        "//pkg/util/stop",
         "//pkg/util/tracing",
         "@com_github_cockroachdb_errors//:errors",
         "@com_github_cockroachdb_errors//oserror",

diff --git a/pkg/kv/kvserver/logstore/logstore.go b/pkg/kv/kvserver/logstore/logstore.go
@@ -13,6 +13,7 @@ package logstore
 
 import (
 	"context"
+	"fmt"
 	"sync"
 	"time"
 
@@ -44,25 +45,24 @@ var disableSyncRaftLog = settings.RegisterBoolSetting(
 	envutil.EnvOrDefaultBool("COCKROACH_DISABLE_RAFT_LOG_SYNCHRONIZATION_UNSAFE", false),
 )
 
-// Ready contains the log entries and state to be saved to stable storage. This
-// is a subset of raft.Ready relevant to log storage. All fields are read-only.
-type Ready struct {
-	// The current state of a replica to be saved to stable storage. Empty if
-	// there is no update.
-	raftpb.HardState
+var enableNonBlockingRaftLogSync = settings.RegisterBoolSetting(
+	settings.TenantWritable,
+	"kv.raft_log.non_blocking_synchronization.enabled",
+	"set to true to enable non-blocking synchronization on Raft log writes to "+
+		"persistent storage. Setting to true does not risk data loss or data corruption "+
+		"on server crashes, but can reduce write latency.",
+	envutil.EnvOrDefaultBool("COCKROACH_ENABLE_RAFT_LOG_NON_BLOCKING_SYNCHRONIZATION", true),
+)
 
-	// Entries specifies entries to be saved to stable storage. Empty if there is
-	// no update.
-	Entries []raftpb.Entry
+// MsgStorageAppend is a raftpb.Message with type MsgStorageAppend.
+type MsgStorageAppend raftpb.Message
 
-	// MustSync indicates whether the HardState and Entries must be synchronously
-	// written to disk, or if an asynchronous write is permissible.
-	MustSync bool
-}
-
-// MakeReady constructs a Ready struct from raft.Ready.
-func MakeReady(from raft.Ready) Ready {
-	return Ready{HardState: from.HardState, Entries: from.Entries, MustSync: from.MustSync}
+// MakeMsgStorageAppend constructs a MsgStorageAppend from a raftpb.Message.
+func MakeMsgStorageAppend(m raftpb.Message) MsgStorageAppend {
+	if m.Type != raftpb.MsgStorageAppend {
+		panic(fmt.Sprintf("unexpected message type %s", m.Type))
+	}
+	return MsgStorageAppend(m)
 }
 
 // RaftState stores information about the last entry and the size of the log.
@@ -87,6 +87,8 @@ type AppendStats struct {
 	PebbleBytes int64
 
 	Sync bool
+	// If true, PebbleEnd-PebbleBegin does not include the sync time.
+	NonBlocking bool
 }
 
 // Metrics contains metrics specific to the log storage.
@@ -100,37 +102,67 @@ type LogStore struct {
 	Engine      storage.Engine
 	Sideload    SideloadStorage
 	StateLoader StateLoader
+	SyncWaiter  *SyncWaiterLoop
 	EntryCache  *raftentry.Cache
 	Settings    *cluster.Settings
 	Metrics     Metrics
 }
 
+// SyncCallback is a callback that is notified when a raft log write has been
+// durably committed to disk. The function is handed the response messages that
+// are associated with the MsgStorageAppend that triggered the fsync.
+type SyncCallback interface {
+	OnLogSync(context.Context, []raftpb.Message)
+}
+
 func newStoreEntriesBatch(eng storage.Engine) storage.Batch {
 	// Use an unindexed batch because we don't need to read our writes, and
 	// it is more efficient.
 	return eng.NewUnindexedBatch(false /* writeOnly */)
 }
 
-// StoreEntries persists newly appended Raft log Entries to the log storage.
+// StoreEntries persists newly appended Raft log Entries to the log storage,
+// then calls the provided callback with the input's response messages (if any)
+// once the entries are durable. The durable log write may or may not be
+// blocking (and therefore the callback may or may not be called synchronously),
+// depending on the kv.raft_log.non_blocking_synchronization.enabled cluster
+// setting. Either way, the effects of the log append will be immediately
+// visible readers of the Engine.
+//
 // Accepts the state of the log before the operation, returns the state after.
 // Persists HardState atomically with, or strictly after Entries.
 func (s *LogStore) StoreEntries(
-	ctx context.Context, state RaftState, rd Ready, stats *AppendStats,
+	ctx context.Context, state RaftState, m MsgStorageAppend, cb SyncCallback, stats *AppendStats,
 ) (RaftState, error) {
 	batch := newStoreEntriesBatch(s.Engine)
-	defer batch.Close()
-	return s.storeEntriesAndCommitBatch(ctx, state, rd, stats, batch)
+	return s.storeEntriesAndCommitBatch(ctx, state, m, cb, stats, batch)
 }
 
+// storeEntriesAndCommitBatch is like StoreEntries, but it accepts a
+// storage.Batch, which it takes responsibility for committing and closing.
 func (s *LogStore) storeEntriesAndCommitBatch(
-	ctx context.Context, state RaftState, rd Ready, stats *AppendStats, batch storage.Batch,
+	ctx context.Context,
+	state RaftState,
+	m MsgStorageAppend,
+	cb SyncCallback,
+	stats *AppendStats,
+	batch storage.Batch,
 ) (RaftState, error) {
+	// Before returning, Close the batch if we haven't handed ownership of it to a
+	// SyncWaiterLoop. If batch == nil, SyncWaiterLoop is responsible for closing
+	// it once the in-progress disk writes complete.
+	defer func() {
+		if batch != nil {
+			defer batch.Close()
+		}
+	}()
+
 	prevLastIndex := state.LastIndex
-	if len(rd.Entries) > 0 {
+	if len(m.Entries) > 0 {
 		stats.Begin = timeutil.Now()
 		// All of the entries are appended to distinct keys, returning a new
 		// last index.
-		thinEntries, numSideloaded, sideLoadedEntriesSize, otherEntriesSize, err := MaybeSideloadEntries(ctx, rd.Entries, s.Sideload)
+		thinEntries, numSideloaded, sideLoadedEntriesSize, otherEntriesSize, err := MaybeSideloadEntries(ctx, m.Entries, s.Sideload)
 		if err != nil {
 			const expl = "during sideloading"
 			return RaftState{}, errors.Wrap(err, expl)
@@ -149,16 +181,21 @@ func (s *LogStore) storeEntriesAndCommitBatch(
 		stats.End = timeutil.Now()
 	}
 
-	if !raft.IsEmptyHardState(rd.HardState) {
+	hs := raftpb.HardState{
+		Term:   m.Term,
+		Vote:   m.Vote,
+		Commit: m.Commit,
+	}
+	if !raft.IsEmptyHardState(hs) {
 		// NB: Note that without additional safeguards, it's incorrect to write
-		// the HardState before appending rd.Entries. When catching up, a follower
+		// the HardState before appending m.Entries. When catching up, a follower
 		// will receive Entries that are immediately Committed in the same
 		// Ready. If we persist the HardState but happen to lose the Entries,
 		// assertions can be tripped.
 		//
 		// We have both in the same batch, so there's no problem. If that ever
 		// changes, we must write and sync the Entries before the HardState.
-		if err := s.StateLoader.SetHardState(ctx, batch, rd.HardState); err != nil {
+		if err := s.StateLoader.SetHardState(ctx, batch, hs); err != nil {
 			const expl = "during setHardState"
 			return RaftState{}, errors.Wrap(err, expl)
 		}
@@ -168,9 +205,9 @@ func (s *LogStore) storeEntriesAndCommitBatch(
 	//
 	// Note that the data is visible to other goroutines before it is synced to
 	// disk. This is fine. The important constraints are that these syncs happen
-	// before Raft messages are sent and before the call to RawNode.Advance. Our
-	// regular locking is sufficient for this and if other goroutines can see the
-	// data early, that's fine. In particular, snapshots are not a problem (I
+	// before the MsgStorageAppend's responses are delivered back to the RawNode.
+	// Our regular locking is sufficient for this and if other goroutines can see
+	// the data early, that's fine. In particular, snapshots are not a problem (I
 	// think they're the only thing that might access log entries or HardState
 	// from other goroutines). Snapshots do not include either the HardState or
 	// uncommitted log entries, and even if they did include log entries that
@@ -183,25 +220,54 @@ func (s *LogStore) storeEntriesAndCommitBatch(
 	// (Replica), so this comment might need to move.
 	stats.PebbleBegin = timeutil.Now()
 	stats.PebbleBytes = int64(batch.Len())
-	sync := rd.MustSync && !disableSyncRaftLog.Get(&s.Settings.SV)
-	if err := batch.Commit(sync); err != nil {
-		const expl = "while committing batch"
-		return RaftState{}, errors.Wrap(err, expl)
+	mustSync := len(m.Responses) > 0
+	sync := mustSync && !disableSyncRaftLog.Get(&s.Settings.SV)
+	nonBlockingSync := sync && enableNonBlockingRaftLogSync.Get(&s.Settings.SV)
+	if nonBlockingSync {
+		// If non-blocking synchronization is enabled, apply the batched updates to
+		// the engine and initiate a synchronous disk write, but don't wait for the
+		// write to complete. Instead, enqueue that waiting on the SyncWaiterLoop,
+		// who will signal the callback when the write completes.
+		if err := batch.CommitNoSyncWait(); err != nil {
+			const expl = "while committing batch without sync wait"
+			return RaftState{}, errors.Wrap(err, expl)
+		}
+		stats.PebbleEnd = timeutil.Now()
+		s.SyncWaiter.enqueue(ctx, batch, func() {
+			// NOTE: run on the SyncWaiterLoop goroutine.
+			logCommitEnd := timeutil.Now()
+			s.Metrics.RaftLogCommitLatency.RecordValue(logCommitEnd.Sub(stats.PebbleBegin).Nanoseconds())
+			cb.OnLogSync(ctx, m.Responses)
+		})
+		// Do not Close batch on return. Will be Closed by SyncWaiterLoop.
+		batch = nil
+	} else {
+		if err := batch.Commit(sync); err != nil {
+			const expl = "while committing batch"
+			return RaftState{}, errors.Wrap(err, expl)
+		}
+		stats.PebbleEnd = timeutil.Now()
+		if mustSync {
+			logCommitEnd := stats.PebbleEnd
+			s.Metrics.RaftLogCommitLatency.RecordValue(logCommitEnd.Sub(stats.PebbleBegin).Nanoseconds())
+			cb.OnLogSync(ctx, m.Responses)
+		}
 	}
 	stats.Sync = sync
-	stats.PebbleEnd = timeutil.Now()
-	if rd.MustSync {
-		s.Metrics.RaftLogCommitLatency.RecordValue(stats.PebbleEnd.Sub(stats.PebbleBegin).Nanoseconds())
-	}
+	stats.NonBlocking = nonBlockingSync
 
-	if len(rd.Entries) > 0 {
+	// TODO BEFORE MERGE: how does this work with CommitNoSyncWait()? Do we need
+	// to wait for the sync to finish before purging sideloaded entries? We don't
+	// end up with log entries without their corresponding sideloaded SSTables.
+	// What do we do with stats?
+	if len(m.Entries) > 0 {
 		// We may have just overwritten parts of the log which contain
 		// sideloaded SSTables from a previous term (and perhaps discarded some
 		// entries that we didn't overwrite). Remove any such leftover on-disk
 		// payloads (we can do that now because we've committed the deletion
 		// just above).
-		firstPurge := rd.Entries[0].Index // first new entry written
-		purgeTerm := rd.Entries[0].Term - 1
+		firstPurge := m.Entries[0].Index // first new entry written
+		purgeTerm := m.Entries[0].Term - 1
 		lastPurge := prevLastIndex // old end of the log, include in deletion
 		purgedSize, err := maybePurgeSideloaded(ctx, s.Sideload, firstPurge, lastPurge, purgeTerm)
 		if err != nil {
@@ -217,7 +283,7 @@ func (s *LogStore) storeEntriesAndCommitBatch(
 
 	// Update raft log entry cache. We clear any older, uncommitted log entries
 	// and cache the latest ones.
-	s.EntryCache.Add(s.RangeID, rd.Entries, true /* truncate */)
+	s.EntryCache.Add(s.RangeID, m.Entries, true /* truncate */)
 
 	return state, nil
 }

diff --git a/pkg/kv/kvserver/logstore/logstore_bench_test.go b/pkg/kv/kvserver/logstore/logstore_bench_test.go
@@ -36,6 +36,10 @@ func (b *discardBatch) Commit(bool) error {
 	return nil
 }
 
+type noopSyncCallback struct{}
+
+func (noopSyncCallback) OnLogSync(context.Context, []raftpb.Message) {}
+
 func BenchmarkLogStore_StoreEntries(b *testing.B) {
 	defer log.Scope(b).Close(b)
 	const kb = 1 << 10
@@ -48,23 +52,25 @@ func BenchmarkLogStore_StoreEntries(b *testing.B) {
 }
 
 func runBenchmarkLogStore_StoreEntries(b *testing.B, bytes int64) {
+	ctx := context.Background()
 	const tenMB = 10 * 1 << 20
 	ec := raftentry.NewCache(tenMB)
 	const rangeID = 1
 	eng := storage.NewDefaultInMemForTesting()
 	defer eng.Close()
+	st := cluster.MakeTestingClusterSettings()
+	enableNonBlockingRaftLogSync.Override(ctx, &st.SV, false)
 	s := LogStore{
 		RangeID:     rangeID,
 		Engine:      eng,
 		StateLoader: NewStateLoader(rangeID),
 		EntryCache:  ec,
-		Settings:    cluster.MakeTestingClusterSettings(),
+		Settings:    st,
 		Metrics: Metrics{
 			RaftLogCommitLatency: metric.NewHistogram(metric.Metadata{}, 10*time.Second, metric.IOLatencyBuckets),
 		},
 	}
 
-	ctx := context.Background()
 	rs := RaftState{
 		LastTerm: 1,
 		ByteSize: 0,
@@ -89,17 +95,13 @@ func runBenchmarkLogStore_StoreEntries(b *testing.B, bytes int64) {
 	batch := &discardBatch{}
 	for i := 0; i < b.N; i++ {
 		batch.Batch = newStoreEntriesBatch(eng)
-		rd := Ready{
-			HardState: raftpb.HardState{},
-			Entries:   ents,
-			MustSync:  true,
-		}
+		m := MsgStorageAppend{Entries: ents}
+		cb := noopSyncCallback{}
 		var err error
-		rs, err = s.storeEntriesAndCommitBatch(ctx, rs, rd, stats, batch)
+		rs, err = s.storeEntriesAndCommitBatch(ctx, rs, m, cb, stats, batch)
 		if err != nil {
 			b.Fatal(err)
 		}
-		batch.Batch.Close()
 		ents[0].Index++
 	}
 	require.EqualValues(b, b.N, rs.LastIndex)