kvserver: e2e flow control for raft messages #79755

tbg · 2022-04-11T10:36:07Z

Is your feature request related to a problem? Please describe.

The flow control for raft messages is currently overly simplistic. A high-level overview of the current state follows.

Sending

Raft messages originate in a raft.Ready:

cockroach/pkg/kv/kvserver/replica_raft.go

Lines 596 to 598 in 40fd5d3

    
           if hasReady = raftGroup.HasReady(); hasReady { 
        
           	rd = raftGroup.Ready() 
        
           }

and are passed to the RaftTransport queues here:

cockroach/pkg/kv/kvserver/replica_raft.go

Lines 1443 to 1465 in 40fd5d3

    
           	if r.maybeCoalesceHeartbeat(ctx, msg, toReplica, fromReplica, false, nil) { 
        
           		return 
        
           	} 
        
           	req := newRaftMessageRequest() 
        
           	*req = kvserverpb.RaftMessageRequest{ 
        
           		RangeID:       r.RangeID, 
        
           		ToReplica:     toReplica, 
        
           		FromReplica:   fromReplica, 
        
           		Message:       msg, 
        
           		RangeStartKey: startKey, // usually nil 
        
           	} 
        
           	if !r.sendRaftMessageRequest(ctx, req) { 
        
           		if err := r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { 
        
           			r.mu.droppedMessages++ 
        
           			raftGroup.ReportUnreachable(msg.To) 
        
           			return true, nil 
        
           		}); err != nil && !errors.Is(err, errRemoved) { 
        
           			log.Fatalf(ctx, "%v", err) 
        
           		} 
        
           	} 
        
           }

There is a single queue for messages to each destination store (irrespective of rangeID), which is essentially a 10k-buffered channel:

cockroach/pkg/kv/kvserver/raft_transport.go

Lines 556 to 557 in c23d1aa

    
           ch := make(chan *kvserverpb.RaftMessageRequest, raftSendBufferSize) 
        
           value, ok = queuesMap.LoadOrStore(int64(nodeID), unsafe.Pointer(&ch))

from which batches of messages are put on the wire:

cockroach/pkg/kv/kvserver/raft_transport.go

Lines 515 to 534 in c23d1aa

    
           case req := <-ch: 
        
           	budget := targetRaftOutgoingBatchSize.Get(&t.st.SV) - int64(req.Size()) 
        
           	batch.Requests = append(batch.Requests, *req) 
        
           	releaseRaftMessageRequest(req) 
        
           	// Pull off as many queued requests as possible, within reason. 
        
           	for budget > 0 { 
        
           		select { 
        
           		case req = <-ch: 
        
           			budget -= int64(req.Size()) 
        
           			batch.Requests = append(batch.Requests, *req) 
        
           			releaseRaftMessageRequest(req) 
        
           		default: 
        
           			budget = -1 
        
           		} 
        
           	} 
        
           	err := stream.Send(batch) 
        
           	if err != nil { 
        
           		return err 
        
           	}

Receiving

Messages arrive here:

cockroach/pkg/kv/kvserver/raft_transport.go

Lines 386 to 395 in c23d1aa

    
           for i := range batch.Requests { 
        
           	req := &batch.Requests[i] 
        
           	atomic.AddInt64(&stats.serverRecv, 1) 
        
           	if pErr := t.handleRaftRequest(ctx, req, stream); pErr != nil { 
        
           		atomic.AddInt64(&stats.serverSent, 1) 
        
           		if err := stream.Send(newRaftMessageResponse(req, pErr)); err != nil { 
        
           			return err 
        
           		} 
        
           	} 
        
           }

and are put on the raft receive queue:

cockroach/pkg/kv/kvserver/store_raft.go

Lines 167 to 185 in 98a66b5

    
           s.metrics.RaftRcvdMessages[req.Message.Type].Inc(1) 
        
           value, ok := s.replicaQueues.Load(int64(req.RangeID)) 
        
           if !ok { 
        
           	value, _ = s.replicaQueues.LoadOrStore(int64(req.RangeID), unsafe.Pointer(&raftRequestQueue{})) 
        
           } 
        
           q := (*raftRequestQueue)(value) 
        
           q.Lock() 
        
           defer q.Unlock() 
        
           if len(q.infos) >= replicaRequestQueueSize { 
        
           	// TODO(peter): Return an error indicating the request was dropped. Note 
        
           	// that dropping the request is safe. Raft will retry. 
        
           	s.metrics.RaftRcvdMsgDropped.Inc(1) 
        
           	return false 
        
           } 
        
           q.infos = append(q.infos, raftRequestInfo{ 
        
           	req:        req, 
        
           	respStream: respStream, 
        
           })

The next available raft scheduler goroutine will pick up the nonempty queue in

cockroach/pkg/kv/kvserver/store_raft.go

Lines 442 to 460 in 98a66b5

    
           func (s *Store) processRequestQueue(ctx context.Context, rangeID roachpb.RangeID) bool { 
        
           	value, ok := s.replicaQueues.Load(int64(rangeID)) 
        
           	if !ok { 
        
           		return false 
        
           	} 
        
           	q := (*raftRequestQueue)(value) 
        
           	infos, ok := q.drain() 
        
           	if !ok { 
        
           		return false 
        
           	} 
        
           	defer q.recycle(infos) 
        
           	var hadError bool 
        
           	for i := range infos { 
        
           		info := &infos[i] 
        
           		if pErr := s.withReplicaForRequest( 
        
           			ctx, info.req, func(_ context.Context, r *Replica) *roachpb.Error { 
        
           				return s.processRaftRequestWithReplica(r.raftCtx, r, info.req) 
        
           			},

and hands it to the raft group (which will not perform work yet, but instead stages everything for the next Ready):

cockroach/pkg/kv/kvserver/store_raft.go

Lines 254 to 256 in 98a66b5

    
           if err := r.stepRaftGroup(req); err != nil { 
        
           	return roachpb.NewError(err) 
        
           }

Finally, a scheduler goroutine will actually process the replica for raft ready handling:

cockroach/pkg/kv/kvserver/store_raft.go

Lines 497 to 506 in 98a66b5

    
           func (s *Store) processReady(rangeID roachpb.RangeID) { 
        
           	r, ok := s.mu.replicasByRangeID.Load(rangeID) 
        
           	if !ok { 
        
           		return 
        
           	} 
        
           	ctx := r.raftCtx 
        
           	start := timeutil.Now() 
        
           	stats, expl, err := r.handleRaftReady(ctx, noSnap) 
        
           	maybeFatalOnRaftReadyErr(ctx, expl, err)

which is where I/O happens and the memory is flushed to disk.

Describe the solution you'd like

Introduce a model that more deliberately handles the case in which a follower is receiving more load than it can handle. Control the memory usage, while prioritizing heartbeats (and generally allowing QOS), without excessive performance cliffs. Integrate with admission control.

See also #79215.

Jira issue: CRDB-15932

Epic CRDB-15069

The text was updated successfully, but these errors were encountered:

Inspired by cockroachlabs/support#1770. If either the raft send or receive queue fills up, wide-spread outages can occur as replication progress stalls. We have metrics that can indicate this, but straightforward logging is also appropriate to direct attention to the fact, which this commit achieves. Touches cockroachdb#79755 Release justification: important logging improvement Release note: None

86608: batcheval: add latch key protecting range key stats update r=erikgrinaker a=aliher1911 Previously GC needed to get a read latch with max timestamp to ensure that range tombstones are not modified during GC. This is causing all writers to get stuck in queue while GC is validating request and removing range tombstone. This commit adds a dedicated latch key LocalRangeRangeTombstoneStatsUpdateLockSuffix to address the problem. All range tombstone writers obtain this read latch on top of the write latches for the ranges they are interested to update. GC on the other hand will obtain write latch on that key. This approach allows point writers to proceed during GC, but will block new range tombstones from being written. Non conflicting writes of range tombstones could still proceed since their write latch ranges don't overlap. Release justification: this is a safe change as range tombstone behaviour is not enabled yet and the change is needed to address potential performance regressions. Release note: None 86645: kvserver: log when raft send/recv queue fills up r=pavelkalinnikov a=tbg Inspired by https://github.com/cockroachlabs/support/issues/1770. If either the raft send or receive queue fills up, wide-spread outages can occur as replication progress stalls. We have metrics that can indicate this, but straightforward logging is also appropriate to direct attention to the fact, which this commit achieves. Touches #79755 Release justification: important logging improvement Release note: None 86679: server,ui: show internal stats with new cluster setting r=maryliag a=maryliag Previously, we were not showing internal results on fingerprint options on SQL Activity. A new cluster setting created `sql.stats.response.show_internal` can be set to `true` and internal statistics will be displayed on SQL Activity page. Fixes #79547 https://www.loom.com/share/1b89ba99a7c247edadb5c8b0d127755c Release justification: low risk, high benefit change Release note (sql change): New cluster setting `sql.stats.response.show_internal` with default value of `false` can be set to true, to display information about internal stats on SQL Activity page, with fingerprint option. 86748: storage: rename `MVCCRangeKeyStack.FirstAbove/Below` r=tbg a=erikgrinaker This patch renames `FirstAbove/Below` to `FirstAtOrAbove/Below`, for clarity. Release justification: bug fixes and low-risk updates to new functionality Release note: None 86809: backupccl: set kv.bulkio.write_metadata_sst.enabled to default false r=stevendanna a=msbutler This patch sets write_metadata_sst cluster setting to false in prep for the 22.2 branch cut, as there's additional worked required before this feature gets used in production. Setting this to false will also stop the roachtest in #86289 from consistently failing due to #86806. Fixes #86289 Release note: none Release justification: prevents using an experimental feature by default Co-authored-by: Oleg Afanasyev <[email protected]> Co-authored-by: Tobias Grieger <[email protected]> Co-authored-by: Marylia Gutierrez <[email protected]> Co-authored-by: Erik Grinaker <[email protected]> Co-authored-by: Michael Butler <[email protected]>

Inspired by cockroachlabs/support#1770. If either the raft send or receive queue fills up, wide-spread outages can occur as replication progress stalls. We have metrics that can indicate this, but straightforward logging is also appropriate to direct attention to the fact, which this commit achieves. Touches cockroachdb#79755 Release justification: important logging improvement Release note: None

irfansharif · 2023-01-20T15:21:39Z

I read some of the discussion above and linked issues and PRs. I think this is going to be entirely encompassed by #95563.

tbg added the C-enhancement Solution expected to add code/behavior + preserve backward-compat (pg compat issues are exception) label Apr 11, 2022

erikgrinaker mentioned this issue Apr 12, 2022

kvserver: excessive memory usage when receiving snapshots #73313

Open

tbg mentioned this issue Apr 12, 2022

kvserver: raft receive queue may OOM under overload #71805

Open

jlinder added sync-me and removed sync-me labels May 20, 2022

This was referenced May 31, 2022

kvserver: throttle writes on followers #79215

Closed

admission: experiment with quota pool awareness of IO overload on remote stores #82132

Closed

admission,kv,bulk: unify (local) store overload protection via admission control #75066

Closed

mwang1026 added the T-kv-replication label Jun 2, 2022

tbg mentioned this issue Jun 3, 2022

kvserver: provide a way for replicas to re-enter the quota pool #82403

Open

irfansharif added the A-admission-control label Jun 20, 2022

erikgrinaker mentioned this issue Aug 5, 2022

kvserver: don't check ConnHealth when releasing proposal quota #85565

Merged

tbg mentioned this issue Aug 23, 2022

kvserver: log when raft send/recv queue fills up #86645

Merged

tbg mentioned this issue Sep 1, 2022

kvserver,kvprober: topology-aware probing #87263

Open

erikgrinaker mentioned this issue Sep 12, 2022

roachtest: increase memory for tests that see Raft OOMs #87809

Closed

tbg mentioned this issue Sep 20, 2022

release-21.2: kvserver: log when raft send/recv queue fills up #88245

Merged

tbg mentioned this issue Sep 20, 2022

release-22.1: kvserver: log when raft send/recv queue fills up #88255

Merged

irfansharif closed this as completed Jan 20, 2023

irfansharif closed this as not planned Won't fix, can't repro, duplicate, stale Jan 20, 2023

exalate-issue-sync bot closed this as completed Apr 16, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

kvserver: e2e flow control for raft messages #79755

kvserver: e2e flow control for raft messages #79755

tbg commented Apr 11, 2022 •

edited by exalate-issue-sync bot

Loading

irfansharif commented Jan 20, 2023

kvserver: e2e flow control for raft messages #79755

kvserver: e2e flow control for raft messages #79755

Comments

tbg commented Apr 11, 2022 • edited by exalate-issue-sync bot Loading

Sending

Receiving

irfansharif commented Jan 20, 2023

tbg commented Apr 11, 2022 •

edited by exalate-issue-sync bot

Loading