Skip to content

Commit

Permalink
raft: store Vote in MsgStorageAppend, not *HardState
Browse files Browse the repository at this point in the history
This commit replaces the HardState field in Message with a Vote. For
MsgStorageAppends, the term, vote, and commit fields will either all be
set (to facilitate the construction of a HardState) if any of the fields
have changed or will all be unset if none of the fields have changed.

Signed-off-by: Nathan VanBenschoten <[email protected]>
  • Loading branch information
nvanbenschoten committed Dec 21, 2022
1 parent 2e0653d commit 09c91d8
Show file tree
Hide file tree
Showing 9 changed files with 203 additions and 224 deletions.
12 changes: 6 additions & 6 deletions raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,12 +167,12 @@ type Config struct {
// messages over the network) and those targeted at the same thread can't be
// reordered. Messages to different targets can be processed in any order.
//
// MsgStorageAppend carries Raft log entries to append, election votes to
// persist, and snapshots to apply. All writes performed in service of a
// MsgStorageAppend must be durable before response messages are delivered.
// However, if the MsgStorageAppend carries no response messages, durability
// is not required. The message assumes the role of the Entries, HardState,
// and Snapshot fields in Ready.
// MsgStorageAppend carries Raft log entries to append, election votes /
// term changes / updated commit indexes to persist, and snapshots to apply.
// All writes performed in service of a MsgStorageAppend must be durable
// before response messages are delivered. However, if the MsgStorageAppend
// carries no response messages, durability is not required. The message
// assumes the role of the Entries, HardState, and Snapshot fields in Ready.
//
// MsgStorageApply carries committed entries to apply. Writes performed in
// service of a MsgStorageApply need not be durable before response messages
Expand Down
198 changes: 87 additions & 111 deletions raftpb/raft.pb.go

Large diffs are not rendered by default.

16 changes: 9 additions & 7 deletions raftpb/raft.proto
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,12 @@ option (gogoproto.goproto_unrecognized_all) = false;
option (gogoproto.goproto_sizecache_all) = false;

enum EntryType {

EntryNormal = 0;
EntryConfChange = 1; // corresponds to pb.ConfChange
EntryConfChangeV2 = 2; // corresponds to pb.ConfChangeV2
}

message Entry {

optional uint64 Term = 2 [(gogoproto.nullable) = false]; // must be 64-bit aligned for atomic operations
optional uint64 Index = 3 [(gogoproto.nullable) = false]; // must be 64-bit aligned for atomic operations
optional EntryType Type = 1 [(gogoproto.nullable) = false];
Expand Down Expand Up @@ -81,14 +79,19 @@ message Message {
// entries from its leader as it already has an entry with term 5 at index 100.
// (type=MsgStorageAppendResp,index=100,logTerm=5) means the local node wrote
// entries up to index=100 in stable storage, and the term of the entry at index
// 100 was 5.
// 100 was 5. This doesn't always mean that the corresponding MsgStorageAppend
// message was the one that carried these entries, just that those entries were
// stable at the time of processing the corresponding MsgStorageAppend.
optional uint64 logTerm = 5 [(gogoproto.nullable) = false];
optional uint64 index = 6 [(gogoproto.nullable) = false];
repeated Entry entries = 7 [(gogoproto.nullable) = false];
optional uint64 commit = 8 [(gogoproto.nullable) = false];
// hardState can be non-nil for MsgStorageAppend messages and is nil for all
// other message types.
optional HardState hardState = 13 [(gogoproto.nullable) = true];
// (type=MsgStorageAppend,vote=5,term=10) means the local node is voting for
// peer 5 in term 10. For MsgStorageAppends, the term, vote, and commit fields
// will either all be set (to facilitate the construction of a HardState) if
// any of the fields have changed or will all be unset if none of the fields
// have changed.
optional uint64 vote = 13 [(gogoproto.nullable) = false];
// snapshot is non-nil and non-empty for MsgSnap messages and nil for all other
// message types. However, peer nodes running older binary versions may send a
// non-nil, empty value for the snapshot field of non-MsgSnap messages. Code
Expand Down Expand Up @@ -204,7 +207,6 @@ message ConfChangeSingle {
//
// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
message ConfChangeV2 {

optional ConfChangeTransition transition = 1 [(gogoproto.nullable) = false];
repeated ConfChangeSingle changes = 2 [(gogoproto.nullable) = false];
optional bytes context = 3;
Expand Down
2 changes: 1 addition & 1 deletion raftpb/raft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ func TestProtoMemorySizes(t *testing.T) {
assert(unsafe.Sizeof(s), if64Bit(144, 80), "Snapshot")

var m Message
assert(unsafe.Sizeof(m), if64Bit(160, 108), "Message")
assert(unsafe.Sizeof(m), if64Bit(160, 112), "Message")

var hs HardState
assert(unsafe.Sizeof(hs), 24, "HardState")
Expand Down
7 changes: 4 additions & 3 deletions rafttest/interaction_env_handler_process_append_thread.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,10 @@ func (env *InteractionEnv) ProcessAppendThread(idx int) error {
m.Responses = nil
env.Output.WriteString("Processing:\n")
env.Output.WriteString(raft.DescribeMessage(m, defaultEntryFormatter) + "\n")
var st raftpb.HardState
if m.HardState != nil {
st = *m.HardState
st := raftpb.HardState{
Term: m.Term,
Vote: m.Vote,
Commit: m.Commit,
}
var snap raftpb.Snapshot
if m.Snapshot != nil {
Expand Down
24 changes: 12 additions & 12 deletions rawnode.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,19 +222,19 @@ func newStorageAppendMsg(r *raft, rd Ready) pb.Message {
Type: pb.MsgStorageAppend,
To: LocalAppendThread,
From: r.id,
Term: r.Term,
Entries: rd.Entries,
}
if !IsEmptyHardState(rd.HardState) {
// TODO(nvanbenschoten): we could avoid this heap allocation by
// replacing the pb.Message.HardState field with a Vote uint64 field. We
// would then need to teach apps to construct a HardState from these
// three fields, or supply a function/method that does so.
// m.Term = rd.Term
// m.Vote = rd.Vote
// m.Commit = rd.Commit
hs := rd.HardState
m.HardState = &hs
// If the Ready includes a HardState update, assign each of its fields
// to the corresponding fields in the Message. This allows clients to
// reconstruct the HardState and save it to stable storage.
//
// If the Ready does not include a HardState update, make sure to not
// assign a value to any of the fields so that a HardState reconstructed
// from them will be empty (return true from raft.IsEmptyHardState).
m.Term = rd.Term
m.Vote = rd.Vote
m.Commit = rd.Commit
}
if !IsEmptySnap(rd.Snapshot) {
snap := rd.Snapshot
Expand Down Expand Up @@ -340,8 +340,8 @@ func newStorageAppendRespMsg(r *raft, rd Ready) pb.Message {
// MsgStorageAppend that contained the last entry in the unstable slice carried
// an earlier term and was dropped.
//
// A MsgStorageAppend with a new HardState is emitted on each term change. This
// is the same condition that causes MsgStorageAppendResp messages with earlier
// A MsgStorageAppend with a new term is emitted on each term change. This is
// the same condition that causes MsgStorageAppendResp messages with earlier
// terms to be ignored. As a result, we are guaranteed that, assuming a bounded
// number of term changes, there will eventually be a MsgStorageAppendResp
// message that is not ignored. This means that entries in the unstable log
Expand Down
Loading

0 comments on commit 09c91d8

Please sign in to comment.