This repository has been archived by the owner on Mar 4, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 137
/
Copy pathraft.h
1202 lines (1089 loc) · 41.3 KB
/
raft.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#ifndef RAFT_H
#define RAFT_H
#include <stdarg.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#ifndef RAFT_API
#define RAFT_API __attribute__((visibility("default")))
#endif
/**
* Version.
*/
#define RAFT_VERSION_MAJOR 0
#define RAFT_VERSION_MINOR 18
#define RAFT_VERSION_RELEASE 1
#define RAFT_VERSION_NUMBER \
(RAFT_VERSION_MAJOR * 100 * 100 + RAFT_VERSION_MINOR * 100 + \
RAFT_VERSION_RELEASE)
int raft_version_number(void);
/**
* Error codes.
*/
enum {
RAFT_NOMEM = 1, /* Out of memory */
RAFT_BADID, /* Server ID is not valid */
RAFT_DUPLICATEID, /* Server ID already in use */
RAFT_DUPLICATEADDRESS, /* Server address already in use */
RAFT_BADROLE, /* Server role is not valid */
RAFT_MALFORMED,
RAFT_NOTLEADER,
RAFT_LEADERSHIPLOST,
RAFT_SHUTDOWN,
RAFT_CANTBOOTSTRAP,
RAFT_CANTCHANGE,
RAFT_CORRUPT,
RAFT_CANCELED,
RAFT_NAMETOOLONG,
RAFT_TOOBIG,
RAFT_NOCONNECTION,
RAFT_BUSY,
RAFT_IOERR, /* File system or storage error */
RAFT_NOTFOUND, /* Resource not found */
RAFT_INVALID, /* Invalid parameter */
RAFT_UNAUTHORIZED, /* No access to a resource */
RAFT_NOSPACE, /* Not enough space on disk */
RAFT_TOOMANY /* Some system or raft limit was hit */
};
/**
* Size of human-readable error message buffers.
*/
#define RAFT_ERRMSG_BUF_SIZE 256
/**
* Return the error message describing the given error code.
*/
RAFT_API const char *raft_strerror(int errnum);
typedef unsigned long long raft_id;
/**
* Hold the value of a raft term. Guaranteed to be at least 64-bit long.
*/
typedef unsigned long long raft_term;
/**
* Hold the value of a raft entry index. Guaranteed to be at least 64-bit long.
*/
typedef unsigned long long raft_index;
/**
* Hold a time value expressed in milliseconds since the epoch.
*/
typedef unsigned long long raft_time;
/**
* Hold the features a raft node is capable of.
*/
typedef uint64_t raft_flags;
/**
* A data buffer.
*/
struct raft_buffer
{
void *base; /* Pointer to the buffer data. */
size_t len; /* Length of the buffer. */
};
/**
* Server role codes.
*/
enum {
RAFT_STANDBY, /* Replicate log, does not participate in quorum. */
RAFT_VOTER, /* Replicate log, does participate in quorum. */
RAFT_SPARE /* Does not replicate log, or participate in quorum. */
};
/**
* Hold information about a single server in the cluster configuration.
* WARNING: This struct is encoded/decoded, be careful when adapting it.
*/
struct raft_server
{
raft_id id; /* Server ID, must be greater than zero. */
char *address; /* Server address. User defined. */
int role; /* Server role. */
};
/**
* Hold information about all servers currently part of the cluster.
* WARNING: This struct is encoded/decoded, be careful when adapting it.
*/
struct raft_configuration
{
struct raft_server *servers; /* Array of servers member of the cluster. */
unsigned n; /* Number of servers in the array. */
};
/**
* Initialize an empty raft configuration.
*/
RAFT_API void raft_configuration_init(struct raft_configuration *c);
/**
* Release all memory used by the given configuration object.
*/
RAFT_API void raft_configuration_close(struct raft_configuration *c);
/**
* Add a server to a raft configuration.
*
* The @id must be greater than zero and @address point to a valid string.
*
* The @role must be either #RAFT_VOTER, #RAFT_STANDBY, #RAFT_SPARE.
*
* If @id or @address are already in use by another server in the configuration,
* an error is returned.
*
* The @address string will be copied and can be released after this function
* returns.
*/
RAFT_API int raft_configuration_add(struct raft_configuration *c,
raft_id id,
const char *address,
int role);
/**
* Encode the given configuration object.
*
* The memory of the returned buffer is allocated using raft_malloc(), and
* client code is responsible for releasing it when no longer needed.
*/
RAFT_API int raft_configuration_encode(const struct raft_configuration *c,
struct raft_buffer *buf);
/**
* Hash function which outputs a 64-bit value based on a text and a number.
*
* This can be used to generate a unique ID for a new server being added, for
* example based on its address and on the current time in milliseconds since
* the Epoch.
*
* It's internally implemented as a SHA1 where only the last 8 bytes of the hash
* value are kept.
*/
RAFT_API unsigned long long raft_digest(const char *text, unsigned long long n);
/**
* Log entry types.
*/
enum {
RAFT_COMMAND = 1, /* Command for the application FSM. */
RAFT_BARRIER, /* Wait for all previous commands to be applied. */
RAFT_CHANGE /* Raft configuration change. */
};
/**
* A single entry in the raft log.
*
* An entry that originated from this raft instance while it was the leader
* (typically via client calls to raft_apply()) should normally have a @buf
* attribute referencing directly the memory that was originally allocated by
* the client itself to contain the entry data, and the @batch attribute set to
* #NULL.
*
* An entry that was received from the network as part of an AppendEntries RPC
* or that was loaded from disk at startup should normally have a @batch
* attribute that points to a contiguous chunk of memory that contains the data
* of the entry itself plus possibly the data for other entries that were
* received or loaded with it at the same time. In this case the @buf pointer
* will be equal to the @batch pointer plus an offset, that locates the position
* of the entry's data within the batch.
*
* When the @batch attribute is not #NULL the raft library will take care of
* releasing that memory only once there are no more references to the
* associated entries.
*
* This arrangement makes it possible to minimize the amount of memory-copying
* when performing I/O.
*/
struct raft_entry
{
raft_term term; /* Term in which the entry was created. */
unsigned short type; /* Type (FSM command, barrier, config change). */
struct raft_buffer buf; /* Entry data. */
void *batch; /* Batch that buf's memory points to, if any. */
};
/**
* Hold the arguments of a RequestVote RPC.
*
* The RequestVote RPC is invoked by candidates to gather votes.
*/
struct raft_request_vote
{
int version;
raft_term term; /* Candidate's term. */
raft_id candidate_id; /* ID of the server requesting the vote. */
raft_index last_log_index; /* Index of candidate's last log entry. */
raft_index last_log_term; /* Term of log entry at last_log_index. */
bool disrupt_leader; /* True if current leader should be discarded. */
bool pre_vote; /* True if this is a pre-vote request. */
};
#define RAFT_REQUEST_VOTE_VERSION 2
/**
* Hold the result of a RequestVote RPC.
*/
struct raft_request_vote_result
{
int version;
raft_term term; /* Receiver's current term (candidate updates itself). */
bool vote_granted; /* True means candidate received vote. */
bool pre_vote; /* The response to a pre-vote RequestVote or not. */
};
#define RAFT_REQUEST_VOTE_RESULT_VERSION 2
/**
* Hold the arguments of an AppendEntries RPC.
*
* The AppendEntries RPC is invoked by the leader to replicate log entries. It's
* also used as heartbeat (figure 3.1).
*/
struct raft_append_entries
{
int version;
raft_term term; /* Leader's term. */
raft_index prev_log_index; /* Index of log entry preceeding new ones. */
raft_term prev_log_term; /* Term of entry at prev_log_index. */
raft_index leader_commit; /* Leader's commit index. */
struct raft_entry *entries; /* Log entries to append. */
unsigned n_entries; /* Size of the log entries array. */
};
#define RAFT_APPEND_ENTRIES_VERSION 0
/**
* Hold the result of an AppendEntries RPC (figure 3.1).
*/
struct raft_append_entries_result
{
int version;
raft_term term; /* Receiver's current_term. */
raft_index rejected; /* If non-zero, the index that was rejected. */
raft_index last_log_index; /* Receiver's last log entry index, as hint. */
raft_flags features; /* Feature flags. */
};
#define RAFT_APPEND_ENTRIES_RESULT_VERSION 1
/**
* Hold the arguments of an InstallSnapshot RPC (figure 5.3).
*/
struct raft_install_snapshot
{
int version;
raft_term term; /* Leader's term. */
raft_index last_index; /* Index of last entry in the snapshot. */
raft_term last_term; /* Term of last_index. */
struct raft_configuration conf; /* Config as of last_index. */
raft_index conf_index; /* Commit index of conf. */
struct raft_buffer data; /* Raw snapshot data. */
};
#define RAFT_INSTALL_SNAPSHOT_VERSION 0
/**
* Hold the arguments of a TimeoutNow RPC.
*
* The TimeoutNow RPC is invoked by leaders to transfer leadership to a
* follower.
*/
struct raft_timeout_now
{
int version;
raft_term term; /* Leader's term. */
raft_index last_log_index; /* Index of leader's last log entry. */
raft_index last_log_term; /* Term of log entry at last_log_index. */
};
#define RAFT_TIMEOUT_NOW_VERSION 0
/**
* Type codes for RPC messages.
*/
enum {
RAFT_IO_APPEND_ENTRIES = 1,
RAFT_IO_APPEND_ENTRIES_RESULT,
RAFT_IO_REQUEST_VOTE,
RAFT_IO_REQUEST_VOTE_RESULT,
RAFT_IO_INSTALL_SNAPSHOT,
RAFT_IO_TIMEOUT_NOW
};
/**
* A single RPC message that can be sent or received over the network.
*
* The RPC message types all have a `version` field.
* In the libuv io implementation, `version` is filled out during decoding
* and is based on the size of the message on the wire, see e.g.
* `sizeofRequestVoteV1`. The version number in the RAFT_MESSAGE_XXX_VERSION
* macro needs to be bumped every time the message is updated.
*
* Notes when adding a new message type to raft:
* raft_io implementations compiled against old versions of raft don't know the
* new message type and possibly have not allocated enough space for it. When
* such an application receives a new message over the wire, the raft_io
* implementation will err out or drop the message, because it doesn't know how
* to decode it based on its type.
* raft_io implementations compiled against versions of raft that know the new
* message type but at runtime are linked against an older raft lib, will pass
* the message to raft, where raft will drop it.
* When raft receives a message and accesses a field of a new message type,
* the raft_io implementation must have known about the new message type,
* so it was compiled against a modern enough version of raft, and memory
* accesses should be safe.
*
* Sending a new message type with a raft_io implementation that doesn't know
* the type is safe, the implementation should drop the message based on its
* type and will not try to access fields it doesn't know the existence of.
*/
struct raft_message
{
unsigned short type; /* RPC type code. */
raft_id server_id; /* ID of sending or destination server. */
const char *server_address; /* Address of sending or destination server. */
union { /* Type-specific data */
struct raft_request_vote request_vote;
struct raft_request_vote_result request_vote_result;
struct raft_append_entries append_entries;
struct raft_append_entries_result append_entries_result;
struct raft_install_snapshot install_snapshot;
struct raft_timeout_now timeout_now;
};
};
/**
* Hold the details of a snapshot.
* The user-provided raft_buffer structs should provide the user with enough
* flexibility to adapt/evolve snapshot formats.
* If this struct would NEED to be adapted in the future, raft can always move
* to a new struct with a new name and a new raft_io version.
*/
struct raft_snapshot
{
/* Index and term of last entry included in the snapshot. */
raft_index index;
raft_term term;
/* Last committed configuration included in the snapshot, along with the
* index it was committed at. */
struct raft_configuration configuration;
raft_index configuration_index;
/* Content of the snapshot. When a snapshot is taken, the user FSM can fill
* the bufs array with more than one buffer. When a snapshot is restored,
* there will always be a single buffer. */
struct raft_buffer *bufs;
unsigned n_bufs;
};
/**
* Asynchronous request to send an RPC message.
*/
struct raft_io_send;
typedef void (*raft_io_send_cb)(struct raft_io_send *req, int status);
struct raft_io_send
{
void *data; /* User data */
raft_io_send_cb cb; /* Request callback */
};
/**
* Asynchronous request to store new log entries.
*/
struct raft_io_append;
typedef void (*raft_io_append_cb)(struct raft_io_append *req, int status);
struct raft_io_append
{
void *data; /* User data */
raft_io_append_cb cb; /* Request callback */
};
/**
* Asynchronous request to store a new snapshot.
*/
struct raft_io_snapshot_put;
typedef void (*raft_io_snapshot_put_cb)(struct raft_io_snapshot_put *req,
int status);
struct raft_io_snapshot_put
{
void *data; /* User data */
raft_io_snapshot_put_cb cb; /* Request callback */
};
/**
* Asynchronous request to load the most recent snapshot available.
*/
struct raft_io_snapshot_get;
typedef void (*raft_io_snapshot_get_cb)(struct raft_io_snapshot_get *req,
struct raft_snapshot *snapshot,
int status);
struct raft_io_snapshot_get
{
void *data; /* User data */
raft_io_snapshot_get_cb cb; /* Request callback */
};
/**
* Asynchronous work request.
*/
struct raft_io_async_work;
typedef int (*raft_io_async_work_fn)(struct raft_io_async_work *req);
typedef void (*raft_io_async_work_cb)(struct raft_io_async_work *req,
int status);
struct raft_io_async_work
{
void *data; /* User data */
raft_io_async_work_fn work; /* Function to run async from the main loop */
raft_io_async_work_cb cb; /* Request callback */
};
/**
* Customizable tracer, for debugging purposes.
*/
struct raft_tracer
{
/**
* Implementation-defined state object.
*/
void *impl;
/**
* Whether this tracer should emit messages.
*/
bool enabled;
/**
* Trace level.
*/
unsigned level;
/**
* Emit the given trace message, possibly decorating it with the provided
* metadata.
*/
void (*emit)(struct raft_tracer *t,
const char *file,
unsigned int line,
const char *func,
unsigned int level,
const char *message);
};
struct raft_io; /* Forward declaration. */
/**
* Callback invoked by the I/O implementation at regular intervals.
*/
typedef void (*raft_io_tick_cb)(struct raft_io *io);
/**
* Callback invoked by the I/O implementation when an RPC message is received.
*/
typedef void (*raft_io_recv_cb)(struct raft_io *io, struct raft_message *msg);
typedef void (*raft_io_close_cb)(struct raft_io *io);
/**
* version field MUST be filled out by user.
* When moving to a new version, the user MUST implement the newly added
* methods.
*/
struct raft_io
{
int version; /* 1 or 2 */
void *data;
void *impl;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int (*init)(struct raft_io *io, raft_id id, const char *address);
void (*close)(struct raft_io *io, raft_io_close_cb cb);
int (*load)(struct raft_io *io,
raft_term *term,
raft_id *voted_for,
struct raft_snapshot **snapshot,
raft_index *start_index,
struct raft_entry *entries[],
size_t *n_entries);
int (*start)(struct raft_io *io,
unsigned msecs,
raft_io_tick_cb tick,
raft_io_recv_cb recv);
int (*bootstrap)(struct raft_io *io, const struct raft_configuration *conf);
int (*recover)(struct raft_io *io, const struct raft_configuration *conf);
int (*set_term)(struct raft_io *io, raft_term term);
int (*set_vote)(struct raft_io *io, raft_id server_id);
int (*send)(struct raft_io *io,
struct raft_io_send *req,
const struct raft_message *message,
raft_io_send_cb cb);
int (*append)(struct raft_io *io,
struct raft_io_append *req,
const struct raft_entry entries[],
unsigned n,
raft_io_append_cb cb);
int (*truncate)(struct raft_io *io, raft_index index);
int (*snapshot_put)(struct raft_io *io,
unsigned trailing,
struct raft_io_snapshot_put *req,
const struct raft_snapshot *snapshot,
raft_io_snapshot_put_cb cb);
int (*snapshot_get)(struct raft_io *io,
struct raft_io_snapshot_get *req,
raft_io_snapshot_get_cb cb);
raft_time (*time)(struct raft_io *io);
int (*random)(struct raft_io *io, int min, int max);
/* Field(s) below added since version 2. */
int (*async_work)(struct raft_io *io,
struct raft_io_async_work *req,
raft_io_async_work_cb cb);
};
/**
* version field MUST be filled out by user.
* When moving to a new version, the user MUST initialize the new methods,
* either with an implementation or with NULL.
*
* version 2:
* introduces `snapshot_finalize`, when this method is not NULL, it will
* always run after a successful call to `snapshot`, whether the snapshot has
* been successfully written to disk or not. If it is set, raft will
* assume no ownership of any of the `raft_buffer`s and the responsibility to
* clean up lies with the user of raft.
* `snapshot_finalize` can be used to e.g. release a lock that was taken during
* a call to `snapshot`. Until `snapshot_finalize` is called, raft can access
* the data contained in the `raft_buffer`s.
*
* version 3:
* Adds support for async snapshots through the `snapshot_async` function.
* When this method is provided, raft will call `snapshot` in the main loop,
* and when successful, will call `snapshot_async` using the `io->async_work`
* method, so blocking I/O calls are allowed in the implementation. After the
* `snapshot_async` completes, `snapshot_finalize` will be called in the main
* loop, independent of the return value of `snapshot_async`.
* An implementation that does not use asynchronous snapshots MUST set
* `snapshot_async` to NULL.
* All memory allocated by the snapshot routines MUST be freed by the snapshot
* routines themselves.
*/
struct raft_fsm
{
int version; /* 1, 2 or 3 */
void *data;
int (*apply)(struct raft_fsm *fsm,
const struct raft_buffer *buf,
void **result);
int (*snapshot)(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs);
int (*restore)(struct raft_fsm *fsm, struct raft_buffer *buf);
/* Fields below added since version 2. */
int (*snapshot_finalize)(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs);
/* Fields below added since version 3. */
int (*snapshot_async)(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs);
};
struct raft; /* Forward declaration. */
/**
* State codes.
*/
enum { RAFT_UNAVAILABLE, RAFT_FOLLOWER, RAFT_CANDIDATE, RAFT_LEADER };
/**
* State callback to invoke if raft's state changes.
*/
typedef void (*raft_state_cb)(struct raft *raft,
unsigned short old_state,
unsigned short new_state);
struct raft_progress;
/**
* Close callback.
*
* It's safe to release the memory of a raft instance only after this callback
* has fired.
*/
typedef void (*raft_close_cb)(struct raft *raft);
struct raft_change; /* Forward declaration */
struct raft_transfer; /* Forward declaration */
struct raft_log;
/**
* Hold and drive the state of a single raft server in a cluster.
* When replacing reserved fields in the middle of this struct, you MUST use a
* type with the same size and alignment requirements as the original type.
*/
struct raft
{
void *data; /* Custom user data. */
struct raft_tracer *tracer; /* Tracer implementation. */
struct raft_io *io; /* Disk and network I/O implementation. */
struct raft_fsm *fsm; /* User-defined FSM to apply commands to. */
raft_id id; /* Server ID of this raft instance. */
char *address; /* Server address of this raft instance. */
/*
* Cache of the server's persistent state, updated on stable storage before
* responding to RPCs (Figure 3.1).
*/
raft_term current_term; /* Latest term server has seen. */
raft_id voted_for; /* Candidate that received vote in current term. */
struct raft_log *log; /* Log entries. */
/*
* Current membership configuration (Chapter 4).
*
* At any given moment the current configuration can be committed or
* uncommitted.
*
* If a server is voting, the log entry with index 1 must always contain the
* first committed configuration.
*
* At all times #configuration_committed_index is either zero or is the
* index of the most recent log entry of type #RAFT_CHANGE that we know to
* be committed. That means #configuration_committed_index is always equal
* or lower than #commit_index.
*
* At all times #configuration_uncommitted_index is either zero or is the
* index of an uncommitted log entry of type #RAFT_CHANGE. There can be at
* most one uncommitted entry of type #RAFT_CHANGE because we allow only one
* configuration change at a time.
*
* At all times #configuration_last_snapshot is a copy of the configuration
* contained the most recent snapshot, if any.
*
* The possible scenarios are:
*
* 1. #configuration_committed_index and #configuration_uncommitted_index
* are both zero. This should only happen when a brand new server starts
* joining a cluster and is waiting to receive log entries from the
* current leader. In this case #configuration and
* #configuration_last_snapshot must be empty and have no servers.
*
* 2. #configuration_committed_index is non-zero and
* #configuration_uncommitted_index is zero. This means that
* #configuration is committed and there is no pending configuration
* change. The content of #configuration must match the one of the log
* entry at #configuration_committed_index.
*
* 3. #configuration_committed_index and #configuration_uncommitted_index
* are both non-zero, with the latter being greater than the former. This
* means that #configuration is uncommitted and represents a pending
* configuration change. The content of #configuration must match the one
* of the log entry at #configuration_uncommitted_index.
*
* When a snapshot is taken, a copy of the most recent configuration known
* to be committed (i.e. the configuration contained in the log entry at
* #configuration_committed_index) is saved in #configuration_last_snapshot,
* so it can be easily retrieved in case the log gets truncated because of
* compaction and does not contain the entry at
* #configuration_committed_index anymore. Likewise, if a snapshot is
* restored its associated configuration is saved in
* #configuration_last_snapshot.
*/
struct raft_configuration configuration;
struct raft_configuration configuration_last_snapshot;
raft_index configuration_committed_index;
raft_index configuration_uncommitted_index;
/*
* Election timeout in milliseconds (default 1000).
*
* From 3.4:
*
* Raft uses a heartbeat mechanism to trigger leader election. When
* servers start up, they begin as followers. A server remains in follower
* state as long as it receives valid RPCs from a leader or
* candidate. Leaders send periodic heartbeats (AppendEntries RPCs that
* carry no log entries) to all followers in order to maintain their
* authority. If a follower receives no communication over a period of
* time called the election timeout, then it assumes there is no viable
* leader and begins an election to choose a new leader.
*
* This is the baseline value and will be randomized between 1x and 2x.
*
* See raft_change_election_timeout() to customize the value of this
* attribute.
*/
unsigned election_timeout;
/*
* Heartbeat timeout in milliseconds (default 100). This is relevant only
* for when the raft instance is in leader state: empty AppendEntries RPCs
* will be sent if this amount of milliseconds elapses without any
* user-triggered AppendEntries RCPs being sent.
*
* From Figure 3.1:
*
* [Leaders] Send empty AppendEntries RPC during idle periods to prevent
* election timeouts.
*/
unsigned heartbeat_timeout;
/*
* When the leader sends an InstallSnapshot RPC to a follower it will
* consider the RPC as failed after this timeout and retry.
*/
unsigned install_snapshot_timeout;
/*
* The fields below hold the part of the server's volatile state which is
* always applicable regardless of the whether the server is follower,
* candidate or leader (Figure 3.1). This state is rebuilt automatically
* after a server restart.
*/
raft_index commit_index; /* Highest log entry known to be committed */
raft_index last_applied; /* Highest log entry applied to the FSM */
raft_index last_stored; /* Highest log entry persisted on disk */
/*
* Current server state of this raft instance, along with a union defining
* state-specific values.
*/
unsigned short state;
union {
struct /* Follower */
{
unsigned randomized_election_timeout; /* Timer expiration. */
struct /* Current leader info. */
{
raft_id id;
char *address;
} current_leader;
uint64_t append_in_flight_count;
uint64_t reserved[7]; /* Future use */
} follower_state;
struct
{
unsigned randomized_election_timeout; /* Timer expiration. */
bool *votes; /* Vote results. */
bool disrupt_leader; /* For leadership transfer */
bool in_pre_vote; /* True in pre-vote phase. */
uint64_t reserved[8]; /* Future use */
} candidate_state;
struct
{
struct raft_progress *progress; /* Per-server replication state. */
struct raft_change *change; /* Pending membership change. */
raft_id promotee_id; /* ID of server being promoted. */
unsigned short round_number; /* Current sync round. */
raft_index round_index; /* Target of the current round. */
raft_time round_start; /* Start of current round. */
void *requests[2]; /* Outstanding client requests. */
uint32_t voter_contacts; /* Current number of voting nodes we are in contact with */
uint32_t reserved2; /* Future use */
uint64_t reserved[7]; /* Future use */
} leader_state;
};
/* Election timer start.
*
* This timer has different purposes depending on the state. Followers
* convert to candidate after the randomized election timeout has elapsed
* without leader contact. Candidates start a new election after the
* randomized election timeout has elapsed without a winner. Leaders step
* down after the election timeout has elapsed without contacting a majority
* of voting servers. */
raft_time election_timer_start;
/* In-progress leadership transfer request, if any. */
struct raft_transfer *transfer;
/*
* Information about the last snapshot that was taken (if any).
*/
struct
{
unsigned threshold; /* N. of entries before snapshot */
unsigned trailing; /* N. of trailing entries to retain */
struct raft_snapshot pending; /* In progress snapshot */
struct raft_io_snapshot_put put; /* Store snapshot request */
uint64_t reserved[8]; /* Future use */
} snapshot;
/*
* Callback to invoke once a close request has completed.
*/
raft_close_cb close_cb;
/*
* Human-readable message providing diagnostic information about the last
* error occurred.
*/
char errmsg[RAFT_ERRMSG_BUF_SIZE];
/* Whether to use pre-vote to avoid disconnected servers disrupting the
* current leader, as described in 4.2.3 and 9.6. */
bool pre_vote;
/* Limit how long to wait for a stand-by to catch-up with the log when its
* being promoted to voter. */
unsigned max_catch_up_rounds;
unsigned max_catch_up_round_duration;
/* uint64_t because we used a reserved field. In reality this a pointer to a
* `struct raft_callbacks` that can be used to store e.g. various
* user-supplied callbacks. */
uint64_t callbacks;
/* Future extensions */
uint64_t reserved[31];
};
RAFT_API int raft_init(struct raft *r,
struct raft_io *io,
struct raft_fsm *fsm,
raft_id id,
const char *address);
RAFT_API void raft_close(struct raft *r, raft_close_cb cb);
/**
* This function MUST be called after raft_init and before raft_start.
* @cb will be called every time the raft state changes.
*/
RAFT_API void raft_register_state_cb(struct raft *r, raft_state_cb cb);
/**
* Bootstrap this raft instance using the given configuration. The instance must
* not have been started yet and must be completely pristine, otherwise
* #RAFT_CANTBOOTSTRAP will be returned.
*/
RAFT_API int raft_bootstrap(struct raft *r,
const struct raft_configuration *conf);
/**
* Force a new configuration in order to recover from a loss of quorum where the
* current configuration cannot be restored, such as when a majority of servers
* die at the same time.
*
* This works by appending the new configuration directly to the log stored on
* disk.
*
* In order for this operation to be safe you must follow these steps:
*
* 1. Make sure that no servers in the cluster are running, either because they
* died or because you manually stopped them.
*
* 2. Run @raft_recover exactly one time, on the non-dead server which has
* the highest term and the longest log.
*
* 3. Copy the data directory of the server you ran @raft_recover on to all
* other non-dead servers in the cluster, replacing their current data
* directory.
*
* 4. Restart all servers.
*/
RAFT_API int raft_recover(struct raft *r,
const struct raft_configuration *conf);
RAFT_API int raft_start(struct raft *r);
/**
* Set the election timeout.
*
* Every raft instance is initialized with a default election timeout of 1000
* milliseconds. If you wish to tweak it, call this function before starting
* your event loop.
*
* From Chapter 9:
*
* We recommend a range that is 10-20 times the one-way network latency, which
* keeps split votes rates under 40% in all cases for reasonably sized
* clusters, and typically results in much lower rates.
*
* Note that the current random election timer will be reset and a new one timer
* will be generated.
*/
RAFT_API void raft_set_election_timeout(struct raft *r, unsigned msecs);
/**
* Set the heartbeat timeout.
*/
RAFT_API void raft_set_heartbeat_timeout(struct raft *r, unsigned msecs);
/**
* Set the snapshot install timeout.
*/
RAFT_API void raft_set_install_snapshot_timeout(struct raft *r, unsigned msecs);
/**
* Number of outstanding log entries before starting a new snapshot. The default
* is 1024.
*/
RAFT_API void raft_set_snapshot_threshold(struct raft *r, unsigned n);
/**
* Enable or disable pre-vote support. Pre-vote is turned off by default.
*/
RAFT_API void raft_set_pre_vote(struct raft *r, bool enabled);
/**
* Number of outstanding log entries to keep in the log after a snapshot has
* been taken. This avoids sending snapshots when a follower is behind by just a
* few entries. The default is 128.
*/
RAFT_API void raft_set_snapshot_trailing(struct raft *r, unsigned n);
/**
* Set the maximum number of a catch-up rounds to try when replicating entries
* to a stand-by server that is being promoted to voter, before giving up and
* failing the configuration change. The default is 10.
*/
RAFT_API void raft_set_max_catch_up_rounds(struct raft *r, unsigned n);
/**
* Set the maximum duration of a catch-up round when replicating entries to a
* stand-by server that is being promoted to voter. The default is 5 seconds.
*/
RAFT_API void raft_set_max_catch_up_round_duration(struct raft *r,
unsigned msecs);
/**
* Return a human-readable description of the last error occurred.
*/
RAFT_API const char *raft_errmsg(struct raft *r);
/**
* Return the code of the current raft state (follower/candidate/leader).
*/
RAFT_API int raft_state(struct raft *r);
/**
* Return the code of the current raft role (spare/standby/voter),
* or -1 if this server is not in the current configuration.
*/
RAFT_API int raft_role(struct raft *r);
/**
* Return the ID and address of the current known leader, if any.
*/
RAFT_API void raft_leader(struct raft *r, raft_id *id, const char **address);
/**
* Return the index of the last entry that was appended to the local log.
*/
RAFT_API raft_index raft_last_index(struct raft *r);
/**
* Return the index of the last entry that was applied to the local FSM.
*/
RAFT_API raft_index raft_last_applied(struct raft *r);
/**
* Return the number of voting servers that the leader has recently been in
* contact with. This can be used to help determine whether the cluster may be
* in a degraded/at risk state.
*
* Returns valid values >= 1, because a leader is always in contact with
* itself.
* Returns -1 if called on a follower.
*
* Note that the value returned may be out of date, and so should not be relied
* upon for absolute correctness.
*/
RAFT_API int raft_voter_contacts(struct raft *r);
/**