From 86915775f15bcb4c6f8f6dc73b82e1d320e70776 Mon Sep 17 00:00:00 2001 From: Josh Hershberg Date: Tue, 14 Nov 2023 13:35:44 +0200 Subject: [PATCH 01/15] Cluster refactor: rename cluster.c -> cluster_legacy.c Signed-off-by: Josh Hershberg --- src/Makefile | 2 +- src/{cluster.c => cluster_legacy.c} | 0 src/cluster_legacy.h | 4 ++++ 3 files changed, 5 insertions(+), 1 deletion(-) rename src/{cluster.c => cluster_legacy.c} (100%) create mode 100644 src/cluster_legacy.h diff --git a/src/Makefile b/src/Makefile index c28f6f2b9bc..75d18f5e6c6 100644 --- a/src/Makefile +++ b/src/Makefile @@ -345,7 +345,7 @@ endif REDIS_SERVER_NAME=redis-server$(PROG_SUFFIX) REDIS_SENTINEL_NAME=redis-sentinel$(PROG_SUFFIX) -REDIS_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o +REDIS_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o REDIS_CLI_NAME=redis-cli$(PROG_SUFFIX) REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o redisassert.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o REDIS_BENCHMARK_NAME=redis-benchmark$(PROG_SUFFIX) diff --git a/src/cluster.c b/src/cluster_legacy.c similarity index 100% rename from src/cluster.c rename to src/cluster_legacy.c diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h new file mode 100644 index 00000000000..5683ee86a51 --- /dev/null +++ b/src/cluster_legacy.h @@ -0,0 +1,4 @@ +#ifndef CLUSTER_LEGACY_H +#define CLUSTER_LEGACY_H + +#endif //CLUSTER_LEGACY_H From 6a6ae6ffe803485573bb22b9f97e416fc0bbe09d Mon Sep 17 00:00:00 2001 From: Josh Hershberg Date: Tue, 14 Nov 2023 13:36:43 +0200 Subject: [PATCH 02/15] Cluster refactor: Create new cluster.c and include of cluster_legacy.h create new cluster.c Signed-off-by: Josh Hershberg forgot to #include cluster_legacy.h Signed-off-by: Josh Hershberg --- src/cluster.c | 0 src/cluster_legacy.c | 1 + 2 files changed, 1 insertion(+) create mode 100644 src/cluster.c diff --git a/src/cluster.c b/src/cluster.c new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index a4e9080b38b..aa1dc20c02c 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -30,6 +30,7 @@ #include "server.h" #include "cluster.h" +#include "cluster_legacy.h" #include "endianconv.h" #include "connection.h" From 5292adb9853ed366eb94ff465d6d4aceb5d1e6bc Mon Sep 17 00:00:00 2001 From: Josh Hershberg Date: Mon, 30 Oct 2023 09:45:59 +0200 Subject: [PATCH 03/15] Cluster refactor: Move trivial stuff into cluster_legacy.h Move some declerations from cluster.h to cluster_legacy.h. The items moved are specific to the legacy clustering implementation and DO NOT require any other refactoring other than moving them from one file to another. Signed-off-by: Josh Hershberg --- src/cluster.h | 209 ------------------------------------------ src/cluster_legacy.h | 210 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 210 insertions(+), 209 deletions(-) diff --git a/src/cluster.h b/src/cluster.h index 0340349b2f3..7f6687ae7f8 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -13,14 +13,6 @@ #define CLUSTER_NAMELEN 40 /* sha1 hex length */ #define CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */ -/* The following defines are amount of time, sometimes expressed as - * multiplicators of the node timeout value (when ending with MULT). */ -#define CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */ -#define CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */ -#define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */ -#define CLUSTER_MF_PAUSE_MULT 2 /* Master pause manual failover mult. */ -#define CLUSTER_SLAVE_MIGRATION_DELAY 5000 /* Delay for slave migration. */ - /* Redirection errors returned by getNodeByQuery(). */ #define CLUSTER_REDIR_NONE 0 /* Node can serve the request. */ #define CLUSTER_REDIR_CROSS_SLOT 1 /* -CROSSSLOT request. */ @@ -69,21 +61,6 @@ typedef struct clusterLink { #define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL) #define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER) -/* Reasons why a slave is not able to failover. */ -#define CLUSTER_CANT_FAILOVER_NONE 0 -#define CLUSTER_CANT_FAILOVER_DATA_AGE 1 -#define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2 -#define CLUSTER_CANT_FAILOVER_EXPIRED 3 -#define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4 -#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (10) /* seconds. */ - -/* clusterState todo_before_sleep flags. */ -#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0) -#define CLUSTER_TODO_UPDATE_STATE (1<<1) -#define CLUSTER_TODO_SAVE_CONFIG (1<<2) -#define CLUSTER_TODO_FSYNC_CONFIG (1<<3) -#define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1<<4) - /* Message types. * * Note that the PING, PONG and MEET messages are actually the same exact @@ -201,192 +178,6 @@ typedef struct clusterState { unsigned char owner_not_claiming_slot[CLUSTER_SLOTS / 8]; } clusterState; -/* Redis cluster messages header */ - -/* Initially we don't know our "name", but we'll find it once we connect - * to the first node, using the getsockname() function. Then we'll use this - * address for all the next messages. */ -typedef struct { - char nodename[CLUSTER_NAMELEN]; - uint32_t ping_sent; - uint32_t pong_received; - char ip[NET_IP_STR_LEN]; /* IP address last time it was seen */ - uint16_t port; /* primary port last time it was seen */ - uint16_t cport; /* cluster port last time it was seen */ - uint16_t flags; /* node->flags copy */ - uint16_t pport; /* secondary port last time it was seen */ - uint16_t notused1; -} clusterMsgDataGossip; - -typedef struct { - char nodename[CLUSTER_NAMELEN]; -} clusterMsgDataFail; - -typedef struct { - uint32_t channel_len; - uint32_t message_len; - unsigned char bulk_data[8]; /* 8 bytes just as placeholder. */ -} clusterMsgDataPublish; - -typedef struct { - uint64_t configEpoch; /* Config epoch of the specified instance. */ - char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */ - unsigned char slots[CLUSTER_SLOTS/8]; /* Slots bitmap. */ -} clusterMsgDataUpdate; - -typedef struct { - uint64_t module_id; /* ID of the sender module. */ - uint32_t len; /* ID of the sender module. */ - uint8_t type; /* Type from 0 to 255. */ - unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */ -} clusterMsgModule; - -/* The cluster supports optional extension messages that can be sent - * along with ping/pong/meet messages to give additional info in a - * consistent manner. */ -typedef enum { - CLUSTERMSG_EXT_TYPE_HOSTNAME, - CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME, - CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE, - CLUSTERMSG_EXT_TYPE_SHARDID, -} clusterMsgPingtypes; - -/* Helper function for making sure extensions are eight byte aligned. */ -#define EIGHT_BYTE_ALIGN(size) ((((size) + 7) / 8) * 8) - -typedef struct { - char hostname[1]; /* The announced hostname, ends with \0. */ -} clusterMsgPingExtHostname; - -typedef struct { - char human_nodename[1]; /* The announced nodename, ends with \0. */ -} clusterMsgPingExtHumanNodename; - -typedef struct { - char name[CLUSTER_NAMELEN]; /* Node name. */ - uint64_t ttl; /* Remaining time to blacklist the node, in seconds. */ -} clusterMsgPingExtForgottenNode; - -static_assert(sizeof(clusterMsgPingExtForgottenNode) % 8 == 0, ""); - -typedef struct { - char shard_id[CLUSTER_NAMELEN]; /* The shard_id, 40 bytes fixed. */ -} clusterMsgPingExtShardId; - -typedef struct { - uint32_t length; /* Total length of this extension message (including this header) */ - uint16_t type; /* Type of this extension message (see clusterMsgPingExtTypes) */ - uint16_t unused; /* 16 bits of padding to make this structure 8 byte aligned. */ - union { - clusterMsgPingExtHostname hostname; - clusterMsgPingExtHumanNodename human_nodename; - clusterMsgPingExtForgottenNode forgotten_node; - clusterMsgPingExtShardId shard_id; - } ext[]; /* Actual extension information, formatted so that the data is 8 - * byte aligned, regardless of its content. */ -} clusterMsgPingExt; - -union clusterMsgData { - /* PING, MEET and PONG */ - struct { - /* Array of N clusterMsgDataGossip structures */ - clusterMsgDataGossip gossip[1]; - /* Extension data that can optionally be sent for ping/meet/pong - * messages. We can't explicitly define them here though, since - * the gossip array isn't the real length of the gossip data. */ - } ping; - - /* FAIL */ - struct { - clusterMsgDataFail about; - } fail; - - /* PUBLISH */ - struct { - clusterMsgDataPublish msg; - } publish; - - /* UPDATE */ - struct { - clusterMsgDataUpdate nodecfg; - } update; - - /* MODULE */ - struct { - clusterMsgModule msg; - } module; -}; - -#define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */ - -typedef struct { - char sig[4]; /* Signature "RCmb" (Redis Cluster message bus). */ - uint32_t totlen; /* Total length of this message */ - uint16_t ver; /* Protocol version, currently set to 1. */ - uint16_t port; /* Primary port number (TCP or TLS). */ - uint16_t type; /* Message type */ - uint16_t count; /* Only used for some kind of messages. */ - uint64_t currentEpoch; /* The epoch accordingly to the sending node. */ - uint64_t configEpoch; /* The config epoch if it's a master, or the last - epoch advertised by its master if it is a - slave. */ - uint64_t offset; /* Master replication offset if node is a master or - processed replication offset if node is a slave. */ - char sender[CLUSTER_NAMELEN]; /* Name of the sender node */ - unsigned char myslots[CLUSTER_SLOTS/8]; - char slaveof[CLUSTER_NAMELEN]; - char myip[NET_IP_STR_LEN]; /* Sender IP, if not all zeroed. */ - uint16_t extensions; /* Number of extensions sent along with this packet. */ - char notused1[30]; /* 30 bytes reserved for future usage. */ - uint16_t pport; /* Secondary port number: if primary port is TCP port, this is - TLS port, and if primary port is TLS port, this is TCP port.*/ - uint16_t cport; /* Sender TCP cluster bus port */ - uint16_t flags; /* Sender node flags */ - unsigned char state; /* Cluster state from the POV of the sender */ - unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */ - union clusterMsgData data; -} clusterMsg; - -/* clusterMsg defines the gossip wire protocol exchanged among Redis cluster - * members, which can be running different versions of redis-server bits, - * especially during cluster rolling upgrades. - * - * Therefore, fields in this struct should remain at the same offset from - * release to release. The static asserts below ensures that incompatible - * changes in clusterMsg be caught at compile time. - */ - -static_assert(offsetof(clusterMsg, sig) == 0, "unexpected field offset"); -static_assert(offsetof(clusterMsg, totlen) == 4, "unexpected field offset"); -static_assert(offsetof(clusterMsg, ver) == 8, "unexpected field offset"); -static_assert(offsetof(clusterMsg, port) == 10, "unexpected field offset"); -static_assert(offsetof(clusterMsg, type) == 12, "unexpected field offset"); -static_assert(offsetof(clusterMsg, count) == 14, "unexpected field offset"); -static_assert(offsetof(clusterMsg, currentEpoch) == 16, "unexpected field offset"); -static_assert(offsetof(clusterMsg, configEpoch) == 24, "unexpected field offset"); -static_assert(offsetof(clusterMsg, offset) == 32, "unexpected field offset"); -static_assert(offsetof(clusterMsg, sender) == 40, "unexpected field offset"); -static_assert(offsetof(clusterMsg, myslots) == 80, "unexpected field offset"); -static_assert(offsetof(clusterMsg, slaveof) == 2128, "unexpected field offset"); -static_assert(offsetof(clusterMsg, myip) == 2168, "unexpected field offset"); -static_assert(offsetof(clusterMsg, extensions) == 2214, "unexpected field offset"); -static_assert(offsetof(clusterMsg, notused1) == 2216, "unexpected field offset"); -static_assert(offsetof(clusterMsg, pport) == 2246, "unexpected field offset"); -static_assert(offsetof(clusterMsg, cport) == 2248, "unexpected field offset"); -static_assert(offsetof(clusterMsg, flags) == 2250, "unexpected field offset"); -static_assert(offsetof(clusterMsg, state) == 2252, "unexpected field offset"); -static_assert(offsetof(clusterMsg, mflags) == 2253, "unexpected field offset"); -static_assert(offsetof(clusterMsg, data) == 2256, "unexpected field offset"); - -#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData)) - -/* Message flags better specify the packet content or are used to - * provide some information about the node state. */ -#define CLUSTERMSG_FLAG0_PAUSED (1<<0) /* Master paused for manual failover. */ -#define CLUSTERMSG_FLAG0_FORCEACK (1<<1) /* Give ACK to AUTH_REQUEST even if - master is up. */ -#define CLUSTERMSG_FLAG0_EXT_DATA (1<<2) /* Message contains extension data */ - /* ---------------------- API exported outside cluster.c -------------------- */ void clusterInit(void); void clusterInitListeners(void); diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 5683ee86a51..43234d88937 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -1,4 +1,214 @@ #ifndef CLUSTER_LEGACY_H #define CLUSTER_LEGACY_H +/* The following defines are amount of time, sometimes expressed as + * multiplicators of the node timeout value (when ending with MULT). */ +#define CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */ +#define CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */ +#define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */ +#define CLUSTER_MF_PAUSE_MULT 2 /* Master pause manual failover mult. */ +#define CLUSTER_SLAVE_MIGRATION_DELAY 5000 /* Delay for slave migration. */ + +/* Reasons why a slave is not able to failover. */ +#define CLUSTER_CANT_FAILOVER_NONE 0 +#define CLUSTER_CANT_FAILOVER_DATA_AGE 1 +#define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2 +#define CLUSTER_CANT_FAILOVER_EXPIRED 3 +#define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4 +#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (10) /* seconds. */ + +/* clusterState todo_before_sleep flags. */ +#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0) +#define CLUSTER_TODO_UPDATE_STATE (1<<1) +#define CLUSTER_TODO_SAVE_CONFIG (1<<2) +#define CLUSTER_TODO_FSYNC_CONFIG (1<<3) +#define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1<<4) + + +/* Redis cluster messages header */ + +/* Initially we don't know our "name", but we'll find it once we connect + * to the first node, using the getsockname() function. Then we'll use this + * address for all the next messages. */ +typedef struct { + char nodename[CLUSTER_NAMELEN]; + uint32_t ping_sent; + uint32_t pong_received; + char ip[NET_IP_STR_LEN]; /* IP address last time it was seen */ + uint16_t port; /* primary port last time it was seen */ + uint16_t cport; /* cluster port last time it was seen */ + uint16_t flags; /* node->flags copy */ + uint16_t pport; /* secondary port last time it was seen */ + uint16_t notused1; +} clusterMsgDataGossip; + +typedef struct { + char nodename[CLUSTER_NAMELEN]; +} clusterMsgDataFail; + +typedef struct { + uint32_t channel_len; + uint32_t message_len; + unsigned char bulk_data[8]; /* 8 bytes just as placeholder. */ +} clusterMsgDataPublish; + +typedef struct { + uint64_t configEpoch; /* Config epoch of the specified instance. */ + char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */ + unsigned char slots[CLUSTER_SLOTS/8]; /* Slots bitmap. */ +} clusterMsgDataUpdate; + +typedef struct { + uint64_t module_id; /* ID of the sender module. */ + uint32_t len; /* ID of the sender module. */ + uint8_t type; /* Type from 0 to 255. */ + unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */ +} clusterMsgModule; + +/* The cluster supports optional extension messages that can be sent + * along with ping/pong/meet messages to give additional info in a + * consistent manner. */ +typedef enum { + CLUSTERMSG_EXT_TYPE_HOSTNAME, + CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME, + CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE, + CLUSTERMSG_EXT_TYPE_SHARDID, +} clusterMsgPingtypes; + +/* Helper function for making sure extensions are eight byte aligned. */ +#define EIGHT_BYTE_ALIGN(size) ((((size) + 7) / 8) * 8) + +typedef struct { + char hostname[1]; /* The announced hostname, ends with \0. */ +} clusterMsgPingExtHostname; + +typedef struct { + char human_nodename[1]; /* The announced nodename, ends with \0. */ +} clusterMsgPingExtHumanNodename; + +typedef struct { + char name[CLUSTER_NAMELEN]; /* Node name. */ + uint64_t ttl; /* Remaining time to blacklist the node, in seconds. */ +} clusterMsgPingExtForgottenNode; + +static_assert(sizeof(clusterMsgPingExtForgottenNode) % 8 == 0, ""); + +typedef struct { + char shard_id[CLUSTER_NAMELEN]; /* The shard_id, 40 bytes fixed. */ +} clusterMsgPingExtShardId; + +typedef struct { + uint32_t length; /* Total length of this extension message (including this header) */ + uint16_t type; /* Type of this extension message (see clusterMsgPingExtTypes) */ + uint16_t unused; /* 16 bits of padding to make this structure 8 byte aligned. */ + union { + clusterMsgPingExtHostname hostname; + clusterMsgPingExtHumanNodename human_nodename; + clusterMsgPingExtForgottenNode forgotten_node; + clusterMsgPingExtShardId shard_id; + } ext[]; /* Actual extension information, formatted so that the data is 8 + * byte aligned, regardless of its content. */ +} clusterMsgPingExt; + +union clusterMsgData { + /* PING, MEET and PONG */ + struct { + /* Array of N clusterMsgDataGossip structures */ + clusterMsgDataGossip gossip[1]; + /* Extension data that can optionally be sent for ping/meet/pong + * messages. We can't explicitly define them here though, since + * the gossip array isn't the real length of the gossip data. */ + } ping; + + /* FAIL */ + struct { + clusterMsgDataFail about; + } fail; + + /* PUBLISH */ + struct { + clusterMsgDataPublish msg; + } publish; + + /* UPDATE */ + struct { + clusterMsgDataUpdate nodecfg; + } update; + + /* MODULE */ + struct { + clusterMsgModule msg; + } module; +}; + +#define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */ + +typedef struct { + char sig[4]; /* Signature "RCmb" (Redis Cluster message bus). */ + uint32_t totlen; /* Total length of this message */ + uint16_t ver; /* Protocol version, currently set to 1. */ + uint16_t port; /* Primary port number (TCP or TLS). */ + uint16_t type; /* Message type */ + uint16_t count; /* Only used for some kind of messages. */ + uint64_t currentEpoch; /* The epoch accordingly to the sending node. */ + uint64_t configEpoch; /* The config epoch if it's a master, or the last + epoch advertised by its master if it is a + slave. */ + uint64_t offset; /* Master replication offset if node is a master or + processed replication offset if node is a slave. */ + char sender[CLUSTER_NAMELEN]; /* Name of the sender node */ + unsigned char myslots[CLUSTER_SLOTS/8]; + char slaveof[CLUSTER_NAMELEN]; + char myip[NET_IP_STR_LEN]; /* Sender IP, if not all zeroed. */ + uint16_t extensions; /* Number of extensions sent along with this packet. */ + char notused1[30]; /* 30 bytes reserved for future usage. */ + uint16_t pport; /* Secondary port number: if primary port is TCP port, this is + TLS port, and if primary port is TLS port, this is TCP port.*/ + uint16_t cport; /* Sender TCP cluster bus port */ + uint16_t flags; /* Sender node flags */ + unsigned char state; /* Cluster state from the POV of the sender */ + unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */ + union clusterMsgData data; +} clusterMsg; + +/* clusterMsg defines the gossip wire protocol exchanged among Redis cluster + * members, which can be running different versions of redis-server bits, + * especially during cluster rolling upgrades. + * + * Therefore, fields in this struct should remain at the same offset from + * release to release. The static asserts below ensures that incompatible + * changes in clusterMsg be caught at compile time. + */ + +static_assert(offsetof(clusterMsg, sig) == 0, "unexpected field offset"); +static_assert(offsetof(clusterMsg, totlen) == 4, "unexpected field offset"); +static_assert(offsetof(clusterMsg, ver) == 8, "unexpected field offset"); +static_assert(offsetof(clusterMsg, port) == 10, "unexpected field offset"); +static_assert(offsetof(clusterMsg, type) == 12, "unexpected field offset"); +static_assert(offsetof(clusterMsg, count) == 14, "unexpected field offset"); +static_assert(offsetof(clusterMsg, currentEpoch) == 16, "unexpected field offset"); +static_assert(offsetof(clusterMsg, configEpoch) == 24, "unexpected field offset"); +static_assert(offsetof(clusterMsg, offset) == 32, "unexpected field offset"); +static_assert(offsetof(clusterMsg, sender) == 40, "unexpected field offset"); +static_assert(offsetof(clusterMsg, myslots) == 80, "unexpected field offset"); +static_assert(offsetof(clusterMsg, slaveof) == 2128, "unexpected field offset"); +static_assert(offsetof(clusterMsg, myip) == 2168, "unexpected field offset"); +static_assert(offsetof(clusterMsg, extensions) == 2214, "unexpected field offset"); +static_assert(offsetof(clusterMsg, notused1) == 2216, "unexpected field offset"); +static_assert(offsetof(clusterMsg, pport) == 2246, "unexpected field offset"); +static_assert(offsetof(clusterMsg, cport) == 2248, "unexpected field offset"); +static_assert(offsetof(clusterMsg, flags) == 2250, "unexpected field offset"); +static_assert(offsetof(clusterMsg, state) == 2252, "unexpected field offset"); +static_assert(offsetof(clusterMsg, mflags) == 2253, "unexpected field offset"); +static_assert(offsetof(clusterMsg, data) == 2256, "unexpected field offset"); + +#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData)) + +/* Message flags better specify the packet content or are used to + * provide some information about the node state. */ +#define CLUSTERMSG_FLAG0_PAUSED (1<<0) /* Master paused for manual failover. */ +#define CLUSTERMSG_FLAG0_FORCEACK (1<<1) /* Give ACK to AUTH_REQUEST even if + master is up. */ +#define CLUSTERMSG_FLAG0_EXT_DATA (1<<2) /* Message contains extension data */ + #endif //CLUSTER_LEGACY_H From 98a6c44b751df2a79d55703a8ff7caa740a8f064 Mon Sep 17 00:00:00 2001 From: Josh Hershberg Date: Mon, 30 Oct 2023 12:38:43 +0200 Subject: [PATCH 04/15] Cluster refactor: Make clusterState private Move clusterState into cluster_legacy.h. In order to achieve this some "accessor" methods needed to be added to the cluster API and some other minor refactors. Signed-off-by: Josh Hershberg --- src/cluster.h | 54 ++++++-------------------------------------- src/cluster_legacy.c | 42 +++++++++++++++++++++++++++++++++- src/cluster_legacy.h | 50 ++++++++++++++++++++++++++++++++++++++++ src/db.c | 2 +- src/module.c | 24 ++++---------------- src/replication.c | 2 +- src/script.c | 2 +- src/server.c | 4 ++-- src/server.h | 1 + 9 files changed, 108 insertions(+), 73 deletions(-) diff --git a/src/cluster.h b/src/cluster.h index 7f6687ae7f8..757861588fa 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -129,54 +129,8 @@ typedef struct clusterNode { list *fail_reports; /* List of nodes signaling this as failing */ } clusterNode; -typedef struct clusterState { - clusterNode *myself; /* This node */ - uint64_t currentEpoch; - int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */ - int size; /* Num of master nodes with at least one slot */ - dict *nodes; /* Hash table of name -> clusterNode structures */ - dict *shards; /* Hash table of shard_id -> list (of nodes) structures */ - dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */ - clusterNode *migrating_slots_to[CLUSTER_SLOTS]; - clusterNode *importing_slots_from[CLUSTER_SLOTS]; - clusterNode *slots[CLUSTER_SLOTS]; - rax *slots_to_channels; - /* The following fields are used to take the slave state on elections. */ - mstime_t failover_auth_time; /* Time of previous or next election. */ - int failover_auth_count; /* Number of votes received so far. */ - int failover_auth_sent; /* True if we already asked for votes. */ - int failover_auth_rank; /* This slave rank for current auth request. */ - uint64_t failover_auth_epoch; /* Epoch of the current election. */ - int cant_failover_reason; /* Why a slave is currently not able to - failover. See the CANT_FAILOVER_* macros. */ - /* Manual failover state in common. */ - mstime_t mf_end; /* Manual failover time limit (ms unixtime). - It is zero if there is no MF in progress. */ - /* Manual failover state of master. */ - clusterNode *mf_slave; /* Slave performing the manual failover. */ - /* Manual failover state of slave. */ - long long mf_master_offset; /* Master offset the slave needs to start MF - or -1 if still not received. */ - int mf_can_start; /* If non-zero signal that the manual failover - can start requesting masters vote. */ - /* The following fields are used by masters to take state on elections. */ - uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */ - int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ - /* Stats */ - /* Messages received and sent by type. */ - long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT]; - long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT]; - long long stats_pfail_nodes; /* Number of nodes in PFAIL status, - excluding nodes without address. */ - unsigned long long stat_cluster_links_buffer_limit_exceeded; /* Total number of cluster links freed due to exceeding buffer limit */ +struct clusterState; - /* Bit map for slots that are no longer claimed by the owner in cluster PING - * messages. During slot migration, the owner will stop claiming the slot after - * the ownership transfer. Set the bit corresponding to the slot when a node - * stops claiming the slot. This prevents spreading incorrect information (that - * source still owns the slot) using UPDATE messages. */ - unsigned char owner_not_claiming_slot[CLUSTER_SLOTS / 8]; -} clusterState; /* ---------------------- API exported outside cluster.c -------------------- */ void clusterInit(void); @@ -208,5 +162,11 @@ int clusterNodeGetSlotBit(clusterNode *n, int slot); void clusterUpdateMyselfHumanNodename(void); int isValidAuxString(char *s, unsigned int length); int getNodeDefaultClientPort(clusterNode *n); +int clusterNodeIsMyself(clusterNode *n); +clusterNode* getMyClusterNode(void); +int clusterManualFailoverTimeLimit(void); +char* getMyClusterId(void); +int getClusterSize(void); +char** getClusterNodesList(size_t *numnodes); #endif /* __CLUSTER_H */ diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index aa1dc20c02c..a5cfcf7c540 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -985,7 +985,7 @@ void clusterUpdateMyselfHumanNodename(void) { void clusterInit(void) { int saveconf = 0; - server.cluster = zmalloc(sizeof(clusterState)); + server.cluster = zmalloc(sizeof(struct clusterState)); server.cluster->myself = NULL; server.cluster->currentEpoch = 0; server.cluster->state = CLUSTER_FAIL; @@ -7656,3 +7656,43 @@ unsigned int countChannelsInSlot(unsigned int hashslot) { raxStop(&iter); return j; } + +int clusterNodeIsMyself(clusterNode *n) { + return n == server.cluster->myself; +} + +clusterNode* getMyClusterNode(void) { + return server.cluster->myself; +} + +int clusterManualFailoverTimeLimit(void) { + return server.cluster->mf_end; +} + +char* getMyClusterId(void) { + return server.cluster->myself->name; +} + +int getClusterSize(void) { + return dictSize(server.cluster->nodes); +} + +char** getClusterNodesList(size_t *numnodes) { + size_t count = dictSize(server.cluster->nodes); + char **ids = zmalloc((count+1)*CLUSTER_NAMELEN); + dictIterator *di = dictGetIterator(server.cluster->nodes); + dictEntry *de; + int j = 0; + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + if (node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) continue; + ids[j] = zmalloc(CLUSTER_NAMELEN); + memcpy(ids[j],node->name,CLUSTER_NAMELEN); + j++; + } + *numnodes = j; + ids[j] = NULL; /* Null term so that FreeClusterNodesList does not need + * to also get the count argument. */ + dictReleaseIterator(di); + return ids; +} diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 43234d88937..3c2e148fb29 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -211,4 +211,54 @@ static_assert(offsetof(clusterMsg, data) == 2256, "unexpected field offset"); master is up. */ #define CLUSTERMSG_FLAG0_EXT_DATA (1<<2) /* Message contains extension data */ +struct clusterState { + clusterNode *myself; /* This node */ + uint64_t currentEpoch; + int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */ + int size; /* Num of master nodes with at least one slot */ + dict *nodes; /* Hash table of name -> clusterNode structures */ + dict *shards; /* Hash table of shard_id -> list (of nodes) structures */ + dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */ + clusterNode *migrating_slots_to[CLUSTER_SLOTS]; + clusterNode *importing_slots_from[CLUSTER_SLOTS]; + clusterNode *slots[CLUSTER_SLOTS]; + rax *slots_to_channels; + /* The following fields are used to take the slave state on elections. */ + mstime_t failover_auth_time; /* Time of previous or next election. */ + int failover_auth_count; /* Number of votes received so far. */ + int failover_auth_sent; /* True if we already asked for votes. */ + int failover_auth_rank; /* This slave rank for current auth request. */ + uint64_t failover_auth_epoch; /* Epoch of the current election. */ + int cant_failover_reason; /* Why a slave is currently not able to + failover. See the CANT_FAILOVER_* macros. */ + /* Manual failover state in common. */ + mstime_t mf_end; /* Manual failover time limit (ms unixtime). + It is zero if there is no MF in progress. */ + /* Manual failover state of master. */ + clusterNode *mf_slave; /* Slave performing the manual failover. */ + /* Manual failover state of slave. */ + long long mf_master_offset; /* Master offset the slave needs to start MF + or -1 if still not received. */ + int mf_can_start; /* If non-zero signal that the manual failover + can start requesting masters vote. */ + /* The following fields are used by masters to take state on elections. */ + uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */ + int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ + /* Stats */ + /* Messages received and sent by type. */ + long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT]; + long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT]; + long long stats_pfail_nodes; /* Number of nodes in PFAIL status, + excluding nodes without address. */ + unsigned long long stat_cluster_links_buffer_limit_exceeded; /* Total number of cluster links freed due to exceeding buffer limit */ + + /* Bit map for slots that are no longer claimed by the owner in cluster PING + * messages. During slot migration, the owner will stop claiming the slot after + * the ownership transfer. Set the bit corresponding to the slot when a node + * stops claiming the slot. This prevents spreading incorrect information (that + * source still owns the slot) using UPDATE messages. */ + unsigned char owner_not_claiming_slot[CLUSTER_SLOTS / 8]; +}; + + #endif //CLUSTER_LEGACY_H diff --git a/src/db.c b/src/db.c index c4c22907eb7..f77db3f885c 100644 --- a/src/db.c +++ b/src/db.c @@ -2197,7 +2197,7 @@ int dbExpand(const redisDb *db, uint64_t db_size, dbKeyType keyType, int try_exp dict *d; if (server.cluster_enabled) { for (int i = 0; i < CLUSTER_SLOTS; i++) { - if (clusterNodeGetSlotBit(server.cluster->myself, i)) { + if (clusterNodeGetSlotBit(getMyClusterNode(), i)) { /* We don't know exact number of keys that would fall into each slot, but we can approximate it, assuming even distribution. */ if (keyType == DB_MAIN) { d = db->dict[i]; diff --git a/src/module.c b/src/module.c index 0428ac59c0b..b24527fc192 100644 --- a/src/module.c +++ b/src/module.c @@ -6466,7 +6466,7 @@ RedisModuleCallReply *RM_Call(RedisModuleCtx *ctx, const char *cmdname, const ch c->flags &= ~(CLIENT_READONLY|CLIENT_ASKING); c->flags |= ctx->client->flags & (CLIENT_READONLY|CLIENT_ASKING); if (getNodeByQuery(c,c->cmd,c->argv,c->argc,NULL,&error_code) != - server.cluster->myself) + getMyClusterNode()) { sds msg = NULL; if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) { @@ -8917,23 +8917,7 @@ char **RM_GetClusterNodesList(RedisModuleCtx *ctx, size_t *numnodes) { UNUSED(ctx); if (!server.cluster_enabled) return NULL; - size_t count = dictSize(server.cluster->nodes); - char **ids = zmalloc((count+1)*REDISMODULE_NODE_ID_LEN); - dictIterator *di = dictGetIterator(server.cluster->nodes); - dictEntry *de; - int j = 0; - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - if (node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) continue; - ids[j] = zmalloc(REDISMODULE_NODE_ID_LEN); - memcpy(ids[j],node->name,REDISMODULE_NODE_ID_LEN); - j++; - } - *numnodes = j; - ids[j] = NULL; /* Null term so that FreeClusterNodesList does not need - * to also get the count argument. */ - dictReleaseIterator(di); - return ids; + return getClusterNodesList(numnodes); } /* Free the node list obtained with RedisModule_GetClusterNodesList. */ @@ -8947,7 +8931,7 @@ void RM_FreeClusterNodesList(char **ids) { * is disabled. */ const char *RM_GetMyClusterID(void) { if (!server.cluster_enabled) return NULL; - return server.cluster->myself->name; + return getMyClusterId(); } /* Return the number of nodes in the cluster, regardless of their state @@ -8956,7 +8940,7 @@ const char *RM_GetMyClusterID(void) { * cluster mode, zero is returned. */ size_t RM_GetClusterSize(void) { if (!server.cluster_enabled) return 0; - return dictSize(server.cluster->nodes); + return getClusterSize(); } /* Populate the specified info for the node having as ID the specified 'id', diff --git a/src/replication.c b/src/replication.c index 313f69152c4..e64251663b4 100644 --- a/src/replication.c +++ b/src/replication.c @@ -3774,7 +3774,7 @@ void replicationCron(void) { * match the one stored into 'mf_master_offset' state. */ int manual_failover_in_progress = ((server.cluster_enabled && - server.cluster->mf_end) || + clusterManualFailoverTimeLimit()) || server.failover_end_time) && isPausedActionsWithUpdate(PAUSE_ACTION_REPLICA); diff --git a/src/script.c b/src/script.c index d0b9b963500..678773d9680 100644 --- a/src/script.c +++ b/src/script.c @@ -429,7 +429,7 @@ static int scriptVerifyClusterState(scriptRunCtx *run_ctx, client *c, client *or c->flags &= ~(CLIENT_READONLY | CLIENT_ASKING); c->flags |= original_c->flags & (CLIENT_READONLY | CLIENT_ASKING); int hashslot = -1; - if (getNodeByQuery(c, c->cmd, c->argv, c->argc, &hashslot, &error_code) != server.cluster->myself) { + if (getNodeByQuery(c, c->cmd, c->argv, c->argc, &hashslot, &error_code) != getMyClusterNode()) { if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) { *err = sdsnew( "Script attempted to execute a write command while the " diff --git a/src/server.c b/src/server.c index e63a2ffff23..32767569054 100644 --- a/src/server.c +++ b/src/server.c @@ -4037,7 +4037,7 @@ int processCommand(client *c) { int error_code; clusterNode *n = getNodeByQuery(c,c->cmd,c->argv,c->argc, &c->slot,&error_code); - if (n == NULL || n != server.cluster->myself) { + if (n == NULL || !clusterNodeIsMyself(n)) { if (c->cmd->proc == execCommand) { discardTransaction(c); } else { @@ -6838,7 +6838,7 @@ int redisIsSupervised(int mode) { int iAmMaster(void) { return ((!server.cluster_enabled && server.masterhost == NULL) || - (server.cluster_enabled && nodeIsMaster(server.cluster->myself))); + (server.cluster_enabled && nodeIsMaster(getMyClusterNode()))); } #ifdef REDIS_TEST diff --git a/src/server.h b/src/server.h index 902050889cf..13486a54364 100644 --- a/src/server.h +++ b/src/server.h @@ -738,6 +738,7 @@ struct RedisModuleCtx; struct moduleLoadQueueEntry; struct RedisModuleKeyOptCtx; struct RedisModuleCommand; +struct clusterState; /* Each module type implementation should export a set of methods in order * to serialize and deserialize the value in the RDB file, rewrite the AOF From d9a0478599cb91d4cd38763ad09947d055bb21a9 Mon Sep 17 00:00:00 2001 From: Josh Hershberg Date: Mon, 30 Oct 2023 17:08:30 +0200 Subject: [PATCH 05/15] Cluster refactor: Make clusterNode private Move clusterNode into cluster_legacy.h. In order to achieve this some accessor methods were added and also a refactor of how debugCommand handles cluster related subcommands. Signed-off-by: Josh Hershberg --- src/cluster.h | 56 ++++++++---------------------- src/cluster_legacy.c | 81 ++++++++++++++++++++++++++++++++++++++++++++ src/cluster_legacy.h | 36 ++++++++++++++++++++ src/debug.c | 33 ++---------------- src/module.c | 21 ++++++------ src/networking.c | 26 +++++++++++--- src/server.h | 1 + 7 files changed, 166 insertions(+), 88 deletions(-) diff --git a/src/cluster.h b/src/cluster.h index 757861588fa..f9e53a06bf0 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -23,7 +23,8 @@ #define CLUSTER_REDIR_DOWN_UNBOUND 6 /* -CLUSTERDOWN, unbound slot. */ #define CLUSTER_REDIR_DOWN_RO_STATE 7 /* -CLUSTERDOWN, allow reads. */ -struct clusterNode; +typedef struct _clusterNode clusterNode; +struct clusterState; /* clusterLink encapsulates everything needed to talk with a remote node. */ typedef struct clusterLink { @@ -35,7 +36,7 @@ typedef struct clusterLink { char *rcvbuf; /* Packet reception buffer */ size_t rcvbuf_len; /* Used size of rcvbuf */ size_t rcvbuf_alloc; /* Allocated size of rcvbuf */ - struct clusterNode *node; /* Node related to this link. Initialized to NULL when unknown */ + clusterNode *node; /* Node related to this link. Initialized to NULL when unknown */ int inbound; /* 1 if this link is an inbound link accepted from the related node */ } clusterLink; @@ -52,7 +53,6 @@ typedef struct clusterLink { #define CLUSTER_NODE_NOFAILOVER 512 /* Slave will not try to failover. */ #define CLUSTER_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" -#define nodeIsMaster(n) ((n)->flags & CLUSTER_NODE_MASTER) #define nodeIsSlave(n) ((n)->flags & CLUSTER_NODE_SLAVE) #define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE) #define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR)) @@ -89,47 +89,10 @@ typedef struct clusterLink { /* This structure represent elements of node->fail_reports. */ typedef struct clusterNodeFailReport { - struct clusterNode *node; /* Node reporting the failure condition. */ + clusterNode *node; /* Node reporting the failure condition. */ mstime_t time; /* Time of the last report from this node. */ } clusterNodeFailReport; -typedef struct clusterNode { - mstime_t ctime; /* Node object creation time. */ - char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */ - char shard_id[CLUSTER_NAMELEN]; /* shard id, hex string, sha1-size */ - int flags; /* CLUSTER_NODE_... */ - uint64_t configEpoch; /* Last configEpoch observed for this node */ - unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node */ - uint16_t *slot_info_pairs; /* Slots info represented as (start/end) pair (consecutive index). */ - int slot_info_pairs_count; /* Used number of slots in slot_info_pairs */ - int numslots; /* Number of slots handled by this node */ - int numslaves; /* Number of slave nodes, if this is a master */ - struct clusterNode **slaves; /* pointers to slave nodes */ - struct clusterNode *slaveof; /* pointer to the master node. Note that it - may be NULL even if the node is a slave - if we don't have the master node in our - tables. */ - unsigned long long last_in_ping_gossip; /* The number of the last carried in the ping gossip section */ - mstime_t ping_sent; /* Unix time we sent latest ping */ - mstime_t pong_received; /* Unix time we received the pong */ - mstime_t data_received; /* Unix time we received any data */ - mstime_t fail_time; /* Unix time when FAIL flag was set */ - mstime_t voted_time; /* Last time we voted for a slave of this master */ - mstime_t repl_offset_time; /* Unix time we received offset for this node */ - mstime_t orphaned_time; /* Starting time of orphaned master condition */ - long long repl_offset; /* Last known repl offset for this node. */ - char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */ - sds hostname; /* The known hostname for this node */ - sds human_nodename; /* The known human readable nodename for this node */ - int tcp_port; /* Latest known clients TCP port. */ - int tls_port; /* Latest known clients TLS port */ - int cport; /* Latest known cluster port of this node. */ - clusterLink *link; /* TCP/IP link established toward this node */ - clusterLink *inbound_link; /* TCP/IP link accepted from this node */ - list *fail_reports; /* List of nodes signaling this as failing */ -} clusterNode; - -struct clusterState; /* ---------------------- API exported outside cluster.c -------------------- */ @@ -168,5 +131,16 @@ int clusterManualFailoverTimeLimit(void); char* getMyClusterId(void); int getClusterSize(void); char** getClusterNodesList(size_t *numnodes); +int nodeIsMaster(clusterNode *n); +int handleDebugClusterCommand(client *c); +int clusterNodeConfirmedReachable(clusterNode *node); +char* clusterNodeIp(clusterNode *node); +int clusterNodeIsSlave(clusterNode *node); +clusterNode *clusterNodeGetSlaveof(clusterNode *node); +char* clusterNodeGetName(clusterNode *node); +int clusterNodeTimedOut(clusterNode *node); +int clusterNodeIsFailing(clusterNode *node); +int clusterNodeIsNoFailover(clusterNode *node); +char **clusterDebugCommandHelp(void); #endif /* __CLUSTER_H */ diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index a5cfcf7c540..6046caf1aa4 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -7696,3 +7696,84 @@ char** getClusterNodesList(size_t *numnodes) { dictReleaseIterator(di); return ids; } + +int nodeIsMaster(clusterNode *n) { + return n->flags & CLUSTER_NODE_MASTER; +} + +int handleDebugClusterCommand(client *c) { + if (strcasecmp(c->argv[1]->ptr, "CLUSTERLINK") || + strcasecmp(c->argv[2]->ptr, "KILL") || + c->argc != 5) { + return 0; + } + + if (!server.cluster_enabled) { + addReplyError(c, "Debug option only available for cluster mode enabled setup!"); + return 1; + } + + /* Find the node. */ + clusterNode *n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); + if (!n) { + addReplyErrorFormat(c, "Unknown node %s", (char *) c->argv[4]->ptr); + return 1; + } + + /* Terminate the link based on the direction or all. */ + if (!strcasecmp(c->argv[3]->ptr, "from")) { + freeClusterLink(n->inbound_link); + } else if (!strcasecmp(c->argv[3]->ptr, "to")) { + freeClusterLink(n->link); + } else if (!strcasecmp(c->argv[3]->ptr, "all")) { + freeClusterLink(n->link); + freeClusterLink(n->inbound_link); + } else { + addReplyErrorFormat(c, "Unknown direction %s", (char *) c->argv[3]->ptr); + } + addReply(c, shared.ok); + + return 1; +} + +int clusterNodeConfirmedReachable(clusterNode *node) { + return !(node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)); +} + +char* clusterNodeIp(clusterNode *node) { + return node->ip; +} + +int clusterNodeIsSlave(clusterNode *node) { + return !nodeIsMaster(node); +} + +clusterNode *clusterNodeGetSlaveof(clusterNode *node) { + return node->slaveof; +} + +char* clusterNodeGetName(clusterNode *node) { + return node->name; +} + +int clusterNodeTimedOut(clusterNode *node) { + return nodeTimedOut(node); +} + +int clusterNodeIsFailing(clusterNode *node) { + return nodeFailed(node); +} + +int clusterNodeIsNoFailover(clusterNode *node) { + return node->flags & CLUSTER_NODE_NOFAILOVER; +} + +char **clusterDebugCommandHelp(void) { + const char *help[] = { + "CLUSTERLINK KILL ", + " Kills the link based on the direction to/from (both) with the provided node." , + NULL + }; + + return help; +} diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 3c2e148fb29..a2f6edfb6fd 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -211,6 +211,42 @@ static_assert(offsetof(clusterMsg, data) == 2256, "unexpected field offset"); master is up. */ #define CLUSTERMSG_FLAG0_EXT_DATA (1<<2) /* Message contains extension data */ +struct _clusterNode { + mstime_t ctime; /* Node object creation time. */ + char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */ + char shard_id[CLUSTER_NAMELEN]; /* shard id, hex string, sha1-size */ + int flags; /* CLUSTER_NODE_... */ + uint64_t configEpoch; /* Last configEpoch observed for this node */ + unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node */ + uint16_t *slot_info_pairs; /* Slots info represented as (start/end) pair (consecutive index). */ + int slot_info_pairs_count; /* Used number of slots in slot_info_pairs */ + int numslots; /* Number of slots handled by this node */ + int numslaves; /* Number of slave nodes, if this is a master */ + clusterNode **slaves; /* pointers to slave nodes */ + clusterNode *slaveof; /* pointer to the master node. Note that it + may be NULL even if the node is a slave + if we don't have the master node in our + tables. */ + unsigned long long last_in_ping_gossip; /* The number of the last carried in the ping gossip section */ + mstime_t ping_sent; /* Unix time we sent latest ping */ + mstime_t pong_received; /* Unix time we received the pong */ + mstime_t data_received; /* Unix time we received any data */ + mstime_t fail_time; /* Unix time when FAIL flag was set */ + mstime_t voted_time; /* Last time we voted for a slave of this master */ + mstime_t repl_offset_time; /* Unix time we received offset for this node */ + mstime_t orphaned_time; /* Starting time of orphaned master condition */ + long long repl_offset; /* Last known repl offset for this node. */ + char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */ + sds hostname; /* The known hostname for this node */ + sds human_nodename; /* The known human readable nodename for this node */ + int tcp_port; /* Latest known clients TCP port. */ + int tls_port; /* Latest known clients TLS port */ + int cport; /* Latest known cluster port of this node. */ + clusterLink *link; /* TCP/IP link established toward this node */ + clusterLink *inbound_link; /* TCP/IP link accepted from this node */ + list *fail_reports; /* List of nodes signaling this as failing */ +}; + struct clusterState { clusterNode *myself; /* This node */ uint64_t currentEpoch; diff --git a/src/debug.c b/src/debug.c index e537126f8da..b6ed93a79cd 100644 --- a/src/debug.c +++ b/src/debug.c @@ -496,11 +496,9 @@ void debugCommand(client *c) { " In case RESET is provided the peak reset time will be restored to the default value", "REPLYBUFFER RESIZING <0|1>", " Enable or disable the reply buffer resize cron job", -"CLUSTERLINK KILL ", -" Kills the link based on the direction to/from (both) with the provided node." , NULL }; - addReplyHelp(c, help); + addExtendedReplyHelp(c, help, clusterDebugCommandHelp()); } else if (!strcasecmp(c->argv[1]->ptr,"segfault")) { /* Compiler gives warnings about writing to a random address * e.g "*((char*)-1) = 'x';". As a workaround, we map a read-only area @@ -1018,34 +1016,7 @@ NULL return; } addReply(c, shared.ok); - } else if(!strcasecmp(c->argv[1]->ptr,"CLUSTERLINK") && - !strcasecmp(c->argv[2]->ptr,"KILL") && - c->argc == 5) { - if (!server.cluster_enabled) { - addReplyError(c, "Debug option only available for cluster mode enabled setup!"); - return; - } - - /* Find the node. */ - clusterNode *n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[4]->ptr); - return; - } - - /* Terminate the link based on the direction or all. */ - if (!strcasecmp(c->argv[3]->ptr,"from")) { - freeClusterLink(n->inbound_link); - } else if (!strcasecmp(c->argv[3]->ptr,"to")) { - freeClusterLink(n->link); - } else if (!strcasecmp(c->argv[3]->ptr,"all")) { - freeClusterLink(n->link); - freeClusterLink(n->inbound_link); - } else { - addReplyErrorFormat(c, "Unknown direction %s", (char*) c->argv[3]->ptr); - } - addReply(c,shared.ok); - } else { + } else if(!handleDebugClusterCommand(c)) { addReplySubcommandSyntaxError(c); return; } diff --git a/src/module.c b/src/module.c index b24527fc192..115a7cbc438 100644 --- a/src/module.c +++ b/src/module.c @@ -8967,20 +8967,19 @@ int RM_GetClusterNodeInfo(RedisModuleCtx *ctx, const char *id, char *ip, char *m UNUSED(ctx); clusterNode *node = clusterLookupNode(id, strlen(id)); - if (node == NULL || - node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) + if (node == NULL || !clusterNodeConfirmedReachable(node)) { return REDISMODULE_ERR; } - if (ip) redis_strlcpy(ip,node->ip,NET_IP_STR_LEN); + if (ip) redis_strlcpy(ip, clusterNodeIp(node),NET_IP_STR_LEN); if (master_id) { /* If the information is not available, the function will set the * field to zero bytes, so that when the field can't be populated the * function kinda remains predictable. */ - if (node->flags & CLUSTER_NODE_SLAVE && node->slaveof) - memcpy(master_id,node->slaveof->name,REDISMODULE_NODE_ID_LEN); + if (clusterNodeIsSlave(node) && clusterNodeGetSlaveof(node)) + memcpy(master_id, clusterNodeGetName(clusterNodeGetSlaveof(node)) ,REDISMODULE_NODE_ID_LEN); else memset(master_id,0,REDISMODULE_NODE_ID_LEN); } @@ -8990,12 +8989,12 @@ int RM_GetClusterNodeInfo(RedisModuleCtx *ctx, const char *id, char *ip, char *m * we can provide binary compatibility. */ if (flags) { *flags = 0; - if (node->flags & CLUSTER_NODE_MYSELF) *flags |= REDISMODULE_NODE_MYSELF; - if (node->flags & CLUSTER_NODE_MASTER) *flags |= REDISMODULE_NODE_MASTER; - if (node->flags & CLUSTER_NODE_SLAVE) *flags |= REDISMODULE_NODE_SLAVE; - if (node->flags & CLUSTER_NODE_PFAIL) *flags |= REDISMODULE_NODE_PFAIL; - if (node->flags & CLUSTER_NODE_FAIL) *flags |= REDISMODULE_NODE_FAIL; - if (node->flags & CLUSTER_NODE_NOFAILOVER) *flags |= REDISMODULE_NODE_NOFAILOVER; + if (clusterNodeIsMyself(node)) *flags |= REDISMODULE_NODE_MYSELF; + if (nodeIsMaster(node)) *flags |= REDISMODULE_NODE_MASTER; + if (clusterNodeIsSlave(node)) *flags |= REDISMODULE_NODE_SLAVE; + if (clusterNodeTimedOut(node)) *flags |= REDISMODULE_NODE_PFAIL; + if (clusterNodeIsFailing(node)) *flags |= REDISMODULE_NODE_FAIL; + if (clusterNodeIsNoFailover(node)) *flags |= REDISMODULE_NODE_NOFAILOVER; } return REDISMODULE_OK; } diff --git a/src/networking.c b/src/networking.c index 718271aec2a..847eee3d584 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1117,14 +1117,18 @@ void addReplyVerbatim(client *c, const char *s, size_t len, const char *ext) { } } -/* Add an array of C strings as status replies with a heading. - * This function is typically invoked by from commands that support - * subcommands in response to the 'help' subcommand. The help array - * is terminated by NULL sentinel. */ -void addReplyHelp(client *c, const char **help) { +/* This function is similar to the addReplyHelp function but adds the + * ability to pass in two arrays of strings. Some commands have + * some additional subcommands based on the specific feature implementation + * Redis is compiled with (currently just clustering). This function allows + * to pass is the common subcommands in `help` and any implementation + * specific subcommands in `extended_help`. + */ +void addExtendedReplyHelp(client *c, const char **help, const char **extended_help) { sds cmd = sdsnew((char*) c->argv[0]->ptr); void *blenp = addReplyDeferredLen(c); int blen = 0; + int idx = 0; sdstoupper(cmd); addReplyStatusFormat(c, @@ -1132,6 +1136,10 @@ void addReplyHelp(client *c, const char **help) { sdsfree(cmd); while (help[blen]) addReplyStatus(c,help[blen++]); + if (extended_help) { + while (extended_help[idx]) addReplyStatus(c,extended_help[idx++]); + } + blen += idx; addReplyStatus(c,"HELP"); addReplyStatus(c," Print this help."); @@ -1141,6 +1149,14 @@ void addReplyHelp(client *c, const char **help) { setDeferredArrayLen(c,blenp,blen); } +/* Add an array of C strings as status replies with a heading. + * This function is typically invoked by commands that support + * subcommands in response to the 'help' subcommand. The help array + * is terminated by NULL sentinel. */ +void addReplyHelp(client *c, const char **help) { + addExtendedReplyHelp(c, help, NULL); +} + /* Add a suggestive error reply. * This function is typically invoked by from commands that support * subcommands in response to an unknown subcommand or argument error. */ diff --git a/src/server.h b/src/server.h index 13486a54364..66fb288aa77 100644 --- a/src/server.h +++ b/src/server.h @@ -2627,6 +2627,7 @@ void addReplySetLen(client *c, long length); void addReplyAttributeLen(client *c, long length); void addReplyPushLen(client *c, long length); void addReplyHelp(client *c, const char **help); +void addExtendedReplyHelp(client *c, const char **help, const char **extended_help); void addReplySubcommandSyntaxError(client *c); void addReplyLoadedModules(client *c); void copyReplicaOutputBuffer(client *dst, client *src); From 4944eda696fa5d9e1b48dc2b544ea13d5ca2b5c5 Mon Sep 17 00:00:00 2001 From: Josh Hershberg Date: Mon, 30 Oct 2023 17:30:59 +0200 Subject: [PATCH 06/15] Cluster refactor: Move more stuff from cluster.h to cluster_legacy.h More declerations can be moved into cluster_legacy.h as they are not requied for the cluster api. The code was simply moved, not changed in any way. Signed-off-by: Josh Hershberg --- src/cluster.h | 66 +------------------------------------------- src/cluster_legacy.c | 1 + src/cluster_legacy.h | 60 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 65 deletions(-) diff --git a/src/cluster.h b/src/cluster.h index f9e53a06bf0..b7320be1d51 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -2,7 +2,7 @@ #define __CLUSTER_H /*----------------------------------------------------------------------------- - * Redis cluster data structures, defines, exported API. + * Redis cluster exported API. *----------------------------------------------------------------------------*/ #define CLUSTER_SLOT_MASK_BITS 14 /* Number of bits used for slot id. */ @@ -11,7 +11,6 @@ #define CLUSTER_OK 0 /* Everything looks ok */ #define CLUSTER_FAIL 1 /* The cluster can't work */ #define CLUSTER_NAMELEN 40 /* sha1 hex length */ -#define CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */ /* Redirection errors returned by getNodeByQuery(). */ #define CLUSTER_REDIR_NONE 0 /* Node can serve the request. */ @@ -26,60 +25,6 @@ typedef struct _clusterNode clusterNode; struct clusterState; -/* clusterLink encapsulates everything needed to talk with a remote node. */ -typedef struct clusterLink { - mstime_t ctime; /* Link creation time */ - connection *conn; /* Connection to remote node */ - list *send_msg_queue; /* List of messages to be sent */ - size_t head_msg_send_offset; /* Number of bytes already sent of message at head of queue */ - unsigned long long send_msg_queue_mem; /* Memory in bytes used by message queue */ - char *rcvbuf; /* Packet reception buffer */ - size_t rcvbuf_len; /* Used size of rcvbuf */ - size_t rcvbuf_alloc; /* Allocated size of rcvbuf */ - clusterNode *node; /* Node related to this link. Initialized to NULL when unknown */ - int inbound; /* 1 if this link is an inbound link accepted from the related node */ -} clusterLink; - -/* Cluster node flags and macros. */ -#define CLUSTER_NODE_MASTER 1 /* The node is a master */ -#define CLUSTER_NODE_SLAVE 2 /* The node is a slave */ -#define CLUSTER_NODE_PFAIL 4 /* Failure? Need acknowledge */ -#define CLUSTER_NODE_FAIL 8 /* The node is believed to be malfunctioning */ -#define CLUSTER_NODE_MYSELF 16 /* This node is myself */ -#define CLUSTER_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */ -#define CLUSTER_NODE_NOADDR 64 /* We don't know the address of this node */ -#define CLUSTER_NODE_MEET 128 /* Send a MEET message to this node */ -#define CLUSTER_NODE_MIGRATE_TO 256 /* Master eligible for replica migration. */ -#define CLUSTER_NODE_NOFAILOVER 512 /* Slave will not try to failover. */ -#define CLUSTER_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" - -#define nodeIsSlave(n) ((n)->flags & CLUSTER_NODE_SLAVE) -#define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE) -#define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR)) -#define nodeWithoutAddr(n) ((n)->flags & CLUSTER_NODE_NOADDR) -#define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL) -#define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL) -#define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER) - -/* Message types. - * - * Note that the PING, PONG and MEET messages are actually the same exact - * kind of packet. PONG is the reply to ping, in the exact format as a PING, - * while MEET is a special PING that forces the receiver to add the sender - * as a node (if it is not already in the list). */ -#define CLUSTERMSG_TYPE_PING 0 /* Ping */ -#define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */ -#define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */ -#define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */ -#define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */ -#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */ -#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you have my vote */ -#define CLUSTERMSG_TYPE_UPDATE 7 /* Another node slots configuration */ -#define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */ -#define CLUSTERMSG_TYPE_MODULE 9 /* Module cluster API message. */ -#define CLUSTERMSG_TYPE_PUBLISHSHARD 10 /* Pub/Sub Publish shard propagation */ -#define CLUSTERMSG_TYPE_COUNT 11 /* Total number of message types. */ - /* Flags that a module can set in order to prevent certain Redis Cluster * features to be enabled. Useful when implementing a different distributed * system on top of Redis Cluster message bus, using modules. */ @@ -87,14 +32,6 @@ typedef struct clusterLink { #define CLUSTER_MODULE_FLAG_NO_FAILOVER (1<<1) #define CLUSTER_MODULE_FLAG_NO_REDIRECTION (1<<2) -/* This structure represent elements of node->fail_reports. */ -typedef struct clusterNodeFailReport { - clusterNode *node; /* Node reporting the failure condition. */ - mstime_t time; /* Time of the last report from this node. */ -} clusterNodeFailReport; - - - /* ---------------------- API exported outside cluster.c -------------------- */ void clusterInit(void); void clusterInitListeners(void); @@ -120,7 +57,6 @@ void clusterUpdateMyselfHostname(void); void clusterUpdateMyselfAnnouncedPorts(void); sds clusterGenNodesDescription(client *c, int filter, int tls_primary); sds genClusterInfoString(void); -void freeClusterLink(clusterLink *link); int clusterNodeGetSlotBit(clusterNode *n, int slot); void clusterUpdateMyselfHumanNodename(void); int isValidAuxString(char *s, unsigned int length); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 6046caf1aa4..556e5dcebdd 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -103,6 +103,7 @@ int auxTlsPortSetter(clusterNode *n, void *value, int length); sds auxTlsPortGetter(clusterNode *n, sds s); int auxTlsPortPresent(clusterNode *n); static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen); +void freeClusterLink(clusterLink *link); int getNodeDefaultClientPort(clusterNode *n) { return server.tls_cluster ? n->tls_port : n->tcp_port; diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index a2f6edfb6fd..578b46fc3ff 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -1,6 +1,8 @@ #ifndef CLUSTER_LEGACY_H #define CLUSTER_LEGACY_H +#define CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */ + /* The following defines are amount of time, sometimes expressed as * multiplicators of the node timeout value (when ending with MULT). */ #define CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */ @@ -24,9 +26,67 @@ #define CLUSTER_TODO_FSYNC_CONFIG (1<<3) #define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1<<4) +/* clusterLink encapsulates everything needed to talk with a remote node. */ +typedef struct clusterLink { + mstime_t ctime; /* Link creation time */ + connection *conn; /* Connection to remote node */ + list *send_msg_queue; /* List of messages to be sent */ + size_t head_msg_send_offset; /* Number of bytes already sent of message at head of queue */ + unsigned long long send_msg_queue_mem; /* Memory in bytes used by message queue */ + char *rcvbuf; /* Packet reception buffer */ + size_t rcvbuf_len; /* Used size of rcvbuf */ + size_t rcvbuf_alloc; /* Allocated size of rcvbuf */ + clusterNode *node; /* Node related to this link. Initialized to NULL when unknown */ + int inbound; /* 1 if this link is an inbound link accepted from the related node */ +} clusterLink; + +/* Cluster node flags and macros. */ +#define CLUSTER_NODE_MASTER 1 /* The node is a master */ +#define CLUSTER_NODE_SLAVE 2 /* The node is a slave */ +#define CLUSTER_NODE_PFAIL 4 /* Failure? Need acknowledge */ +#define CLUSTER_NODE_FAIL 8 /* The node is believed to be malfunctioning */ +#define CLUSTER_NODE_MYSELF 16 /* This node is myself */ +#define CLUSTER_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */ +#define CLUSTER_NODE_NOADDR 64 /* We don't know the address of this node */ +#define CLUSTER_NODE_MEET 128 /* Send a MEET message to this node */ +#define CLUSTER_NODE_MIGRATE_TO 256 /* Master eligible for replica migration. */ +#define CLUSTER_NODE_NOFAILOVER 512 /* Slave will not try to failover. */ +#define CLUSTER_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" + +#define nodeIsSlave(n) ((n)->flags & CLUSTER_NODE_SLAVE) +#define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE) +#define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR)) +#define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL) +#define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL) +#define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER) + +/* This structure represent elements of node->fail_reports. */ +typedef struct clusterNodeFailReport { + clusterNode *node; /* Node reporting the failure condition. */ + mstime_t time; /* Time of the last report from this node. */ +} clusterNodeFailReport; /* Redis cluster messages header */ +/* Message types. + * + * Note that the PING, PONG and MEET messages are actually the same exact + * kind of packet. PONG is the reply to ping, in the exact format as a PING, + * while MEET is a special PING that forces the receiver to add the sender + * as a node (if it is not already in the list). */ +#define CLUSTERMSG_TYPE_PING 0 /* Ping */ +#define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */ +#define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */ +#define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */ +#define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */ +#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */ +#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you have my vote */ +#define CLUSTERMSG_TYPE_UPDATE 7 /* Another node slots configuration */ +#define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */ +#define CLUSTERMSG_TYPE_MODULE 9 /* Module cluster API message. */ +#define CLUSTERMSG_TYPE_PUBLISHSHARD 10 /* Pub/Sub Publish shard propagation */ +#define CLUSTERMSG_TYPE_COUNT 11 /* Total number of message types. */ + /* Initially we don't know our "name", but we'll find it once we connect * to the first node, using the getsockname() function. Then we'll use this * address for all the next messages. */ From 040cb6a4aa4cee0529772fd979c654457f7e1628 Mon Sep 17 00:00:00 2001 From: Josh Hershberg Date: Tue, 31 Oct 2023 15:55:01 +0200 Subject: [PATCH 07/15] Cluster refactor: verifyClusterNodeId need not be 'public' Signed-off-by: Josh Hershberg --- src/cluster.h | 1 - src/cluster_legacy.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.h b/src/cluster.h index b7320be1d51..6a8b0b423e0 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -38,7 +38,6 @@ void clusterInitListeners(void); void clusterCron(void); void clusterBeforeSleep(void); clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask); -int verifyClusterNodeId(const char *name, int length); clusterNode *clusterLookupNode(const char *name, int length); int clusterRedirectBlockedClientIfNeeded(client *c); void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 556e5dcebdd..88c30f09584 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -104,6 +104,7 @@ sds auxTlsPortGetter(clusterNode *n, sds s); int auxTlsPortPresent(clusterNode *n); static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen); void freeClusterLink(clusterLink *link); +int verifyClusterNodeId(const char *name, int length); int getNodeDefaultClientPort(clusterNode *n) { return server.tls_cluster ? n->tls_port : n->tcp_port; From ac1513221bbb370d49e6dfc9904ab05db306a828 Mon Sep 17 00:00:00 2001 From: Josh Hershberg Date: Wed, 1 Nov 2023 09:44:11 +0200 Subject: [PATCH 08/15] Cluster refactor: Move items from cluster_legacy.c to cluster.c Move (but do not change) some items from cluster_legacy.c back info cluster.c. These items are shared code that all clustering implementations will use. Signed-off-by: Josh Hershberg --- src/cluster.c | 746 ++++++++++++++++++++++++++++++++++++++++++ src/cluster.h | 1 + src/cluster_legacy.c | 750 ------------------------------------------- 3 files changed, 747 insertions(+), 750 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index e69de29bb2d..dd053de708a 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -0,0 +1,746 @@ +#include "server.h" +#include "cluster.h" + +#include + +/* ----------------------------------------------------------------------------- + * Key space handling + * -------------------------------------------------------------------------- */ + +/* We have 16384 hash slots. The hash slot of a given key is obtained + * as the least significant 14 bits of the crc16 of the key. + * + * However if the key contains the {...} pattern, only the part between + * { and } is hashed. This may be useful in the future to force certain + * keys to be in the same node (assuming no resharding is in progress). */ +unsigned int keyHashSlot(char *key, int keylen) { + int s, e; /* start-end indexes of { and } */ + + for (s = 0; s < keylen; s++) + if (key[s] == '{') break; + + /* No '{' ? Hash the whole key. This is the base case. */ + if (s == keylen) return crc16(key,keylen) & 0x3FFF; + + /* '{' found? Check if we have the corresponding '}'. */ + for (e = s+1; e < keylen; e++) + if (key[e] == '}') break; + + /* No '}' or nothing between {} ? Hash the whole key. */ + if (e == keylen || e == s+1) return crc16(key,keylen) & 0x3FFF; + + /* If we are here there is both a { and a } on its right. Hash + * what is in the middle between { and }. */ + return crc16(key+s+1,e-s-1) & 0x3FFF; +} + +/* If it can be inferred that the given glob-style pattern, as implemented in + * stringmatchlen() in util.c, only can match keys belonging to a single slot, + * that slot is returned. Otherwise -1 is returned. */ +int patternHashSlot(char *pattern, int length) { + int s = -1; /* index of the first '{' */ + + for (int i = 0; i < length; i++) { + if (pattern[i] == '*' || pattern[i] == '?' || pattern[i] == '[') { + /* Wildcard or character class found. Keys can be in any slot. */ + return -1; + } else if (pattern[i] == '\\') { + /* Escaped character. Computing slot in this case is not + * implemented. We would need a temp buffer. */ + return -1; + } else if (s == -1 && pattern[i] == '{') { + /* Opening brace '{' found. */ + s = i; + } else if (s >= 0 && pattern[i] == '}' && i == s + 1) { + /* Empty tag '{}' found. The whole key is hashed. Ignore braces. */ + s = -2; + } else if (s >= 0 && pattern[i] == '}') { + /* Non-empty tag '{...}' found. Hash what's between braces. */ + return crc16(pattern + s + 1, i - s - 1) & 0x3FFF; + } + } + + /* The pattern matches a single key. Hash the whole pattern. */ + return crc16(pattern, length) & 0x3FFF; +} + +ConnectionType *connTypeOfCluster(void) { + if (server.tls_cluster) { + return connectionTypeTls(); + } + + return connectionTypeTcp(); +} + +/* ----------------------------------------------------------------------------- + * DUMP, RESTORE and MIGRATE commands + * -------------------------------------------------------------------------- */ + +/* Generates a DUMP-format representation of the object 'o', adding it to the + * io stream pointed by 'rio'. This function can't fail. */ +void createDumpPayload(rio *payload, robj *o, robj *key, int dbid) { + unsigned char buf[2]; + uint64_t crc; + + /* Serialize the object in an RDB-like format. It consist of an object type + * byte followed by the serialized object. This is understood by RESTORE. */ + rioInitWithBuffer(payload,sdsempty()); + serverAssert(rdbSaveObjectType(payload,o)); + serverAssert(rdbSaveObject(payload,o,key,dbid)); + + /* Write the footer, this is how it looks like: + * ----------------+---------------------+---------------+ + * ... RDB payload | 2 bytes RDB version | 8 bytes CRC64 | + * ----------------+---------------------+---------------+ + * RDB version and CRC are both in little endian. + */ + + /* RDB version */ + buf[0] = RDB_VERSION & 0xff; + buf[1] = (RDB_VERSION >> 8) & 0xff; + payload->io.buffer.ptr = sdscatlen(payload->io.buffer.ptr,buf,2); + + /* CRC64 */ + crc = crc64(0,(unsigned char*)payload->io.buffer.ptr, + sdslen(payload->io.buffer.ptr)); + memrev64ifbe(&crc); + payload->io.buffer.ptr = sdscatlen(payload->io.buffer.ptr,&crc,8); +} + +/* Verify that the RDB version of the dump payload matches the one of this Redis + * instance and that the checksum is ok. + * If the DUMP payload looks valid C_OK is returned, otherwise C_ERR + * is returned. If rdbver_ptr is not NULL, its populated with the value read + * from the input buffer. */ +int verifyDumpPayload(unsigned char *p, size_t len, uint16_t *rdbver_ptr) { + unsigned char *footer; + uint16_t rdbver; + uint64_t crc; + + /* At least 2 bytes of RDB version and 8 of CRC64 should be present. */ + if (len < 10) return C_ERR; + footer = p+(len-10); + + /* Set and verify RDB version. */ + rdbver = (footer[1] << 8) | footer[0]; + if (rdbver_ptr) { + *rdbver_ptr = rdbver; + } + if (rdbver > RDB_VERSION) return C_ERR; + + if (server.skip_checksum_validation) + return C_OK; + + /* Verify CRC64 */ + crc = crc64(0,p,len-8); + memrev64ifbe(&crc); + return (memcmp(&crc,footer+2,8) == 0) ? C_OK : C_ERR; +} + +/* DUMP keyname + * DUMP is actually not used by Redis Cluster but it is the obvious + * complement of RESTORE and can be useful for different applications. */ +void dumpCommand(client *c) { + robj *o; + rio payload; + + /* Check if the key is here. */ + if ((o = lookupKeyRead(c->db,c->argv[1])) == NULL) { + addReplyNull(c); + return; + } + + /* Create the DUMP encoded representation. */ + createDumpPayload(&payload,o,c->argv[1],c->db->id); + + /* Transfer to the client */ + addReplyBulkSds(c,payload.io.buffer.ptr); + return; +} + +/* RESTORE key ttl serialized-value [REPLACE] [ABSTTL] [IDLETIME seconds] [FREQ frequency] */ +void restoreCommand(client *c) { + long long ttl, lfu_freq = -1, lru_idle = -1, lru_clock = -1; + rio payload; + int j, type, replace = 0, absttl = 0; + robj *obj; + + /* Parse additional options */ + for (j = 4; j < c->argc; j++) { + int additional = c->argc-j-1; + if (!strcasecmp(c->argv[j]->ptr,"replace")) { + replace = 1; + } else if (!strcasecmp(c->argv[j]->ptr,"absttl")) { + absttl = 1; + } else if (!strcasecmp(c->argv[j]->ptr,"idletime") && additional >= 1 && + lfu_freq == -1) + { + if (getLongLongFromObjectOrReply(c,c->argv[j+1],&lru_idle,NULL) + != C_OK) return; + if (lru_idle < 0) { + addReplyError(c,"Invalid IDLETIME value, must be >= 0"); + return; + } + lru_clock = LRU_CLOCK(); + j++; /* Consume additional arg. */ + } else if (!strcasecmp(c->argv[j]->ptr,"freq") && additional >= 1 && + lru_idle == -1) + { + if (getLongLongFromObjectOrReply(c,c->argv[j+1],&lfu_freq,NULL) + != C_OK) return; + if (lfu_freq < 0 || lfu_freq > 255) { + addReplyError(c,"Invalid FREQ value, must be >= 0 and <= 255"); + return; + } + j++; /* Consume additional arg. */ + } else { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + } + + /* Make sure this key does not already exist here... */ + robj *key = c->argv[1]; + if (!replace && lookupKeyWrite(c->db,key) != NULL) { + addReplyErrorObject(c,shared.busykeyerr); + return; + } + + /* Check if the TTL value makes sense */ + if (getLongLongFromObjectOrReply(c,c->argv[2],&ttl,NULL) != C_OK) { + return; + } else if (ttl < 0) { + addReplyError(c,"Invalid TTL value, must be >= 0"); + return; + } + + /* Verify RDB version and data checksum. */ + if (verifyDumpPayload(c->argv[3]->ptr,sdslen(c->argv[3]->ptr),NULL) == C_ERR) + { + addReplyError(c,"DUMP payload version or checksum are wrong"); + return; + } + + rioInitWithBuffer(&payload,c->argv[3]->ptr); + if (((type = rdbLoadObjectType(&payload)) == -1) || + ((obj = rdbLoadObject(type,&payload,key->ptr,c->db->id,NULL)) == NULL)) + { + addReplyError(c,"Bad data format"); + return; + } + + /* Remove the old key if needed. */ + int deleted = 0; + if (replace) + deleted = dbDelete(c->db,key); + + if (ttl && !absttl) ttl+=commandTimeSnapshot(); + if (ttl && checkAlreadyExpired(ttl)) { + if (deleted) { + robj *aux = server.lazyfree_lazy_server_del ? shared.unlink : shared.del; + rewriteClientCommandVector(c, 2, aux, key); + signalModifiedKey(c,c->db,key); + notifyKeyspaceEvent(NOTIFY_GENERIC,"del",key,c->db->id); + server.dirty++; + } + decrRefCount(obj); + addReply(c, shared.ok); + return; + } + + /* Create the key and set the TTL if any */ + dbAdd(c->db,key,obj); + if (ttl) { + setExpire(c,c->db,key,ttl); + if (!absttl) { + /* Propagate TTL as absolute timestamp */ + robj *ttl_obj = createStringObjectFromLongLong(ttl); + rewriteClientCommandArgument(c,2,ttl_obj); + decrRefCount(ttl_obj); + rewriteClientCommandArgument(c,c->argc,shared.absttl); + } + } + objectSetLRUOrLFU(obj,lfu_freq,lru_idle,lru_clock,1000); + signalModifiedKey(c,c->db,key); + notifyKeyspaceEvent(NOTIFY_GENERIC,"restore",key,c->db->id); + addReply(c,shared.ok); + server.dirty++; +} +/* MIGRATE socket cache implementation. + * + * We take a map between host:ip and a TCP socket that we used to connect + * to this instance in recent time. + * This sockets are closed when the max number we cache is reached, and also + * in serverCron() when they are around for more than a few seconds. */ +#define MIGRATE_SOCKET_CACHE_ITEMS 64 /* max num of items in the cache. */ +#define MIGRATE_SOCKET_CACHE_TTL 10 /* close cached sockets after 10 sec. */ + +typedef struct migrateCachedSocket { + connection *conn; + long last_dbid; + time_t last_use_time; +} migrateCachedSocket; + +/* Return a migrateCachedSocket containing a TCP socket connected with the + * target instance, possibly returning a cached one. + * + * This function is responsible of sending errors to the client if a + * connection can't be established. In this case -1 is returned. + * Otherwise on success the socket is returned, and the caller should not + * attempt to free it after usage. + * + * If the caller detects an error while using the socket, migrateCloseSocket() + * should be called so that the connection will be created from scratch + * the next time. */ +migrateCachedSocket* migrateGetSocket(client *c, robj *host, robj *port, long timeout) { + connection *conn; + sds name = sdsempty(); + migrateCachedSocket *cs; + + /* Check if we have an already cached socket for this ip:port pair. */ + name = sdscatlen(name,host->ptr,sdslen(host->ptr)); + name = sdscatlen(name,":",1); + name = sdscatlen(name,port->ptr,sdslen(port->ptr)); + cs = dictFetchValue(server.migrate_cached_sockets,name); + if (cs) { + sdsfree(name); + cs->last_use_time = server.unixtime; + return cs; + } + + /* No cached socket, create one. */ + if (dictSize(server.migrate_cached_sockets) == MIGRATE_SOCKET_CACHE_ITEMS) { + /* Too many items, drop one at random. */ + dictEntry *de = dictGetRandomKey(server.migrate_cached_sockets); + cs = dictGetVal(de); + connClose(cs->conn); + zfree(cs); + dictDelete(server.migrate_cached_sockets,dictGetKey(de)); + } + + /* Create the connection */ + conn = connCreate(connTypeOfCluster()); + if (connBlockingConnect(conn, host->ptr, atoi(port->ptr), timeout) + != C_OK) { + addReplyError(c,"-IOERR error or timeout connecting to the client"); + connClose(conn); + sdsfree(name); + return NULL; + } + connEnableTcpNoDelay(conn); + + /* Add to the cache and return it to the caller. */ + cs = zmalloc(sizeof(*cs)); + cs->conn = conn; + + cs->last_dbid = -1; + cs->last_use_time = server.unixtime; + dictAdd(server.migrate_cached_sockets,name,cs); + return cs; +} + +/* Free a migrate cached connection. */ +void migrateCloseSocket(robj *host, robj *port) { + sds name = sdsempty(); + migrateCachedSocket *cs; + + name = sdscatlen(name,host->ptr,sdslen(host->ptr)); + name = sdscatlen(name,":",1); + name = sdscatlen(name,port->ptr,sdslen(port->ptr)); + cs = dictFetchValue(server.migrate_cached_sockets,name); + if (!cs) { + sdsfree(name); + return; + } + + connClose(cs->conn); + zfree(cs); + dictDelete(server.migrate_cached_sockets,name); + sdsfree(name); +} + +void migrateCloseTimedoutSockets(void) { + dictIterator *di = dictGetSafeIterator(server.migrate_cached_sockets); + dictEntry *de; + + while((de = dictNext(di)) != NULL) { + migrateCachedSocket *cs = dictGetVal(de); + + if ((server.unixtime - cs->last_use_time) > MIGRATE_SOCKET_CACHE_TTL) { + connClose(cs->conn); + zfree(cs); + dictDelete(server.migrate_cached_sockets,dictGetKey(de)); + } + } + dictReleaseIterator(di); +} + +/* MIGRATE host port key dbid timeout [COPY | REPLACE | AUTH password | + * AUTH2 username password] + * + * On in the multiple keys form: + * + * MIGRATE host port "" dbid timeout [COPY | REPLACE | AUTH password | + * AUTH2 username password] KEYS key1 key2 ... keyN */ +void migrateCommand(client *c) { + migrateCachedSocket *cs; + int copy = 0, replace = 0, j; + char *username = NULL; + char *password = NULL; + long timeout; + long dbid; + robj **ov = NULL; /* Objects to migrate. */ + robj **kv = NULL; /* Key names. */ + robj **newargv = NULL; /* Used to rewrite the command as DEL ... keys ... */ + rio cmd, payload; + int may_retry = 1; + int write_error = 0; + int argv_rewritten = 0; + + /* To support the KEYS option we need the following additional state. */ + int first_key = 3; /* Argument index of the first key. */ + int num_keys = 1; /* By default only migrate the 'key' argument. */ + + /* Parse additional options */ + for (j = 6; j < c->argc; j++) { + int moreargs = (c->argc-1) - j; + if (!strcasecmp(c->argv[j]->ptr,"copy")) { + copy = 1; + } else if (!strcasecmp(c->argv[j]->ptr,"replace")) { + replace = 1; + } else if (!strcasecmp(c->argv[j]->ptr,"auth")) { + if (!moreargs) { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + j++; + password = c->argv[j]->ptr; + redactClientCommandArgument(c,j); + } else if (!strcasecmp(c->argv[j]->ptr,"auth2")) { + if (moreargs < 2) { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + username = c->argv[++j]->ptr; + redactClientCommandArgument(c,j); + password = c->argv[++j]->ptr; + redactClientCommandArgument(c,j); + } else if (!strcasecmp(c->argv[j]->ptr,"keys")) { + if (sdslen(c->argv[3]->ptr) != 0) { + addReplyError(c, + "When using MIGRATE KEYS option, the key argument" + " must be set to the empty string"); + return; + } + first_key = j+1; + num_keys = c->argc - j - 1; + break; /* All the remaining args are keys. */ + } else { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + } + + /* Sanity check */ + if (getLongFromObjectOrReply(c,c->argv[5],&timeout,NULL) != C_OK || + getLongFromObjectOrReply(c,c->argv[4],&dbid,NULL) != C_OK) + { + return; + } + if (timeout <= 0) timeout = 1000; + + /* Check if the keys are here. If at least one key is to migrate, do it + * otherwise if all the keys are missing reply with "NOKEY" to signal + * the caller there was nothing to migrate. We don't return an error in + * this case, since often this is due to a normal condition like the key + * expiring in the meantime. */ + ov = zrealloc(ov,sizeof(robj*)*num_keys); + kv = zrealloc(kv,sizeof(robj*)*num_keys); + int oi = 0; + + for (j = 0; j < num_keys; j++) { + if ((ov[oi] = lookupKeyRead(c->db,c->argv[first_key+j])) != NULL) { + kv[oi] = c->argv[first_key+j]; + oi++; + } + } + num_keys = oi; + if (num_keys == 0) { + zfree(ov); zfree(kv); + addReplySds(c,sdsnew("+NOKEY\r\n")); + return; + } + + try_again: + write_error = 0; + + /* Connect */ + cs = migrateGetSocket(c,c->argv[1],c->argv[2],timeout); + if (cs == NULL) { + zfree(ov); zfree(kv); + return; /* error sent to the client by migrateGetSocket() */ + } + + rioInitWithBuffer(&cmd,sdsempty()); + + /* Authentication */ + if (password) { + int arity = username ? 3 : 2; + serverAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,'*',arity)); + serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"AUTH",4)); + if (username) { + serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,username, + sdslen(username))); + } + serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,password, + sdslen(password))); + } + + /* Send the SELECT command if the current DB is not already selected. */ + int select = cs->last_dbid != dbid; /* Should we emit SELECT? */ + if (select) { + serverAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,'*',2)); + serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"SELECT",6)); + serverAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,dbid)); + } + + int non_expired = 0; /* Number of keys that we'll find non expired. + Note that serializing large keys may take some time + so certain keys that were found non expired by the + lookupKey() function, may be expired later. */ + + /* Create RESTORE payload and generate the protocol to call the command. */ + for (j = 0; j < num_keys; j++) { + long long ttl = 0; + long long expireat = getExpire(c->db,kv[j]); + + if (expireat != -1) { + ttl = expireat-commandTimeSnapshot(); + if (ttl < 0) { + continue; + } + if (ttl < 1) ttl = 1; + } + + /* Relocate valid (non expired) keys and values into the array in successive + * positions to remove holes created by the keys that were present + * in the first lookup but are now expired after the second lookup. */ + ov[non_expired] = ov[j]; + kv[non_expired++] = kv[j]; + + serverAssertWithInfo(c,NULL, + rioWriteBulkCount(&cmd,'*',replace ? 5 : 4)); + + if (server.cluster_enabled) + serverAssertWithInfo(c,NULL, + rioWriteBulkString(&cmd,"RESTORE-ASKING",14)); + else + serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"RESTORE",7)); + serverAssertWithInfo(c,NULL,sdsEncodedObject(kv[j])); + serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,kv[j]->ptr, + sdslen(kv[j]->ptr))); + serverAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,ttl)); + + /* Emit the payload argument, that is the serialized object using + * the DUMP format. */ + createDumpPayload(&payload,ov[j],kv[j],dbid); + serverAssertWithInfo(c,NULL, + rioWriteBulkString(&cmd,payload.io.buffer.ptr, + sdslen(payload.io.buffer.ptr))); + sdsfree(payload.io.buffer.ptr); + + /* Add the REPLACE option to the RESTORE command if it was specified + * as a MIGRATE option. */ + if (replace) + serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"REPLACE",7)); + } + + /* Fix the actual number of keys we are migrating. */ + num_keys = non_expired; + + /* Transfer the query to the other node in 64K chunks. */ + errno = 0; + { + sds buf = cmd.io.buffer.ptr; + size_t pos = 0, towrite; + int nwritten = 0; + + while ((towrite = sdslen(buf)-pos) > 0) { + towrite = (towrite > (64*1024) ? (64*1024) : towrite); + nwritten = connSyncWrite(cs->conn,buf+pos,towrite,timeout); + if (nwritten != (signed)towrite) { + write_error = 1; + goto socket_err; + } + pos += nwritten; + } + } + + char buf0[1024]; /* Auth reply. */ + char buf1[1024]; /* Select reply. */ + char buf2[1024]; /* Restore reply. */ + + /* Read the AUTH reply if needed. */ + if (password && connSyncReadLine(cs->conn, buf0, sizeof(buf0), timeout) <= 0) + goto socket_err; + + /* Read the SELECT reply if needed. */ + if (select && connSyncReadLine(cs->conn, buf1, sizeof(buf1), timeout) <= 0) + goto socket_err; + + /* Read the RESTORE replies. */ + int error_from_target = 0; + int socket_error = 0; + int del_idx = 1; /* Index of the key argument for the replicated DEL op. */ + + /* Allocate the new argument vector that will replace the current command, + * to propagate the MIGRATE as a DEL command (if no COPY option was given). + * We allocate num_keys+1 because the additional argument is for "DEL" + * command name itself. */ + if (!copy) newargv = zmalloc(sizeof(robj*)*(num_keys+1)); + + for (j = 0; j < num_keys; j++) { + if (connSyncReadLine(cs->conn, buf2, sizeof(buf2), timeout) <= 0) { + socket_error = 1; + break; + } + if ((password && buf0[0] == '-') || + (select && buf1[0] == '-') || + buf2[0] == '-') + { + /* On error assume that last_dbid is no longer valid. */ + if (!error_from_target) { + cs->last_dbid = -1; + char *errbuf; + if (password && buf0[0] == '-') errbuf = buf0; + else if (select && buf1[0] == '-') errbuf = buf1; + else errbuf = buf2; + + error_from_target = 1; + addReplyErrorFormat(c,"Target instance replied with error: %s", + errbuf+1); + } + } else { + if (!copy) { + /* No COPY option: remove the local key, signal the change. */ + dbDelete(c->db,kv[j]); + signalModifiedKey(c,c->db,kv[j]); + notifyKeyspaceEvent(NOTIFY_GENERIC,"del",kv[j],c->db->id); + server.dirty++; + + /* Populate the argument vector to replace the old one. */ + newargv[del_idx++] = kv[j]; + incrRefCount(kv[j]); + } + } + } + + /* On socket error, if we want to retry, do it now before rewriting the + * command vector. We only retry if we are sure nothing was processed + * and we failed to read the first reply (j == 0 test). */ + if (!error_from_target && socket_error && j == 0 && may_retry && + errno != ETIMEDOUT) + { + goto socket_err; /* A retry is guaranteed because of tested conditions.*/ + } + + /* On socket errors, close the migration socket now that we still have + * the original host/port in the ARGV. Later the original command may be + * rewritten to DEL and will be too later. */ + if (socket_error) migrateCloseSocket(c->argv[1],c->argv[2]); + + if (!copy) { + /* Translate MIGRATE as DEL for replication/AOF. Note that we do + * this only for the keys for which we received an acknowledgement + * from the receiving Redis server, by using the del_idx index. */ + if (del_idx > 1) { + newargv[0] = createStringObject("DEL",3); + /* Note that the following call takes ownership of newargv. */ + replaceClientCommandVector(c,del_idx,newargv); + argv_rewritten = 1; + } else { + /* No key transfer acknowledged, no need to rewrite as DEL. */ + zfree(newargv); + } + newargv = NULL; /* Make it safe to call zfree() on it in the future. */ + } + + /* If we are here and a socket error happened, we don't want to retry. + * Just signal the problem to the client, but only do it if we did not + * already queue a different error reported by the destination server. */ + if (!error_from_target && socket_error) { + may_retry = 0; + goto socket_err; + } + + if (!error_from_target) { + /* Success! Update the last_dbid in migrateCachedSocket, so that we can + * avoid SELECT the next time if the target DB is the same. Reply +OK. + * + * Note: If we reached this point, even if socket_error is true + * still the SELECT command succeeded (otherwise the code jumps to + * socket_err label. */ + cs->last_dbid = dbid; + addReply(c,shared.ok); + } else { + /* On error we already sent it in the for loop above, and set + * the currently selected socket to -1 to force SELECT the next time. */ + } + + sdsfree(cmd.io.buffer.ptr); + zfree(ov); zfree(kv); zfree(newargv); + return; + +/* On socket errors we try to close the cached socket and try again. + * It is very common for the cached socket to get closed, if just reopening + * it works it's a shame to notify the error to the caller. */ + socket_err: + /* Cleanup we want to perform in both the retry and no retry case. + * Note: Closing the migrate socket will also force SELECT next time. */ + sdsfree(cmd.io.buffer.ptr); + + /* If the command was rewritten as DEL and there was a socket error, + * we already closed the socket earlier. While migrateCloseSocket() + * is idempotent, the host/port arguments are now gone, so don't do it + * again. */ + if (!argv_rewritten) migrateCloseSocket(c->argv[1],c->argv[2]); + zfree(newargv); + newargv = NULL; /* This will get reallocated on retry. */ + + /* Retry only if it's not a timeout and we never attempted a retry + * (or the code jumping here did not set may_retry to zero). */ + if (errno != ETIMEDOUT && may_retry) { + may_retry = 0; + goto try_again; + } + + /* Cleanup we want to do if no retry is attempted. */ + zfree(ov); zfree(kv); + addReplyErrorSds(c, sdscatprintf(sdsempty(), + "-IOERR error or timeout %s to target instance", + write_error ? "writing" : "reading")); + return; +} + +/* Cluster node sanity check. Returns C_OK if the node id + * is valid an C_ERR otherwise. */ +int verifyClusterNodeId(const char *name, int length) { + if (length != CLUSTER_NAMELEN) return C_ERR; + for (int i = 0; i < length; i++) { + if (name[i] >= 'a' && name[i] <= 'z') continue; + if (name[i] >= '0' && name[i] <= '9') continue; + return C_ERR; + } + return C_OK; +} + +int isValidAuxChar(int c) { + return isalnum(c) || (strchr("!#$%&()*+:;<>?@[]^{|}~", c) == NULL); +} + +int isValidAuxString(char *s, unsigned int length) { + for (unsigned i = 0; i < length; i++) { + if (!isValidAuxChar(s[i])) return 0; + } + return 1; +} diff --git a/src/cluster.h b/src/cluster.h index 6a8b0b423e0..d9c6d9413b6 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -78,4 +78,5 @@ int clusterNodeIsFailing(clusterNode *node); int clusterNodeIsNoFailover(clusterNode *node); char **clusterDebugCommandHelp(void); +ConnectionType *connTypeOfCluster(void); #endif /* __CLUSTER_H */ diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 88c30f09584..aa6625fd1f2 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -40,9 +40,7 @@ #include #include #include -#include #include -#include /* A global reference to myself is handy to make code more clear. * Myself always points to server.cluster->myself, that is, the clusterNode @@ -136,13 +134,6 @@ static int shouldReturnTlsInfo(void) { } } -/* Links to the next and previous entries for keys in the same slot are stored - * in the dict entry metadata. See Slot to Key API below. */ -#define dictEntryNextInSlot(de) \ - (((clusterDictEntryMetadata *)dictEntryMetadata(de))->next) -#define dictEntryPrevInSlot(de) \ - (((clusterDictEntryMetadata *)dictEntryMetadata(de))->prev) - #define isSlotUnclaimed(slot) \ (server.cluster->slots[slot] == NULL || \ bitmapTestBit(server.cluster->owner_not_claiming_slot, slot)) @@ -175,13 +166,6 @@ dictType clusterNodesBlackListDictType = { NULL /* allow to expand */ }; -static ConnectionType *connTypeOfCluster(void) { - if (server.tls_cluster) { - return connectionTypeTls(); - } - - return connectionTypeTcp(); -} /* Cluster shards hash table, mapping shard id to list of nodes */ dictType clusterSdsToListType = { dictSdsHash, /* hash function */ @@ -238,17 +222,6 @@ auxFieldHandler auxFieldHandlers[] = { {"tls-port", auxTlsPortSetter, auxTlsPortGetter, auxTlsPortPresent}, }; -int isValidAuxChar(int c) { - return isalnum(c) || (strchr("!#$%&()*+:;<>?@[]^{|}~", c) == NULL); -} - -int isValidAuxString(char *s, unsigned int length) { - for (unsigned i = 0; i < length; i++) { - if (!isValidAuxChar(s[i])) return 0; - } - return 1; -} - int auxShardIdSetter(clusterNode *n, void *value, int length) { if (verifyClusterNodeId(value, length) == C_ERR) { return C_ERR; @@ -1339,67 +1312,6 @@ unsigned long getClusterConnectionsCount(void) { ((dictSize(server.cluster->nodes)-1)*2) : 0; } -/* ----------------------------------------------------------------------------- - * Key space handling - * -------------------------------------------------------------------------- */ - -/* We have 16384 hash slots. The hash slot of a given key is obtained - * as the least significant 14 bits of the crc16 of the key. - * - * However if the key contains the {...} pattern, only the part between - * { and } is hashed. This may be useful in the future to force certain - * keys to be in the same node (assuming no resharding is in progress). */ -unsigned int keyHashSlot(char *key, int keylen) { - int s, e; /* start-end indexes of { and } */ - - for (s = 0; s < keylen; s++) - if (key[s] == '{') break; - - /* No '{' ? Hash the whole key. This is the base case. */ - if (s == keylen) return crc16(key,keylen) & 0x3FFF; - - /* '{' found? Check if we have the corresponding '}'. */ - for (e = s+1; e < keylen; e++) - if (key[e] == '}') break; - - /* No '}' or nothing between {} ? Hash the whole key. */ - if (e == keylen || e == s+1) return crc16(key,keylen) & 0x3FFF; - - /* If we are here there is both a { and a } on its right. Hash - * what is in the middle between { and }. */ - return crc16(key+s+1,e-s-1) & 0x3FFF; -} - -/* If it can be inferred that the given glob-style pattern, as implemented in - * stringmatchlen() in util.c, only can match keys belonging to a single slot, - * that slot is returned. Otherwise -1 is returned. */ -int patternHashSlot(char *pattern, int length) { - int s = -1; /* index of the first '{' */ - - for (int i = 0; i < length; i++) { - if (pattern[i] == '*' || pattern[i] == '?' || pattern[i] == '[') { - /* Wildcard or character class found. Keys can be in any slot. */ - return -1; - } else if (pattern[i] == '\\') { - /* Escaped character. Computing slot in this case is not - * implemented. We would need a temp buffer. */ - return -1; - } else if (s == -1 && pattern[i] == '{') { - /* Opening brace '{' found. */ - s = i; - } else if (s >= 0 && pattern[i] == '}' && i == s + 1) { - /* Empty tag '{}' found. The whole key is hashed. Ignore braces. */ - s = -2; - } else if (s >= 0 && pattern[i] == '}') { - /* Non-empty tag '{...}' found. Hash what's between braces. */ - return crc16(pattern + s + 1, i - s - 1) & 0x3FFF; - } - } - - /* The pattern matches a single key. Hash the whole pattern. */ - return crc16(pattern, length) & 0x3FFF; -} - /* ----------------------------------------------------------------------------- * CLUSTER node API * -------------------------------------------------------------------------- */ @@ -1667,18 +1579,6 @@ void clusterDelNode(clusterNode *delnode) { freeClusterNode(delnode); } -/* Cluster node sanity check. Returns C_OK if the node id - * is valid an C_ERR otherwise. */ -int verifyClusterNodeId(const char *name, int length) { - if (length != CLUSTER_NAMELEN) return C_ERR; - for (int i = 0; i < length; i++) { - if (name[i] >= 'a' && name[i] <= 'z') continue; - if (name[i] >= '0' && name[i] <= '9') continue; - return C_ERR; - } - return C_OK; -} - /* Node lookup by name */ clusterNode *clusterLookupNode(const char *name, int length) { if (verifyClusterNodeId(name, length) != C_OK) return NULL; @@ -6537,656 +6437,6 @@ void removeChannelsInSlot(unsigned int slot) { zfree(channels); } -/* ----------------------------------------------------------------------------- - * DUMP, RESTORE and MIGRATE commands - * -------------------------------------------------------------------------- */ - -/* Generates a DUMP-format representation of the object 'o', adding it to the - * io stream pointed by 'rio'. This function can't fail. */ -void createDumpPayload(rio *payload, robj *o, robj *key, int dbid) { - unsigned char buf[2]; - uint64_t crc; - - /* Serialize the object in an RDB-like format. It consist of an object type - * byte followed by the serialized object. This is understood by RESTORE. */ - rioInitWithBuffer(payload,sdsempty()); - serverAssert(rdbSaveObjectType(payload,o)); - serverAssert(rdbSaveObject(payload,o,key,dbid)); - - /* Write the footer, this is how it looks like: - * ----------------+---------------------+---------------+ - * ... RDB payload | 2 bytes RDB version | 8 bytes CRC64 | - * ----------------+---------------------+---------------+ - * RDB version and CRC are both in little endian. - */ - - /* RDB version */ - buf[0] = RDB_VERSION & 0xff; - buf[1] = (RDB_VERSION >> 8) & 0xff; - payload->io.buffer.ptr = sdscatlen(payload->io.buffer.ptr,buf,2); - - /* CRC64 */ - crc = crc64(0,(unsigned char*)payload->io.buffer.ptr, - sdslen(payload->io.buffer.ptr)); - memrev64ifbe(&crc); - payload->io.buffer.ptr = sdscatlen(payload->io.buffer.ptr,&crc,8); -} - -/* Verify that the RDB version of the dump payload matches the one of this Redis - * instance and that the checksum is ok. - * If the DUMP payload looks valid C_OK is returned, otherwise C_ERR - * is returned. If rdbver_ptr is not NULL, its populated with the value read - * from the input buffer. */ -int verifyDumpPayload(unsigned char *p, size_t len, uint16_t *rdbver_ptr) { - unsigned char *footer; - uint16_t rdbver; - uint64_t crc; - - /* At least 2 bytes of RDB version and 8 of CRC64 should be present. */ - if (len < 10) return C_ERR; - footer = p+(len-10); - - /* Set and verify RDB version. */ - rdbver = (footer[1] << 8) | footer[0]; - if (rdbver_ptr) { - *rdbver_ptr = rdbver; - } - if (rdbver > RDB_VERSION) return C_ERR; - - if (server.skip_checksum_validation) - return C_OK; - - /* Verify CRC64 */ - crc = crc64(0,p,len-8); - memrev64ifbe(&crc); - return (memcmp(&crc,footer+2,8) == 0) ? C_OK : C_ERR; -} - -/* DUMP keyname - * DUMP is actually not used by Redis Cluster but it is the obvious - * complement of RESTORE and can be useful for different applications. */ -void dumpCommand(client *c) { - robj *o; - rio payload; - - /* Check if the key is here. */ - if ((o = lookupKeyRead(c->db,c->argv[1])) == NULL) { - addReplyNull(c); - return; - } - - /* Create the DUMP encoded representation. */ - createDumpPayload(&payload,o,c->argv[1],c->db->id); - - /* Transfer to the client */ - addReplyBulkSds(c,payload.io.buffer.ptr); - return; -} - -/* RESTORE key ttl serialized-value [REPLACE] [ABSTTL] [IDLETIME seconds] [FREQ frequency] */ -void restoreCommand(client *c) { - long long ttl, lfu_freq = -1, lru_idle = -1, lru_clock = -1; - rio payload; - int j, type, replace = 0, absttl = 0; - robj *obj; - - /* Parse additional options */ - for (j = 4; j < c->argc; j++) { - int additional = c->argc-j-1; - if (!strcasecmp(c->argv[j]->ptr,"replace")) { - replace = 1; - } else if (!strcasecmp(c->argv[j]->ptr,"absttl")) { - absttl = 1; - } else if (!strcasecmp(c->argv[j]->ptr,"idletime") && additional >= 1 && - lfu_freq == -1) - { - if (getLongLongFromObjectOrReply(c,c->argv[j+1],&lru_idle,NULL) - != C_OK) return; - if (lru_idle < 0) { - addReplyError(c,"Invalid IDLETIME value, must be >= 0"); - return; - } - lru_clock = LRU_CLOCK(); - j++; /* Consume additional arg. */ - } else if (!strcasecmp(c->argv[j]->ptr,"freq") && additional >= 1 && - lru_idle == -1) - { - if (getLongLongFromObjectOrReply(c,c->argv[j+1],&lfu_freq,NULL) - != C_OK) return; - if (lfu_freq < 0 || lfu_freq > 255) { - addReplyError(c,"Invalid FREQ value, must be >= 0 and <= 255"); - return; - } - j++; /* Consume additional arg. */ - } else { - addReplyErrorObject(c,shared.syntaxerr); - return; - } - } - - /* Make sure this key does not already exist here... */ - robj *key = c->argv[1]; - if (!replace && lookupKeyWrite(c->db,key) != NULL) { - addReplyErrorObject(c,shared.busykeyerr); - return; - } - - /* Check if the TTL value makes sense */ - if (getLongLongFromObjectOrReply(c,c->argv[2],&ttl,NULL) != C_OK) { - return; - } else if (ttl < 0) { - addReplyError(c,"Invalid TTL value, must be >= 0"); - return; - } - - /* Verify RDB version and data checksum. */ - if (verifyDumpPayload(c->argv[3]->ptr,sdslen(c->argv[3]->ptr),NULL) == C_ERR) - { - addReplyError(c,"DUMP payload version or checksum are wrong"); - return; - } - - rioInitWithBuffer(&payload,c->argv[3]->ptr); - if (((type = rdbLoadObjectType(&payload)) == -1) || - ((obj = rdbLoadObject(type,&payload,key->ptr,c->db->id,NULL)) == NULL)) - { - addReplyError(c,"Bad data format"); - return; - } - - /* Remove the old key if needed. */ - int deleted = 0; - if (replace) - deleted = dbDelete(c->db,key); - - if (ttl && !absttl) ttl+=commandTimeSnapshot(); - if (ttl && checkAlreadyExpired(ttl)) { - if (deleted) { - robj *aux = server.lazyfree_lazy_server_del ? shared.unlink : shared.del; - rewriteClientCommandVector(c, 2, aux, key); - signalModifiedKey(c,c->db,key); - notifyKeyspaceEvent(NOTIFY_GENERIC,"del",key,c->db->id); - server.dirty++; - } - decrRefCount(obj); - addReply(c, shared.ok); - return; - } - - /* Create the key and set the TTL if any */ - dbAdd(c->db,key,obj); - if (ttl) { - setExpire(c,c->db,key,ttl); - if (!absttl) { - /* Propagate TTL as absolute timestamp */ - robj *ttl_obj = createStringObjectFromLongLong(ttl); - rewriteClientCommandArgument(c,2,ttl_obj); - decrRefCount(ttl_obj); - rewriteClientCommandArgument(c,c->argc,shared.absttl); - } - } - objectSetLRUOrLFU(obj,lfu_freq,lru_idle,lru_clock,1000); - signalModifiedKey(c,c->db,key); - notifyKeyspaceEvent(NOTIFY_GENERIC,"restore",key,c->db->id); - addReply(c,shared.ok); - server.dirty++; -} - -/* MIGRATE socket cache implementation. - * - * We take a map between host:ip and a TCP socket that we used to connect - * to this instance in recent time. - * This sockets are closed when the max number we cache is reached, and also - * in serverCron() when they are around for more than a few seconds. */ -#define MIGRATE_SOCKET_CACHE_ITEMS 64 /* max num of items in the cache. */ -#define MIGRATE_SOCKET_CACHE_TTL 10 /* close cached sockets after 10 sec. */ - -typedef struct migrateCachedSocket { - connection *conn; - long last_dbid; - time_t last_use_time; -} migrateCachedSocket; - -/* Return a migrateCachedSocket containing a TCP socket connected with the - * target instance, possibly returning a cached one. - * - * This function is responsible of sending errors to the client if a - * connection can't be established. In this case -1 is returned. - * Otherwise on success the socket is returned, and the caller should not - * attempt to free it after usage. - * - * If the caller detects an error while using the socket, migrateCloseSocket() - * should be called so that the connection will be created from scratch - * the next time. */ -migrateCachedSocket* migrateGetSocket(client *c, robj *host, robj *port, long timeout) { - connection *conn; - sds name = sdsempty(); - migrateCachedSocket *cs; - - /* Check if we have an already cached socket for this ip:port pair. */ - name = sdscatlen(name,host->ptr,sdslen(host->ptr)); - name = sdscatlen(name,":",1); - name = sdscatlen(name,port->ptr,sdslen(port->ptr)); - cs = dictFetchValue(server.migrate_cached_sockets,name); - if (cs) { - sdsfree(name); - cs->last_use_time = server.unixtime; - return cs; - } - - /* No cached socket, create one. */ - if (dictSize(server.migrate_cached_sockets) == MIGRATE_SOCKET_CACHE_ITEMS) { - /* Too many items, drop one at random. */ - dictEntry *de = dictGetRandomKey(server.migrate_cached_sockets); - cs = dictGetVal(de); - connClose(cs->conn); - zfree(cs); - dictDelete(server.migrate_cached_sockets,dictGetKey(de)); - } - - /* Create the connection */ - conn = connCreate(connTypeOfCluster()); - if (connBlockingConnect(conn, host->ptr, atoi(port->ptr), timeout) - != C_OK) { - addReplyError(c,"-IOERR error or timeout connecting to the client"); - connClose(conn); - sdsfree(name); - return NULL; - } - connEnableTcpNoDelay(conn); - - /* Add to the cache and return it to the caller. */ - cs = zmalloc(sizeof(*cs)); - cs->conn = conn; - - cs->last_dbid = -1; - cs->last_use_time = server.unixtime; - dictAdd(server.migrate_cached_sockets,name,cs); - return cs; -} - -/* Free a migrate cached connection. */ -void migrateCloseSocket(robj *host, robj *port) { - sds name = sdsempty(); - migrateCachedSocket *cs; - - name = sdscatlen(name,host->ptr,sdslen(host->ptr)); - name = sdscatlen(name,":",1); - name = sdscatlen(name,port->ptr,sdslen(port->ptr)); - cs = dictFetchValue(server.migrate_cached_sockets,name); - if (!cs) { - sdsfree(name); - return; - } - - connClose(cs->conn); - zfree(cs); - dictDelete(server.migrate_cached_sockets,name); - sdsfree(name); -} - -void migrateCloseTimedoutSockets(void) { - dictIterator *di = dictGetSafeIterator(server.migrate_cached_sockets); - dictEntry *de; - - while((de = dictNext(di)) != NULL) { - migrateCachedSocket *cs = dictGetVal(de); - - if ((server.unixtime - cs->last_use_time) > MIGRATE_SOCKET_CACHE_TTL) { - connClose(cs->conn); - zfree(cs); - dictDelete(server.migrate_cached_sockets,dictGetKey(de)); - } - } - dictReleaseIterator(di); -} - -/* MIGRATE host port key dbid timeout [COPY | REPLACE | AUTH password | - * AUTH2 username password] - * - * On in the multiple keys form: - * - * MIGRATE host port "" dbid timeout [COPY | REPLACE | AUTH password | - * AUTH2 username password] KEYS key1 key2 ... keyN */ -void migrateCommand(client *c) { - migrateCachedSocket *cs; - int copy = 0, replace = 0, j; - char *username = NULL; - char *password = NULL; - long timeout; - long dbid; - robj **ov = NULL; /* Objects to migrate. */ - robj **kv = NULL; /* Key names. */ - robj **newargv = NULL; /* Used to rewrite the command as DEL ... keys ... */ - rio cmd, payload; - int may_retry = 1; - int write_error = 0; - int argv_rewritten = 0; - - /* To support the KEYS option we need the following additional state. */ - int first_key = 3; /* Argument index of the first key. */ - int num_keys = 1; /* By default only migrate the 'key' argument. */ - - /* Parse additional options */ - for (j = 6; j < c->argc; j++) { - int moreargs = (c->argc-1) - j; - if (!strcasecmp(c->argv[j]->ptr,"copy")) { - copy = 1; - } else if (!strcasecmp(c->argv[j]->ptr,"replace")) { - replace = 1; - } else if (!strcasecmp(c->argv[j]->ptr,"auth")) { - if (!moreargs) { - addReplyErrorObject(c,shared.syntaxerr); - return; - } - j++; - password = c->argv[j]->ptr; - redactClientCommandArgument(c,j); - } else if (!strcasecmp(c->argv[j]->ptr,"auth2")) { - if (moreargs < 2) { - addReplyErrorObject(c,shared.syntaxerr); - return; - } - username = c->argv[++j]->ptr; - redactClientCommandArgument(c,j); - password = c->argv[++j]->ptr; - redactClientCommandArgument(c,j); - } else if (!strcasecmp(c->argv[j]->ptr,"keys")) { - if (sdslen(c->argv[3]->ptr) != 0) { - addReplyError(c, - "When using MIGRATE KEYS option, the key argument" - " must be set to the empty string"); - return; - } - first_key = j+1; - num_keys = c->argc - j - 1; - break; /* All the remaining args are keys. */ - } else { - addReplyErrorObject(c,shared.syntaxerr); - return; - } - } - - /* Sanity check */ - if (getLongFromObjectOrReply(c,c->argv[5],&timeout,NULL) != C_OK || - getLongFromObjectOrReply(c,c->argv[4],&dbid,NULL) != C_OK) - { - return; - } - if (timeout <= 0) timeout = 1000; - - /* Check if the keys are here. If at least one key is to migrate, do it - * otherwise if all the keys are missing reply with "NOKEY" to signal - * the caller there was nothing to migrate. We don't return an error in - * this case, since often this is due to a normal condition like the key - * expiring in the meantime. */ - ov = zrealloc(ov,sizeof(robj*)*num_keys); - kv = zrealloc(kv,sizeof(robj*)*num_keys); - int oi = 0; - - for (j = 0; j < num_keys; j++) { - if ((ov[oi] = lookupKeyRead(c->db,c->argv[first_key+j])) != NULL) { - kv[oi] = c->argv[first_key+j]; - oi++; - } - } - num_keys = oi; - if (num_keys == 0) { - zfree(ov); zfree(kv); - addReplySds(c,sdsnew("+NOKEY\r\n")); - return; - } - -try_again: - write_error = 0; - - /* Connect */ - cs = migrateGetSocket(c,c->argv[1],c->argv[2],timeout); - if (cs == NULL) { - zfree(ov); zfree(kv); - return; /* error sent to the client by migrateGetSocket() */ - } - - rioInitWithBuffer(&cmd,sdsempty()); - - /* Authentication */ - if (password) { - int arity = username ? 3 : 2; - serverAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,'*',arity)); - serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"AUTH",4)); - if (username) { - serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,username, - sdslen(username))); - } - serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,password, - sdslen(password))); - } - - /* Send the SELECT command if the current DB is not already selected. */ - int select = cs->last_dbid != dbid; /* Should we emit SELECT? */ - if (select) { - serverAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,'*',2)); - serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"SELECT",6)); - serverAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,dbid)); - } - - int non_expired = 0; /* Number of keys that we'll find non expired. - Note that serializing large keys may take some time - so certain keys that were found non expired by the - lookupKey() function, may be expired later. */ - - /* Create RESTORE payload and generate the protocol to call the command. */ - for (j = 0; j < num_keys; j++) { - long long ttl = 0; - long long expireat = getExpire(c->db,kv[j]); - - if (expireat != -1) { - ttl = expireat-commandTimeSnapshot(); - if (ttl < 0) { - continue; - } - if (ttl < 1) ttl = 1; - } - - /* Relocate valid (non expired) keys and values into the array in successive - * positions to remove holes created by the keys that were present - * in the first lookup but are now expired after the second lookup. */ - ov[non_expired] = ov[j]; - kv[non_expired++] = kv[j]; - - serverAssertWithInfo(c,NULL, - rioWriteBulkCount(&cmd,'*',replace ? 5 : 4)); - - if (server.cluster_enabled) - serverAssertWithInfo(c,NULL, - rioWriteBulkString(&cmd,"RESTORE-ASKING",14)); - else - serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"RESTORE",7)); - serverAssertWithInfo(c,NULL,sdsEncodedObject(kv[j])); - serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,kv[j]->ptr, - sdslen(kv[j]->ptr))); - serverAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,ttl)); - - /* Emit the payload argument, that is the serialized object using - * the DUMP format. */ - createDumpPayload(&payload,ov[j],kv[j],dbid); - serverAssertWithInfo(c,NULL, - rioWriteBulkString(&cmd,payload.io.buffer.ptr, - sdslen(payload.io.buffer.ptr))); - sdsfree(payload.io.buffer.ptr); - - /* Add the REPLACE option to the RESTORE command if it was specified - * as a MIGRATE option. */ - if (replace) - serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"REPLACE",7)); - } - - /* Fix the actual number of keys we are migrating. */ - num_keys = non_expired; - - /* Transfer the query to the other node in 64K chunks. */ - errno = 0; - { - sds buf = cmd.io.buffer.ptr; - size_t pos = 0, towrite; - int nwritten = 0; - - while ((towrite = sdslen(buf)-pos) > 0) { - towrite = (towrite > (64*1024) ? (64*1024) : towrite); - nwritten = connSyncWrite(cs->conn,buf+pos,towrite,timeout); - if (nwritten != (signed)towrite) { - write_error = 1; - goto socket_err; - } - pos += nwritten; - } - } - - char buf0[1024]; /* Auth reply. */ - char buf1[1024]; /* Select reply. */ - char buf2[1024]; /* Restore reply. */ - - /* Read the AUTH reply if needed. */ - if (password && connSyncReadLine(cs->conn, buf0, sizeof(buf0), timeout) <= 0) - goto socket_err; - - /* Read the SELECT reply if needed. */ - if (select && connSyncReadLine(cs->conn, buf1, sizeof(buf1), timeout) <= 0) - goto socket_err; - - /* Read the RESTORE replies. */ - int error_from_target = 0; - int socket_error = 0; - int del_idx = 1; /* Index of the key argument for the replicated DEL op. */ - - /* Allocate the new argument vector that will replace the current command, - * to propagate the MIGRATE as a DEL command (if no COPY option was given). - * We allocate num_keys+1 because the additional argument is for "DEL" - * command name itself. */ - if (!copy) newargv = zmalloc(sizeof(robj*)*(num_keys+1)); - - for (j = 0; j < num_keys; j++) { - if (connSyncReadLine(cs->conn, buf2, sizeof(buf2), timeout) <= 0) { - socket_error = 1; - break; - } - if ((password && buf0[0] == '-') || - (select && buf1[0] == '-') || - buf2[0] == '-') - { - /* On error assume that last_dbid is no longer valid. */ - if (!error_from_target) { - cs->last_dbid = -1; - char *errbuf; - if (password && buf0[0] == '-') errbuf = buf0; - else if (select && buf1[0] == '-') errbuf = buf1; - else errbuf = buf2; - - error_from_target = 1; - addReplyErrorFormat(c,"Target instance replied with error: %s", - errbuf+1); - } - } else { - if (!copy) { - /* No COPY option: remove the local key, signal the change. */ - dbDelete(c->db,kv[j]); - signalModifiedKey(c,c->db,kv[j]); - notifyKeyspaceEvent(NOTIFY_GENERIC,"del",kv[j],c->db->id); - server.dirty++; - - /* Populate the argument vector to replace the old one. */ - newargv[del_idx++] = kv[j]; - incrRefCount(kv[j]); - } - } - } - - /* On socket error, if we want to retry, do it now before rewriting the - * command vector. We only retry if we are sure nothing was processed - * and we failed to read the first reply (j == 0 test). */ - if (!error_from_target && socket_error && j == 0 && may_retry && - errno != ETIMEDOUT) - { - goto socket_err; /* A retry is guaranteed because of tested conditions.*/ - } - - /* On socket errors, close the migration socket now that we still have - * the original host/port in the ARGV. Later the original command may be - * rewritten to DEL and will be too later. */ - if (socket_error) migrateCloseSocket(c->argv[1],c->argv[2]); - - if (!copy) { - /* Translate MIGRATE as DEL for replication/AOF. Note that we do - * this only for the keys for which we received an acknowledgement - * from the receiving Redis server, by using the del_idx index. */ - if (del_idx > 1) { - newargv[0] = createStringObject("DEL",3); - /* Note that the following call takes ownership of newargv. */ - replaceClientCommandVector(c,del_idx,newargv); - argv_rewritten = 1; - } else { - /* No key transfer acknowledged, no need to rewrite as DEL. */ - zfree(newargv); - } - newargv = NULL; /* Make it safe to call zfree() on it in the future. */ - } - - /* If we are here and a socket error happened, we don't want to retry. - * Just signal the problem to the client, but only do it if we did not - * already queue a different error reported by the destination server. */ - if (!error_from_target && socket_error) { - may_retry = 0; - goto socket_err; - } - - if (!error_from_target) { - /* Success! Update the last_dbid in migrateCachedSocket, so that we can - * avoid SELECT the next time if the target DB is the same. Reply +OK. - * - * Note: If we reached this point, even if socket_error is true - * still the SELECT command succeeded (otherwise the code jumps to - * socket_err label. */ - cs->last_dbid = dbid; - addReply(c,shared.ok); - } else { - /* On error we already sent it in the for loop above, and set - * the currently selected socket to -1 to force SELECT the next time. */ - } - - sdsfree(cmd.io.buffer.ptr); - zfree(ov); zfree(kv); zfree(newargv); - return; - -/* On socket errors we try to close the cached socket and try again. - * It is very common for the cached socket to get closed, if just reopening - * it works it's a shame to notify the error to the caller. */ -socket_err: - /* Cleanup we want to perform in both the retry and no retry case. - * Note: Closing the migrate socket will also force SELECT next time. */ - sdsfree(cmd.io.buffer.ptr); - - /* If the command was rewritten as DEL and there was a socket error, - * we already closed the socket earlier. While migrateCloseSocket() - * is idempotent, the host/port arguments are now gone, so don't do it - * again. */ - if (!argv_rewritten) migrateCloseSocket(c->argv[1],c->argv[2]); - zfree(newargv); - newargv = NULL; /* This will get reallocated on retry. */ - - /* Retry only if it's not a timeout and we never attempted a retry - * (or the code jumping here did not set may_retry to zero). */ - if (errno != ETIMEDOUT && may_retry) { - may_retry = 0; - goto try_again; - } - - /* Cleanup we want to do if no retry is attempted. */ - zfree(ov); zfree(kv); - addReplyErrorSds(c, sdscatprintf(sdsempty(), - "-IOERR error or timeout %s to target instance", - write_error ? "writing" : "reading")); - return; -} /* ----------------------------------------------------------------------------- * Cluster functions related to serving / redirecting clients From 33ef6a30039692ee3a106d058e2e9df1c7198ecf Mon Sep 17 00:00:00 2001 From: Josh Hershberg Date: Wed, 1 Nov 2023 12:37:00 +0200 Subject: [PATCH 09/15] Cluster refactor: s/clusterNodeGetSlotBit/clusterNodeCoversSlot/ Simple rename, "GetSlotBit" is implementation specific Signed-off-by: Josh Hershberg --- src/cluster.h | 2 +- src/cluster_legacy.c | 10 +++++----- src/db.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/cluster.h b/src/cluster.h index d9c6d9413b6..7cf412351fa 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -56,7 +56,7 @@ void clusterUpdateMyselfHostname(void); void clusterUpdateMyselfAnnouncedPorts(void); sds clusterGenNodesDescription(client *c, int filter, int tls_primary); sds genClusterInfoString(void); -int clusterNodeGetSlotBit(clusterNode *n, int slot); +int clusterNodeCoversSlot(clusterNode *n, int slot); void clusterUpdateMyselfHumanNodename(void); int isValidAuxString(char *s, unsigned int length); int getNodeDefaultClientPort(clusterNode *n); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index aa6625fd1f2..bba4395f391 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -55,7 +55,7 @@ void clusterSendPing(clusterLink *link, int type); void clusterSendFail(char *nodename); void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request); void clusterUpdateState(void); -int clusterNodeGetSlotBit(clusterNode *n, int slot); +int clusterNodeCoversSlot(clusterNode *n, int slot); list *clusterGetNodesInMyShard(clusterNode *node); int clusterNodeAddSlave(clusterNode *master, clusterNode *slave); int clusterAddSlot(clusterNode *n, int slot); @@ -4065,7 +4065,7 @@ void clusterFailoverReplaceYourMaster(void) { /* 2) Claim all the slots assigned to our master. */ for (j = 0; j < CLUSTER_SLOTS; j++) { - if (clusterNodeGetSlotBit(oldmaster,j)) { + if (clusterNodeCoversSlot(oldmaster, j)) { clusterDelSlot(j); clusterAddSlot(myself,j); } @@ -4843,7 +4843,7 @@ int clusterNodeClearSlotBit(clusterNode *n, int slot) { } /* Return the slot bit from the cluster node structure. */ -int clusterNodeGetSlotBit(clusterNode *n, int slot) { +int clusterNodeCoversSlot(clusterNode *n, int slot) { return bitmapTestBit(n->slots,slot); } @@ -4882,7 +4882,7 @@ int clusterDelNodeSlots(clusterNode *node) { int deleted = 0, j; for (j = 0; j < CLUSTER_SLOTS; j++) { - if (clusterNodeGetSlotBit(node,j)) { + if (clusterNodeCoversSlot(node, j)) { clusterDelSlot(j); deleted++; } @@ -5234,7 +5234,7 @@ sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary) { for (j = 0; j < CLUSTER_SLOTS; j++) { int bit; - if ((bit = clusterNodeGetSlotBit(node,j)) != 0) { + if ((bit = clusterNodeCoversSlot(node, j)) != 0) { if (start == -1) start = j; } if (start != -1 && (!bit || j == CLUSTER_SLOTS-1)) { diff --git a/src/db.c b/src/db.c index f77db3f885c..a369c9a9a22 100644 --- a/src/db.c +++ b/src/db.c @@ -2197,7 +2197,7 @@ int dbExpand(const redisDb *db, uint64_t db_size, dbKeyType keyType, int try_exp dict *d; if (server.cluster_enabled) { for (int i = 0; i < CLUSTER_SLOTS; i++) { - if (clusterNodeGetSlotBit(getMyClusterNode(), i)) { + if (clusterNodeCoversSlot(getMyClusterNode(), i)) { /* We don't know exact number of keys that would fall into each slot, but we can approximate it, assuming even distribution. */ if (keyType == DB_MAIN) { d = db->dict[i]; From 4afc54ad9b77de13379b32b3fb934b00e640583a Mon Sep 17 00:00:00 2001 From: Josh Hershberg Date: Wed, 1 Nov 2023 14:51:49 +0200 Subject: [PATCH 10/15] Cluster refactor: break up clusterCommand Divide up clusterCommand into clusterCommand for shared sub-commands and clusterCommandSpecial for implementation specific sub-commands. So to, the cluster command help sub-command has been divided into two implementations, clusterCommandHelp and clusterCommandHelpSpecial. Some common sub-subcommand implementations have been extracted and their implemenations either made shared or else implementation specific. Signed-off-by: Josh Hershberg --- src/cluster.c | 174 ++++ src/cluster.h | 13 +- src/cluster_legacy.c | 2081 ++++++++++++++++++++---------------------- src/module.c | 2 +- 4 files changed, 1165 insertions(+), 1105 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index dd053de708a..3439dab0a2e 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -744,3 +744,177 @@ int isValidAuxString(char *s, unsigned int length) { } return 1; } + +void clusterCommandMyId(client *c) { + char *name = clusterNodeGetName(getMyClusterNode()); + if (name) { + addReplyBulkCBuffer(c,name, CLUSTER_NAMELEN); + } else { + addReplyError(c, "No ID yet"); + } +} + +void clusterCommandMyShardId(client *c) { + char *sid = clusterNodeGetShardId(getMyClusterNode()); + if (sid) { + addReplyBulkCBuffer(c,sid, CLUSTER_NAMELEN); + } else { + addReplyError(c, "No shard ID yet"); + } +} + +/* When a cluster command is called, we need to decide whether to return TLS info or + * non-TLS info by the client's connection type. However if the command is called by + * a Lua script or RM_call, there is no connection in the fake client, so we use + * server.current_client here to get the real client if available. And if it is not + * available (modules may call commands without a real client), we return the default + * info, which is determined by server.tls_cluster. */ +static int shouldReturnTlsInfo(void) { + if (server.current_client && server.current_client->conn) { + return connIsTLS(server.current_client->conn); + } else { + return server.tls_cluster; + } +} + +unsigned int countKeysInSlot(unsigned int slot) { + return dictSize(server.db->dict[slot]); +} + +void clusterCommandHelp(client *c) { + const char *help[] = { + "COUNTKEYSINSLOT ", + " Return the number of keys in .", + "GETKEYSINSLOT ", + " Return key names stored by current node in a slot.", + "INFO", + " Return information about the cluster.", + "KEYSLOT ", + " Return the hash slot for .", + "MYID", + " Return the node id.", + "MYSHARDID", + " Return the node's shard id.", + "NODES", + " Return cluster configuration seen by node. Output format:", + " ...", + "REPLICAS ", + " Return replicas.", + "SLOTS", + " Return information about slots range mappings. Each range is made of:", + " start, end, master and replicas IP addresses, ports and ids", + "SHARDS", + " Return information about slot range mappings and the nodes associated with them.", + NULL + }; + + addExtendedReplyHelp(c, help, clusterCommandSpecialHelp()); +} + +void clusterCommand(client *c) { + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; + } + + if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"help")) { + clusterCommandHelp(c); + } else if (!strcasecmp(c->argv[1]->ptr,"nodes") && c->argc == 2) { + /* CLUSTER NODES */ + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + sds nodes = clusterGenNodesDescription(c, 0, shouldReturnTlsInfo()); + addReplyVerbatim(c,nodes,sdslen(nodes),"txt"); + sdsfree(nodes); + } else if (!strcasecmp(c->argv[1]->ptr,"myid") && c->argc == 2) { + /* CLUSTER MYID */ + clusterCommandMyId(c); + } else if (!strcasecmp(c->argv[1]->ptr,"myshardid") && c->argc == 2) { + /* CLUSTER MYSHARDID */ + clusterCommandMyShardId(c); + } else if (!strcasecmp(c->argv[1]->ptr,"slots") && c->argc == 2) { + /* CLUSTER SLOTS */ + clusterCommandSlots(c); + } else if (!strcasecmp(c->argv[1]->ptr,"shards") && c->argc == 2) { + /* CLUSTER SHARDS */ + clusterCommandShards(c); + } else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) { + /* CLUSTER INFO */ + + sds info = genClusterInfoString(); + + /* Produce the reply protocol. */ + addReplyVerbatim(c,info,sdslen(info),"txt"); + sdsfree(info); + } else if (!strcasecmp(c->argv[1]->ptr,"keyslot") && c->argc == 3) { + /* CLUSTER KEYSLOT */ + sds key = c->argv[2]->ptr; + + addReplyLongLong(c,keyHashSlot(key,sdslen(key))); + } else if (!strcasecmp(c->argv[1]->ptr,"countkeysinslot") && c->argc == 3) { + /* CLUSTER COUNTKEYSINSLOT */ + long long slot; + + if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK) + return; + if (slot < 0 || slot >= CLUSTER_SLOTS) { + addReplyError(c,"Invalid slot"); + return; + } + addReplyLongLong(c,countKeysInSlot(slot)); + } else if (!strcasecmp(c->argv[1]->ptr,"getkeysinslot") && c->argc == 4) { + /* CLUSTER GETKEYSINSLOT */ + long long maxkeys, slot; + + if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK) + return; + if (getLongLongFromObjectOrReply(c,c->argv[3],&maxkeys,NULL) + != C_OK) + return; + if (slot < 0 || slot >= CLUSTER_SLOTS || maxkeys < 0) { + addReplyError(c,"Invalid slot or number of keys"); + return; + } + + unsigned int keys_in_slot = countKeysInSlot(slot); + unsigned int numkeys = maxkeys > keys_in_slot ? keys_in_slot : maxkeys; + addReplyArrayLen(c,numkeys); + dictIterator *iter = NULL; + dictEntry *de = NULL; + iter = dictGetIterator(server.db->dict[slot]); + for (unsigned int i = 0; i < numkeys; i++) { + de = dictNext(iter); + serverAssert(de != NULL); + sds sdskey = dictGetKey(de); + addReplyBulkCBuffer(c, sdskey, sdslen(sdskey)); + } + dictReleaseIterator(iter); + } else if ((!strcasecmp(c->argv[1]->ptr,"slaves") || + !strcasecmp(c->argv[1]->ptr,"replicas")) && c->argc == 3) { + /* CLUSTER SLAVES */ + /* CLUSTER REPLICAS */ + clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + int j; + + /* Lookup the specified node in our table. */ + if (!n) { + addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); + return; + } + + if (clusterNodeIsSlave(n)) { + addReplyError(c,"The specified node is not a master"); + return; + } + + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + addReplyArrayLen(c, getNumSlaves(n)); + for (j = 0; j < getNumSlaves(n); j++) { + sds ni = clusterGenNodeDescription(c, getSlave(n, j), shouldReturnTlsInfo()); + addReplyBulkCString(c,ni); + sdsfree(ni); + } + } else if(!clusterCommandSpecial(c)) { + addReplySubcommandSyntaxError(c); + return; + } +} diff --git a/src/cluster.h b/src/cluster.h index 7cf412351fa..5160582b387 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -68,7 +68,7 @@ int getClusterSize(void); char** getClusterNodesList(size_t *numnodes); int nodeIsMaster(clusterNode *n); int handleDebugClusterCommand(client *c); -int clusterNodeConfirmedReachable(clusterNode *node); +int clusterNodePending(clusterNode *node); char* clusterNodeIp(clusterNode *node); int clusterNodeIsSlave(clusterNode *node); clusterNode *clusterNodeGetSlaveof(clusterNode *node); @@ -76,6 +76,17 @@ char* clusterNodeGetName(clusterNode *node); int clusterNodeTimedOut(clusterNode *node); int clusterNodeIsFailing(clusterNode *node); int clusterNodeIsNoFailover(clusterNode *node); +void clusterCommand(client *c); +int clusterCommandSpecial(client *c); +const char** clusterCommandSpecialHelp(void); +char* clusterNodeGetShardId(clusterNode *node); +void clusterCommandSlots(client * c); +void clusterCommandMyId(client *c); +void clusterCommandMyShardId(client *c); +void clusterCommandShards(client *c); +sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary); +int getNumSlaves(clusterNode *node); +clusterNode *getSlave(clusterNode *node, int slave_idx); char **clusterDebugCommandHelp(void); ConnectionType *connTypeOfCluster(void); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index bba4395f391..2b080ef891d 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -120,20 +120,6 @@ static inline int defaultClientPort(void) { return server.tls_cluster ? server.tls_port : server.port; } -/* When a cluster command is called, we need to decide whether to return TLS info or - * non-TLS info by the client's connection type. However if the command is called by - * a Lua script or RM_call, there is no connection in the fake client, so we use - * server.current_client here to get the real client if available. And if it is not - * available (modules may call commands without a real client), we return the default - * info, which is determined by server.tls_cluster. */ -static int shouldReturnTlsInfo(void) { - if (server.current_client && server.current_client->conn) { - return connIsTLS(server.current_client->conn); - } else { - return server.tls_cluster; - } -} - #define isSlotUnclaimed(slot) \ (server.cluster->slots[slot] == NULL || \ bitmapTestBit(server.cluster->owner_not_claiming_slot, slot)) @@ -5678,7 +5664,7 @@ void addShardReplyForClusterShards(client *c, list *nodes) { /* Add to the output buffer of the given client, an array of slot (start, end) * pair owned by the shard, also the primary and set of replica(s) along with * information about each node. */ -void clusterReplyShards(client *c) { +void clusterCommandShards(client *c) { addReplyArrayLen(c, dictSize(server.cluster->shards)); /* This call will add slot_info_pairs to all nodes */ clusterGenNodesSlotsInfo(0); @@ -5689,7 +5675,7 @@ void clusterReplyShards(client *c) { dictReleaseIterator(di); } -void clusterReplyMultiBulkSlots(client * c) { +void clusterCommandSlots(client * c) { /* Format: 1) 1) start slot * 2) end slot * 3) 1) master IP @@ -5804,1228 +5790,1117 @@ sds genClusterInfoString(void) { return info; } -void clusterCommand(client *c) { + +void removeChannelsInSlot(unsigned int slot) { + unsigned int channelcount = countChannelsInSlot(slot); + if (channelcount == 0) return; + + /* Retrieve all the channels for the slot. */ + robj **channels = zmalloc(sizeof(robj*)*channelcount); + raxIterator iter; + int j = 0; + unsigned char indexed[2]; + + indexed[0] = (slot >> 8) & 0xff; + indexed[1] = slot & 0xff; + raxStart(&iter,server.cluster->slots_to_channels); + raxSeek(&iter,">=",indexed,2); + while(raxNext(&iter)) { + if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break; + channels[j++] = createStringObject((char*)iter.key + 2, iter.key_len - 2); + } + raxStop(&iter); + + pubsubUnsubscribeShardChannels(channels, channelcount); + zfree(channels); +} + + +/* ----------------------------------------------------------------------------- + * Cluster functions related to serving / redirecting clients + * -------------------------------------------------------------------------- */ + +/* The ASKING command is required after a -ASK redirection. + * The client should issue ASKING before to actually send the command to + * the target instance. See the Redis Cluster specification for more + * information. */ +void askingCommand(client *c) { if (server.cluster_enabled == 0) { addReplyError(c,"This instance has cluster support disabled"); return; } + c->flags |= CLIENT_ASKING; + addReply(c,shared.ok); +} - if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"help")) { - const char *help[] = { -"ADDSLOTS [ ...]", -" Assign slots to current node.", -"ADDSLOTSRANGE [ ...]", -" Assign slots which are between and to current node.", -"BUMPEPOCH", -" Advance the cluster config epoch.", -"COUNT-FAILURE-REPORTS ", -" Return number of failure reports for .", -"COUNTKEYSINSLOT ", -" Return the number of keys in .", -"DELSLOTS [ ...]", -" Delete slots information from current node.", -"DELSLOTSRANGE [ ...]", -" Delete slots information which are between and from current node.", -"FAILOVER [FORCE|TAKEOVER]", -" Promote current replica node to being a master.", -"FORGET ", -" Remove a node from the cluster.", -"GETKEYSINSLOT ", -" Return key names stored by current node in a slot.", -"FLUSHSLOTS", -" Delete current node own slots information.", -"INFO", -" Return information about the cluster.", -"KEYSLOT ", -" Return the hash slot for .", -"MEET []", -" Connect nodes into a working cluster.", -"MYID", -" Return the node id.", -"MYSHARDID", -" Return the node's shard id.", -"NODES", -" Return cluster configuration seen by node. Output format:", -" ...", -"REPLICATE ", -" Configure current node as replica to .", -"RESET [HARD|SOFT]", -" Reset current node (default: soft).", -"SET-CONFIG-EPOCH ", -" Set config epoch of current node.", -"SETSLOT (IMPORTING |MIGRATING |STABLE|NODE )", -" Set slot state.", -"REPLICAS ", -" Return replicas.", -"SAVECONFIG", -" Force saving cluster configuration on disk.", -"SLOTS", -" Return information about slots range mappings. Each range is made of:", -" start, end, master and replicas IP addresses, ports and ids", -"SHARDS", -" Return information about slot range mappings and the nodes associated with them.", -"LINKS", -" Return information about all network links between this node and its peers.", -" Output format is an array where each array element is a map containing attributes of a link", -NULL - }; - addReplyHelp(c, help); - } else if (!strcasecmp(c->argv[1]->ptr,"meet") && (c->argc == 4 || c->argc == 5)) { - /* CLUSTER MEET [cport] */ - long long port, cport; +/* The READONLY command is used by clients to enter the read-only mode. + * In this mode slaves will not redirect clients as long as clients access + * with read-only commands to keys that are served by the slave's master. */ +void readonlyCommand(client *c) { + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; + } + c->flags |= CLIENT_READONLY; + addReply(c,shared.ok); +} - if (getLongLongFromObject(c->argv[3], &port) != C_OK) { - addReplyErrorFormat(c,"Invalid base port specified: %s", - (char*)c->argv[3]->ptr); - return; - } +/* The READWRITE command just clears the READONLY command state. */ +void readwriteCommand(client *c) { + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; + } + c->flags &= ~CLIENT_READONLY; + addReply(c,shared.ok); +} - if (c->argc == 5) { - if (getLongLongFromObject(c->argv[4], &cport) != C_OK) { - addReplyErrorFormat(c,"Invalid bus port specified: %s", - (char*)c->argv[4]->ptr); - return; - } - } else { - cport = port + CLUSTER_PORT_INCR; - } +/* Return the pointer to the cluster node that is able to serve the command. + * For the function to succeed the command should only target either: + * + * 1) A single key (even multiple times like RPOPLPUSH mylist mylist). + * 2) Multiple keys in the same hash slot, while the slot is stable (no + * resharding in progress). + * + * On success the function returns the node that is able to serve the request. + * If the node is not 'myself' a redirection must be performed. The kind of + * redirection is specified setting the integer passed by reference + * 'error_code', which will be set to CLUSTER_REDIR_ASK or + * CLUSTER_REDIR_MOVED. + * + * When the node is 'myself' 'error_code' is set to CLUSTER_REDIR_NONE. + * + * If the command fails NULL is returned, and the reason of the failure is + * provided via 'error_code', which will be set to: + * + * CLUSTER_REDIR_CROSS_SLOT if the request contains multiple keys that + * don't belong to the same hash slot. + * + * CLUSTER_REDIR_UNSTABLE if the request contains multiple keys + * belonging to the same slot, but the slot is not stable (in migration or + * importing state, likely because a resharding is in progress). + * + * CLUSTER_REDIR_DOWN_UNBOUND if the request addresses a slot which is + * not bound to any node. In this case the cluster global state should be + * already "down" but it is fragile to rely on the update of the global state, + * so we also handle it here. + * + * CLUSTER_REDIR_DOWN_STATE and CLUSTER_REDIR_DOWN_RO_STATE if the cluster is + * down but the user attempts to execute a command that addresses one or more keys. */ +clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *error_code) { + clusterNode *n = NULL; + robj *firstkey = NULL; + int multiple_keys = 0; + multiState *ms, _ms; + multiCmd mc; + int i, slot = 0, migrating_slot = 0, importing_slot = 0, missing_keys = 0, + existing_keys = 0; - if (clusterStartHandshake(c->argv[2]->ptr,port,cport) == 0 && - errno == EINVAL) - { - addReplyErrorFormat(c,"Invalid node address specified: %s:%s", - (char*)c->argv[2]->ptr, (char*)c->argv[3]->ptr); - } else { - addReply(c,shared.ok); - } - } else if (!strcasecmp(c->argv[1]->ptr,"nodes") && c->argc == 2) { - /* CLUSTER NODES */ - /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - sds nodes = clusterGenNodesDescription(c, 0, shouldReturnTlsInfo()); - addReplyVerbatim(c,nodes,sdslen(nodes),"txt"); - sdsfree(nodes); - } else if (!strcasecmp(c->argv[1]->ptr,"myid") && c->argc == 2) { - /* CLUSTER MYID */ - addReplyBulkCBuffer(c,myself->name, CLUSTER_NAMELEN); - } else if (!strcasecmp(c->argv[1]->ptr,"myshardid") && c->argc == 2) { - /* CLUSTER MYSHARDID */ - addReplyBulkCBuffer(c,myself->shard_id, CLUSTER_NAMELEN); - } else if (!strcasecmp(c->argv[1]->ptr,"slots") && c->argc == 2) { - /* CLUSTER SLOTS */ - clusterReplyMultiBulkSlots(c); - } else if (!strcasecmp(c->argv[1]->ptr,"shards") && c->argc == 2) { - /* CLUSTER SHARDS */ - clusterReplyShards(c); - } else if (!strcasecmp(c->argv[1]->ptr,"flushslots") && c->argc == 2) { - /* CLUSTER FLUSHSLOTS */ - if (dbSize(&server.db[0], DB_MAIN) != 0) { - addReplyError(c,"DB must be empty to perform CLUSTER FLUSHSLOTS."); - return; - } - clusterDelNodeSlots(myself); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } else if ((!strcasecmp(c->argv[1]->ptr,"addslots") || - !strcasecmp(c->argv[1]->ptr,"delslots")) && c->argc >= 3) - { - /* CLUSTER ADDSLOTS [slot] ... */ - /* CLUSTER DELSLOTS [slot] ... */ - int j, slot; - unsigned char *slots = zmalloc(CLUSTER_SLOTS); - int del = !strcasecmp(c->argv[1]->ptr,"delslots"); + /* Allow any key to be set if a module disabled cluster redirections. */ + if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) + return myself; - memset(slots,0,CLUSTER_SLOTS); - /* Check that all the arguments are parseable.*/ - for (j = 2; j < c->argc; j++) { - if ((slot = getSlotOrReply(c,c->argv[j])) == C_ERR) { - zfree(slots); - return; - } - } - /* Check that the slots are not already busy. */ - for (j = 2; j < c->argc; j++) { - slot = getSlotOrReply(c,c->argv[j]); - if (checkSlotAssignmentsOrReply(c, slots, del, slot, slot) == C_ERR) { - zfree(slots); - return; - } - } - clusterUpdateSlots(c, slots, del); - zfree(slots); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } else if ((!strcasecmp(c->argv[1]->ptr,"addslotsrange") || - !strcasecmp(c->argv[1]->ptr,"delslotsrange")) && c->argc >= 4) { - if (c->argc % 2 == 1) { - addReplyErrorArity(c); - return; - } - /* CLUSTER ADDSLOTSRANGE [ ...] */ - /* CLUSTER DELSLOTSRANGE [ ...] */ - int j, startslot, endslot; - unsigned char *slots = zmalloc(CLUSTER_SLOTS); - int del = !strcasecmp(c->argv[1]->ptr,"delslotsrange"); + /* Set error code optimistically for the base case. */ + if (error_code) *error_code = CLUSTER_REDIR_NONE; - memset(slots,0,CLUSTER_SLOTS); - /* Check that all the arguments are parseable and that all the - * slots are not already busy. */ - for (j = 2; j < c->argc; j += 2) { - if ((startslot = getSlotOrReply(c,c->argv[j])) == C_ERR) { - zfree(slots); - return; - } - if ((endslot = getSlotOrReply(c,c->argv[j+1])) == C_ERR) { - zfree(slots); - return; - } - if (startslot > endslot) { - addReplyErrorFormat(c,"start slot number %d is greater than end slot number %d", startslot, endslot); - zfree(slots); - return; - } + /* Modules can turn off Redis Cluster redirection: this is useful + * when writing a module that implements a completely different + * distributed system. */ - if (checkSlotAssignmentsOrReply(c, slots, del, startslot, endslot) == C_ERR) { - zfree(slots); - return; - } - } - clusterUpdateSlots(c, slots, del); - zfree(slots); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"setslot") && c->argc >= 4) { - /* SETSLOT 10 MIGRATING */ - /* SETSLOT 10 IMPORTING */ - /* SETSLOT 10 STABLE */ - /* SETSLOT 10 NODE */ - int slot; - clusterNode *n; + /* We handle all the cases as if they were EXEC commands, so we have + * a common code path for everything */ + if (cmd->proc == execCommand) { + /* If CLIENT_MULTI flag is not set EXEC is just going to return an + * error. */ + if (!(c->flags & CLIENT_MULTI)) return myself; + ms = &c->mstate; + } else { + /* In order to have a single codepath create a fake Multi State + * structure if the client is not in MULTI/EXEC state, this way + * we have a single codepath below. */ + ms = &_ms; + _ms.commands = &mc; + _ms.count = 1; + mc.argv = argv; + mc.argc = argc; + mc.cmd = cmd; + } - if (nodeIsSlave(myself)) { - addReplyError(c,"Please use SETSLOT only with masters."); - return; - } + int is_pubsubshard = cmd->proc == ssubscribeCommand || + cmd->proc == sunsubscribeCommand || + cmd->proc == spublishCommand; - if ((slot = getSlotOrReply(c,c->argv[2])) == -1) return; + /* Check that all the keys are in the same hash slot, and obtain this + * slot and the node associated. */ + for (i = 0; i < ms->count; i++) { + struct redisCommand *mcmd; + robj **margv; + int margc, numkeys, j; + keyReference *keyindex; - if (!strcasecmp(c->argv[3]->ptr,"migrating") && c->argc == 5) { - if (server.cluster->slots[slot] != myself) { - addReplyErrorFormat(c,"I'm not the owner of hash slot %u",slot); - return; - } - n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); - if (n == NULL) { - addReplyErrorFormat(c,"I don't know about node %s", - (char*)c->argv[4]->ptr); - return; - } - if (nodeIsSlave(n)) { - addReplyError(c,"Target node is not a master"); - return; - } - server.cluster->migrating_slots_to[slot] = n; - } else if (!strcasecmp(c->argv[3]->ptr,"importing") && c->argc == 5) { - if (server.cluster->slots[slot] == myself) { - addReplyErrorFormat(c, - "I'm already the owner of hash slot %u",slot); - return; - } - n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); - if (n == NULL) { - addReplyErrorFormat(c,"I don't know about node %s", - (char*)c->argv[4]->ptr); - return; - } - if (nodeIsSlave(n)) { - addReplyError(c,"Target node is not a master"); - return; - } - server.cluster->importing_slots_from[slot] = n; - } else if (!strcasecmp(c->argv[3]->ptr,"stable") && c->argc == 4) { - /* CLUSTER SETSLOT STABLE */ - server.cluster->importing_slots_from[slot] = NULL; - server.cluster->migrating_slots_to[slot] = NULL; - } else if (!strcasecmp(c->argv[3]->ptr,"node") && c->argc == 5) { - /* CLUSTER SETSLOT NODE */ - n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", - (char*)c->argv[4]->ptr); - return; - } - if (nodeIsSlave(n)) { - addReplyError(c,"Target node is not a master"); - return; - } - /* If this hash slot was served by 'myself' before to switch - * make sure there are no longer local keys for this hash slot. */ - if (server.cluster->slots[slot] == myself && n != myself) { - if (countKeysInSlot(slot) != 0) { - addReplyErrorFormat(c, - "Can't assign hashslot %d to a different node " - "while I still hold keys for this hash slot.", slot); - return; + mcmd = ms->commands[i].cmd; + margc = ms->commands[i].argc; + margv = ms->commands[i].argv; + + getKeysResult result = GETKEYS_RESULT_INIT; + numkeys = getKeysFromCommand(mcmd,margv,margc,&result); + keyindex = result.keys; + + for (j = 0; j < numkeys; j++) { + robj *thiskey = margv[keyindex[j].pos]; + int thisslot = keyHashSlot((char*)thiskey->ptr, + sdslen(thiskey->ptr)); + + if (firstkey == NULL) { + /* This is the first key we see. Check what is the slot + * and node. */ + firstkey = thiskey; + slot = thisslot; + n = server.cluster->slots[slot]; + + /* Error: If a slot is not served, we are in "cluster down" + * state. However the state is yet to be updated, so this was + * not trapped earlier in processCommand(). Report the same + * error to the client. */ + if (n == NULL) { + getKeysFreeResult(&result); + if (error_code) + *error_code = CLUSTER_REDIR_DOWN_UNBOUND; + return NULL; } - } - /* If this slot is in migrating status but we have no keys - * for it assigning the slot to another node will clear - * the migrating status. */ - if (countKeysInSlot(slot) == 0 && - server.cluster->migrating_slots_to[slot]) - server.cluster->migrating_slots_to[slot] = NULL; - int slot_was_mine = server.cluster->slots[slot] == myself; - clusterDelSlot(slot); - clusterAddSlot(n,slot); + /* If we are migrating or importing this slot, we need to check + * if we have all the keys in the request (the only way we + * can safely serve the request, otherwise we return a TRYAGAIN + * error). To do so we set the importing/migrating state and + * increment a counter for every missing key. */ + if (n == myself && + server.cluster->migrating_slots_to[slot] != NULL) + { + migrating_slot = 1; + } else if (server.cluster->importing_slots_from[slot] != NULL) { + importing_slot = 1; + } + } else { + /* If it is not the first key/channel, make sure it is exactly + * the same key/channel as the first we saw. */ + if (slot != thisslot) { + /* Error: multiple keys from different slots. */ + getKeysFreeResult(&result); + if (error_code) + *error_code = CLUSTER_REDIR_CROSS_SLOT; + return NULL; + } + if (importing_slot && !multiple_keys && !equalStringObjects(firstkey,thiskey)) { + /* Flag this request as one with multiple different + * keys/channels when the slot is in importing state. */ + multiple_keys = 1; + } + } - /* If we are a master left without slots, we should turn into a - * replica of the new master. */ - if (slot_was_mine && - n != myself && - myself->numslots == 0 && - server.cluster_allow_replica_migration) + /* Migrating / Importing slot? Count keys we don't have. + * If it is pubsubshard command, it isn't required to check + * the channel being present or not in the node during the + * slot migration, the channel will be served from the source + * node until the migration completes with CLUSTER SETSLOT + * NODE . */ + int flags = LOOKUP_NOTOUCH | LOOKUP_NOSTATS | LOOKUP_NONOTIFY | LOOKUP_NOEXPIRE; + if ((migrating_slot || importing_slot) && !is_pubsubshard) { - serverLog(LL_NOTICE, - "Configuration change detected. Reconfiguring myself " - "as a replica of %.40s (%s)", n->name, n->human_nodename); - clusterSetMaster(n); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | - CLUSTER_TODO_UPDATE_STATE | - CLUSTER_TODO_FSYNC_CONFIG); + if (lookupKeyReadWithFlags(&server.db[0], thiskey, flags) == NULL) missing_keys++; + else existing_keys++; } + } + getKeysFreeResult(&result); + } - /* If this node was importing this slot, assigning the slot to - * itself also clears the importing status. */ - if (n == myself && - server.cluster->importing_slots_from[slot]) - { - /* This slot was manually migrated, set this node configEpoch - * to a new epoch so that the new version can be propagated - * by the cluster. - * - * Note that if this ever results in a collision with another - * node getting the same configEpoch, for example because a - * failover happens at the same time we close the slot, the - * configEpoch collision resolution will fix it assigning - * a different epoch to each node. */ - if (clusterBumpConfigEpochWithoutConsensus() == C_OK) { - serverLog(LL_NOTICE, - "configEpoch updated after importing slot %d", slot); - } - server.cluster->importing_slots_from[slot] = NULL; - /* After importing this slot, let the other nodes know as - * soon as possible. */ - clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + /* No key at all in command? then we can serve the request + * without redirections or errors in all the cases. */ + if (n == NULL) return myself; + + uint64_t cmd_flags = getCommandFlags(c); + /* Cluster is globally down but we got keys? We only serve the request + * if it is a read command and when allow_reads_when_down is enabled. */ + if (server.cluster->state != CLUSTER_OK) { + if (is_pubsubshard) { + if (!server.cluster_allow_pubsubshard_when_down) { + if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE; + return NULL; } + } else if (!server.cluster_allow_reads_when_down) { + /* The cluster is configured to block commands when the + * cluster is down. */ + if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE; + return NULL; + } else if (cmd_flags & CMD_WRITE) { + /* The cluster is configured to allow read only commands */ + if (error_code) *error_code = CLUSTER_REDIR_DOWN_RO_STATE; + return NULL; } else { - addReplyError(c, - "Invalid CLUSTER SETSLOT action or number of arguments. Try CLUSTER HELP"); - return; + /* Fall through and allow the command to be executed: + * this happens when server.cluster_allow_reads_when_down is + * true and the command is not a write command */ } - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE); - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"bumpepoch") && c->argc == 2) { - /* CLUSTER BUMPEPOCH */ - int retval = clusterBumpConfigEpochWithoutConsensus(); - sds reply = sdscatprintf(sdsempty(),"+%s %llu\r\n", - (retval == C_OK) ? "BUMPED" : "STILL", - (unsigned long long) myself->configEpoch); - addReplySds(c,reply); - } else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) { - /* CLUSTER INFO */ - - sds info = genClusterInfoString(); - - /* Produce the reply protocol. */ - addReplyVerbatim(c,info,sdslen(info),"txt"); - sdsfree(info); - } else if (!strcasecmp(c->argv[1]->ptr,"saveconfig") && c->argc == 2) { - int retval = clusterSaveConfig(1); + } - if (retval == 0) - addReply(c,shared.ok); - else - addReplyErrorFormat(c,"error saving the cluster node config: %s", - strerror(errno)); - } else if (!strcasecmp(c->argv[1]->ptr,"keyslot") && c->argc == 3) { - /* CLUSTER KEYSLOT */ - sds key = c->argv[2]->ptr; + /* Return the hashslot by reference. */ + if (hashslot) *hashslot = slot; - addReplyLongLong(c,keyHashSlot(key,sdslen(key))); - } else if (!strcasecmp(c->argv[1]->ptr,"countkeysinslot") && c->argc == 3) { - /* CLUSTER COUNTKEYSINSLOT */ - long long slot; + /* MIGRATE always works in the context of the local node if the slot + * is open (migrating or importing state). We need to be able to freely + * move keys among instances in this case. */ + if ((migrating_slot || importing_slot) && cmd->proc == migrateCommand) + return myself; - if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK) - return; - if (slot < 0 || slot >= CLUSTER_SLOTS) { - addReplyError(c,"Invalid slot"); - return; + /* If we don't have all the keys and we are migrating the slot, send + * an ASK redirection or TRYAGAIN. */ + if (migrating_slot && missing_keys) { + /* If we have keys but we don't have all keys, we return TRYAGAIN */ + if (existing_keys) { + if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE; + return NULL; + } else { + if (error_code) *error_code = CLUSTER_REDIR_ASK; + return server.cluster->migrating_slots_to[slot]; } - addReplyLongLong(c,countKeysInSlot(slot)); - } else if (!strcasecmp(c->argv[1]->ptr,"getkeysinslot") && c->argc == 4) { - /* CLUSTER GETKEYSINSLOT */ - long long maxkeys, slot; + } - if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK) - return; - if (getLongLongFromObjectOrReply(c,c->argv[3],&maxkeys,NULL) - != C_OK) - return; - if (slot < 0 || slot >= CLUSTER_SLOTS || maxkeys < 0) { - addReplyError(c,"Invalid slot or number of keys"); - return; + /* If we are receiving the slot, and the client correctly flagged the + * request as "ASKING", we can serve the request. However if the request + * involves multiple keys and we don't have them all, the only option is + * to send a TRYAGAIN error. */ + if (importing_slot && + (c->flags & CLIENT_ASKING || cmd_flags & CMD_ASKING)) + { + if (multiple_keys && missing_keys) { + if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE; + return NULL; + } else { + return myself; } + } - unsigned int keys_in_slot = countKeysInSlot(slot); - unsigned int numkeys = maxkeys > keys_in_slot ? keys_in_slot : maxkeys; - addReplyArrayLen(c,numkeys); - dictIterator *iter = NULL; - dictEntry *de = NULL; - iter = dictGetIterator(server.db->dict[slot]); - for (unsigned int i = 0; i < numkeys; i++) { - de = dictNext(iter); - serverAssert(de != NULL); - sds sdskey = dictGetKey(de); - addReplyBulkCBuffer(c, sdskey, sdslen(sdskey)); - } - dictReleaseIterator(iter); - } else if (!strcasecmp(c->argv[1]->ptr,"forget") && c->argc == 3) { - /* CLUSTER FORGET */ - clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); - if (!n) { - if (clusterBlacklistExists((char*)c->argv[2]->ptr)) - /* Already forgotten. The deletion may have been gossipped by - * another node, so we pretend it succeeded. */ - addReply(c,shared.ok); - else - addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); - return; - } else if (n == myself) { - addReplyError(c,"I tried hard but I can't forget myself..."); - return; - } else if (nodeIsSlave(myself) && myself->slaveof == n) { - addReplyError(c,"Can't forget my master!"); - return; - } - clusterBlacklistAddNode(n); - clusterDelNode(n); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"replicate") && c->argc == 3) { - /* CLUSTER REPLICATE */ - /* Lookup the specified node in our table. */ - clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); - return; - } - - /* I can't replicate myself. */ - if (n == myself) { - addReplyError(c,"Can't replicate myself"); - return; - } + /* Handle the read-only client case reading from a slave: if this + * node is a slave and the request is about a hash slot our master + * is serving, we can reply without redirection. */ + int is_write_command = (cmd_flags & CMD_WRITE) || + (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE)); + if (((c->flags & CLIENT_READONLY) || is_pubsubshard) && + !is_write_command && + nodeIsSlave(myself) && + myself->slaveof == n) + { + return myself; + } - /* Can't replicate a slave. */ - if (nodeIsSlave(n)) { - addReplyError(c,"I can only replicate a master, not a replica."); - return; - } + /* Base case: just return the right node. However if this node is not + * myself, set error_code to MOVED since we need to issue a redirection. */ + if (n != myself && error_code) *error_code = CLUSTER_REDIR_MOVED; + return n; +} - /* If the instance is currently a master, it should have no assigned - * slots nor keys to accept to replicate some other node. - * Slaves can switch to another master without issues. */ - if (nodeIsMaster(myself) && - (myself->numslots != 0 || dbSize(&server.db[0], DB_MAIN) != 0)) { - addReplyError(c, - "To set a master the node must be empty and " - "without assigned slots."); - return; - } +/* Send the client the right redirection code, according to error_code + * that should be set to one of CLUSTER_REDIR_* macros. + * + * If CLUSTER_REDIR_ASK or CLUSTER_REDIR_MOVED error codes + * are used, then the node 'n' should not be NULL, but should be the + * node we want to mention in the redirection. Moreover hashslot should + * be set to the hash slot that caused the redirection. */ +void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code) { + if (error_code == CLUSTER_REDIR_CROSS_SLOT) { + addReplyError(c,"-CROSSSLOT Keys in request don't hash to the same slot"); + } else if (error_code == CLUSTER_REDIR_UNSTABLE) { + /* The request spawns multiple keys in the same slot, + * but the slot is not "stable" currently as there is + * a migration or import in progress. */ + addReplyError(c,"-TRYAGAIN Multiple keys request during rehashing of slot"); + } else if (error_code == CLUSTER_REDIR_DOWN_STATE) { + addReplyError(c,"-CLUSTERDOWN The cluster is down"); + } else if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) { + addReplyError(c,"-CLUSTERDOWN The cluster is down and only accepts read commands"); + } else if (error_code == CLUSTER_REDIR_DOWN_UNBOUND) { + addReplyError(c,"-CLUSTERDOWN Hash slot not served"); + } else if (error_code == CLUSTER_REDIR_MOVED || + error_code == CLUSTER_REDIR_ASK) + { + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + int port = getNodeClientPort(n, shouldReturnTlsInfo()); + addReplyErrorSds(c,sdscatprintf(sdsempty(), + "-%s %d %s:%d", + (error_code == CLUSTER_REDIR_ASK) ? "ASK" : "MOVED", + hashslot, getPreferredEndpoint(n), port)); + } else { + serverPanic("getNodeByQuery() unknown error."); + } +} - /* Set the master. */ - clusterSetMaster(n); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } else if ((!strcasecmp(c->argv[1]->ptr,"slaves") || - !strcasecmp(c->argv[1]->ptr,"replicas")) && c->argc == 3) { - /* CLUSTER SLAVES */ - /* CLUSTER REPLICAS */ - clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); - int j; +/* This function is called by the function processing clients incrementally + * to detect timeouts, in order to handle the following case: + * + * 1) A client blocks with BLPOP or similar blocking operation. + * 2) The master migrates the hash slot elsewhere or turns into a slave. + * 3) The client may remain blocked forever (or up to the max timeout time) + * waiting for a key change that will never happen. + * + * If the client is found to be blocked into a hash slot this node no + * longer handles, the client is sent a redirection error, and the function + * returns 1. Otherwise 0 is returned and no operation is performed. */ +int clusterRedirectBlockedClientIfNeeded(client *c) { + if (c->flags & CLIENT_BLOCKED && + (c->bstate.btype == BLOCKED_LIST || + c->bstate.btype == BLOCKED_ZSET || + c->bstate.btype == BLOCKED_STREAM || + c->bstate.btype == BLOCKED_MODULE)) + { + dictEntry *de; + dictIterator *di; - /* Lookup the specified node in our table. */ - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); - return; + /* If the cluster is down, unblock the client with the right error. + * If the cluster is configured to allow reads on cluster down, we + * still want to emit this error since a write will be required + * to unblock them which may never come. */ + if (server.cluster->state == CLUSTER_FAIL) { + clusterRedirectClient(c,NULL,0,CLUSTER_REDIR_DOWN_STATE); + return 1; } - if (nodeIsSlave(n)) { - addReplyError(c,"The specified node is not a master"); - return; - } + /* If the client is blocked on module, but not on a specific key, + * don't unblock it (except for the CLUSTER_FAIL case above). */ + if (c->bstate.btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c)) + return 0; - /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - addReplyArrayLen(c,n->numslaves); - for (j = 0; j < n->numslaves; j++) { - sds ni = clusterGenNodeDescription(c, n->slaves[j], shouldReturnTlsInfo()); - addReplyBulkCString(c,ni); - sdsfree(ni); - } - } else if (!strcasecmp(c->argv[1]->ptr,"count-failure-reports") && - c->argc == 3) - { - /* CLUSTER COUNT-FAILURE-REPORTS */ - clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + /* All keys must belong to the same slot, so check first key only. */ + di = dictGetIterator(c->bstate.keys); + if ((de = dictNext(di)) != NULL) { + robj *key = dictGetKey(de); + int slot = keyHashSlot((char*)key->ptr, sdslen(key->ptr)); + clusterNode *node = server.cluster->slots[slot]; - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); - return; - } else { - addReplyLongLong(c,clusterNodeFailureReportsCount(n)); - } - } else if (!strcasecmp(c->argv[1]->ptr,"failover") && - (c->argc == 2 || c->argc == 3)) - { - /* CLUSTER FAILOVER [FORCE|TAKEOVER] */ - int force = 0, takeover = 0; + /* if the client is read-only and attempting to access key that our + * replica can handle, allow it. */ + if ((c->flags & CLIENT_READONLY) && + !(c->lastcmd->flags & CMD_WRITE) && + nodeIsSlave(myself) && myself->slaveof == node) + { + node = myself; + } - if (c->argc == 3) { - if (!strcasecmp(c->argv[2]->ptr,"force")) { - force = 1; - } else if (!strcasecmp(c->argv[2]->ptr,"takeover")) { - takeover = 1; - force = 1; /* Takeover also implies force. */ - } else { - addReplyErrorObject(c,shared.syntaxerr); - return; + /* We send an error and unblock the client if: + * 1) The slot is unassigned, emitting a cluster down error. + * 2) The slot is not handled by this node, nor being imported. */ + if (node != myself && + server.cluster->importing_slots_from[slot] == NULL) + { + if (node == NULL) { + clusterRedirectClient(c,NULL,0, + CLUSTER_REDIR_DOWN_UNBOUND); + } else { + clusterRedirectClient(c,node,slot, + CLUSTER_REDIR_MOVED); + } + dictReleaseIterator(di); + return 1; } } + dictReleaseIterator(di); + } + return 0; +} - /* Check preconditions. */ - if (nodeIsMaster(myself)) { - addReplyError(c,"You should send CLUSTER FAILOVER to a replica"); - return; - } else if (myself->slaveof == NULL) { - addReplyError(c,"I'm a replica but my master is unknown to me"); - return; - } else if (!force && - (nodeFailed(myself->slaveof) || - myself->slaveof->link == NULL)) - { - addReplyError(c,"Master is down or failed, " - "please use CLUSTER FAILOVER FORCE"); - return; - } - resetManualFailover(); - server.cluster->mf_end = mstime() + CLUSTER_MF_TIMEOUT; +/* Remove all the keys in the specified hash slot. + * The number of removed items is returned. */ +unsigned int delKeysInSlot(unsigned int hashslot) { + unsigned int j = 0; - if (takeover) { - /* A takeover does not perform any initial check. It just - * generates a new configuration epoch for this node without - * consensus, claims the master's slots, and broadcast the new - * configuration. */ - serverLog(LL_NOTICE,"Taking over the master (user request)."); - clusterBumpConfigEpochWithoutConsensus(); - clusterFailoverReplaceYourMaster(); - } else if (force) { - /* If this is a forced failover, we don't need to talk with our - * master to agree about the offset. We just failover taking over - * it without coordination. */ - serverLog(LL_NOTICE,"Forced failover user request accepted."); - server.cluster->mf_can_start = 1; - } else { - serverLog(LL_NOTICE,"Manual failover user request accepted."); - clusterSendMFStart(myself->slaveof); - } - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"set-config-epoch") && c->argc == 3) - { - /* CLUSTER SET-CONFIG-EPOCH - * - * The user is allowed to set the config epoch only when a node is - * totally fresh: no config epoch, no other known node, and so forth. - * This happens at cluster creation time to start with a cluster where - * every node has a different node ID, without to rely on the conflicts - * resolution system which is too slow when a big cluster is created. */ - long long epoch; + dictIterator *iter = NULL; + dictEntry *de = NULL; + iter = dictGetSafeIterator(server.db->dict[hashslot]); + while((de = dictNext(iter)) != NULL) { + sds sdskey = dictGetKey(de); + robj *key = createStringObject(sdskey, sdslen(sdskey)); + dbDelete(&server.db[0], key); + propagateDeletion(&server.db[0], key, server.lazyfree_lazy_server_del); + signalModifiedKey(NULL, &server.db[0], key); + /* The keys are not actually logically deleted from the database, just moved to another node. + * The modules needs to know that these keys are no longer available locally, so just send the + * keyspace notification to the modules, but not to clients. */ + moduleNotifyKeyspaceEvent(NOTIFY_GENERIC, "del", key, server.db[0].id); + postExecutionUnitOperations(); + decrRefCount(key); + j++; + server.dirty++; + } + dictReleaseIterator(iter); - if (getLongLongFromObjectOrReply(c,c->argv[2],&epoch,NULL) != C_OK) - return; + return j; +} - if (epoch < 0) { - addReplyErrorFormat(c,"Invalid config epoch specified: %lld",epoch); - } else if (dictSize(server.cluster->nodes) > 1) { - addReplyError(c,"The user can assign a config epoch only when the " - "node does not know any other node."); - } else if (myself->configEpoch != 0) { - addReplyError(c,"Node config epoch is already non-zero"); - } else { - myself->configEpoch = epoch; - serverLog(LL_NOTICE, - "configEpoch set to %llu via CLUSTER SET-CONFIG-EPOCH", - (unsigned long long) myself->configEpoch); - - if (server.cluster->currentEpoch < (uint64_t)epoch) - server.cluster->currentEpoch = epoch; - /* No need to fsync the config here since in the unlucky event - * of a failure to persist the config, the conflict resolution code - * will assign a unique config to this node. */ - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } - } else if (!strcasecmp(c->argv[1]->ptr,"reset") && - (c->argc == 2 || c->argc == 3)) - { - /* CLUSTER RESET [SOFT|HARD] */ - int hard = 0; +/* ----------------------------------------------------------------------------- + * Operation(s) on channel rax tree. + * -------------------------------------------------------------------------- */ - /* Parse soft/hard argument. Default is soft. */ - if (c->argc == 3) { - if (!strcasecmp(c->argv[2]->ptr,"hard")) { - hard = 1; - } else if (!strcasecmp(c->argv[2]->ptr,"soft")) { - hard = 0; - } else { - addReplyErrorObject(c,shared.syntaxerr); - return; - } - } +void slotToChannelUpdate(sds channel, int add) { + size_t keylen = sdslen(channel); + unsigned int hashslot = keyHashSlot(channel,keylen); + unsigned char buf[64]; + unsigned char *indexed = buf; - /* Slaves can be reset while containing data, but not master nodes - * that must be empty. */ - if (nodeIsMaster(myself) && dbSize(c->db, DB_MAIN) != 0) { - addReplyError(c,"CLUSTER RESET can't be called with " - "master nodes containing keys"); - return; - } - clusterReset(hard); - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"links") && c->argc == 2) { - /* CLUSTER LINKS */ - addReplyClusterLinksDescription(c); + if (keylen+2 > 64) indexed = zmalloc(keylen+2); + indexed[0] = (hashslot >> 8) & 0xff; + indexed[1] = hashslot & 0xff; + memcpy(indexed+2,channel,keylen); + if (add) { + raxInsert(server.cluster->slots_to_channels,indexed,keylen+2,NULL,NULL); } else { - addReplySubcommandSyntaxError(c); - return; + raxRemove(server.cluster->slots_to_channels,indexed,keylen+2,NULL); } + if (indexed != buf) zfree(indexed); } -void removeChannelsInSlot(unsigned int slot) { - unsigned int channelcount = countChannelsInSlot(slot); - if (channelcount == 0) return; +void slotToChannelAdd(sds channel) { + slotToChannelUpdate(channel,1); +} - /* Retrieve all the channels for the slot. */ - robj **channels = zmalloc(sizeof(robj*)*channelcount); +void slotToChannelDel(sds channel) { + slotToChannelUpdate(channel,0); +} + +/* Get the count of the channels for a given slot. */ +unsigned int countChannelsInSlot(unsigned int hashslot) { raxIterator iter; int j = 0; unsigned char indexed[2]; - indexed[0] = (slot >> 8) & 0xff; - indexed[1] = slot & 0xff; + indexed[0] = (hashslot >> 8) & 0xff; + indexed[1] = hashslot & 0xff; raxStart(&iter,server.cluster->slots_to_channels); raxSeek(&iter,">=",indexed,2); while(raxNext(&iter)) { if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break; - channels[j++] = createStringObject((char*)iter.key + 2, iter.key_len - 2); + j++; } raxStop(&iter); + return j; +} - pubsubUnsubscribeShardChannels(channels, channelcount); - zfree(channels); +int clusterNodeIsMyself(clusterNode *n) { + return n == server.cluster->myself; } +clusterNode* getMyClusterNode(void) { + return server.cluster->myself; +} -/* ----------------------------------------------------------------------------- - * Cluster functions related to serving / redirecting clients - * -------------------------------------------------------------------------- */ +int clusterManualFailoverTimeLimit(void) { + return server.cluster->mf_end; +} -/* The ASKING command is required after a -ASK redirection. - * The client should issue ASKING before to actually send the command to - * the target instance. See the Redis Cluster specification for more - * information. */ -void askingCommand(client *c) { - if (server.cluster_enabled == 0) { - addReplyError(c,"This instance has cluster support disabled"); - return; - } - c->flags |= CLIENT_ASKING; - addReply(c,shared.ok); +char* getMyClusterId(void) { + return server.cluster->myself->name; } -/* The READONLY command is used by clients to enter the read-only mode. - * In this mode slaves will not redirect clients as long as clients access - * with read-only commands to keys that are served by the slave's master. */ -void readonlyCommand(client *c) { - if (server.cluster_enabled == 0) { - addReplyError(c,"This instance has cluster support disabled"); - return; - } - c->flags |= CLIENT_READONLY; - addReply(c,shared.ok); +int getClusterSize(void) { + return dictSize(server.cluster->nodes); } -/* The READWRITE command just clears the READONLY command state. */ -void readwriteCommand(client *c) { - if (server.cluster_enabled == 0) { - addReplyError(c,"This instance has cluster support disabled"); - return; +char** getClusterNodesList(size_t *numnodes) { + size_t count = dictSize(server.cluster->nodes); + char **ids = zmalloc((count+1)*CLUSTER_NAMELEN); + dictIterator *di = dictGetIterator(server.cluster->nodes); + dictEntry *de; + int j = 0; + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + if (node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) continue; + ids[j] = zmalloc(CLUSTER_NAMELEN); + memcpy(ids[j],node->name,CLUSTER_NAMELEN); + j++; } - c->flags &= ~CLIENT_READONLY; - addReply(c,shared.ok); + *numnodes = j; + ids[j] = NULL; /* Null term so that FreeClusterNodesList does not need + * to also get the count argument. */ + dictReleaseIterator(di); + return ids; } -/* Return the pointer to the cluster node that is able to serve the command. - * For the function to succeed the command should only target either: - * - * 1) A single key (even multiple times like RPOPLPUSH mylist mylist). - * 2) Multiple keys in the same hash slot, while the slot is stable (no - * resharding in progress). - * - * On success the function returns the node that is able to serve the request. - * If the node is not 'myself' a redirection must be performed. The kind of - * redirection is specified setting the integer passed by reference - * 'error_code', which will be set to CLUSTER_REDIR_ASK or - * CLUSTER_REDIR_MOVED. - * - * When the node is 'myself' 'error_code' is set to CLUSTER_REDIR_NONE. - * - * If the command fails NULL is returned, and the reason of the failure is - * provided via 'error_code', which will be set to: - * - * CLUSTER_REDIR_CROSS_SLOT if the request contains multiple keys that - * don't belong to the same hash slot. - * - * CLUSTER_REDIR_UNSTABLE if the request contains multiple keys - * belonging to the same slot, but the slot is not stable (in migration or - * importing state, likely because a resharding is in progress). - * - * CLUSTER_REDIR_DOWN_UNBOUND if the request addresses a slot which is - * not bound to any node. In this case the cluster global state should be - * already "down" but it is fragile to rely on the update of the global state, - * so we also handle it here. - * - * CLUSTER_REDIR_DOWN_STATE and CLUSTER_REDIR_DOWN_RO_STATE if the cluster is - * down but the user attempts to execute a command that addresses one or more keys. */ -clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *error_code) { - clusterNode *n = NULL; - robj *firstkey = NULL; - int multiple_keys = 0; - multiState *ms, _ms; - multiCmd mc; - int i, slot = 0, migrating_slot = 0, importing_slot = 0, missing_keys = 0, - existing_keys = 0; +int nodeIsMaster(clusterNode *n) { + return n->flags & CLUSTER_NODE_MASTER; +} - /* Allow any key to be set if a module disabled cluster redirections. */ - if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) - return myself; +int handleDebugClusterCommand(client *c) { + if (strcasecmp(c->argv[1]->ptr, "CLUSTERLINK") || + strcasecmp(c->argv[2]->ptr, "KILL") || + c->argc != 5) { + return 0; + } - /* Set error code optimistically for the base case. */ - if (error_code) *error_code = CLUSTER_REDIR_NONE; + if (!server.cluster_enabled) { + addReplyError(c, "Debug option only available for cluster mode enabled setup!"); + return 1; + } - /* Modules can turn off Redis Cluster redirection: this is useful - * when writing a module that implements a completely different - * distributed system. */ + /* Find the node. */ + clusterNode *n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); + if (!n) { + addReplyErrorFormat(c, "Unknown node %s", (char *) c->argv[4]->ptr); + return 1; + } - /* We handle all the cases as if they were EXEC commands, so we have - * a common code path for everything */ - if (cmd->proc == execCommand) { - /* If CLIENT_MULTI flag is not set EXEC is just going to return an - * error. */ - if (!(c->flags & CLIENT_MULTI)) return myself; - ms = &c->mstate; + /* Terminate the link based on the direction or all. */ + if (!strcasecmp(c->argv[3]->ptr, "from")) { + freeClusterLink(n->inbound_link); + } else if (!strcasecmp(c->argv[3]->ptr, "to")) { + freeClusterLink(n->link); + } else if (!strcasecmp(c->argv[3]->ptr, "all")) { + freeClusterLink(n->link); + freeClusterLink(n->inbound_link); } else { - /* In order to have a single codepath create a fake Multi State - * structure if the client is not in MULTI/EXEC state, this way - * we have a single codepath below. */ - ms = &_ms; - _ms.commands = &mc; - _ms.count = 1; - mc.argv = argv; - mc.argc = argc; - mc.cmd = cmd; + addReplyErrorFormat(c, "Unknown direction %s", (char *) c->argv[3]->ptr); } + addReply(c, shared.ok); - int is_pubsubshard = cmd->proc == ssubscribeCommand || - cmd->proc == sunsubscribeCommand || - cmd->proc == spublishCommand; + return 1; +} - /* Check that all the keys are in the same hash slot, and obtain this - * slot and the node associated. */ - for (i = 0; i < ms->count; i++) { - struct redisCommand *mcmd; - robj **margv; - int margc, numkeys, j; - keyReference *keyindex; +int clusterNodePending(clusterNode *node) { + return node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE); +} - mcmd = ms->commands[i].cmd; - margc = ms->commands[i].argc; - margv = ms->commands[i].argv; +char* clusterNodeIp(clusterNode *node) { + return node->ip; +} - getKeysResult result = GETKEYS_RESULT_INIT; - numkeys = getKeysFromCommand(mcmd,margv,margc,&result); - keyindex = result.keys; +int clusterNodeIsSlave(clusterNode *node) { + return node->flags & CLUSTER_NODE_SLAVE; +} - for (j = 0; j < numkeys; j++) { - robj *thiskey = margv[keyindex[j].pos]; - int thisslot = keyHashSlot((char*)thiskey->ptr, - sdslen(thiskey->ptr)); +clusterNode *clusterNodeGetSlaveof(clusterNode *node) { + return node->slaveof; +} - if (firstkey == NULL) { - /* This is the first key we see. Check what is the slot - * and node. */ - firstkey = thiskey; - slot = thisslot; - n = server.cluster->slots[slot]; +char* clusterNodeGetName(clusterNode *node) { + return node->name; +} - /* Error: If a slot is not served, we are in "cluster down" - * state. However the state is yet to be updated, so this was - * not trapped earlier in processCommand(). Report the same - * error to the client. */ - if (n == NULL) { - getKeysFreeResult(&result); - if (error_code) - *error_code = CLUSTER_REDIR_DOWN_UNBOUND; - return NULL; - } +int clusterNodeTimedOut(clusterNode *node) { + return nodeTimedOut(node); +} - /* If we are migrating or importing this slot, we need to check - * if we have all the keys in the request (the only way we - * can safely serve the request, otherwise we return a TRYAGAIN - * error). To do so we set the importing/migrating state and - * increment a counter for every missing key. */ - if (n == myself && - server.cluster->migrating_slots_to[slot] != NULL) - { - migrating_slot = 1; - } else if (server.cluster->importing_slots_from[slot] != NULL) { - importing_slot = 1; - } - } else { - /* If it is not the first key/channel, make sure it is exactly - * the same key/channel as the first we saw. */ - if (slot != thisslot) { - /* Error: multiple keys from different slots. */ - getKeysFreeResult(&result); - if (error_code) - *error_code = CLUSTER_REDIR_CROSS_SLOT; - return NULL; - } - if (importing_slot && !multiple_keys && !equalStringObjects(firstkey,thiskey)) { - /* Flag this request as one with multiple different - * keys/channels when the slot is in importing state. */ - multiple_keys = 1; - } - } +int clusterNodeIsFailing(clusterNode *node) { + return nodeFailed(node); +} - /* Migrating / Importing slot? Count keys we don't have. - * If it is pubsubshard command, it isn't required to check - * the channel being present or not in the node during the - * slot migration, the channel will be served from the source - * node until the migration completes with CLUSTER SETSLOT - * NODE . */ - int flags = LOOKUP_NOTOUCH | LOOKUP_NOSTATS | LOOKUP_NONOTIFY | LOOKUP_NOEXPIRE; - if ((migrating_slot || importing_slot) && !is_pubsubshard) - { - if (lookupKeyReadWithFlags(&server.db[0], thiskey, flags) == NULL) missing_keys++; - else existing_keys++; - } - } - getKeysFreeResult(&result); - } +int clusterNodeIsNoFailover(clusterNode *node) { + return node->flags & CLUSTER_NODE_NOFAILOVER; +} - /* No key at all in command? then we can serve the request - * without redirections or errors in all the cases. */ - if (n == NULL) return myself; +const char **clusterDebugCommandHelp(void) { + static const char *help[] = { + "CLUSTERLINK KILL ", + " Kills the link based on the direction to/from (both) with the provided node.", + NULL + }; - uint64_t cmd_flags = getCommandFlags(c); - /* Cluster is globally down but we got keys? We only serve the request - * if it is a read command and when allow_reads_when_down is enabled. */ - if (server.cluster->state != CLUSTER_OK) { - if (is_pubsubshard) { - if (!server.cluster_allow_pubsubshard_when_down) { - if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE; - return NULL; - } - } else if (!server.cluster_allow_reads_when_down) { - /* The cluster is configured to block commands when the - * cluster is down. */ - if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE; - return NULL; - } else if (cmd_flags & CMD_WRITE) { - /* The cluster is configured to allow read only commands */ - if (error_code) *error_code = CLUSTER_REDIR_DOWN_RO_STATE; - return NULL; - } else { - /* Fall through and allow the command to be executed: - * this happens when server.cluster_allow_reads_when_down is - * true and the command is not a write command */ - } - } + return help; +} - /* Return the hashslot by reference. */ - if (hashslot) *hashslot = slot; +char* clusterNodeGetShardId(clusterNode *node) { + return node->shard_id; +} - /* MIGRATE always works in the context of the local node if the slot - * is open (migrating or importing state). We need to be able to freely - * move keys among instances in this case. */ - if ((migrating_slot || importing_slot) && cmd->proc == migrateCommand) - return myself; +int clusterCommandSpecial(client *c) { + if (!strcasecmp(c->argv[1]->ptr,"meet") && (c->argc == 4 || c->argc == 5)) { + /* CLUSTER MEET [cport] */ + long long port, cport; - /* If we don't have all the keys and we are migrating the slot, send - * an ASK redirection or TRYAGAIN. */ - if (migrating_slot && missing_keys) { - /* If we have keys but we don't have all keys, we return TRYAGAIN */ - if (existing_keys) { - if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE; - return NULL; + if (getLongLongFromObject(c->argv[3], &port) != C_OK) { + addReplyErrorFormat(c,"Invalid base port specified: %s", + (char*)c->argv[3]->ptr); + return 1; + } + + if (c->argc == 5) { + if (getLongLongFromObject(c->argv[4], &cport) != C_OK) { + addReplyErrorFormat(c,"Invalid bus port specified: %s", + (char*)c->argv[4]->ptr); + return 1; + } } else { - if (error_code) *error_code = CLUSTER_REDIR_ASK; - return server.cluster->migrating_slots_to[slot]; + cport = port + CLUSTER_PORT_INCR; } - } - /* If we are receiving the slot, and the client correctly flagged the - * request as "ASKING", we can serve the request. However if the request - * involves multiple keys and we don't have them all, the only option is - * to send a TRYAGAIN error. */ - if (importing_slot && - (c->flags & CLIENT_ASKING || cmd_flags & CMD_ASKING)) - { - if (multiple_keys && missing_keys) { - if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE; - return NULL; + if (clusterStartHandshake(c->argv[2]->ptr,port,cport) == 0 && + errno == EINVAL) + { + addReplyErrorFormat(c,"Invalid node address specified: %s:%s", + (char*)c->argv[2]->ptr, (char*)c->argv[3]->ptr); } else { - return myself; + addReply(c,shared.ok); } - } + } else if (!strcasecmp(c->argv[1]->ptr,"flushslots") && c->argc == 2) { + /* CLUSTER FLUSHSLOTS */ + if (dbSize(&server.db[0], DB_MAIN) != 0) { + addReplyError(c,"DB must be empty to perform CLUSTER FLUSHSLOTS."); + return 1; + } + clusterDelNodeSlots(myself); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } else if ((!strcasecmp(c->argv[1]->ptr,"addslots") || + !strcasecmp(c->argv[1]->ptr,"delslots")) && c->argc >= 3) { + /* CLUSTER ADDSLOTS [slot] ... */ + /* CLUSTER DELSLOTS [slot] ... */ + int j, slot; + unsigned char *slots = zmalloc(CLUSTER_SLOTS); + int del = !strcasecmp(c->argv[1]->ptr,"delslots"); - /* Handle the read-only client case reading from a slave: if this - * node is a slave and the request is about a hash slot our master - * is serving, we can reply without redirection. */ - int is_write_command = (cmd_flags & CMD_WRITE) || - (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE)); - if (((c->flags & CLIENT_READONLY) || is_pubsubshard) && - !is_write_command && - nodeIsSlave(myself) && - myself->slaveof == n) - { - return myself; - } + memset(slots,0,CLUSTER_SLOTS); + /* Check that all the arguments are parseable.*/ + for (j = 2; j < c->argc; j++) { + if ((slot = getSlotOrReply(c,c->argv[j])) == C_ERR) { + zfree(slots); + return 1; + } + } + /* Check that the slots are not already busy. */ + for (j = 2; j < c->argc; j++) { + slot = getSlotOrReply(c,c->argv[j]); + if (checkSlotAssignmentsOrReply(c, slots, del, slot, slot) == C_ERR) { + zfree(slots); + return 1; + } + } + clusterUpdateSlots(c, slots, del); + zfree(slots); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } else if ((!strcasecmp(c->argv[1]->ptr,"addslotsrange") || + !strcasecmp(c->argv[1]->ptr,"delslotsrange")) && c->argc >= 4) { + if (c->argc % 2 == 1) { + addReplyErrorArity(c); + return 1; + } + /* CLUSTER ADDSLOTSRANGE [ ...] */ + /* CLUSTER DELSLOTSRANGE [ ...] */ + int j, startslot, endslot; + unsigned char *slots = zmalloc(CLUSTER_SLOTS); + int del = !strcasecmp(c->argv[1]->ptr,"delslotsrange"); - /* Base case: just return the right node. However if this node is not - * myself, set error_code to MOVED since we need to issue a redirection. */ - if (n != myself && error_code) *error_code = CLUSTER_REDIR_MOVED; - return n; -} + memset(slots,0,CLUSTER_SLOTS); + /* Check that all the arguments are parseable and that all the + * slots are not already busy. */ + for (j = 2; j < c->argc; j += 2) { + if ((startslot = getSlotOrReply(c,c->argv[j])) == C_ERR) { + zfree(slots); + return 1; + } + if ((endslot = getSlotOrReply(c,c->argv[j+1])) == C_ERR) { + zfree(slots); + return 1; + } + if (startslot > endslot) { + addReplyErrorFormat(c,"start slot number %d is greater than end slot number %d", startslot, endslot); + zfree(slots); + return 1; + } -/* Send the client the right redirection code, according to error_code - * that should be set to one of CLUSTER_REDIR_* macros. - * - * If CLUSTER_REDIR_ASK or CLUSTER_REDIR_MOVED error codes - * are used, then the node 'n' should not be NULL, but should be the - * node we want to mention in the redirection. Moreover hashslot should - * be set to the hash slot that caused the redirection. */ -void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code) { - if (error_code == CLUSTER_REDIR_CROSS_SLOT) { - addReplyError(c,"-CROSSSLOT Keys in request don't hash to the same slot"); - } else if (error_code == CLUSTER_REDIR_UNSTABLE) { - /* The request spawns multiple keys in the same slot, - * but the slot is not "stable" currently as there is - * a migration or import in progress. */ - addReplyError(c,"-TRYAGAIN Multiple keys request during rehashing of slot"); - } else if (error_code == CLUSTER_REDIR_DOWN_STATE) { - addReplyError(c,"-CLUSTERDOWN The cluster is down"); - } else if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) { - addReplyError(c,"-CLUSTERDOWN The cluster is down and only accepts read commands"); - } else if (error_code == CLUSTER_REDIR_DOWN_UNBOUND) { - addReplyError(c,"-CLUSTERDOWN Hash slot not served"); - } else if (error_code == CLUSTER_REDIR_MOVED || - error_code == CLUSTER_REDIR_ASK) - { - /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - int port = getNodeClientPort(n, shouldReturnTlsInfo()); - addReplyErrorSds(c,sdscatprintf(sdsempty(), - "-%s %d %s:%d", - (error_code == CLUSTER_REDIR_ASK) ? "ASK" : "MOVED", - hashslot, getPreferredEndpoint(n), port)); - } else { - serverPanic("getNodeByQuery() unknown error."); - } -} - -/* This function is called by the function processing clients incrementally - * to detect timeouts, in order to handle the following case: - * - * 1) A client blocks with BLPOP or similar blocking operation. - * 2) The master migrates the hash slot elsewhere or turns into a slave. - * 3) The client may remain blocked forever (or up to the max timeout time) - * waiting for a key change that will never happen. - * - * If the client is found to be blocked into a hash slot this node no - * longer handles, the client is sent a redirection error, and the function - * returns 1. Otherwise 0 is returned and no operation is performed. */ -int clusterRedirectBlockedClientIfNeeded(client *c) { - if (c->flags & CLIENT_BLOCKED && - (c->bstate.btype == BLOCKED_LIST || - c->bstate.btype == BLOCKED_ZSET || - c->bstate.btype == BLOCKED_STREAM || - c->bstate.btype == BLOCKED_MODULE)) - { - dictEntry *de; - dictIterator *di; + if (checkSlotAssignmentsOrReply(c, slots, del, startslot, endslot) == C_ERR) { + zfree(slots); + return 1; + } + } + clusterUpdateSlots(c, slots, del); + zfree(slots); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"setslot") && c->argc >= 4) { + /* SETSLOT 10 MIGRATING */ + /* SETSLOT 10 IMPORTING */ + /* SETSLOT 10 STABLE */ + /* SETSLOT 10 NODE */ + int slot; + clusterNode *n; - /* If the cluster is down, unblock the client with the right error. - * If the cluster is configured to allow reads on cluster down, we - * still want to emit this error since a write will be required - * to unblock them which may never come. */ - if (server.cluster->state == CLUSTER_FAIL) { - clusterRedirectClient(c,NULL,0,CLUSTER_REDIR_DOWN_STATE); + if (nodeIsSlave(myself)) { + addReplyError(c,"Please use SETSLOT only with masters."); return 1; } - /* If the client is blocked on module, but not on a specific key, - * don't unblock it (except for the CLUSTER_FAIL case above). */ - if (c->bstate.btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c)) - return 0; - - /* All keys must belong to the same slot, so check first key only. */ - di = dictGetIterator(c->bstate.keys); - if ((de = dictNext(di)) != NULL) { - robj *key = dictGetKey(de); - int slot = keyHashSlot((char*)key->ptr, sdslen(key->ptr)); - clusterNode *node = server.cluster->slots[slot]; + if ((slot = getSlotOrReply(c, c->argv[2])) == -1) return 1; - /* if the client is read-only and attempting to access key that our - * replica can handle, allow it. */ - if ((c->flags & CLIENT_READONLY) && - !(c->lastcmd->flags & CMD_WRITE) && - nodeIsSlave(myself) && myself->slaveof == node) - { - node = myself; + if (!strcasecmp(c->argv[3]->ptr,"migrating") && c->argc == 5) { + if (server.cluster->slots[slot] != myself) { + addReplyErrorFormat(c,"I'm not the owner of hash slot %u",slot); + return 1; } - - /* We send an error and unblock the client if: - * 1) The slot is unassigned, emitting a cluster down error. - * 2) The slot is not handled by this node, nor being imported. */ - if (node != myself && - server.cluster->importing_slots_from[slot] == NULL) - { - if (node == NULL) { - clusterRedirectClient(c,NULL,0, - CLUSTER_REDIR_DOWN_UNBOUND); - } else { - clusterRedirectClient(c,node,slot, - CLUSTER_REDIR_MOVED); - } - dictReleaseIterator(di); + n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); + if (n == NULL) { + addReplyErrorFormat(c,"I don't know about node %s", + (char*)c->argv[4]->ptr); return 1; } - } - dictReleaseIterator(di); - } - return 0; -} - -/* Remove all the keys in the specified hash slot. - * The number of removed items is returned. */ -unsigned int delKeysInSlot(unsigned int hashslot) { - unsigned int j = 0; - - dictIterator *iter = NULL; - dictEntry *de = NULL; - iter = dictGetSafeIterator(server.db->dict[hashslot]); - while((de = dictNext(iter)) != NULL) { - sds sdskey = dictGetKey(de); - robj *key = createStringObject(sdskey, sdslen(sdskey)); - dbDelete(&server.db[0], key); - propagateDeletion(&server.db[0], key, server.lazyfree_lazy_server_del); - signalModifiedKey(NULL, &server.db[0], key); - /* The keys are not actually logically deleted from the database, just moved to another node. - * The modules needs to know that these keys are no longer available locally, so just send the - * keyspace notification to the modules, but not to clients. */ - moduleNotifyKeyspaceEvent(NOTIFY_GENERIC, "del", key, server.db[0].id); - postExecutionUnitOperations(); - decrRefCount(key); - j++; - server.dirty++; - } - dictReleaseIterator(iter); - - return j; -} - -unsigned int countKeysInSlot(unsigned int slot) { - return dictSize(server.db->dict[slot]); -} - -/* ----------------------------------------------------------------------------- - * Operation(s) on channel rax tree. - * -------------------------------------------------------------------------- */ - -void slotToChannelUpdate(sds channel, int add) { - size_t keylen = sdslen(channel); - unsigned int hashslot = keyHashSlot(channel,keylen); - unsigned char buf[64]; - unsigned char *indexed = buf; + if (nodeIsSlave(n)) { + addReplyError(c,"Target node is not a master"); + return 1; + } + server.cluster->migrating_slots_to[slot] = n; + } else if (!strcasecmp(c->argv[3]->ptr,"importing") && c->argc == 5) { + if (server.cluster->slots[slot] == myself) { + addReplyErrorFormat(c, + "I'm already the owner of hash slot %u",slot); + return 1; + } + n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); + if (n == NULL) { + addReplyErrorFormat(c,"I don't know about node %s", + (char*)c->argv[4]->ptr); + return 1; + } + if (nodeIsSlave(n)) { + addReplyError(c,"Target node is not a master"); + return 1; + } + server.cluster->importing_slots_from[slot] = n; + } else if (!strcasecmp(c->argv[3]->ptr,"stable") && c->argc == 4) { + /* CLUSTER SETSLOT STABLE */ + server.cluster->importing_slots_from[slot] = NULL; + server.cluster->migrating_slots_to[slot] = NULL; + } else if (!strcasecmp(c->argv[3]->ptr,"node") && c->argc == 5) { + /* CLUSTER SETSLOT NODE */ + n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); + if (!n) { + addReplyErrorFormat(c,"Unknown node %s", + (char*)c->argv[4]->ptr); + return 1; + } + if (nodeIsSlave(n)) { + addReplyError(c,"Target node is not a master"); + return 1; + } + /* If this hash slot was served by 'myself' before to switch + * make sure there are no longer local keys for this hash slot. */ + if (server.cluster->slots[slot] == myself && n != myself) { + if (countKeysInSlot(slot) != 0) { + addReplyErrorFormat(c, + "Can't assign hashslot %d to a different node " + "while I still hold keys for this hash slot.", slot); + return 1; + } + } + /* If this slot is in migrating status but we have no keys + * for it assigning the slot to another node will clear + * the migrating status. */ + if (countKeysInSlot(slot) == 0 && + server.cluster->migrating_slots_to[slot]) + server.cluster->migrating_slots_to[slot] = NULL; - if (keylen+2 > 64) indexed = zmalloc(keylen+2); - indexed[0] = (hashslot >> 8) & 0xff; - indexed[1] = hashslot & 0xff; - memcpy(indexed+2,channel,keylen); - if (add) { - raxInsert(server.cluster->slots_to_channels,indexed,keylen+2,NULL,NULL); - } else { - raxRemove(server.cluster->slots_to_channels,indexed,keylen+2,NULL); - } - if (indexed != buf) zfree(indexed); -} + int slot_was_mine = server.cluster->slots[slot] == myself; + clusterDelSlot(slot); + clusterAddSlot(n,slot); -void slotToChannelAdd(sds channel) { - slotToChannelUpdate(channel,1); -} + /* If we are a master left without slots, we should turn into a + * replica of the new master. */ + if (slot_was_mine && + n != myself && + myself->numslots == 0 && + server.cluster_allow_replica_migration) { + serverLog(LL_NOTICE, + "Configuration change detected. Reconfiguring myself " + "as a replica of %.40s (%s)", n->name, n->human_nodename); + clusterSetMaster(n); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | + CLUSTER_TODO_UPDATE_STATE | + CLUSTER_TODO_FSYNC_CONFIG); + } -void slotToChannelDel(sds channel) { - slotToChannelUpdate(channel,0); -} + /* If this node was importing this slot, assigning the slot to + * itself also clears the importing status. */ + if (n == myself && + server.cluster->importing_slots_from[slot]) { + /* This slot was manually migrated, set this node configEpoch + * to a new epoch so that the new version can be propagated + * by the cluster. + * + * Note that if this ever results in a collision with another + * node getting the same configEpoch, for example because a + * failover happens at the same time we close the slot, the + * configEpoch collision resolution will fix it assigning + * a different epoch to each node. */ + if (clusterBumpConfigEpochWithoutConsensus() == C_OK) { + serverLog(LL_NOTICE, + "configEpoch updated after importing slot %d", slot); + } + server.cluster->importing_slots_from[slot] = NULL; + /* After importing this slot, let the other nodes know as + * soon as possible. */ + clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + } + } else { + addReplyError(c, + "Invalid CLUSTER SETSLOT action or number of arguments. Try CLUSTER HELP"); + return 1; + } + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"bumpepoch") && c->argc == 2) { + /* CLUSTER BUMPEPOCH */ + int retval = clusterBumpConfigEpochWithoutConsensus(); + sds reply = sdscatprintf(sdsempty(),"+%s %llu\r\n", + (retval == C_OK) ? "BUMPED" : "STILL", + (unsigned long long) myself->configEpoch); + addReplySds(c,reply); + } else if (!strcasecmp(c->argv[1]->ptr,"saveconfig") && c->argc == 2) { + int retval = clusterSaveConfig(1); -/* Get the count of the channels for a given slot. */ -unsigned int countChannelsInSlot(unsigned int hashslot) { - raxIterator iter; - int j = 0; - unsigned char indexed[2]; + if (retval == 0) + addReply(c,shared.ok); + else + addReplyErrorFormat(c,"error saving the cluster node config: %s", + strerror(errno)); + } else if (!strcasecmp(c->argv[1]->ptr,"forget") && c->argc == 3) { + /* CLUSTER FORGET */ + clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + if (!n) { + if (clusterBlacklistExists((char*)c->argv[2]->ptr)) + /* Already forgotten. The deletion may have been gossipped by + * another node, so we pretend it succeeded. */ + addReply(c,shared.ok); + else + addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); + return 1; + } else if (n == myself) { + addReplyError(c,"I tried hard but I can't forget myself..."); + return 1; + } else if (nodeIsSlave(myself) && myself->slaveof == n) { + addReplyError(c,"Can't forget my master!"); + return 1; + } + clusterBlacklistAddNode(n); + clusterDelNode(n); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"replicate") && c->argc == 3) { + /* CLUSTER REPLICATE */ + /* Lookup the specified node in our table. */ + clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + if (!n) { + addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); + return 1; + } - indexed[0] = (hashslot >> 8) & 0xff; - indexed[1] = hashslot & 0xff; - raxStart(&iter,server.cluster->slots_to_channels); - raxSeek(&iter,">=",indexed,2); - while(raxNext(&iter)) { - if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break; - j++; - } - raxStop(&iter); - return j; -} + /* I can't replicate myself. */ + if (n == myself) { + addReplyError(c,"Can't replicate myself"); + return 1; + } -int clusterNodeIsMyself(clusterNode *n) { - return n == server.cluster->myself; -} + /* Can't replicate a slave. */ + if (nodeIsSlave(n)) { + addReplyError(c,"I can only replicate a master, not a replica."); + return 1; + } -clusterNode* getMyClusterNode(void) { - return server.cluster->myself; -} + /* If the instance is currently a master, it should have no assigned + * slots nor keys to accept to replicate some other node. + * Slaves can switch to another master without issues. */ + if (nodeIsMaster(myself) && + (myself->numslots != 0 || dbSize(&server.db[0], DB_MAIN) != 0)) { + addReplyError(c, + "To set a master the node must be empty and " + "without assigned slots."); + return 1; + } -int clusterManualFailoverTimeLimit(void) { - return server.cluster->mf_end; -} + /* Set the master. */ + clusterSetMaster(n); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"count-failure-reports") && + c->argc == 3) + { + /* CLUSTER COUNT-FAILURE-REPORTS */ + clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); -char* getMyClusterId(void) { - return server.cluster->myself->name; -} + if (!n) { + addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); + return 1; + } else { + addReplyLongLong(c,clusterNodeFailureReportsCount(n)); + } + } else if (!strcasecmp(c->argv[1]->ptr,"failover") && + (c->argc == 2 || c->argc == 3)) + { + /* CLUSTER FAILOVER [FORCE|TAKEOVER] */ + int force = 0, takeover = 0; -int getClusterSize(void) { - return dictSize(server.cluster->nodes); -} + if (c->argc == 3) { + if (!strcasecmp(c->argv[2]->ptr,"force")) { + force = 1; + } else if (!strcasecmp(c->argv[2]->ptr,"takeover")) { + takeover = 1; + force = 1; /* Takeover also implies force. */ + } else { + addReplyErrorObject(c,shared.syntaxerr); + return 1; + } + } -char** getClusterNodesList(size_t *numnodes) { - size_t count = dictSize(server.cluster->nodes); - char **ids = zmalloc((count+1)*CLUSTER_NAMELEN); - dictIterator *di = dictGetIterator(server.cluster->nodes); - dictEntry *de; - int j = 0; - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - if (node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) continue; - ids[j] = zmalloc(CLUSTER_NAMELEN); - memcpy(ids[j],node->name,CLUSTER_NAMELEN); - j++; - } - *numnodes = j; - ids[j] = NULL; /* Null term so that FreeClusterNodesList does not need - * to also get the count argument. */ - dictReleaseIterator(di); - return ids; -} + /* Check preconditions. */ + if (nodeIsMaster(myself)) { + addReplyError(c,"You should send CLUSTER FAILOVER to a replica"); + return 1; + } else if (myself->slaveof == NULL) { + addReplyError(c,"I'm a replica but my master is unknown to me"); + return 1; + } else if (!force && + (nodeFailed(myself->slaveof) || + myself->slaveof->link == NULL)) + { + addReplyError(c,"Master is down or failed, " + "please use CLUSTER FAILOVER FORCE"); + return 1; + } + resetManualFailover(); + server.cluster->mf_end = mstime() + CLUSTER_MF_TIMEOUT; -int nodeIsMaster(clusterNode *n) { - return n->flags & CLUSTER_NODE_MASTER; -} + if (takeover) { + /* A takeover does not perform any initial check. It just + * generates a new configuration epoch for this node without + * consensus, claims the master's slots, and broadcast the new + * configuration. */ + serverLog(LL_NOTICE,"Taking over the master (user request)."); + clusterBumpConfigEpochWithoutConsensus(); + clusterFailoverReplaceYourMaster(); + } else if (force) { + /* If this is a forced failover, we don't need to talk with our + * master to agree about the offset. We just failover taking over + * it without coordination. */ + serverLog(LL_NOTICE,"Forced failover user request accepted."); + server.cluster->mf_can_start = 1; + } else { + serverLog(LL_NOTICE,"Manual failover user request accepted."); + clusterSendMFStart(myself->slaveof); + } + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"set-config-epoch") && c->argc == 3) + { + /* CLUSTER SET-CONFIG-EPOCH + * + * The user is allowed to set the config epoch only when a node is + * totally fresh: no config epoch, no other known node, and so forth. + * This happens at cluster creation time to start with a cluster where + * every node has a different node ID, without to rely on the conflicts + * resolution system which is too slow when a big cluster is created. */ + long long epoch; -int handleDebugClusterCommand(client *c) { - if (strcasecmp(c->argv[1]->ptr, "CLUSTERLINK") || - strcasecmp(c->argv[2]->ptr, "KILL") || - c->argc != 5) { - return 0; - } + if (getLongLongFromObjectOrReply(c,c->argv[2],&epoch,NULL) != C_OK) + return 1; - if (!server.cluster_enabled) { - addReplyError(c, "Debug option only available for cluster mode enabled setup!"); - return 1; - } + if (epoch < 0) { + addReplyErrorFormat(c,"Invalid config epoch specified: %lld",epoch); + } else if (dictSize(server.cluster->nodes) > 1) { + addReplyError(c,"The user can assign a config epoch only when the " + "node does not know any other node."); + } else if (myself->configEpoch != 0) { + addReplyError(c,"Node config epoch is already non-zero"); + } else { + myself->configEpoch = epoch; + serverLog(LL_NOTICE, + "configEpoch set to %llu via CLUSTER SET-CONFIG-EPOCH", + (unsigned long long) myself->configEpoch); - /* Find the node. */ - clusterNode *n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); - if (!n) { - addReplyErrorFormat(c, "Unknown node %s", (char *) c->argv[4]->ptr); - return 1; - } + if (server.cluster->currentEpoch < (uint64_t)epoch) + server.cluster->currentEpoch = epoch; + /* No need to fsync the config here since in the unlucky event + * of a failure to persist the config, the conflict resolution code + * will assign a unique config to this node. */ + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } + } else if (!strcasecmp(c->argv[1]->ptr,"reset") && + (c->argc == 2 || c->argc == 3)) + { + /* CLUSTER RESET [SOFT|HARD] */ + int hard = 0; - /* Terminate the link based on the direction or all. */ - if (!strcasecmp(c->argv[3]->ptr, "from")) { - freeClusterLink(n->inbound_link); - } else if (!strcasecmp(c->argv[3]->ptr, "to")) { - freeClusterLink(n->link); - } else if (!strcasecmp(c->argv[3]->ptr, "all")) { - freeClusterLink(n->link); - freeClusterLink(n->inbound_link); + /* Parse soft/hard argument. Default is soft. */ + if (c->argc == 3) { + if (!strcasecmp(c->argv[2]->ptr,"hard")) { + hard = 1; + } else if (!strcasecmp(c->argv[2]->ptr,"soft")) { + hard = 0; + } else { + addReplyErrorObject(c,shared.syntaxerr); + return 1; + } + } + + /* Slaves can be reset while containing data, but not master nodes + * that must be empty. */ + if (nodeIsMaster(myself) && dbSize(c->db, DB_MAIN) != 0) { + addReplyError(c,"CLUSTER RESET can't be called with " + "master nodes containing keys"); + return 1; + } + clusterReset(hard); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"links") && c->argc == 2) { + /* CLUSTER LINKS */ + addReplyClusterLinksDescription(c); } else { - addReplyErrorFormat(c, "Unknown direction %s", (char *) c->argv[3]->ptr); + return 0; } - addReply(c, shared.ok); return 1; } -int clusterNodeConfirmedReachable(clusterNode *node) { - return !(node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)); -} - -char* clusterNodeIp(clusterNode *node) { - return node->ip; -} - -int clusterNodeIsSlave(clusterNode *node) { - return !nodeIsMaster(node); -} - -clusterNode *clusterNodeGetSlaveof(clusterNode *node) { - return node->slaveof; -} - -char* clusterNodeGetName(clusterNode *node) { - return node->name; -} - -int clusterNodeTimedOut(clusterNode *node) { - return nodeTimedOut(node); -} +const char** clusterCommandSpecialHelp(void) { + static const char *help[] = { + "ADDSLOTS [ ...]", + " Assign slots to current node.", + "ADDSLOTSRANGE [ ...]", + " Assign slots which are between and to current node.", + "BUMPEPOCH", + " Advance the cluster config epoch.", + "COUNT-FAILURE-REPORTS ", + " Return number of failure reports for .", + "DELSLOTS [ ...]", + " Delete slots information from current node.", + "DELSLOTSRANGE [ ...]", + " Delete slots information which are between and from current node.", + "FAILOVER [FORCE|TAKEOVER]", + " Promote current replica node to being a master.", + "FORGET ", + " Remove a node from the cluster.", + "FLUSHSLOTS", + " Delete current node own slots information.", + "MEET []", + " Connect nodes into a working cluster.", + "REPLICATE ", + " Configure current node as replica to .", + "RESET [HARD|SOFT]", + " Reset current node (default: soft).", + "SET-CONFIG-EPOCH ", + " Set config epoch of current node.", + "SETSLOT (IMPORTING |MIGRATING |STABLE|NODE )", + " Set slot state.", + "SAVECONFIG", + " Force saving cluster configuration on disk.", + "LINKS", + " Return information about all network links between this node and its peers.", + " Output format is an array where each array element is a map containing attributes of a link", + NULL + }; -int clusterNodeIsFailing(clusterNode *node) { - return nodeFailed(node); + return help; } -int clusterNodeIsNoFailover(clusterNode *node) { - return node->flags & CLUSTER_NODE_NOFAILOVER; +int getNumSlaves(clusterNode *node) { + return node->numslaves; } -char **clusterDebugCommandHelp(void) { - const char *help[] = { - "CLUSTERLINK KILL ", - " Kills the link based on the direction to/from (both) with the provided node." , - NULL - }; - - return help; +clusterNode *getSlave(clusterNode *node, int slave_idx) { + return node->slaves[slave_idx]; } diff --git a/src/module.c b/src/module.c index 115a7cbc438..5a813fb5c5d 100644 --- a/src/module.c +++ b/src/module.c @@ -8967,7 +8967,7 @@ int RM_GetClusterNodeInfo(RedisModuleCtx *ctx, const char *id, char *ip, char *m UNUSED(ctx); clusterNode *node = clusterLookupNode(id, strlen(id)); - if (node == NULL || !clusterNodeConfirmedReachable(node)) + if (node == NULL || clusterNodePending(node)) { return REDISMODULE_ERR; } From c6157b3510ca4641f1be63cdf810991441b04d85 Mon Sep 17 00:00:00 2001 From: Josh Hershberg Date: Thu, 2 Nov 2023 11:38:31 +0200 Subject: [PATCH 11/15] Cluster refactor: Make clustering functions common Move primary functions used to implement datapath clustering into cluster.c, making them shared. This required adding "accessor" and other functions to abstract access to node details and cluster state. Signed-off-by: Josh Hershberg --- src/cluster.c | 526 +++++++++++++++++++++++++++++++++++++++- src/cluster.h | 9 + src/cluster_legacy.c | 566 +++---------------------------------------- 3 files changed, 569 insertions(+), 532 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 3439dab0a2e..9b8b3b3b864 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -10,7 +10,7 @@ /* We have 16384 hash slots. The hash slot of a given key is obtained * as the least significant 14 bits of the crc16 of the key. * - * However if the key contains the {...} pattern, only the part between + * However, if the key contains the {...} pattern, only the part between * { and } is hashed. This may be useful in the future to force certain * keys to be in the same node (assuming no resharding is in progress). */ unsigned int keyHashSlot(char *key, int keylen) { @@ -754,6 +754,10 @@ void clusterCommandMyId(client *c) { } } +char* getMyClusterId(void) { + return clusterNodeGetName(getMyClusterNode()); +} + void clusterCommandMyShardId(client *c) { char *sid = clusterNodeGetShardId(getMyClusterNode()); if (sid) { @@ -918,3 +922,523 @@ void clusterCommand(client *c) { return; } } + +/* Return the pointer to the cluster node that is able to serve the command. + * For the function to succeed the command should only target either: + * + * 1) A single key (even multiple times like RPOPLPUSH mylist mylist). + * 2) Multiple keys in the same hash slot, while the slot is stable (no + * resharding in progress). + * + * On success the function returns the node that is able to serve the request. + * If the node is not 'myself' a redirection must be performed. The kind of + * redirection is specified setting the integer passed by reference + * 'error_code', which will be set to CLUSTER_REDIR_ASK or + * CLUSTER_REDIR_MOVED. + * + * When the node is 'myself' 'error_code' is set to CLUSTER_REDIR_NONE. + * + * If the command fails NULL is returned, and the reason of the failure is + * provided via 'error_code', which will be set to: + * + * CLUSTER_REDIR_CROSS_SLOT if the request contains multiple keys that + * don't belong to the same hash slot. + * + * CLUSTER_REDIR_UNSTABLE if the request contains multiple keys + * belonging to the same slot, but the slot is not stable (in migration or + * importing state, likely because a resharding is in progress). + * + * CLUSTER_REDIR_DOWN_UNBOUND if the request addresses a slot which is + * not bound to any node. In this case the cluster global state should be + * already "down" but it is fragile to rely on the update of the global state, + * so we also handle it here. + * + * CLUSTER_REDIR_DOWN_STATE and CLUSTER_REDIR_DOWN_RO_STATE if the cluster is + * down but the user attempts to execute a command that addresses one or more keys. */ +clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *error_code) { + clusterNode *myself = getMyClusterNode(); + clusterNode *n = NULL; + robj *firstkey = NULL; + int multiple_keys = 0; + multiState *ms, _ms; + multiCmd mc; + int i, slot = 0, migrating_slot = 0, importing_slot = 0, missing_keys = 0, + existing_keys = 0; + + /* Allow any key to be set if a module disabled cluster redirections. */ + if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) + return myself; + + /* Set error code optimistically for the base case. */ + if (error_code) *error_code = CLUSTER_REDIR_NONE; + + /* Modules can turn off Redis Cluster redirection: this is useful + * when writing a module that implements a completely different + * distributed system. */ + + /* We handle all the cases as if they were EXEC commands, so we have + * a common code path for everything */ + if (cmd->proc == execCommand) { + /* If CLIENT_MULTI flag is not set EXEC is just going to return an + * error. */ + if (!(c->flags & CLIENT_MULTI)) return myself; + ms = &c->mstate; + } else { + /* In order to have a single codepath create a fake Multi State + * structure if the client is not in MULTI/EXEC state, this way + * we have a single codepath below. */ + ms = &_ms; + _ms.commands = &mc; + _ms.count = 1; + mc.argv = argv; + mc.argc = argc; + mc.cmd = cmd; + } + + int is_pubsubshard = cmd->proc == ssubscribeCommand || + cmd->proc == sunsubscribeCommand || + cmd->proc == spublishCommand; + + /* Check that all the keys are in the same hash slot, and obtain this + * slot and the node associated. */ + for (i = 0; i < ms->count; i++) { + struct redisCommand *mcmd; + robj **margv; + int margc, numkeys, j; + keyReference *keyindex; + + mcmd = ms->commands[i].cmd; + margc = ms->commands[i].argc; + margv = ms->commands[i].argv; + + getKeysResult result = GETKEYS_RESULT_INIT; + numkeys = getKeysFromCommand(mcmd,margv,margc,&result); + keyindex = result.keys; + + for (j = 0; j < numkeys; j++) { + robj *thiskey = margv[keyindex[j].pos]; + int thisslot = keyHashSlot((char*)thiskey->ptr, + sdslen(thiskey->ptr)); + + if (firstkey == NULL) { + /* This is the first key we see. Check what is the slot + * and node. */ + firstkey = thiskey; + slot = thisslot; + n = getNodeBySlot(slot); + + /* Error: If a slot is not served, we are in "cluster down" + * state. However the state is yet to be updated, so this was + * not trapped earlier in processCommand(). Report the same + * error to the client. */ + if (n == NULL) { + getKeysFreeResult(&result); + if (error_code) + *error_code = CLUSTER_REDIR_DOWN_UNBOUND; + return NULL; + } + + /* If we are migrating or importing this slot, we need to check + * if we have all the keys in the request (the only way we + * can safely serve the request, otherwise we return a TRYAGAIN + * error). To do so we set the importing/migrating state and + * increment a counter for every missing key. */ + if (n == myself && + getMigratingSlotDest(slot) != NULL) + { + migrating_slot = 1; + } else if (getImportingSlotSource(slot) != NULL) { + importing_slot = 1; + } + } else { + /* If it is not the first key/channel, make sure it is exactly + * the same key/channel as the first we saw. */ + if (slot != thisslot) { + /* Error: multiple keys from different slots. */ + getKeysFreeResult(&result); + if (error_code) + *error_code = CLUSTER_REDIR_CROSS_SLOT; + return NULL; + } + if (importing_slot && !multiple_keys && !equalStringObjects(firstkey,thiskey)) { + /* Flag this request as one with multiple different + * keys/channels when the slot is in importing state. */ + multiple_keys = 1; + } + } + + /* Migrating / Importing slot? Count keys we don't have. + * If it is pubsubshard command, it isn't required to check + * the channel being present or not in the node during the + * slot migration, the channel will be served from the source + * node until the migration completes with CLUSTER SETSLOT + * NODE . */ + int flags = LOOKUP_NOTOUCH | LOOKUP_NOSTATS | LOOKUP_NONOTIFY | LOOKUP_NOEXPIRE; + if ((migrating_slot || importing_slot) && !is_pubsubshard) + { + if (lookupKeyReadWithFlags(&server.db[0], thiskey, flags) == NULL) missing_keys++; + else existing_keys++; + } + } + getKeysFreeResult(&result); + } + + /* No key at all in command? then we can serve the request + * without redirections or errors in all the cases. */ + if (n == NULL) return myself; + + uint64_t cmd_flags = getCommandFlags(c); + /* Cluster is globally down but we got keys? We only serve the request + * if it is a read command and when allow_reads_when_down is enabled. */ + if (!isClusterHealthy()) { + if (is_pubsubshard) { + if (!server.cluster_allow_pubsubshard_when_down) { + if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE; + return NULL; + } + } else if (!server.cluster_allow_reads_when_down) { + /* The cluster is configured to block commands when the + * cluster is down. */ + if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE; + return NULL; + } else if (cmd_flags & CMD_WRITE) { + /* The cluster is configured to allow read only commands */ + if (error_code) *error_code = CLUSTER_REDIR_DOWN_RO_STATE; + return NULL; + } else { + /* Fall through and allow the command to be executed: + * this happens when server.cluster_allow_reads_when_down is + * true and the command is not a write command */ + } + } + + /* Return the hashslot by reference. */ + if (hashslot) *hashslot = slot; + + /* MIGRATE always works in the context of the local node if the slot + * is open (migrating or importing state). We need to be able to freely + * move keys among instances in this case. */ + if ((migrating_slot || importing_slot) && cmd->proc == migrateCommand) + return myself; + + /* If we don't have all the keys and we are migrating the slot, send + * an ASK redirection or TRYAGAIN. */ + if (migrating_slot && missing_keys) { + /* If we have keys but we don't have all keys, we return TRYAGAIN */ + if (existing_keys) { + if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE; + return NULL; + } else { + if (error_code) *error_code = CLUSTER_REDIR_ASK; + return getMigratingSlotDest(slot); + } + } + + /* If we are receiving the slot, and the client correctly flagged the + * request as "ASKING", we can serve the request. However if the request + * involves multiple keys and we don't have them all, the only option is + * to send a TRYAGAIN error. */ + if (importing_slot && + (c->flags & CLIENT_ASKING || cmd_flags & CMD_ASKING)) + { + if (multiple_keys && missing_keys) { + if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE; + return NULL; + } else { + return myself; + } + } + + /* Handle the read-only client case reading from a slave: if this + * node is a slave and the request is about a hash slot our master + * is serving, we can reply without redirection. */ + int is_write_command = (cmd_flags & CMD_WRITE) || + (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE)); + if (((c->flags & CLIENT_READONLY) || is_pubsubshard) && + !is_write_command && + clusterNodeIsSlave(myself) && + clusterNodeGetSlaveof(myself) == n) + { + return myself; + } + + /* Base case: just return the right node. However, if this node is not + * myself, set error_code to MOVED since we need to issue a redirection. */ + if (n != myself && error_code) *error_code = CLUSTER_REDIR_MOVED; + return n; +} + +/* Send the client the right redirection code, according to error_code + * that should be set to one of CLUSTER_REDIR_* macros. + * + * If CLUSTER_REDIR_ASK or CLUSTER_REDIR_MOVED error codes + * are used, then the node 'n' should not be NULL, but should be the + * node we want to mention in the redirection. Moreover hashslot should + * be set to the hash slot that caused the redirection. */ +void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code) { + if (error_code == CLUSTER_REDIR_CROSS_SLOT) { + addReplyError(c,"-CROSSSLOT Keys in request don't hash to the same slot"); + } else if (error_code == CLUSTER_REDIR_UNSTABLE) { + /* The request spawns multiple keys in the same slot, + * but the slot is not "stable" currently as there is + * a migration or import in progress. */ + addReplyError(c,"-TRYAGAIN Multiple keys request during rehashing of slot"); + } else if (error_code == CLUSTER_REDIR_DOWN_STATE) { + addReplyError(c,"-CLUSTERDOWN The cluster is down"); + } else if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) { + addReplyError(c,"-CLUSTERDOWN The cluster is down and only accepts read commands"); + } else if (error_code == CLUSTER_REDIR_DOWN_UNBOUND) { + addReplyError(c,"-CLUSTERDOWN Hash slot not served"); + } else if (error_code == CLUSTER_REDIR_MOVED || + error_code == CLUSTER_REDIR_ASK) + { + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + int port = getNodeClientPort(n, shouldReturnTlsInfo()); + addReplyErrorSds(c,sdscatprintf(sdsempty(), + "-%s %d %s:%d", + (error_code == CLUSTER_REDIR_ASK) ? "ASK" : "MOVED", + hashslot, getPreferredEndpoint(n), port)); + } else { + serverPanic("getNodeByQuery() unknown error."); + } +} + +/* This function is called by the function processing clients incrementally + * to detect timeouts, in order to handle the following case: + * + * 1) A client blocks with BLPOP or similar blocking operation. + * 2) The master migrates the hash slot elsewhere or turns into a slave. + * 3) The client may remain blocked forever (or up to the max timeout time) + * waiting for a key change that will never happen. + * + * If the client is found to be blocked into a hash slot this node no + * longer handles, the client is sent a redirection error, and the function + * returns 1. Otherwise 0 is returned and no operation is performed. */ +int clusterRedirectBlockedClientIfNeeded(client *c) { + clusterNode *myself = getMyClusterNode(); + if (c->flags & CLIENT_BLOCKED && + (c->bstate.btype == BLOCKED_LIST || + c->bstate.btype == BLOCKED_ZSET || + c->bstate.btype == BLOCKED_STREAM || + c->bstate.btype == BLOCKED_MODULE)) + { + dictEntry *de; + dictIterator *di; + + /* If the cluster is down, unblock the client with the right error. + * If the cluster is configured to allow reads on cluster down, we + * still want to emit this error since a write will be required + * to unblock them which may never come. */ + if (!isClusterHealthy()) { + clusterRedirectClient(c,NULL,0,CLUSTER_REDIR_DOWN_STATE); + return 1; + } + + /* If the client is blocked on module, but not on a specific key, + * don't unblock it (except for the CLUSTER_FAIL case above). */ + if (c->bstate.btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c)) + return 0; + + /* All keys must belong to the same slot, so check first key only. */ + di = dictGetIterator(c->bstate.keys); + if ((de = dictNext(di)) != NULL) { + robj *key = dictGetKey(de); + int slot = keyHashSlot((char*)key->ptr, sdslen(key->ptr)); + clusterNode *node = getNodeBySlot(slot); + + /* if the client is read-only and attempting to access key that our + * replica can handle, allow it. */ + if ((c->flags & CLIENT_READONLY) && + !(c->lastcmd->flags & CMD_WRITE) && + clusterNodeIsSlave(myself) && clusterNodeGetSlaveof(myself) == node) + { + node = myself; + } + + /* We send an error and unblock the client if: + * 1) The slot is unassigned, emitting a cluster down error. + * 2) The slot is not handled by this node, nor being imported. */ + if (node != myself && getImportingSlotSource(slot) == NULL) + { + if (node == NULL) { + clusterRedirectClient(c,NULL,0, + CLUSTER_REDIR_DOWN_UNBOUND); + } else { + clusterRedirectClient(c,node,slot, + CLUSTER_REDIR_MOVED); + } + dictReleaseIterator(di); + return 1; + } + } + dictReleaseIterator(di); + } + return 0; +} + +/* Returns an indication if the replica node is fully available + * and should be listed in CLUSTER SLOTS response. + * Returns 1 for available nodes, 0 for nodes that have + * not finished their initial sync, in failed state, or are + * otherwise considered not available to serve read commands. */ +static int isReplicaAvailable(clusterNode *node) { + if (clusterNodeIsFailing(node)) { + return 0; + } + long long repl_offset = getReplOffset(node); + if (clusterNodeIsMyself(node)) { + /* Nodes do not update their own information + * in the cluster node list. */ + repl_offset = replicationGetSlaveOffset(); + } + return (repl_offset != 0); +} + +void addNodeToNodeReply(client *c, clusterNode *node) { + char* hostname = clusterNodeHostname(node); + addReplyArrayLen(c, 4); + if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_IP) { + addReplyBulkCString(c, clusterNodeIp(node)); + } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_HOSTNAME) { + if (hostname != NULL && hostname[0] != '\0') { + addReplyBulkCString(c, hostname); + } else { + addReplyBulkCString(c, "?"); + } + } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT) { + addReplyNull(c); + } else { + serverPanic("Unrecognized preferred endpoint type"); + } + + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + addReplyLongLong(c, getNodeClientPort(node, shouldReturnTlsInfo())); + addReplyBulkCBuffer(c, clusterNodeGetName(node), CLUSTER_NAMELEN); + + /* Add the additional endpoint information, this is all the known networking information + * that is not the preferred endpoint. Note the logic is evaluated twice so we can + * correctly report the number of additional network arguments without using a deferred + * map, an assertion is made at the end to check we set the right length. */ + int length = 0; + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) { + length++; + } + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME + && hostname != NULL && hostname[0] != '\0') + { + length++; + } + addReplyMapLen(c, length); + + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) { + addReplyBulkCString(c, "ip"); + addReplyBulkCString(c, clusterNodeIp(node)); + length--; + } + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME + && hostname != NULL && hostname[0] != '\0') + { + addReplyBulkCString(c, "hostname"); + addReplyBulkCString(c, hostname); + length--; + } + serverAssert(length == 0); +} + +void addNodeReplyForClusterSlot(client *c, clusterNode *node, int start_slot, int end_slot) { + int i, nested_elements = 3; /* slots (2) + master addr (1) */ + for (i = 0; i < getNumSlaves(node); i++) { + if (!isReplicaAvailable(getSlave(node, i))) continue; + nested_elements++; + } + addReplyArrayLen(c, nested_elements); + addReplyLongLong(c, start_slot); + addReplyLongLong(c, end_slot); + addNodeToNodeReply(c, node); + + /* Remaining nodes in reply are replicas for slot range */ + for (i = 0; i < getNumSlaves(node); i++) { + /* This loop is copy/pasted from clusterGenNodeDescription() + * with modifications for per-slot node aggregation. */ + if (!isReplicaAvailable(getSlave(node, i))) continue; + addNodeToNodeReply(c, getSlave(node, i)); + nested_elements--; + } + serverAssert(nested_elements == 3); /* Original 3 elements */ +} + +void clusterCommandSlots(client * c) { + /* Format: 1) 1) start slot + * 2) end slot + * 3) 1) master IP + * 2) master port + * 3) node ID + * 4) 1) replica IP + * 2) replica port + * 3) node ID + * ... continued until done + */ + clusterNode *n = NULL; + int num_masters = 0, start = -1; + void *slot_replylen = addReplyDeferredLen(c); + + for (int i = 0; i <= CLUSTER_SLOTS; i++) { + /* Find start node and slot id. */ + if (n == NULL) { + if (i == CLUSTER_SLOTS) break; + n = getNodeBySlot(i); + start = i; + continue; + } + + /* Add cluster slots info when occur different node with start + * or end of slot. */ + if (i == CLUSTER_SLOTS || n != getNodeBySlot(i)) { + addNodeReplyForClusterSlot(c, n, start, i-1); + num_masters++; + if (i == CLUSTER_SLOTS) break; + n = getNodeBySlot(i); + start = i; + } + } + setDeferredArrayLen(c, slot_replylen, num_masters); +} + +/* ----------------------------------------------------------------------------- + * Cluster functions related to serving / redirecting clients + * -------------------------------------------------------------------------- */ + +/* The ASKING command is required after a -ASK redirection. + * The client should issue ASKING before to actually send the command to + * the target instance. See the Redis Cluster specification for more + * information. */ +void askingCommand(client *c) { + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; + } + c->flags |= CLIENT_ASKING; + addReply(c,shared.ok); +} + +/* The READONLY command is used by clients to enter the read-only mode. + * In this mode slaves will not redirect clients as long as clients access + * with read-only commands to keys that are served by the slave's master. */ +void readonlyCommand(client *c) { + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; + } + c->flags |= CLIENT_READONLY; + addReply(c,shared.ok); +} + +/* The READWRITE command just clears the READONLY command state. */ +void readwriteCommand(client *c) { + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; + } + c->flags &= ~CLIENT_READONLY; + addReply(c,shared.ok); +} diff --git a/src/cluster.h b/src/cluster.h index 5160582b387..9f6e482f43d 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -87,6 +87,15 @@ void clusterCommandShards(client *c); sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary); int getNumSlaves(clusterNode *node); clusterNode *getSlave(clusterNode *node, int slave_idx); +clusterNode *getMigratingSlotDest(int slot); +clusterNode *getImportingSlotSource(int slot); +int isClusterHealthy(void); +clusterNode *getNodeBySlot(int slot); +int getNodeClientPort(clusterNode *n, int use_tls); +char* clusterNodeHostname(clusterNode *node); +const char *getPreferredEndpoint(clusterNode *n); +void migrateCommand(client *c); +long long getReplOffset(clusterNode *node); char **clusterDebugCommandHelp(void); ConnectionType *connTypeOfCluster(void); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 2b080ef891d..0fd8b0a207f 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -112,7 +112,7 @@ static inline int getNodeDefaultReplicationPort(clusterNode *n) { return server.tls_replication ? n->tls_port : n->tcp_port; } -static inline int getNodeClientPort(clusterNode *n, int use_tls) { +int getNodeClientPort(clusterNode *n, int use_tls) { return use_tls ? n->tls_port : n->tcp_port; } @@ -5402,15 +5402,6 @@ void addReplyClusterLinksDescription(client *c) { * CLUSTER command * -------------------------------------------------------------------------- */ -const char *getPreferredEndpoint(clusterNode *n) { - switch(server.cluster_preferred_endpoint_type) { - case CLUSTER_ENDPOINT_TYPE_IP: return n->ip; - case CLUSTER_ENDPOINT_TYPE_HOSTNAME: return (sdslen(n->hostname) != 0) ? n->hostname : "?"; - case CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT: return ""; - } - return "unknown"; -} - const char *clusterGetMessageTypeString(int type) { switch(type) { case CLUSTERMSG_TYPE_PING: return "ping"; @@ -5440,24 +5431,6 @@ int getSlotOrReply(client *c, robj *o) { return (int) slot; } -/* Returns an indication if the replica node is fully available - * and should be listed in CLUSTER SLOTS response. - * Returns 1 for available nodes, 0 for nodes that have - * not finished their initial sync, in failed state, or are - * otherwise considered not available to serve read commands. */ -static int isReplicaAvailable(clusterNode *node) { - if (nodeFailed(node)) { - return 0; - } - long long repl_offset = node->repl_offset; - if (node->flags & CLUSTER_NODE_MYSELF) { - /* Nodes do not update their own information - * in the cluster node list. */ - repl_offset = replicationGetSlaveOffset(); - } - return (repl_offset != 0); -} - int checkSlotAssignmentsOrReply(client *c, unsigned char *slots, int del, int start_slot, int end_slot) { int slot; for (slot = start_slot; slot <= end_slot; slot++) { @@ -5494,78 +5467,6 @@ void clusterUpdateSlots(client *c, unsigned char *slots, int del) { } } -void addNodeToNodeReply(client *c, clusterNode *node) { - addReplyArrayLen(c, 4); - if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_IP) { - addReplyBulkCString(c, node->ip); - } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_HOSTNAME) { - if (sdslen(node->hostname) != 0) { - addReplyBulkCBuffer(c, node->hostname, sdslen(node->hostname)); - } else { - addReplyBulkCString(c, "?"); - } - } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT) { - addReplyNull(c); - } else { - serverPanic("Unrecognized preferred endpoint type"); - } - - /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - addReplyLongLong(c, getNodeClientPort(node, shouldReturnTlsInfo())); - addReplyBulkCBuffer(c, node->name, CLUSTER_NAMELEN); - - /* Add the additional endpoint information, this is all the known networking information - * that is not the preferred endpoint. Note the logic is evaluated twice so we can - * correctly report the number of additional network arguments without using a deferred - * map, an assertion is made at the end to check we set the right length. */ - int length = 0; - if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) { - length++; - } - if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME - && sdslen(node->hostname) != 0) - { - length++; - } - addReplyMapLen(c, length); - - if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) { - addReplyBulkCString(c, "ip"); - addReplyBulkCString(c, node->ip); - length--; - } - if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME - && sdslen(node->hostname) != 0) - { - addReplyBulkCString(c, "hostname"); - addReplyBulkCBuffer(c, node->hostname, sdslen(node->hostname)); - length--; - } - serverAssert(length == 0); -} - -void addNodeReplyForClusterSlot(client *c, clusterNode *node, int start_slot, int end_slot) { - int i, nested_elements = 3; /* slots (2) + master addr (1) */ - for (i = 0; i < node->numslaves; i++) { - if (!isReplicaAvailable(node->slaves[i])) continue; - nested_elements++; - } - addReplyArrayLen(c, nested_elements); - addReplyLongLong(c, start_slot); - addReplyLongLong(c, end_slot); - addNodeToNodeReply(c, node); - - /* Remaining nodes in reply are replicas for slot range */ - for (i = 0; i < node->numslaves; i++) { - /* This loop is copy/pasted from clusterGenNodeDescription() - * with modifications for per-slot node aggregation. */ - if (!isReplicaAvailable(node->slaves[i])) continue; - addNodeToNodeReply(c, node->slaves[i]); - nested_elements--; - } - serverAssert(nested_elements == 3); /* Original 3 elements */ -} - /* Add detailed information of a node to the output buffer of the given client. */ void addNodeDetailsToShardReply(client *c, clusterNode *node) { int reply_count = 0; @@ -5675,43 +5576,6 @@ void clusterCommandShards(client *c) { dictReleaseIterator(di); } -void clusterCommandSlots(client * c) { - /* Format: 1) 1) start slot - * 2) end slot - * 3) 1) master IP - * 2) master port - * 3) node ID - * 4) 1) replica IP - * 2) replica port - * 3) node ID - * ... continued until done - */ - clusterNode *n = NULL; - int num_masters = 0, start = -1; - void *slot_replylen = addReplyDeferredLen(c); - - for (int i = 0; i <= CLUSTER_SLOTS; i++) { - /* Find start node and slot id. */ - if (n == NULL) { - if (i == CLUSTER_SLOTS) break; - n = server.cluster->slots[i]; - start = i; - continue; - } - - /* Add cluster slots info when occur different node with start - * or end of slot. */ - if (i == CLUSTER_SLOTS || n != server.cluster->slots[i]) { - addNodeReplyForClusterSlot(c, n, start, i-1); - num_masters++; - if (i == CLUSTER_SLOTS) break; - n = server.cluster->slots[i]; - start = i; - } - } - setDeferredArrayLen(c, slot_replylen, num_masters); -} - sds genClusterInfoString(void) { sds info = sdsempty(); char *statestr[] = {"ok","fail"}; @@ -5816,396 +5680,6 @@ void removeChannelsInSlot(unsigned int slot) { } -/* ----------------------------------------------------------------------------- - * Cluster functions related to serving / redirecting clients - * -------------------------------------------------------------------------- */ - -/* The ASKING command is required after a -ASK redirection. - * The client should issue ASKING before to actually send the command to - * the target instance. See the Redis Cluster specification for more - * information. */ -void askingCommand(client *c) { - if (server.cluster_enabled == 0) { - addReplyError(c,"This instance has cluster support disabled"); - return; - } - c->flags |= CLIENT_ASKING; - addReply(c,shared.ok); -} - -/* The READONLY command is used by clients to enter the read-only mode. - * In this mode slaves will not redirect clients as long as clients access - * with read-only commands to keys that are served by the slave's master. */ -void readonlyCommand(client *c) { - if (server.cluster_enabled == 0) { - addReplyError(c,"This instance has cluster support disabled"); - return; - } - c->flags |= CLIENT_READONLY; - addReply(c,shared.ok); -} - -/* The READWRITE command just clears the READONLY command state. */ -void readwriteCommand(client *c) { - if (server.cluster_enabled == 0) { - addReplyError(c,"This instance has cluster support disabled"); - return; - } - c->flags &= ~CLIENT_READONLY; - addReply(c,shared.ok); -} - -/* Return the pointer to the cluster node that is able to serve the command. - * For the function to succeed the command should only target either: - * - * 1) A single key (even multiple times like RPOPLPUSH mylist mylist). - * 2) Multiple keys in the same hash slot, while the slot is stable (no - * resharding in progress). - * - * On success the function returns the node that is able to serve the request. - * If the node is not 'myself' a redirection must be performed. The kind of - * redirection is specified setting the integer passed by reference - * 'error_code', which will be set to CLUSTER_REDIR_ASK or - * CLUSTER_REDIR_MOVED. - * - * When the node is 'myself' 'error_code' is set to CLUSTER_REDIR_NONE. - * - * If the command fails NULL is returned, and the reason of the failure is - * provided via 'error_code', which will be set to: - * - * CLUSTER_REDIR_CROSS_SLOT if the request contains multiple keys that - * don't belong to the same hash slot. - * - * CLUSTER_REDIR_UNSTABLE if the request contains multiple keys - * belonging to the same slot, but the slot is not stable (in migration or - * importing state, likely because a resharding is in progress). - * - * CLUSTER_REDIR_DOWN_UNBOUND if the request addresses a slot which is - * not bound to any node. In this case the cluster global state should be - * already "down" but it is fragile to rely on the update of the global state, - * so we also handle it here. - * - * CLUSTER_REDIR_DOWN_STATE and CLUSTER_REDIR_DOWN_RO_STATE if the cluster is - * down but the user attempts to execute a command that addresses one or more keys. */ -clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *error_code) { - clusterNode *n = NULL; - robj *firstkey = NULL; - int multiple_keys = 0; - multiState *ms, _ms; - multiCmd mc; - int i, slot = 0, migrating_slot = 0, importing_slot = 0, missing_keys = 0, - existing_keys = 0; - - /* Allow any key to be set if a module disabled cluster redirections. */ - if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) - return myself; - - /* Set error code optimistically for the base case. */ - if (error_code) *error_code = CLUSTER_REDIR_NONE; - - /* Modules can turn off Redis Cluster redirection: this is useful - * when writing a module that implements a completely different - * distributed system. */ - - /* We handle all the cases as if they were EXEC commands, so we have - * a common code path for everything */ - if (cmd->proc == execCommand) { - /* If CLIENT_MULTI flag is not set EXEC is just going to return an - * error. */ - if (!(c->flags & CLIENT_MULTI)) return myself; - ms = &c->mstate; - } else { - /* In order to have a single codepath create a fake Multi State - * structure if the client is not in MULTI/EXEC state, this way - * we have a single codepath below. */ - ms = &_ms; - _ms.commands = &mc; - _ms.count = 1; - mc.argv = argv; - mc.argc = argc; - mc.cmd = cmd; - } - - int is_pubsubshard = cmd->proc == ssubscribeCommand || - cmd->proc == sunsubscribeCommand || - cmd->proc == spublishCommand; - - /* Check that all the keys are in the same hash slot, and obtain this - * slot and the node associated. */ - for (i = 0; i < ms->count; i++) { - struct redisCommand *mcmd; - robj **margv; - int margc, numkeys, j; - keyReference *keyindex; - - mcmd = ms->commands[i].cmd; - margc = ms->commands[i].argc; - margv = ms->commands[i].argv; - - getKeysResult result = GETKEYS_RESULT_INIT; - numkeys = getKeysFromCommand(mcmd,margv,margc,&result); - keyindex = result.keys; - - for (j = 0; j < numkeys; j++) { - robj *thiskey = margv[keyindex[j].pos]; - int thisslot = keyHashSlot((char*)thiskey->ptr, - sdslen(thiskey->ptr)); - - if (firstkey == NULL) { - /* This is the first key we see. Check what is the slot - * and node. */ - firstkey = thiskey; - slot = thisslot; - n = server.cluster->slots[slot]; - - /* Error: If a slot is not served, we are in "cluster down" - * state. However the state is yet to be updated, so this was - * not trapped earlier in processCommand(). Report the same - * error to the client. */ - if (n == NULL) { - getKeysFreeResult(&result); - if (error_code) - *error_code = CLUSTER_REDIR_DOWN_UNBOUND; - return NULL; - } - - /* If we are migrating or importing this slot, we need to check - * if we have all the keys in the request (the only way we - * can safely serve the request, otherwise we return a TRYAGAIN - * error). To do so we set the importing/migrating state and - * increment a counter for every missing key. */ - if (n == myself && - server.cluster->migrating_slots_to[slot] != NULL) - { - migrating_slot = 1; - } else if (server.cluster->importing_slots_from[slot] != NULL) { - importing_slot = 1; - } - } else { - /* If it is not the first key/channel, make sure it is exactly - * the same key/channel as the first we saw. */ - if (slot != thisslot) { - /* Error: multiple keys from different slots. */ - getKeysFreeResult(&result); - if (error_code) - *error_code = CLUSTER_REDIR_CROSS_SLOT; - return NULL; - } - if (importing_slot && !multiple_keys && !equalStringObjects(firstkey,thiskey)) { - /* Flag this request as one with multiple different - * keys/channels when the slot is in importing state. */ - multiple_keys = 1; - } - } - - /* Migrating / Importing slot? Count keys we don't have. - * If it is pubsubshard command, it isn't required to check - * the channel being present or not in the node during the - * slot migration, the channel will be served from the source - * node until the migration completes with CLUSTER SETSLOT - * NODE . */ - int flags = LOOKUP_NOTOUCH | LOOKUP_NOSTATS | LOOKUP_NONOTIFY | LOOKUP_NOEXPIRE; - if ((migrating_slot || importing_slot) && !is_pubsubshard) - { - if (lookupKeyReadWithFlags(&server.db[0], thiskey, flags) == NULL) missing_keys++; - else existing_keys++; - } - } - getKeysFreeResult(&result); - } - - /* No key at all in command? then we can serve the request - * without redirections or errors in all the cases. */ - if (n == NULL) return myself; - - uint64_t cmd_flags = getCommandFlags(c); - /* Cluster is globally down but we got keys? We only serve the request - * if it is a read command and when allow_reads_when_down is enabled. */ - if (server.cluster->state != CLUSTER_OK) { - if (is_pubsubshard) { - if (!server.cluster_allow_pubsubshard_when_down) { - if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE; - return NULL; - } - } else if (!server.cluster_allow_reads_when_down) { - /* The cluster is configured to block commands when the - * cluster is down. */ - if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE; - return NULL; - } else if (cmd_flags & CMD_WRITE) { - /* The cluster is configured to allow read only commands */ - if (error_code) *error_code = CLUSTER_REDIR_DOWN_RO_STATE; - return NULL; - } else { - /* Fall through and allow the command to be executed: - * this happens when server.cluster_allow_reads_when_down is - * true and the command is not a write command */ - } - } - - /* Return the hashslot by reference. */ - if (hashslot) *hashslot = slot; - - /* MIGRATE always works in the context of the local node if the slot - * is open (migrating or importing state). We need to be able to freely - * move keys among instances in this case. */ - if ((migrating_slot || importing_slot) && cmd->proc == migrateCommand) - return myself; - - /* If we don't have all the keys and we are migrating the slot, send - * an ASK redirection or TRYAGAIN. */ - if (migrating_slot && missing_keys) { - /* If we have keys but we don't have all keys, we return TRYAGAIN */ - if (existing_keys) { - if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE; - return NULL; - } else { - if (error_code) *error_code = CLUSTER_REDIR_ASK; - return server.cluster->migrating_slots_to[slot]; - } - } - - /* If we are receiving the slot, and the client correctly flagged the - * request as "ASKING", we can serve the request. However if the request - * involves multiple keys and we don't have them all, the only option is - * to send a TRYAGAIN error. */ - if (importing_slot && - (c->flags & CLIENT_ASKING || cmd_flags & CMD_ASKING)) - { - if (multiple_keys && missing_keys) { - if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE; - return NULL; - } else { - return myself; - } - } - - /* Handle the read-only client case reading from a slave: if this - * node is a slave and the request is about a hash slot our master - * is serving, we can reply without redirection. */ - int is_write_command = (cmd_flags & CMD_WRITE) || - (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE)); - if (((c->flags & CLIENT_READONLY) || is_pubsubshard) && - !is_write_command && - nodeIsSlave(myself) && - myself->slaveof == n) - { - return myself; - } - - /* Base case: just return the right node. However if this node is not - * myself, set error_code to MOVED since we need to issue a redirection. */ - if (n != myself && error_code) *error_code = CLUSTER_REDIR_MOVED; - return n; -} - -/* Send the client the right redirection code, according to error_code - * that should be set to one of CLUSTER_REDIR_* macros. - * - * If CLUSTER_REDIR_ASK or CLUSTER_REDIR_MOVED error codes - * are used, then the node 'n' should not be NULL, but should be the - * node we want to mention in the redirection. Moreover hashslot should - * be set to the hash slot that caused the redirection. */ -void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code) { - if (error_code == CLUSTER_REDIR_CROSS_SLOT) { - addReplyError(c,"-CROSSSLOT Keys in request don't hash to the same slot"); - } else if (error_code == CLUSTER_REDIR_UNSTABLE) { - /* The request spawns multiple keys in the same slot, - * but the slot is not "stable" currently as there is - * a migration or import in progress. */ - addReplyError(c,"-TRYAGAIN Multiple keys request during rehashing of slot"); - } else if (error_code == CLUSTER_REDIR_DOWN_STATE) { - addReplyError(c,"-CLUSTERDOWN The cluster is down"); - } else if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) { - addReplyError(c,"-CLUSTERDOWN The cluster is down and only accepts read commands"); - } else if (error_code == CLUSTER_REDIR_DOWN_UNBOUND) { - addReplyError(c,"-CLUSTERDOWN Hash slot not served"); - } else if (error_code == CLUSTER_REDIR_MOVED || - error_code == CLUSTER_REDIR_ASK) - { - /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - int port = getNodeClientPort(n, shouldReturnTlsInfo()); - addReplyErrorSds(c,sdscatprintf(sdsempty(), - "-%s %d %s:%d", - (error_code == CLUSTER_REDIR_ASK) ? "ASK" : "MOVED", - hashslot, getPreferredEndpoint(n), port)); - } else { - serverPanic("getNodeByQuery() unknown error."); - } -} - -/* This function is called by the function processing clients incrementally - * to detect timeouts, in order to handle the following case: - * - * 1) A client blocks with BLPOP or similar blocking operation. - * 2) The master migrates the hash slot elsewhere or turns into a slave. - * 3) The client may remain blocked forever (or up to the max timeout time) - * waiting for a key change that will never happen. - * - * If the client is found to be blocked into a hash slot this node no - * longer handles, the client is sent a redirection error, and the function - * returns 1. Otherwise 0 is returned and no operation is performed. */ -int clusterRedirectBlockedClientIfNeeded(client *c) { - if (c->flags & CLIENT_BLOCKED && - (c->bstate.btype == BLOCKED_LIST || - c->bstate.btype == BLOCKED_ZSET || - c->bstate.btype == BLOCKED_STREAM || - c->bstate.btype == BLOCKED_MODULE)) - { - dictEntry *de; - dictIterator *di; - - /* If the cluster is down, unblock the client with the right error. - * If the cluster is configured to allow reads on cluster down, we - * still want to emit this error since a write will be required - * to unblock them which may never come. */ - if (server.cluster->state == CLUSTER_FAIL) { - clusterRedirectClient(c,NULL,0,CLUSTER_REDIR_DOWN_STATE); - return 1; - } - - /* If the client is blocked on module, but not on a specific key, - * don't unblock it (except for the CLUSTER_FAIL case above). */ - if (c->bstate.btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c)) - return 0; - - /* All keys must belong to the same slot, so check first key only. */ - di = dictGetIterator(c->bstate.keys); - if ((de = dictNext(di)) != NULL) { - robj *key = dictGetKey(de); - int slot = keyHashSlot((char*)key->ptr, sdslen(key->ptr)); - clusterNode *node = server.cluster->slots[slot]; - - /* if the client is read-only and attempting to access key that our - * replica can handle, allow it. */ - if ((c->flags & CLIENT_READONLY) && - !(c->lastcmd->flags & CMD_WRITE) && - nodeIsSlave(myself) && myself->slaveof == node) - { - node = myself; - } - - /* We send an error and unblock the client if: - * 1) The slot is unassigned, emitting a cluster down error. - * 2) The slot is not handled by this node, nor being imported. */ - if (node != myself && - server.cluster->importing_slots_from[slot] == NULL) - { - if (node == NULL) { - clusterRedirectClient(c,NULL,0, - CLUSTER_REDIR_DOWN_UNBOUND); - } else { - clusterRedirectClient(c,node,slot, - CLUSTER_REDIR_MOVED); - } - dictReleaseIterator(di); - return 1; - } - } - dictReleaseIterator(di); - } - return 0; -} /* Remove all the keys in the specified hash slot. * The number of removed items is returned. */ @@ -6295,10 +5769,6 @@ int clusterManualFailoverTimeLimit(void) { return server.cluster->mf_end; } -char* getMyClusterId(void) { - return server.cluster->myself->name; -} - int getClusterSize(void) { return dictSize(server.cluster->nodes); } @@ -6904,3 +6374,37 @@ int getNumSlaves(clusterNode *node) { clusterNode *getSlave(clusterNode *node, int slave_idx) { return node->slaves[slave_idx]; } + +clusterNode *getMigratingSlotDest(int slot) { + return server.cluster->migrating_slots_to[slot]; +} + +clusterNode *getImportingSlotSource(int slot) { + return server.cluster->importing_slots_from[slot]; +} + +int isClusterHealthy(void) { + return server.cluster->state == CLUSTER_OK; +} + +clusterNode *getNodeBySlot(int slot) { + return server.cluster->slots[slot]; +} + +char* clusterNodeHostname(clusterNode *node) { + return node->hostname; +} + +long long getReplOffset(clusterNode *node) { + return node->repl_offset; +} + +const char *getPreferredEndpoint(clusterNode *n) { + char* hostname = clusterNodeHostname(n); + switch(server.cluster_preferred_endpoint_type) { + case CLUSTER_ENDPOINT_TYPE_IP: return clusterNodeIp(n); + case CLUSTER_ENDPOINT_TYPE_HOSTNAME: return (hostname != NULL && hostname[0] != '\0') ? hostname : "?"; + case CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT: return ""; + } + return "unknown"; +} From 2e5181ef28ff0db00cca013ba7986456bd1d32d0 Mon Sep 17 00:00:00 2001 From: Josh Hershberg Date: Sun, 5 Nov 2023 10:47:57 +0200 Subject: [PATCH 12/15] Cluster refactor: Add failover cmd support to cluster api The failover command is up until now not supported in cluster mode. This commit allows a cluster implementation to support the command. The legacy clustering implementation still does not support this command. Signed-off-by: Josh Hershberg --- src/cluster.h | 2 ++ src/cluster_legacy.c | 26 +++++++++++++++++++++----- src/replication.c | 12 +++++++----- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/src/cluster.h b/src/cluster.h index 9f6e482f43d..8b8feb24e53 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -96,6 +96,8 @@ char* clusterNodeHostname(clusterNode *node); const char *getPreferredEndpoint(clusterNode *n); void migrateCommand(client *c); long long getReplOffset(clusterNode *node); +int clusterAllowFailoverCmd(client *c); +void clusterPromoteSelfToMaster(void); char **clusterDebugCommandHelp(void); ConnectionType *connTypeOfCluster(void); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 0fd8b0a207f..72ab7428a11 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -6400,11 +6400,27 @@ long long getReplOffset(clusterNode *node) { } const char *getPreferredEndpoint(clusterNode *n) { - char* hostname = clusterNodeHostname(n); - switch(server.cluster_preferred_endpoint_type) { - case CLUSTER_ENDPOINT_TYPE_IP: return clusterNodeIp(n); - case CLUSTER_ENDPOINT_TYPE_HOSTNAME: return (hostname != NULL && hostname[0] != '\0') ? hostname : "?"; - case CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT: return ""; + char *hostname = clusterNodeHostname(n); + switch (server.cluster_preferred_endpoint_type) { + case CLUSTER_ENDPOINT_TYPE_IP: + return clusterNodeIp(n); + case CLUSTER_ENDPOINT_TYPE_HOSTNAME: + return (hostname != NULL && hostname[0] != '\0') ? hostname : "?"; + case CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT: + return ""; } return "unknown"; } + +int clusterAllowFailoverCmd(client *c) { + if (!server.cluster_enabled) { + return 1; + } + addReplyError(c,"FAILOVER not allowed in cluster mode. " + "Use CLUSTER FAILOVER command instead."); + return 0; +} + +void clusterPromoteSelfToMaster(void) { + replicationUnsetMaster(); +} diff --git a/src/replication.c b/src/replication.c index e64251663b4..e4b7c42e118 100644 --- a/src/replication.c +++ b/src/replication.c @@ -951,7 +951,11 @@ void syncCommand(client *c) { } if (!strcasecmp(c->argv[1]->ptr,server.replid)) { - replicationUnsetMaster(); + if (server.cluster_enabled) { + clusterPromoteSelfToMaster(); + } else { + replicationUnsetMaster(); + } sds client = catClientInfoString(sdsempty(),c); serverLog(LL_NOTICE, "MASTER MODE enabled (failover request from '%s')",client); @@ -4061,12 +4065,10 @@ void abortFailover(const char *err) { * will attempt forever and must be manually aborted. */ void failoverCommand(client *c) { - if (server.cluster_enabled) { - addReplyError(c,"FAILOVER not allowed in cluster mode. " - "Use CLUSTER FAILOVER command instead."); + if (!clusterAllowFailoverCmd(c)) { return; } - + /* Handle special case for abort */ if ((c->argc == 2) && !strcasecmp(c->argv[1]->ptr,"abort")) { if (server.failover_state == NO_FAILOVER) { From 13b754853c11b6234512778e620e5576e546394b Mon Sep 17 00:00:00 2001 From: Josh Hershberg Date: Mon, 6 Nov 2023 09:42:32 +0200 Subject: [PATCH 13/15] Cluster refactor: cluster.h - reorder functions into logical groups Signed-off-by: Josh Hershberg --- src/cluster.h | 67 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/src/cluster.h b/src/cluster.h index 8b8feb24e53..9b422d38567 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -33,36 +33,51 @@ struct clusterState; #define CLUSTER_MODULE_FLAG_NO_REDIRECTION (1<<2) /* ---------------------- API exported outside cluster.c -------------------- */ +/* functions requiring mechanism specific implementations */ void clusterInit(void); void clusterInitListeners(void); void clusterCron(void); void clusterBeforeSleep(void); -clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask); -clusterNode *clusterLookupNode(const char *name, int length); -int clusterRedirectBlockedClientIfNeeded(client *c); -void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code); -void migrateCloseTimedoutSockets(void); int verifyClusterConfigWithData(void); -unsigned long getClusterConnectionsCount(void); + int clusterSendModuleMessageToTarget(const char *target, uint64_t module_id, uint8_t type, const char *payload, uint32_t len); -void clusterPropagatePublish(robj *channel, robj *message, int sharded); -unsigned int keyHashSlot(char *key, int keylen); -int patternHashSlot(char *pattern, int length); + void clusterUpdateMyselfFlags(void); void clusterUpdateMyselfIp(void); -void slotToChannelAdd(sds channel); -void slotToChannelDel(sds channel); void clusterUpdateMyselfHostname(void); void clusterUpdateMyselfAnnouncedPorts(void); +void clusterUpdateMyselfHumanNodename(void); + +void slotToChannelAdd(sds channel); +void slotToChannelDel(sds channel); +void clusterPropagatePublish(robj *channel, robj *message, int sharded); + +unsigned long getClusterConnectionsCount(void); +int isClusterHealthy(void); + sds clusterGenNodesDescription(client *c, int filter, int tls_primary); sds genClusterInfoString(void); +/* handle implementation specific debug cluster commands. Return 1 if handled, 0 otherwise. */ +int handleDebugClusterCommand(client *c); +const char **clusterDebugCommandHelp(void); +/* handle implementation specific cluster commands. Return 1 if handled, 0 otherwise. */ +int clusterCommandSpecial(client *c); +const char** clusterCommandSpecialHelp(void); + +int clusterAllowFailoverCmd(client *c); +void clusterPromoteSelfToMaster(void); +int clusterManualFailoverTimeLimit(void); + +void clusterCommandSlots(client * c); +void clusterCommandMyId(client *c); +void clusterCommandMyShardId(client *c); +void clusterCommandShards(client *c); +sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary); + int clusterNodeCoversSlot(clusterNode *n, int slot); -void clusterUpdateMyselfHumanNodename(void); -int isValidAuxString(char *s, unsigned int length); int getNodeDefaultClientPort(clusterNode *n); int clusterNodeIsMyself(clusterNode *n); clusterNode* getMyClusterNode(void); -int clusterManualFailoverTimeLimit(void); char* getMyClusterId(void); int getClusterSize(void); char** getClusterNodesList(size_t *numnodes); @@ -76,29 +91,27 @@ char* clusterNodeGetName(clusterNode *node); int clusterNodeTimedOut(clusterNode *node); int clusterNodeIsFailing(clusterNode *node); int clusterNodeIsNoFailover(clusterNode *node); -void clusterCommand(client *c); -int clusterCommandSpecial(client *c); -const char** clusterCommandSpecialHelp(void); char* clusterNodeGetShardId(clusterNode *node); -void clusterCommandSlots(client * c); -void clusterCommandMyId(client *c); -void clusterCommandMyShardId(client *c); -void clusterCommandShards(client *c); -sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary); int getNumSlaves(clusterNode *node); clusterNode *getSlave(clusterNode *node, int slave_idx); clusterNode *getMigratingSlotDest(int slot); clusterNode *getImportingSlotSource(int slot); -int isClusterHealthy(void); clusterNode *getNodeBySlot(int slot); int getNodeClientPort(clusterNode *n, int use_tls); char* clusterNodeHostname(clusterNode *node); const char *getPreferredEndpoint(clusterNode *n); -void migrateCommand(client *c); long long getReplOffset(clusterNode *node); -int clusterAllowFailoverCmd(client *c); -void clusterPromoteSelfToMaster(void); +clusterNode *clusterLookupNode(const char *name, int length); -char **clusterDebugCommandHelp(void); +/* functions with shared implementations */ +clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask); +int clusterRedirectBlockedClientIfNeeded(client *c); +void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code); +void migrateCloseTimedoutSockets(void); +unsigned int keyHashSlot(char *key, int keylen); +int patternHashSlot(char *pattern, int length); +int isValidAuxString(char *s, unsigned int length); +void migrateCommand(client *c); +void clusterCommand(client *c); ConnectionType *connTypeOfCluster(void); #endif /* __CLUSTER_H */ From 290f376429ddfb43dd88215e274b2ffa3d5bcac9 Mon Sep 17 00:00:00 2001 From: Josh Hershberg Date: Thu, 9 Nov 2023 11:04:47 +0200 Subject: [PATCH 14/15] Cluster refactor: fn renames + small compilation issue on ubuntu Signed-off-by: Josh Hershberg --- src/cluster.c | 26 +++++++-------- src/cluster.h | 18 +++++----- src/cluster_legacy.c | 78 ++++++++++++++++++++++---------------------- src/debug.c | 2 +- src/module.c | 2 +- src/server.c | 4 +-- 6 files changed, 65 insertions(+), 65 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 9b8b3b3b864..74d6a4d6d19 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -812,7 +812,7 @@ void clusterCommandHelp(client *c) { NULL }; - addExtendedReplyHelp(c, help, clusterCommandSpecialHelp()); + addExtendedReplyHelp(c, help, clusterCommandExtendedHelp()); } void clusterCommand(client *c) { @@ -911,9 +911,9 @@ void clusterCommand(client *c) { } /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - addReplyArrayLen(c, getNumSlaves(n)); - for (j = 0; j < getNumSlaves(n); j++) { - sds ni = clusterGenNodeDescription(c, getSlave(n, j), shouldReturnTlsInfo()); + addReplyArrayLen(c, clusterNodeNumSlaves(n)); + for (j = 0; j < clusterNodeNumSlaves(n); j++) { + sds ni = clusterGenNodeDescription(c, clusterNodeGetSlave(n, j), shouldReturnTlsInfo()); addReplyBulkCString(c,ni); sdsfree(ni); } @@ -1193,11 +1193,11 @@ void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_co error_code == CLUSTER_REDIR_ASK) { /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - int port = getNodeClientPort(n, shouldReturnTlsInfo()); + int port = clusterNodeClientPort(n, shouldReturnTlsInfo()); addReplyErrorSds(c,sdscatprintf(sdsempty(), "-%s %d %s:%d", (error_code == CLUSTER_REDIR_ASK) ? "ASK" : "MOVED", - hashslot, getPreferredEndpoint(n), port)); + hashslot, clusterNodePreferredEndpoint(n), port)); } else { serverPanic("getNodeByQuery() unknown error."); } @@ -1285,7 +1285,7 @@ static int isReplicaAvailable(clusterNode *node) { if (clusterNodeIsFailing(node)) { return 0; } - long long repl_offset = getReplOffset(node); + long long repl_offset = clusterNodeReplOffset(node); if (clusterNodeIsMyself(node)) { /* Nodes do not update their own information * in the cluster node list. */ @@ -1312,7 +1312,7 @@ void addNodeToNodeReply(client *c, clusterNode *node) { } /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - addReplyLongLong(c, getNodeClientPort(node, shouldReturnTlsInfo())); + addReplyLongLong(c, clusterNodeClientPort(node, shouldReturnTlsInfo())); addReplyBulkCBuffer(c, clusterNodeGetName(node), CLUSTER_NAMELEN); /* Add the additional endpoint information, this is all the known networking information @@ -1347,8 +1347,8 @@ void addNodeToNodeReply(client *c, clusterNode *node) { void addNodeReplyForClusterSlot(client *c, clusterNode *node, int start_slot, int end_slot) { int i, nested_elements = 3; /* slots (2) + master addr (1) */ - for (i = 0; i < getNumSlaves(node); i++) { - if (!isReplicaAvailable(getSlave(node, i))) continue; + for (i = 0; i < clusterNodeNumSlaves(node); i++) { + if (!isReplicaAvailable(clusterNodeGetSlave(node, i))) continue; nested_elements++; } addReplyArrayLen(c, nested_elements); @@ -1357,11 +1357,11 @@ void addNodeReplyForClusterSlot(client *c, clusterNode *node, int start_slot, in addNodeToNodeReply(c, node); /* Remaining nodes in reply are replicas for slot range */ - for (i = 0; i < getNumSlaves(node); i++) { + for (i = 0; i < clusterNodeNumSlaves(node); i++) { /* This loop is copy/pasted from clusterGenNodeDescription() * with modifications for per-slot node aggregation. */ - if (!isReplicaAvailable(getSlave(node, i))) continue; - addNodeToNodeReply(c, getSlave(node, i)); + if (!isReplicaAvailable(clusterNodeGetSlave(node, i))) continue; + addNodeToNodeReply(c, clusterNodeGetSlave(node, i)); nested_elements--; } serverAssert(nested_elements == 3); /* Original 3 elements */ diff --git a/src/cluster.h b/src/cluster.h index 9b422d38567..ec64dc8b621 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -35,7 +35,7 @@ struct clusterState; /* ---------------------- API exported outside cluster.c -------------------- */ /* functions requiring mechanism specific implementations */ void clusterInit(void); -void clusterInitListeners(void); +void clusterInitLast(void); void clusterCron(void); void clusterBeforeSleep(void); int verifyClusterConfigWithData(void); @@ -59,10 +59,10 @@ sds clusterGenNodesDescription(client *c, int filter, int tls_primary); sds genClusterInfoString(void); /* handle implementation specific debug cluster commands. Return 1 if handled, 0 otherwise. */ int handleDebugClusterCommand(client *c); -const char **clusterDebugCommandHelp(void); +const char **clusterDebugCommandExtendedHelp(void); /* handle implementation specific cluster commands. Return 1 if handled, 0 otherwise. */ int clusterCommandSpecial(client *c); -const char** clusterCommandSpecialHelp(void); +const char** clusterCommandExtendedHelp(void); int clusterAllowFailoverCmd(client *c); void clusterPromoteSelfToMaster(void); @@ -81,9 +81,9 @@ clusterNode* getMyClusterNode(void); char* getMyClusterId(void); int getClusterSize(void); char** getClusterNodesList(size_t *numnodes); -int nodeIsMaster(clusterNode *n); int handleDebugClusterCommand(client *c); int clusterNodePending(clusterNode *node); +int clusterNodeIsMaster(clusterNode *n); char* clusterNodeIp(clusterNode *node); int clusterNodeIsSlave(clusterNode *node); clusterNode *clusterNodeGetSlaveof(clusterNode *node); @@ -92,15 +92,15 @@ int clusterNodeTimedOut(clusterNode *node); int clusterNodeIsFailing(clusterNode *node); int clusterNodeIsNoFailover(clusterNode *node); char* clusterNodeGetShardId(clusterNode *node); -int getNumSlaves(clusterNode *node); -clusterNode *getSlave(clusterNode *node, int slave_idx); +int clusterNodeNumSlaves(clusterNode *node); +clusterNode *clusterNodeGetSlave(clusterNode *node, int slave_idx); clusterNode *getMigratingSlotDest(int slot); clusterNode *getImportingSlotSource(int slot); clusterNode *getNodeBySlot(int slot); -int getNodeClientPort(clusterNode *n, int use_tls); +int clusterNodeClientPort(clusterNode *n, int use_tls); char* clusterNodeHostname(clusterNode *node); -const char *getPreferredEndpoint(clusterNode *n); -long long getReplOffset(clusterNode *node); +const char *clusterNodePreferredEndpoint(clusterNode *n); +long long clusterNodeReplOffset(clusterNode *node); clusterNode *clusterLookupNode(const char *name, int length); /* functions with shared implementations */ diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 72ab7428a11..98d6f38c3f0 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -41,6 +41,7 @@ #include #include #include +#include /* A global reference to myself is handy to make code more clear. * Myself always points to server.cluster->myself, that is, the clusterNode @@ -112,7 +113,7 @@ static inline int getNodeDefaultReplicationPort(clusterNode *n) { return server.tls_replication ? n->tls_port : n->tcp_port; } -int getNodeClientPort(clusterNode *n, int use_tls) { +int clusterNodeClientPort(clusterNode *n, int use_tls) { return use_tls ? n->tls_port : n->tcp_port; } @@ -1028,7 +1029,7 @@ void clusterInit(void) { clusterUpdateMyselfHumanNodename(); } -void clusterInitListeners(void) { +void clusterInitLast(void) { if (connectionIndexByType(connTypeOfCluster()->get_type(NULL)) < 0) { serverLog(LL_WARNING, "Missing connection type %s, but it is required for the Cluster bus.", connTypeOfCluster()->get_type(NULL)); exit(1); @@ -1755,7 +1756,7 @@ int clusterBumpConfigEpochWithoutConsensus(void) { void clusterHandleConfigEpochCollision(clusterNode *sender) { /* Prerequisites: nodes have the same configEpoch and are both masters. */ if (sender->configEpoch != myself->configEpoch || - !nodeIsMaster(sender) || !nodeIsMaster(myself)) return; + !clusterNodeIsMaster(sender) || !clusterNodeIsMaster(myself)) return; /* Don't act if the colliding node has a smaller Node ID. */ if (memcmp(sender->name,myself->name,CLUSTER_NAMELEN) <= 0) return; /* Get the next ID available at the best of this node knowledge. */ @@ -1877,7 +1878,7 @@ void markNodeAsFailingIfNeeded(clusterNode *node) { failures = clusterNodeFailureReportsCount(node); /* Also count myself as a voter if I'm a master. */ - if (nodeIsMaster(myself)) failures++; + if (clusterNodeIsMaster(myself)) failures++; if (failures < needed_quorum) return; /* No weak agreement from masters. */ serverLog(LL_NOTICE, @@ -1920,7 +1921,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) { * 1) The FAIL state is old enough. * 2) It is yet serving slots from our point of view (not failed over). * Apparently no one is going to fix these slots, clear the FAIL flag. */ - if (nodeIsMaster(node) && node->numslots > 0 && + if (clusterNodeIsMaster(node) && node->numslots > 0 && (now - node->fail_time) > (server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT)) { @@ -2070,7 +2071,7 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { if (node) { /* We already know this node. Handle failure reports, only when the sender is a master. */ - if (sender && nodeIsMaster(sender) && node != myself) { + if (sender && clusterNodeIsMaster(sender) && node != myself) { if (flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) { if (clusterNodeAddFailureReport(node,sender)) { serverLog(LL_VERBOSE, @@ -2233,7 +2234,7 @@ int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, * a node that we believed to be a slave is now acting as master in order to * update the state of the node. */ void clusterSetNodeAsMaster(clusterNode *n) { - if (nodeIsMaster(n)) return; + if (clusterNodeIsMaster(n)) return; if (n->slaveof) { clusterNodeRemoveSlave(n->slaveof,n); @@ -2281,7 +2282,7 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc /* Here we set curmaster to this node or the node this node * replicates to if it's a slave. In the for loop we are * interested to check if slots are taken away from curmaster. */ - curmaster = nodeIsMaster(myself) ? myself : myself->slaveof; + curmaster = clusterNodeIsMaster(myself) ? myself : myself->slaveof; if (sender == myself) { serverLog(LL_NOTICE,"Discarding UPDATE message about myself."); @@ -2920,7 +2921,7 @@ int clusterProcessPacket(clusterLink *link) { /* Node is a slave. */ clusterNode *master = clusterLookupNode(hdr->slaveof, CLUSTER_NAMELEN); - if (nodeIsMaster(sender)) { + if (clusterNodeIsMaster(sender)) { /* Master turned into a slave! Reconfigure the node. */ clusterDelNodeSlots(sender); sender->flags &= ~(CLUSTER_NODE_MASTER| @@ -2958,7 +2959,7 @@ int clusterProcessPacket(clusterLink *link) { int dirty_slots = 0; /* Sender claimed slots don't match my view? */ if (sender) { - sender_master = nodeIsMaster(sender) ? sender : sender->slaveof; + sender_master = clusterNodeIsMaster(sender) ? sender : sender->slaveof; if (sender_master) { dirty_slots = memcmp(sender_master->slots, hdr->myslots,sizeof(hdr->myslots)) != 0; @@ -2968,7 +2969,7 @@ int clusterProcessPacket(clusterLink *link) { /* 1) If the sender of the message is a master, and we detected that * the set of slots it claims changed, scan the slots to see if we * need to update our configuration. */ - if (sender && nodeIsMaster(sender) && dirty_slots) + if (sender && clusterNodeIsMaster(sender) && dirty_slots) clusterUpdateSlotsConfigWith(sender,senderConfigEpoch,hdr->myslots); /* 2) We also check for the reverse condition, that is, the sender @@ -3017,8 +3018,7 @@ int clusterProcessPacket(clusterLink *link) { /* If our config epoch collides with the sender's try to fix * the problem. */ - if (sender && - nodeIsMaster(myself) && nodeIsMaster(sender) && + if (sender && clusterNodeIsMaster(myself) && clusterNodeIsMaster(sender) && senderConfigEpoch == myself->configEpoch) { clusterHandleConfigEpochCollision(sender); @@ -3083,7 +3083,7 @@ int clusterProcessPacket(clusterLink *link) { /* We consider this vote only if the sender is a master serving * a non zero number of slots, and its currentEpoch is greater or * equal to epoch where this node started the election. */ - if (nodeIsMaster(sender) && sender->numslots > 0 && + if (clusterNodeIsMaster(sender) && sender->numslots > 0 && senderCurrentEpoch >= server.cluster->failover_auth_epoch) { server.cluster->failover_auth_count++; @@ -3438,7 +3438,7 @@ static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen) { hdr->offset = htonu64(offset); /* Set the message flags. */ - if (nodeIsMaster(myself) && server.cluster->mf_end) + if (clusterNodeIsMaster(myself) && server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_PAUSED; hdr->totlen = htonl(msglen); @@ -3869,10 +3869,10 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* Node must be a slave and its master down. * The master can be non failing if the request is flagged * with CLUSTERMSG_FLAG0_FORCEACK (manual failover). */ - if (nodeIsMaster(node) || master == NULL || + if (clusterNodeIsMaster(node) || master == NULL || (!nodeFailed(master) && !force_ack)) { - if (nodeIsMaster(node)) { + if (clusterNodeIsMaster(node)) { serverLog(LL_WARNING, "Failover auth denied to %.40s (%s): it is a master node", node->name, node->human_nodename); @@ -4043,7 +4043,7 @@ void clusterFailoverReplaceYourMaster(void) { int j; clusterNode *oldmaster = myself->slaveof; - if (nodeIsMaster(myself) || oldmaster == NULL) return; + if (clusterNodeIsMaster(myself) || oldmaster == NULL) return; /* 1) Turn this node into a master. */ clusterSetNodeAsMaster(myself); @@ -4105,7 +4105,7 @@ void clusterHandleSlaveFailover(void) { * 3) We don't have the no failover configuration set, and this is * not a manual failover. * 4) It is serving slots. */ - if (nodeIsMaster(myself) || + if (clusterNodeIsMaster(myself) || myself->slaveof == NULL || (!nodeFailed(myself->slaveof) && !manual_failover) || (server.cluster_slave_no_failover && !manual_failover) || @@ -4316,7 +4316,7 @@ void clusterHandleSlaveMigration(int max_slaves) { if (!(node->flags & CLUSTER_NODE_MIGRATE_TO)) is_orphaned = 0; /* Check number of working slaves. */ - if (nodeIsMaster(node)) okslaves = clusterCountNonFailingSlaves(node); + if (clusterNodeIsMaster(node)) okslaves = clusterCountNonFailingSlaves(node); if (okslaves > 0) is_orphaned = 0; if (is_orphaned) { @@ -4591,7 +4591,7 @@ void clusterCron(void) { /* Orphaned master check, useful only if the current instance * is a slave that may migrate to another master. */ - if (nodeIsSlave(myself) && nodeIsMaster(node) && !nodeFailed(node)) { + if (nodeIsSlave(myself) && clusterNodeIsMaster(node) && !nodeFailed(node)) { int okslaves = clusterCountNonFailingSlaves(node); /* A master is orphaned if it is serving a non-zero number of @@ -4642,7 +4642,7 @@ void clusterCron(void) { /* If we are a master and one of the slaves requested a manual * failover, ping it continuously. */ if (server.cluster->mf_end && - nodeIsMaster(myself) && + clusterNodeIsMaster(myself) && server.cluster->mf_slave == node && node->link) { @@ -4912,7 +4912,7 @@ void clusterUpdateState(void) { * the first call to this function and not since the server start, in order * to not count the DB loading time. */ if (first_call_time == 0) first_call_time = mstime(); - if (nodeIsMaster(myself) && + if (clusterNodeIsMaster(myself) && server.cluster->state == CLUSTER_FAIL && mstime() - first_call_time < CLUSTER_WRITABLE_DELAY) return; @@ -4946,7 +4946,7 @@ void clusterUpdateState(void) { while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); - if (nodeIsMaster(node) && node->numslots) { + if (clusterNodeIsMaster(node) && node->numslots) { server.cluster->size++; if ((node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) == 0) reachable_masters++; @@ -4980,7 +4980,7 @@ void clusterUpdateState(void) { rejoin_delay = CLUSTER_MIN_REJOIN_DELAY; if (new_state == CLUSTER_OK && - nodeIsMaster(myself) && + clusterNodeIsMaster(myself) && mstime() - among_minority_time < rejoin_delay) { return; @@ -5068,7 +5068,7 @@ int verifyClusterConfigWithData(void) { /* Remove all the shard channel related information not owned by the current shard. */ static inline void removeAllNotOwnedShardChannelSubscriptions(void) { if (!dictSize(server.pubsubshard_channels)) return; - clusterNode *currmaster = nodeIsMaster(myself) ? myself : myself->slaveof; + clusterNode *currmaster = clusterNodeIsMaster(myself) ? myself : myself->slaveof; for (int j = 0; j < CLUSTER_SLOTS; j++) { if (server.cluster->slots[j] != currmaster) { removeChannelsInSlot(j); @@ -5086,7 +5086,7 @@ void clusterSetMaster(clusterNode *n) { serverAssert(n != myself); serverAssert(myself->numslots == 0); - if (nodeIsMaster(myself)) { + if (clusterNodeIsMaster(myself)) { myself->flags &= ~(CLUSTER_NODE_MASTER|CLUSTER_NODE_MIGRATE_TO); myself->flags |= CLUSTER_NODE_SLAVE; clusterCloseAllSlots(); @@ -5160,7 +5160,7 @@ sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_cou sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary) { int j, start; sds ci; - int port = getNodeClientPort(node, tls_primary); + int port = clusterNodeClientPort(node, tls_primary); /* Node coordinates */ ci = sdscatlen(sdsempty(),node->name,CLUSTER_NAMELEN); @@ -5492,7 +5492,7 @@ void addNodeDetailsToShardReply(client *c, clusterNode *node) { reply_count++; addReplyBulkCString(c, "endpoint"); - addReplyBulkCString(c, getPreferredEndpoint(node)); + addReplyBulkCString(c, clusterNodePreferredEndpoint(node)); reply_count++; if (sdslen(node->hostname) != 0) { @@ -5793,7 +5793,7 @@ char** getClusterNodesList(size_t *numnodes) { return ids; } -int nodeIsMaster(clusterNode *n) { +int clusterNodeIsMaster(clusterNode *n) { return n->flags & CLUSTER_NODE_MASTER; } @@ -5864,7 +5864,7 @@ int clusterNodeIsNoFailover(clusterNode *node) { return node->flags & CLUSTER_NODE_NOFAILOVER; } -const char **clusterDebugCommandHelp(void) { +const char **clusterDebugCommandExtendedHelp(void) { static const char *help[] = { "CLUSTERLINK KILL ", " Kills the link based on the direction to/from (both) with the provided node.", @@ -6174,7 +6174,7 @@ int clusterCommandSpecial(client *c) { /* If the instance is currently a master, it should have no assigned * slots nor keys to accept to replicate some other node. * Slaves can switch to another master without issues. */ - if (nodeIsMaster(myself) && + if (clusterNodeIsMaster(myself) && (myself->numslots != 0 || dbSize(&server.db[0], DB_MAIN) != 0)) { addReplyError(c, "To set a master the node must be empty and " @@ -6217,7 +6217,7 @@ int clusterCommandSpecial(client *c) { } /* Check preconditions. */ - if (nodeIsMaster(myself)) { + if (clusterNodeIsMaster(myself)) { addReplyError(c,"You should send CLUSTER FAILOVER to a replica"); return 1; } else if (myself->slaveof == NULL) { @@ -6309,7 +6309,7 @@ int clusterCommandSpecial(client *c) { /* Slaves can be reset while containing data, but not master nodes * that must be empty. */ - if (nodeIsMaster(myself) && dbSize(c->db, DB_MAIN) != 0) { + if (clusterNodeIsMaster(myself) && dbSize(c->db, DB_MAIN) != 0) { addReplyError(c,"CLUSTER RESET can't be called with " "master nodes containing keys"); return 1; @@ -6326,7 +6326,7 @@ int clusterCommandSpecial(client *c) { return 1; } -const char** clusterCommandSpecialHelp(void) { +const char** clusterCommandExtendedHelp(void) { static const char *help[] = { "ADDSLOTS [ ...]", " Assign slots to current node.", @@ -6367,11 +6367,11 @@ const char** clusterCommandSpecialHelp(void) { return help; } -int getNumSlaves(clusterNode *node) { +int clusterNodeNumSlaves(clusterNode *node) { return node->numslaves; } -clusterNode *getSlave(clusterNode *node, int slave_idx) { +clusterNode *clusterNodeGetSlave(clusterNode *node, int slave_idx) { return node->slaves[slave_idx]; } @@ -6395,11 +6395,11 @@ char* clusterNodeHostname(clusterNode *node) { return node->hostname; } -long long getReplOffset(clusterNode *node) { +long long clusterNodeReplOffset(clusterNode *node) { return node->repl_offset; } -const char *getPreferredEndpoint(clusterNode *n) { +const char *clusterNodePreferredEndpoint(clusterNode *n) { char *hostname = clusterNodeHostname(n); switch (server.cluster_preferred_endpoint_type) { case CLUSTER_ENDPOINT_TYPE_IP: diff --git a/src/debug.c b/src/debug.c index b6ed93a79cd..2758e2b1807 100644 --- a/src/debug.c +++ b/src/debug.c @@ -498,7 +498,7 @@ void debugCommand(client *c) { " Enable or disable the reply buffer resize cron job", NULL }; - addExtendedReplyHelp(c, help, clusterDebugCommandHelp()); + addExtendedReplyHelp(c, help, clusterDebugCommandExtendedHelp()); } else if (!strcasecmp(c->argv[1]->ptr,"segfault")) { /* Compiler gives warnings about writing to a random address * e.g "*((char*)-1) = 'x';". As a workaround, we map a read-only area diff --git a/src/module.c b/src/module.c index 5a813fb5c5d..b33192e0889 100644 --- a/src/module.c +++ b/src/module.c @@ -8990,7 +8990,7 @@ int RM_GetClusterNodeInfo(RedisModuleCtx *ctx, const char *id, char *ip, char *m if (flags) { *flags = 0; if (clusterNodeIsMyself(node)) *flags |= REDISMODULE_NODE_MYSELF; - if (nodeIsMaster(node)) *flags |= REDISMODULE_NODE_MASTER; + if (clusterNodeIsMaster(node)) *flags |= REDISMODULE_NODE_MASTER; if (clusterNodeIsSlave(node)) *flags |= REDISMODULE_NODE_SLAVE; if (clusterNodeTimedOut(node)) *flags |= REDISMODULE_NODE_PFAIL; if (clusterNodeIsFailing(node)) *flags |= REDISMODULE_NODE_FAIL; diff --git a/src/server.c b/src/server.c index 32767569054..454bd969d8c 100644 --- a/src/server.c +++ b/src/server.c @@ -6838,7 +6838,7 @@ int redisIsSupervised(int mode) { int iAmMaster(void) { return ((!server.cluster_enabled && server.masterhost == NULL) || - (server.cluster_enabled && nodeIsMaster(getMyClusterNode()))); + (server.cluster_enabled && clusterNodeIsMaster(getMyClusterNode()))); } #ifdef REDIS_TEST @@ -7161,7 +7161,7 @@ int main(int argc, char **argv) { ACLLoadUsersAtStartup(); initListeners(); if (server.cluster_enabled) { - clusterInitListeners(); + clusterInitLast(); } InitServerLast(); From eebb02582676c65300f37d4c470ae0f67ecb3723 Mon Sep 17 00:00:00 2001 From: Josh Hershberg Date: Tue, 14 Nov 2023 14:32:51 +0200 Subject: [PATCH 15/15] Cluster refactor: Some code convention fixes Signed-off-by: Josh Hershberg --- src/cluster.h | 15 ++++++++------- src/cluster_legacy.c | 14 +++++++------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/cluster.h b/src/cluster.h index ec64dc8b621..02c5f67f374 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -77,28 +77,29 @@ sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary); int clusterNodeCoversSlot(clusterNode *n, int slot); int getNodeDefaultClientPort(clusterNode *n); int clusterNodeIsMyself(clusterNode *n); -clusterNode* getMyClusterNode(void); -char* getMyClusterId(void); +clusterNode *getMyClusterNode(void); +char *getMyClusterId(void); int getClusterSize(void); -char** getClusterNodesList(size_t *numnodes); int handleDebugClusterCommand(client *c); int clusterNodePending(clusterNode *node); int clusterNodeIsMaster(clusterNode *n); -char* clusterNodeIp(clusterNode *node); +char **getClusterNodesList(size_t *numnodes); +int clusterNodeIsMaster(clusterNode *n); +char *clusterNodeIp(clusterNode *node); int clusterNodeIsSlave(clusterNode *node); clusterNode *clusterNodeGetSlaveof(clusterNode *node); -char* clusterNodeGetName(clusterNode *node); +char *clusterNodeGetName(clusterNode *node); int clusterNodeTimedOut(clusterNode *node); int clusterNodeIsFailing(clusterNode *node); int clusterNodeIsNoFailover(clusterNode *node); -char* clusterNodeGetShardId(clusterNode *node); +char *clusterNodeGetShardId(clusterNode *node); int clusterNodeNumSlaves(clusterNode *node); clusterNode *clusterNodeGetSlave(clusterNode *node, int slave_idx); clusterNode *getMigratingSlotDest(int slot); clusterNode *getImportingSlotSource(int slot); clusterNode *getNodeBySlot(int slot); int clusterNodeClientPort(clusterNode *n, int use_tls); -char* clusterNodeHostname(clusterNode *node); +char *clusterNodeHostname(clusterNode *node); const char *clusterNodePreferredEndpoint(clusterNode *n); long long clusterNodeReplOffset(clusterNode *node); clusterNode *clusterLookupNode(const char *name, int length); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 98d6f38c3f0..1f957c99dc4 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -5761,7 +5761,7 @@ int clusterNodeIsMyself(clusterNode *n) { return n == server.cluster->myself; } -clusterNode* getMyClusterNode(void) { +clusterNode *getMyClusterNode(void) { return server.cluster->myself; } @@ -5773,7 +5773,7 @@ int getClusterSize(void) { return dictSize(server.cluster->nodes); } -char** getClusterNodesList(size_t *numnodes) { +char **getClusterNodesList(size_t *numnodes) { size_t count = dictSize(server.cluster->nodes); char **ids = zmalloc((count+1)*CLUSTER_NAMELEN); dictIterator *di = dictGetIterator(server.cluster->nodes); @@ -5836,7 +5836,7 @@ int clusterNodePending(clusterNode *node) { return node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE); } -char* clusterNodeIp(clusterNode *node) { +char *clusterNodeIp(clusterNode *node) { return node->ip; } @@ -5848,7 +5848,7 @@ clusterNode *clusterNodeGetSlaveof(clusterNode *node) { return node->slaveof; } -char* clusterNodeGetName(clusterNode *node) { +char *clusterNodeGetName(clusterNode *node) { return node->name; } @@ -5874,7 +5874,7 @@ const char **clusterDebugCommandExtendedHelp(void) { return help; } -char* clusterNodeGetShardId(clusterNode *node) { +char *clusterNodeGetShardId(clusterNode *node) { return node->shard_id; } @@ -6326,7 +6326,7 @@ int clusterCommandSpecial(client *c) { return 1; } -const char** clusterCommandExtendedHelp(void) { +const char **clusterCommandExtendedHelp(void) { static const char *help[] = { "ADDSLOTS [ ...]", " Assign slots to current node.", @@ -6391,7 +6391,7 @@ clusterNode *getNodeBySlot(int slot) { return server.cluster->slots[slot]; } -char* clusterNodeHostname(clusterNode *node) { +char *clusterNodeHostname(clusterNode *node) { return node->hostname; }