From c85a9b7896b9727709b71fed414bf8a117436d71 Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 12 Dec 2023 02:15:19 +0800 Subject: [PATCH 01/58] Fix delKeysInSlot server events are not executed inside an execution unit (#12745) This is a follow-up fix to #12733. We need to apply the same changes to delKeysInSlot. Refer to #12733 for more details. This PR contains some other minor cleanups / improvements to the test suite and docs. It uses the postnotifications test module in a cluster mode test which revealed a leak in the test module (fixed). --- redis.conf | 1 + src/cluster_legacy.c | 2 ++ tests/modules/postnotifications.c | 15 ++++++++++----- tests/support/server.tcl | 4 ++-- tests/unit/moduleapi/cluster.tcl | 10 +++++++--- 5 files changed, 22 insertions(+), 10 deletions(-) diff --git a/redis.conf b/redis.conf index 5296a6fa56b..c7499ce1f1f 100644 --- a/redis.conf +++ b/redis.conf @@ -51,6 +51,7 @@ # # loadmodule /path/to/my_module.so # loadmodule /path/to/other_module.so +# loadmodule /path/to/args_module.so [arg [arg ...]] ################################## NETWORK ##################################### diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 340b2dfe9f8..801becf3ef1 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -5698,6 +5698,7 @@ unsigned int delKeysInSlot(unsigned int hashslot) { dictEntry *de = NULL; iter = dictGetSafeIterator(server.db->dict[hashslot]); while((de = dictNext(iter)) != NULL) { + enterExecutionUnit(1, 0); sds sdskey = dictGetKey(de); robj *key = createStringObject(sdskey, sdslen(sdskey)); dbDelete(&server.db[0], key); @@ -5707,6 +5708,7 @@ unsigned int delKeysInSlot(unsigned int hashslot) { * The modules needs to know that these keys are no longer available locally, so just send the * keyspace notification to the modules, but not to clients. */ moduleNotifyKeyspaceEvent(NOTIFY_GENERIC, "del", key, server.db[0].id); + exitExecutionUnit(); postExecutionUnitOperations(); decrRefCount(key); j++; diff --git a/tests/modules/postnotifications.c b/tests/modules/postnotifications.c index e40cf29946a..770711bc3fd 100644 --- a/tests/modules/postnotifications.c +++ b/tests/modules/postnotifications.c @@ -75,7 +75,8 @@ static int KeySpace_NotificationExpired(RedisModuleCtx *ctx, int type, const cha REDISMODULE_NOT_USED(key); RedisModuleString *new_key = RedisModule_CreateString(NULL, "expired", 7); - RedisModule_AddPostNotificationJob(ctx, KeySpace_PostNotificationString, new_key, KeySpace_PostNotificationStringFreePD); + int res = RedisModule_AddPostNotificationJob(ctx, KeySpace_PostNotificationString, new_key, KeySpace_PostNotificationStringFreePD); + if (res == REDISMODULE_ERR) KeySpace_PostNotificationStringFreePD(new_key); return REDISMODULE_OK; } @@ -95,7 +96,8 @@ static int KeySpace_NotificationEvicted(RedisModuleCtx *ctx, int type, const cha } RedisModuleString *new_key = RedisModule_CreateString(NULL, "evicted", 7); - RedisModule_AddPostNotificationJob(ctx, KeySpace_PostNotificationString, new_key, KeySpace_PostNotificationStringFreePD); + int res = RedisModule_AddPostNotificationJob(ctx, KeySpace_PostNotificationString, new_key, KeySpace_PostNotificationStringFreePD); + if (res == REDISMODULE_ERR) KeySpace_PostNotificationStringFreePD(new_key); return REDISMODULE_OK; } @@ -121,7 +123,8 @@ static int KeySpace_NotificationString(RedisModuleCtx *ctx, int type, const char new_key = RedisModule_CreateStringPrintf(NULL, "string_changed{%s}", key_str); } - RedisModule_AddPostNotificationJob(ctx, KeySpace_PostNotificationString, new_key, KeySpace_PostNotificationStringFreePD); + int res = RedisModule_AddPostNotificationJob(ctx, KeySpace_PostNotificationString, new_key, KeySpace_PostNotificationStringFreePD); + if (res == REDISMODULE_ERR) KeySpace_PostNotificationStringFreePD(new_key); return REDISMODULE_OK; } @@ -137,7 +140,8 @@ static int KeySpace_LazyExpireInsidePostNotificationJob(RedisModuleCtx *ctx, int } RedisModuleString *new_key = RedisModule_CreateString(NULL, key_str + 5, strlen(key_str) - 5);; - RedisModule_AddPostNotificationJob(ctx, KeySpace_PostNotificationReadKey, new_key, KeySpace_PostNotificationStringFreePD); + int res = RedisModule_AddPostNotificationJob(ctx, KeySpace_PostNotificationReadKey, new_key, KeySpace_PostNotificationStringFreePD); + if (res == REDISMODULE_ERR) KeySpace_PostNotificationStringFreePD(new_key); return REDISMODULE_OK; } @@ -236,7 +240,8 @@ static void KeySpace_ServerEventCallback(RedisModuleCtx *ctx, RedisModuleEvent e KeySpace_EventPostNotificationCtx *pn_ctx = RedisModule_Alloc(sizeof(*pn_ctx)); pn_ctx->triggered_on = RedisModule_HoldString(NULL, (RedisModuleString*)key_name); pn_ctx->new_key = RedisModule_CreateString(NULL, events[subevent], strlen(events[subevent])); - RedisModule_AddPostNotificationJob(ctx, KeySpace_ServerEventPostNotification, pn_ctx, KeySpace_ServerEventPostNotificationFree); + int res = RedisModule_AddPostNotificationJob(ctx, KeySpace_ServerEventPostNotification, pn_ctx, KeySpace_ServerEventPostNotificationFree); + if (res == REDISMODULE_ERR) KeySpace_ServerEventPostNotificationFree(pn_ctx); } /* This function must be present on each Redis module. It is used in order to diff --git a/tests/support/server.tcl b/tests/support/server.tcl index 018c659e9d9..8f5659d9bd1 100644 --- a/tests/support/server.tcl +++ b/tests/support/server.tcl @@ -5,9 +5,9 @@ set ::valgrind_errors {} proc start_server_error {config_file error} { set err {} append err "Can't start the Redis server\n" - append err "CONFIGURATION:" + append err "CONFIGURATION:\n" append err [exec cat $config_file] - append err "\nERROR:" + append err "\nERROR:\n" append err [string trim $error] send_data_packet $::test_server_fd err $err } diff --git a/tests/unit/moduleapi/cluster.tcl b/tests/unit/moduleapi/cluster.tcl index 80750838729..cddd90f759b 100644 --- a/tests/unit/moduleapi/cluster.tcl +++ b/tests/unit/moduleapi/cluster.tcl @@ -163,21 +163,23 @@ start_cluster 3 0 [list config_lines $modules] { $node2_rd close } -set modules [list loadmodule [file normalize tests/modules/keyspace_events.so]] +set testmodule_keyspace_events [file normalize tests/modules/keyspace_events.so] +set testmodule_postnotifications "[file normalize tests/modules/postnotifications.so] with_key_events" +set modules [list loadmodule $testmodule_keyspace_events loadmodule $testmodule_postnotifications] start_cluster 2 2 [list config_lines $modules] { set master1 [srv 0 client] set master2 [srv -1 client] set replica1 [srv -2 client] set replica2 [srv -3 client] - + test "Verify keys deletion and notification effects happened on cluster slots change are replicated inside multi exec" { $master2 set count_dels_{4oi} 1 $master2 del count_dels_{4oi} assert_equal 1 [$master2 keyspace.get_dels] assert_equal 1 [$replica2 keyspace.get_dels] $master2 set count_dels_{4oi} 1 - + set repl [attach_to_replication_stream_on_connection -3] $master1 cluster bumpepoch @@ -195,10 +197,12 @@ start_cluster 2 2 [list config_lines $modules] { fail "replica did not increase del counter" } + # the {lpush before_deleted count_dels_{4oi}} is a post notification job registered when 'count_dels_{4oi}' was removed assert_replication_stream $repl { {multi} {del count_dels_{4oi}} {keyspace.incr_dels} + {lpush before_deleted count_dels_{4oi}} {exec} } close_replication_stream $repl From f9cc25c1dde7da159819380d8397aef25797409b Mon Sep 17 00:00:00 2001 From: Chen Tianjie Date: Wed, 13 Dec 2023 13:44:13 +0800 Subject: [PATCH 02/58] Add metric to INFO CLIENTS: pubsub_clients. (#12849) In INFO CLIENTS section, we already have blocked_clients and tracking_clients. We should add a new metric showing the number of pubsub connections, which helps performance monitoring and trouble shooting. --- src/networking.c | 4 +++- src/pubsub.c | 35 +++++++++++++++++++++++++++-------- src/server.c | 2 ++ src/server.h | 2 ++ tests/unit/info.tcl | 32 ++++++++++++++++++++++++++++++++ 5 files changed, 66 insertions(+), 9 deletions(-) diff --git a/src/networking.c b/src/networking.c index 847eee3d584..4d8daecb3d1 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1546,6 +1546,7 @@ void clearClientConnectionState(client *c) { pubsubUnsubscribeAllChannels(c,0); pubsubUnsubscribeShardAllChannels(c, 0); pubsubUnsubscribeAllPatterns(c,0); + unmarkClientAsPubSub(c); if (c->name) { decrRefCount(c->name); @@ -1556,7 +1557,7 @@ void clearClientConnectionState(client *c) { * represent the client library behind the connection. */ /* Selectively clear state flags not covered above */ - c->flags &= ~(CLIENT_ASKING|CLIENT_READONLY|CLIENT_PUBSUB|CLIENT_REPLY_OFF| + c->flags &= ~(CLIENT_ASKING|CLIENT_READONLY|CLIENT_REPLY_OFF| CLIENT_REPLY_SKIP_NEXT|CLIENT_NO_TOUCH|CLIENT_NO_EVICT); } @@ -1631,6 +1632,7 @@ void freeClient(client *c) { pubsubUnsubscribeAllChannels(c,0); pubsubUnsubscribeShardAllChannels(c, 0); pubsubUnsubscribeAllPatterns(c,0); + unmarkClientAsPubSub(c); dictRelease(c->pubsub_channels); dictRelease(c->pubsub_patterns); dictRelease(c->pubsubshard_channels); diff --git a/src/pubsub.c b/src/pubsub.c index a13c5a61fbe..2fe7a3ff56c 100644 --- a/src/pubsub.c +++ b/src/pubsub.c @@ -241,6 +241,20 @@ int clientTotalPubSubSubscriptionCount(client *c) { return clientSubscriptionsCount(c) + clientShardSubscriptionsCount(c); } +void markClientAsPubSub(client *c) { + if (!(c->flags & CLIENT_PUBSUB)) { + c->flags |= CLIENT_PUBSUB; + server.pubsub_clients++; + } +} + +void unmarkClientAsPubSub(client *c) { + if (c->flags & CLIENT_PUBSUB) { + c->flags &= ~CLIENT_PUBSUB; + server.pubsub_clients--; + } +} + /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or * 0 if the client was already subscribed to that channel. */ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { @@ -326,7 +340,7 @@ void pubsubShardUnsubscribeAllClients(robj *channel) { /* If the client has no other pubsub subscription, * move out of pubsub mode. */ if (clientTotalPubSubSubscriptionCount(c) == 0) { - c->flags &= ~CLIENT_PUBSUB; + unmarkClientAsPubSub(c); } } } @@ -546,7 +560,7 @@ void subscribeCommand(client *c) { } for (j = 1; j < c->argc; j++) pubsubSubscribeChannel(c,c->argv[j],pubSubType); - c->flags |= CLIENT_PUBSUB; + markClientAsPubSub(c); } /* UNSUBSCRIBE [channel ...] */ @@ -559,7 +573,9 @@ void unsubscribeCommand(client *c) { for (j = 1; j < c->argc; j++) pubsubUnsubscribeChannel(c,c->argv[j],1,pubSubType); } - if (clientTotalPubSubSubscriptionCount(c) == 0) c->flags &= ~CLIENT_PUBSUB; + if (clientTotalPubSubSubscriptionCount(c) == 0) { + unmarkClientAsPubSub(c); + } } /* PSUBSCRIBE pattern [pattern ...] */ @@ -579,7 +595,7 @@ void psubscribeCommand(client *c) { for (j = 1; j < c->argc; j++) pubsubSubscribePattern(c,c->argv[j]); - c->flags |= CLIENT_PUBSUB; + markClientAsPubSub(c); } /* PUNSUBSCRIBE [pattern [pattern ...]] */ @@ -592,7 +608,9 @@ void punsubscribeCommand(client *c) { for (j = 1; j < c->argc; j++) pubsubUnsubscribePattern(c,c->argv[j],1); } - if (clientTotalPubSubSubscriptionCount(c) == 0) c->flags &= ~CLIENT_PUBSUB; + if (clientTotalPubSubSubscriptionCount(c) == 0) { + unmarkClientAsPubSub(c); + } } /* This function wraps pubsubPublishMessage and also propagates the message to cluster. @@ -727,10 +745,9 @@ void ssubscribeCommand(client *c) { } pubsubSubscribeChannel(c, c->argv[j], pubSubShardType); } - c->flags |= CLIENT_PUBSUB; + markClientAsPubSub(c); } - /* SUNSUBSCRIBE [shardchannel [shardchannel ...]] */ void sunsubscribeCommand(client *c) { if (c->argc == 1) { @@ -740,7 +757,9 @@ void sunsubscribeCommand(client *c) { pubsubUnsubscribeChannel(c, c->argv[j], 1, pubSubShardType); } } - if (clientTotalPubSubSubscriptionCount(c) == 0) c->flags &= ~CLIENT_PUBSUB; + if (clientTotalPubSubSubscriptionCount(c) == 0) { + unmarkClientAsPubSub(c); + } } size_t pubsubMemOverhead(client *c) { diff --git a/src/server.c b/src/server.c index 9205cf6d43a..29282958d93 100644 --- a/src/server.c +++ b/src/server.c @@ -2758,6 +2758,7 @@ void initServer(void) { server.pubsub_channels = dictCreate(&keylistDictType); server.pubsub_patterns = dictCreate(&keylistDictType); server.pubsubshard_channels = dictCreate(&keylistDictType); + server.pubsub_clients = 0; server.cronloops = 0; server.in_exec = 0; server.busy_module_yield_flags = BUSY_MODULE_YIELD_NONE; @@ -5650,6 +5651,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { "client_recent_max_output_buffer:%zu\r\n", maxout, "blocked_clients:%d\r\n", server.blocked_clients, "tracking_clients:%d\r\n", server.tracking_clients, + "pubsub_clients:%d\r\n", server.pubsub_clients, "clients_in_timeout_table:%llu\r\n", (unsigned long long) raxSize(server.clients_timeout_table), "total_blocking_keys:%lu\r\n", blocking_keys, "total_blocking_keys_on_nokey:%lu\r\n", blocking_keys_on_nokey)); diff --git a/src/server.h b/src/server.h index 77ebb0f5b9a..a0b028f00bc 100644 --- a/src/server.h +++ b/src/server.h @@ -1990,6 +1990,7 @@ struct redisServer { int notify_keyspace_events; /* Events to propagate via Pub/Sub. This is an xor of NOTIFY_... flags. */ dict *pubsubshard_channels; /* Map shard channels to list of subscribed clients */ + unsigned int pubsub_clients; /* # of clients in Pub/Sub mode */ /* Cluster */ int cluster_enabled; /* Is cluster enabled? */ int cluster_port; /* Set the cluster port for a node. */ @@ -3199,6 +3200,7 @@ void addReplyPubsubMessage(client *c, robj *channel, robj *msg, robj *message_bu int serverPubsubSubscriptionCount(void); int serverPubsubShardSubscriptionCount(void); size_t pubsubMemOverhead(client *c); +void unmarkClientAsPubSub(client *c); /* Keyspace events notification */ void notifyKeyspaceEvent(int type, char *event, robj *key, int dbid); diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index 0459676ae26..05e4bbb07ab 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -369,5 +369,37 @@ start_server {tags {"info" "external:skip"}} { assert_equal [getInfoProperty $info client_output_buffer_limit_disconnections] {1} r config set client-output-buffer-limit $org_outbuf_limit } {OK} {logreqres:skip} ;# same as obuf-limits.tcl, skip logreqres + + test {clients: pubsub clients} { + set info [r info clients] + assert_equal [getInfoProperty $info pubsub_clients] {0} + set rd1 [redis_deferring_client] + set rd2 [redis_deferring_client] + # basic count + assert_equal {1} [ssubscribe $rd1 {chan1}] + assert_equal {1} [subscribe $rd2 {chan2}] + set info [r info clients] + assert_equal [getInfoProperty $info pubsub_clients] {2} + # unsubscribe non existing channel + assert_equal {1} [unsubscribe $rd2 {non-exist-chan}] + set info [r info clients] + assert_equal [getInfoProperty $info pubsub_clients] {2} + # count change when client unsubscribe all channels + assert_equal {0} [unsubscribe $rd2 {chan2}] + set info [r info clients] + assert_equal [getInfoProperty $info pubsub_clients] {1} + # non-pubsub clients should not be involved + assert_equal {0} [unsubscribe $rd2 {non-exist-chan}] + set info [r info clients] + assert_equal [getInfoProperty $info pubsub_clients] {1} + # close all clients + $rd1 close + $rd2 close + wait_for_condition 100 50 { + [getInfoProperty [r info clients] pubsub_clients] eq {0} + } else { + fail "pubsub clients did not clear" + } + } } } From 3c0fd2520128a69a7d5826e18a2b834fb2bcef3d Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 13 Dec 2023 21:28:13 +0800 Subject: [PATCH 03/58] Redact ACL username information and mark *-key-file-pass configs as sensitive (#12860) In #11489, we consider acl username to be sensitive information, and consider the ACL GETUSER a sensitive command and remove it from redis-cli historyfile. This PR redact username information in ACL GETUSER and ACL DELUSER from SLOWLOG, and also remove ACL DELUSER from redis-cli historyfile. This PR also mark tls-key-file-pass and tls-client-key-file-pass as sensitive config, will redact it from SLOWLOG and also remove them from redis-cli historyfile. --- src/acl.c | 7 +++++++ src/config.c | 4 ++-- src/redis-cli.c | 7 +++++-- tests/unit/slowlog.tcl | 25 +++++++++++++++++++------ 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/src/acl.c b/src/acl.c index 1d65faf6997..58a9a3972ba 100644 --- a/src/acl.c +++ b/src/acl.c @@ -2837,6 +2837,10 @@ void aclCommand(client *c) { } return; } else if (!strcasecmp(sub,"deluser") && c->argc >= 3) { + /* Initially redact all the arguments to not leak any information + * about the users. */ + for (int j = 2; j < c->argc; j++) redactClientCommandArgument(c, j); + int deleted = 0; for (int j = 2; j < c->argc; j++) { sds username = c->argv[j]->ptr; @@ -2859,6 +2863,9 @@ void aclCommand(client *c) { } addReplyLongLong(c,deleted); } else if (!strcasecmp(sub,"getuser") && c->argc == 3) { + /* Redact the username to not leak any information about the user. */ + redactClientCommandArgument(c, 2); + user *u = ACLGetUserByName(c->argv[2]->ptr,sdslen(c->argv[2]->ptr)); if (u == NULL) { addReplyNull(c); diff --git a/src/config.c b/src/config.c index 3231b244249..b152a8fa538 100644 --- a/src/config.c +++ b/src/config.c @@ -3244,10 +3244,10 @@ standardConfig static_configs[] = { createBoolConfig("tls-session-caching", NULL, MODIFIABLE_CONFIG, server.tls_ctx_config.session_caching, 1, NULL, applyTlsCfg), createStringConfig("tls-cert-file", NULL, VOLATILE_CONFIG | MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.cert_file, NULL, NULL, applyTlsCfg), createStringConfig("tls-key-file", NULL, VOLATILE_CONFIG | MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.key_file, NULL, NULL, applyTlsCfg), - createStringConfig("tls-key-file-pass", NULL, MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.key_file_pass, NULL, NULL, applyTlsCfg), + createStringConfig("tls-key-file-pass", NULL, MODIFIABLE_CONFIG | SENSITIVE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.key_file_pass, NULL, NULL, applyTlsCfg), createStringConfig("tls-client-cert-file", NULL, VOLATILE_CONFIG | MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.client_cert_file, NULL, NULL, applyTlsCfg), createStringConfig("tls-client-key-file", NULL, VOLATILE_CONFIG | MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.client_key_file, NULL, NULL, applyTlsCfg), - createStringConfig("tls-client-key-file-pass", NULL, MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.client_key_file_pass, NULL, NULL, applyTlsCfg), + createStringConfig("tls-client-key-file-pass", NULL, MODIFIABLE_CONFIG | SENSITIVE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.client_key_file_pass, NULL, NULL, applyTlsCfg), createStringConfig("tls-dh-params-file", NULL, VOLATILE_CONFIG | MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.dh_params_file, NULL, NULL, applyTlsCfg), createStringConfig("tls-ca-cert-file", NULL, VOLATILE_CONFIG | MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.ca_cert_file, NULL, NULL, applyTlsCfg), createStringConfig("tls-ca-cert-dir", NULL, VOLATILE_CONFIG | MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.ca_cert_dir, NULL, NULL, applyTlsCfg), diff --git a/src/redis-cli.c b/src/redis-cli.c index 96d667c86de..f18961eaf9d 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -3266,8 +3266,8 @@ void cliLoadPreferences(void) { /* Some commands can include sensitive information and shouldn't be put in the * history file. Currently these commands are include: * - AUTH - * - ACL SETUSER, ACL GETUSER - * - CONFIG SET masterauth/masteruser/requirepass + * - ACL DELUSER, ACL SETUSER, ACL GETUSER + * - CONFIG SET masterauth/masteruser/tls-key-file-pass/tls-client-key-file-pass/requirepass * - HELLO with [AUTH username password] * - MIGRATE with [AUTH password] or [AUTH2 username password] * - SENTINEL CONFIG SET sentinel-pass password, SENTINEL CONFIG SET sentinel-user username @@ -3277,6 +3277,7 @@ static int isSensitiveCommand(int argc, char **argv) { return 1; } else if (argc > 1 && !strcasecmp(argv[0],"acl") && ( + !strcasecmp(argv[1],"deluser") || !strcasecmp(argv[1],"setuser") || !strcasecmp(argv[1],"getuser"))) { @@ -3287,6 +3288,8 @@ static int isSensitiveCommand(int argc, char **argv) { for (int j = 2; j < argc; j = j+2) { if (!strcasecmp(argv[j],"masterauth") || !strcasecmp(argv[j],"masteruser") || + !strcasecmp(argv[j],"tls-key-file-pass") || + !strcasecmp(argv[j],"tls-client-key-file-pass") || !strcasecmp(argv[j],"requirepass")) { return 1; } diff --git a/tests/unit/slowlog.tcl b/tests/unit/slowlog.tcl index 3c547b924be..a5e8862d7b7 100644 --- a/tests/unit/slowlog.tcl +++ b/tests/unit/slowlog.tcl @@ -24,7 +24,7 @@ start_server {tags {"slowlog"} overrides {slowlog-log-slower-than 1000000}} { } {10} test {SLOWLOG - GET optional argument to limit output len works} { - + assert_equal 5 [llength [r slowlog get 5]] assert_equal 10 [llength [r slowlog get -1]] assert_equal 10 [llength [r slowlog get 20]] @@ -50,22 +50,35 @@ start_server {tags {"slowlog"} overrides {slowlog-log-slower-than 1000000}} { } {} {needs:debug} test {SLOWLOG - Certain commands are omitted that contain sensitive information} { + r config set slowlog-max-len 100 r config set slowlog-log-slower-than 0 r slowlog reset catch {r acl setuser "slowlog test user" +get +set} _ + r config set masteruser "" r config set masterauth "" + r config set requirepass "" + r config set tls-key-file-pass "" + r config set tls-client-key-file-pass "" r acl setuser slowlog-test-user +get +set + r acl getuser slowlog-test-user + r acl deluser slowlog-test-user non-existing-user r config set slowlog-log-slower-than 0 r config set slowlog-log-slower-than -1 - set slowlog_resp [r slowlog get] + set slowlog_resp [r slowlog get -1] # Make sure normal configs work, but the two sensitive # commands are omitted or redacted - assert_equal 5 [llength $slowlog_resp] - assert_equal {slowlog reset} [lindex [lindex $slowlog_resp 4] 3] + assert_equal 11 [llength $slowlog_resp] + assert_equal {slowlog reset} [lindex [lindex $slowlog_resp 10] 3] + assert_equal {acl setuser (redacted) (redacted) (redacted)} [lindex [lindex $slowlog_resp 9] 3] + assert_equal {config set masteruser (redacted)} [lindex [lindex $slowlog_resp 8] 3] + assert_equal {config set masterauth (redacted)} [lindex [lindex $slowlog_resp 7] 3] + assert_equal {config set requirepass (redacted)} [lindex [lindex $slowlog_resp 6] 3] + assert_equal {config set tls-key-file-pass (redacted)} [lindex [lindex $slowlog_resp 5] 3] + assert_equal {config set tls-client-key-file-pass (redacted)} [lindex [lindex $slowlog_resp 4] 3] assert_equal {acl setuser (redacted) (redacted) (redacted)} [lindex [lindex $slowlog_resp 3] 3] - assert_equal {config set masterauth (redacted)} [lindex [lindex $slowlog_resp 2] 3] - assert_equal {acl setuser (redacted) (redacted) (redacted)} [lindex [lindex $slowlog_resp 1] 3] + assert_equal {acl getuser (redacted)} [lindex [lindex $slowlog_resp 2] 3] + assert_equal {acl deluser (redacted) (redacted)} [lindex [lindex $slowlog_resp 1] 3] assert_equal {config set slowlog-log-slower-than 0} [lindex [lindex $slowlog_resp 0] 3] } {} {needs:repl} From e95a5d483133366be4180792fae75f64601826b8 Mon Sep 17 00:00:00 2001 From: Chen Tianjie Date: Thu, 14 Dec 2023 03:16:36 +0800 Subject: [PATCH 04/58] Support by/get options for sort(_ro) in cluster mode when pattern implies slot. (#12728) The by/get options of sort/sort_ro command used to be forbidden in cluster mode, since we are not sure which slot the pattern may be in. As the optimization done in #12536, patterns now can be mapped to slots, we should allow by/get options in cluster mode when the pattern maps to the same slot as the key. --- src/sort.c | 19 +++++++++++++------ tests/unit/sort.tcl | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 6 deletions(-) diff --git a/src/sort.c b/src/sort.c index 77f4cbbc4c6..a8b9391b117 100644 --- a/src/sort.c +++ b/src/sort.c @@ -32,6 +32,7 @@ #include "server.h" #include "pqsort.h" /* Partial qsort for SORT+LIMIT */ #include /* isnan() */ +#include "cluster.h" zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank); @@ -235,10 +236,12 @@ void sortCommandGeneric(client *c, int readonly) { if (strchr(c->argv[j+1]->ptr,'*') == NULL) { dontsort = 1; } else { - /* If BY is specified with a real pattern, we can't accept - * it in cluster mode. */ - if (server.cluster_enabled) { - addReplyError(c,"BY option of SORT denied in Cluster mode."); + /* If BY is specified with a real pattern, we can't accept it in cluster mode, + * unless we can make sure the keys formed by the pattern are in the same slot + * as the key to sort. */ + if (server.cluster_enabled && patternHashSlot(sortby->ptr, sdslen(sortby->ptr)) != c->slot) { + addReplyError(c, "BY option of SORT denied in Cluster mode when " + "keys formed by the pattern may be in different slots."); syntax_error++; break; } @@ -252,8 +255,12 @@ void sortCommandGeneric(client *c, int readonly) { } j++; } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) { - if (server.cluster_enabled) { - addReplyError(c,"GET option of SORT denied in Cluster mode."); + /* If GET is specified with a real pattern, we can't accept it in cluster mode, + * unless we can make sure the keys formed by the pattern are in the same slot + * as the key to sort. */ + if (server.cluster_enabled && patternHashSlot(c->argv[j+1]->ptr, sdslen(c->argv[j+1]->ptr)) != c->slot) { + addReplyError(c, "GET option of SORT denied in Cluster mode when " + "keys formed by the pattern may be in different slots."); syntax_error++; break; } diff --git a/tests/unit/sort.tcl b/tests/unit/sort.tcl index 109f6615e9b..eade6ea341f 100644 --- a/tests/unit/sort.tcl +++ b/tests/unit/sort.tcl @@ -357,3 +357,39 @@ foreach command {SORT SORT_RO} { } } } + +start_cluster 1 0 {tags {"external:skip cluster sort"}} { + + r flushall + r lpush "{a}mylist" 1 2 3 + r set "{a}by1" 20 + r set "{a}by2" 30 + r set "{a}by3" 0 + r set "{a}get1" 200 + r set "{a}get2" 100 + r set "{a}get3" 30 + + test "sort by in cluster mode" { + catch {r sort "{a}mylist" by by*} e + assert_match {ERR BY option of SORT denied in Cluster mode when *} $e + r sort "{a}mylist" by "{a}by*" + } {3 1 2} + + test "sort get in cluster mode" { + catch {r sort "{a}mylist" by "{a}by*" get get*} e + assert_match {ERR GET option of SORT denied in Cluster mode when *} $e + r sort "{a}mylist" by "{a}by*" get "{a}get*" + } {30 200 100} + + test "sort_ro by in cluster mode" { + catch {r sort_ro "{a}mylist" by by*} e + assert_match {ERR BY option of SORT denied in Cluster mode when *} $e + r sort_ro "{a}mylist" by "{a}by*" + } {3 1 2} + + test "sort_ro get in cluster mode" { + catch {r sort_ro "{a}mylist" by "{a}by*" get get*} e + assert_match {ERR GET option of SORT denied in Cluster mode when *} $e + r sort_ro "{a}mylist" by "{a}by*" get "{a}get*" + } {30 200 100} +} From 967fb3c6e812ed3ff4259b499a4a8401fe37b0fd Mon Sep 17 00:00:00 2001 From: Guillaume Koenig <106696198+knggk@users.noreply.github.com> Date: Thu, 14 Dec 2023 17:50:18 -0500 Subject: [PATCH 05/58] Extend rax usage by allowing any long long value (#12837) The raxFind implementation uses a special pointer value (the address of a static string) as the "not found" value. It works as long as actual pointers were used. However we've seen usages where long long, non-pointer values have been used. It creates a risk that one of the long long value precisely is the address of the special "not found" value. This commit changes raxFind to return 1 or 0 to indicate elementhood, and take in a new void **value to optionally return the associated value. By extension, this also allow the RedisModule_DictSet/Replace operations to also safely insert integers instead of just pointers. --- src/acl.c | 10 +++++----- src/module.c | 50 ++++++++++++++++++++++++++++++------------------ src/networking.c | 5 +++-- src/rax.c | 18 +++++++---------- src/rax.h | 5 +---- src/rdb.c | 5 +++-- src/server.c | 12 +++++++----- src/t_stream.c | 40 ++++++++++++++++++++++---------------- src/tracking.c | 25 ++++++++++++++++-------- 9 files changed, 97 insertions(+), 73 deletions(-) diff --git a/src/acl.c b/src/acl.c index 58a9a3972ba..841f101cb6a 100644 --- a/src/acl.c +++ b/src/acl.c @@ -437,7 +437,7 @@ aclSelector *ACLUserGetRootSelector(user *u) { * * If the user with such name already exists NULL is returned. */ user *ACLCreateUser(const char *name, size_t namelen) { - if (raxFind(Users,(unsigned char*)name,namelen) != raxNotFound) return NULL; + if (raxFind(Users,(unsigned char*)name,namelen,NULL)) return NULL; user *u = zmalloc(sizeof(*u)); u->name = sdsnewlen(name,namelen); u->flags = USER_FLAG_DISABLED; @@ -1553,8 +1553,8 @@ unsigned long ACLGetCommandID(sds cmdname) { sds lowername = sdsdup(cmdname); sdstolower(lowername); if (commandId == NULL) commandId = raxNew(); - void *id = raxFind(commandId,(unsigned char*)lowername,sdslen(lowername)); - if (id != raxNotFound) { + void *id; + if (raxFind(commandId,(unsigned char*)lowername,sdslen(lowername),&id)) { sdsfree(lowername); return (unsigned long)id; } @@ -1585,8 +1585,8 @@ void ACLClearCommandID(void) { /* Return an username by its name, or NULL if the user does not exist. */ user *ACLGetUserByName(const char *name, size_t namelen) { - void *myuser = raxFind(Users,(unsigned char*)name,namelen); - if (myuser == raxNotFound) return NULL; + void *myuser = NULL; + raxFind(Users,(unsigned char*)name,namelen,&myuser); return myuser; } diff --git a/src/module.c b/src/module.c index 96bc61e0f29..b966998c671 100644 --- a/src/module.c +++ b/src/module.c @@ -9130,7 +9130,7 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod while(1) { key = htonu64(expiretime); - if (raxFind(Timers, (unsigned char*)&key,sizeof(key)) == raxNotFound) { + if (!raxFind(Timers, (unsigned char*)&key,sizeof(key),NULL)) { raxInsert(Timers,(unsigned char*)&key,sizeof(key),timer,NULL); break; } else { @@ -9169,8 +9169,11 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod * If not NULL, the data pointer is set to the value of the data argument when * the timer was created. */ int RM_StopTimer(RedisModuleCtx *ctx, RedisModuleTimerID id, void **data) { - RedisModuleTimer *timer = raxFind(Timers,(unsigned char*)&id,sizeof(id)); - if (timer == raxNotFound || timer->module != ctx->module) + void *result; + if (!raxFind(Timers,(unsigned char*)&id,sizeof(id),&result)) + return REDISMODULE_ERR; + RedisModuleTimer *timer = result; + if (timer->module != ctx->module) return REDISMODULE_ERR; if (data) *data = timer->data; raxRemove(Timers,(unsigned char*)&id,sizeof(id),NULL); @@ -9185,8 +9188,11 @@ int RM_StopTimer(RedisModuleCtx *ctx, RedisModuleTimerID id, void **data) { * REDISMODULE_OK is returned. The arguments remaining or data can be NULL if * the caller does not need certain information. */ int RM_GetTimerInfo(RedisModuleCtx *ctx, RedisModuleTimerID id, uint64_t *remaining, void **data) { - RedisModuleTimer *timer = raxFind(Timers,(unsigned char*)&id,sizeof(id)); - if (timer == raxNotFound || timer->module != ctx->module) + void *result; + if (!raxFind(Timers,(unsigned char*)&id,sizeof(id),&result)) + return REDISMODULE_ERR; + RedisModuleTimer *timer = result; + if (timer->module != ctx->module) return REDISMODULE_ERR; if (remaining) { int64_t rem = ntohu64(id)-ustime(); @@ -9954,9 +9960,10 @@ int RM_DictReplace(RedisModuleDict *d, RedisModuleString *key, void *ptr) { * be set by reference to 1 if the key does not exist, or to 0 if the key * exists. */ void *RM_DictGetC(RedisModuleDict *d, void *key, size_t keylen, int *nokey) { - void *res = raxFind(d->rax,key,keylen); - if (nokey) *nokey = (res == raxNotFound); - return (res == raxNotFound) ? NULL : res; + void *res = NULL; + int found = raxFind(d->rax,key,keylen,&res); + if (nokey) *nokey = !found; + return res; } /* Like RedisModule_DictGetC() but takes the key as a RedisModuleString. */ @@ -10378,8 +10385,10 @@ void RM_FreeServerInfo(RedisModuleCtx *ctx, RedisModuleServerInfoData *data) { * mechanism to release the returned string. Return value will be NULL if the * field was not found. */ RedisModuleString *RM_ServerInfoGetField(RedisModuleCtx *ctx, RedisModuleServerInfoData *data, const char* field) { - sds val = raxFind(data->rax, (unsigned char *)field, strlen(field)); - if (val == raxNotFound) return NULL; + void *result; + if (!raxFind(data->rax, (unsigned char *)field, strlen(field), &result)) + return NULL; + sds val = result; RedisModuleString *o = createStringObject(val,sdslen(val)); if (ctx != NULL) autoMemoryAdd(ctx,REDISMODULE_AM_STRING,o); return o; @@ -10387,9 +10396,9 @@ RedisModuleString *RM_ServerInfoGetField(RedisModuleCtx *ctx, RedisModuleServerI /* Similar to RM_ServerInfoGetField, but returns a char* which should not be freed but the caller. */ const char *RM_ServerInfoGetFieldC(RedisModuleServerInfoData *data, const char* field) { - sds val = raxFind(data->rax, (unsigned char *)field, strlen(field)); - if (val == raxNotFound) return NULL; - return val; + void *result = NULL; + raxFind(data->rax, (unsigned char *)field, strlen(field), &result); + return result; } /* Get the value of a field from data collected with RM_GetServerInfo(). If the @@ -10397,11 +10406,12 @@ const char *RM_ServerInfoGetFieldC(RedisModuleServerInfoData *data, const char* * 0, and the optional out_err argument will be set to REDISMODULE_ERR. */ long long RM_ServerInfoGetFieldSigned(RedisModuleServerInfoData *data, const char* field, int *out_err) { long long ll; - sds val = raxFind(data->rax, (unsigned char *)field, strlen(field)); - if (val == raxNotFound) { + void *result; + if (!raxFind(data->rax, (unsigned char *)field, strlen(field), &result)) { if (out_err) *out_err = REDISMODULE_ERR; return 0; } + sds val = result; if (!string2ll(val,sdslen(val),&ll)) { if (out_err) *out_err = REDISMODULE_ERR; return 0; @@ -10415,11 +10425,12 @@ long long RM_ServerInfoGetFieldSigned(RedisModuleServerInfoData *data, const cha * 0, and the optional out_err argument will be set to REDISMODULE_ERR. */ unsigned long long RM_ServerInfoGetFieldUnsigned(RedisModuleServerInfoData *data, const char* field, int *out_err) { unsigned long long ll; - sds val = raxFind(data->rax, (unsigned char *)field, strlen(field)); - if (val == raxNotFound) { + void *result; + if (!raxFind(data->rax, (unsigned char *)field, strlen(field), &result)) { if (out_err) *out_err = REDISMODULE_ERR; return 0; } + sds val = result; if (!string2ull(val,&ll)) { if (out_err) *out_err = REDISMODULE_ERR; return 0; @@ -10433,11 +10444,12 @@ unsigned long long RM_ServerInfoGetFieldUnsigned(RedisModuleServerInfoData *data * optional out_err argument will be set to REDISMODULE_ERR. */ double RM_ServerInfoGetFieldDouble(RedisModuleServerInfoData *data, const char* field, int *out_err) { double dbl; - sds val = raxFind(data->rax, (unsigned char *)field, strlen(field)); - if (val == raxNotFound) { + void *result; + if (!raxFind(data->rax, (unsigned char *)field, strlen(field), &result)) { if (out_err) *out_err = REDISMODULE_ERR; return 0; } + sds val = result; if (!string2d(val,sdslen(val),&dbl)) { if (out_err) *out_err = REDISMODULE_ERR; return 0; diff --git a/src/networking.c b/src/networking.c index 4d8daecb3d1..c020faf897f 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1812,8 +1812,9 @@ int freeClientsInAsyncFreeQueue(void) { * are not registered clients. */ client *lookupClientByID(uint64_t id) { id = htonu64(id); - client *c = raxFind(server.clients_index,(unsigned char*)&id,sizeof(id)); - return (c == raxNotFound) ? NULL : c; + void *c = NULL; + raxFind(server.clients_index,(unsigned char*)&id,sizeof(id),&c); + return c; } /* This function should be called from _writeToClient when the reply list is not empty, diff --git a/src/rax.c b/src/rax.c index 304b26fe8f6..100744d790d 100644 --- a/src/rax.c +++ b/src/rax.c @@ -44,11 +44,6 @@ #include RAX_MALLOC_INCLUDE -/* This is a special pointer that is guaranteed to never have the same value - * of a radix tree node. It's used in order to report "not found" error without - * requiring the function to have multiple return values. */ -void *raxNotFound = (void*)"rax-not-found-pointer"; - /* -------------------------------- Debugging ------------------------------ */ void raxDebugShowNode(const char *msg, raxNode *n); @@ -912,18 +907,19 @@ int raxTryInsert(rax *rax, unsigned char *s, size_t len, void *data, void **old) return raxGenericInsert(rax,s,len,data,old,0); } -/* Find a key in the rax, returns raxNotFound special void pointer value - * if the item was not found, otherwise the value associated with the - * item is returned. */ -void *raxFind(rax *rax, unsigned char *s, size_t len) { +/* Find a key in the rax: return 1 if the item is found, 0 otherwise. + * If there is an item and 'value' is passed in a non-NULL pointer, + * the value associated with the item is set at that address. */ +int raxFind(rax *rax, unsigned char *s, size_t len, void **value) { raxNode *h; debugf("### Lookup: %.*s\n", (int)len, s); int splitpos = 0; size_t i = raxLowWalk(rax,s,len,&h,NULL,&splitpos,NULL); if (i != len || (h->iscompr && splitpos != 0) || !h->iskey) - return raxNotFound; - return raxGetData(h); + return 0; + if (value != NULL) *value = raxGetData(h); + return 1; } /* Return the memory address where the 'parent' node stores the specified diff --git a/src/rax.h b/src/rax.h index 6b1fd4188cc..c58c28b2c63 100644 --- a/src/rax.h +++ b/src/rax.h @@ -185,15 +185,12 @@ typedef struct raxIterator { raxNodeCallback node_cb; /* Optional node callback. Normally set to NULL. */ } raxIterator; -/* A special pointer returned for not found items. */ -extern void *raxNotFound; - /* Exported API. */ rax *raxNew(void); int raxInsert(rax *rax, unsigned char *s, size_t len, void *data, void **old); int raxTryInsert(rax *rax, unsigned char *s, size_t len, void *data, void **old); int raxRemove(rax *rax, unsigned char *s, size_t len, void **old); -void *raxFind(rax *rax, unsigned char *s, size_t len); +int raxFind(rax *rax, unsigned char *s, size_t len, void **value); void raxFree(rax *rax); void raxFreeWithCallback(rax *rax, void (*free_callback)(void*)); void raxStart(raxIterator *it, rax *rt); diff --git a/src/rdb.c b/src/rdb.c index b50ea7867c4..f6b0054cc03 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -2751,13 +2751,14 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { decrRefCount(o); return NULL; } - streamNACK *nack = raxFind(cgroup->pel,rawid,sizeof(rawid)); - if (nack == raxNotFound) { + void *result; + if (!raxFind(cgroup->pel,rawid,sizeof(rawid),&result)) { rdbReportCorruptRDB("Consumer entry not found in " "group global PEL"); decrRefCount(o); return NULL; } + streamNACK *nack = result; /* Set the NACK consumer, that was left to NULL when * loading the global PEL. Then set the same shared diff --git a/src/server.c b/src/server.c index 29282958d93..25a20a49bcc 100644 --- a/src/server.c +++ b/src/server.c @@ -4279,13 +4279,15 @@ int processCommand(client *c) { /* ====================== Error lookup and execution ===================== */ void incrementErrorCount(const char *fullerr, size_t namelen) { - struct redisError *error = raxFind(server.errors,(unsigned char*)fullerr,namelen); - if (error == raxNotFound) { - error = zmalloc(sizeof(*error)); - error->count = 0; + void *result; + if (!raxFind(server.errors,(unsigned char*)fullerr,namelen,&result)) { + struct redisError *error = zmalloc(sizeof(*error)); + error->count = 1; raxInsert(server.errors,(unsigned char*)fullerr,namelen,error,NULL); + } else { + struct redisError *error = result; + error->count++; } - error->count++; } /*================================== Shutdown =============================== */ diff --git a/src/t_stream.c b/src/t_stream.c index ccb566bae2b..733ccfc8c4e 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -242,10 +242,12 @@ robj *streamDup(robj *o) { raxStart(&ri_cpel, consumer->pel); raxSeek(&ri_cpel, "^", NULL, 0); while (raxNext(&ri_cpel)) { - streamNACK *new_nack = raxFind(new_cg->pel,ri_cpel.key,sizeof(streamID)); + void *result; + int found = raxFind(new_cg->pel,ri_cpel.key,sizeof(streamID),&result); - serverAssert(new_nack != raxNotFound); + serverAssert(found); + streamNACK *new_nack = result; new_nack->consumer = new_consumer; raxInsert(new_consumer->pel,ri_cpel.key,sizeof(streamID),new_nack,NULL); } @@ -1760,8 +1762,10 @@ size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end * or update it if the consumer is the same as before. */ if (group_inserted == 0) { streamFreeNACK(nack); - nack = raxFind(group->pel,buf,sizeof(buf)); - serverAssert(nack != raxNotFound); + void *result; + int found = raxFind(group->pel,buf,sizeof(buf),&result); + serverAssert(found); + nack = result; raxRemove(nack->consumer->pel,buf,sizeof(buf),NULL); /* Update the consumer and NACK metadata. */ nack->consumer = consumer; @@ -2473,7 +2477,7 @@ void streamFreeConsumer(streamConsumer *sc) { * consumer group is returned. */ streamCG *streamCreateCG(stream *s, char *name, size_t namelen, streamID *id, long long entries_read) { if (s->cgroups == NULL) s->cgroups = raxNew(); - if (raxFind(s->cgroups,(unsigned char*)name,namelen) != raxNotFound) + if (raxFind(s->cgroups,(unsigned char*)name,namelen,NULL)) return NULL; streamCG *cg = zmalloc(sizeof(*cg)); @@ -2496,9 +2500,9 @@ void streamFreeCG(streamCG *cg) { * pointer, otherwise if there is no such group, NULL is returned. */ streamCG *streamLookupCG(stream *s, sds groupname) { if (s->cgroups == NULL) return NULL; - streamCG *cg = raxFind(s->cgroups,(unsigned char*)groupname, - sdslen(groupname)); - return (cg == raxNotFound) ? NULL : cg; + void *cg = NULL; + raxFind(s->cgroups,(unsigned char*)groupname,sdslen(groupname),&cg); + return cg; } /* Create a consumer with the specified name in the group 'cg' and return. @@ -2528,9 +2532,8 @@ streamConsumer *streamCreateConsumer(streamCG *cg, sds name, robj *key, int dbid /* Lookup the consumer with the specified name in the group 'cg'. */ streamConsumer *streamLookupConsumer(streamCG *cg, sds name) { if (cg == NULL) return NULL; - streamConsumer *consumer = raxFind(cg->consumers,(unsigned char*)name, - sdslen(name)); - if (consumer == raxNotFound) return NULL; + void *consumer = NULL; + raxFind(cg->consumers,(unsigned char*)name,sdslen(name),&consumer); return consumer; } @@ -2844,8 +2847,9 @@ void xackCommand(client *c) { /* Lookup the ID in the group PEL: it will have a reference to the * NACK structure that will have a reference to the consumer, so that * we are able to remove the entry from both PELs. */ - streamNACK *nack = raxFind(group->pel,buf,sizeof(buf)); - if (nack != raxNotFound) { + void *result; + if (raxFind(group->pel,buf,sizeof(buf),&result)) { + streamNACK *nack = result; raxRemove(group->pel,buf,sizeof(buf),NULL); raxRemove(nack->consumer->pel,buf,sizeof(buf),NULL); streamFreeNACK(nack); @@ -3224,12 +3228,14 @@ void xclaimCommand(client *c) { streamEncodeID(buf,&id); /* Lookup the ID in the group PEL. */ - streamNACK *nack = raxFind(group->pel,buf,sizeof(buf)); + void *result = NULL; + raxFind(group->pel,buf,sizeof(buf),&result); + streamNACK *nack = result; /* Item must exist for us to transfer it to another consumer. */ if (!streamEntryExists(o->ptr,&id)) { /* Clear this entry from the PEL, it no longer exists */ - if (nack != raxNotFound) { + if (nack != NULL) { /* Propagate this change (we are going to delete the NACK). */ streamPropagateXCLAIM(c,c->argv[1],group,c->argv[2],c->argv[j],nack); propagate_last_id = 0; /* Will be propagated by XCLAIM itself. */ @@ -3247,13 +3253,13 @@ void xclaimCommand(client *c) { * entry in the PEL from scratch, so that XCLAIM can also * be used to create entries in the PEL. Useful for AOF * and replication of consumer groups. */ - if (force && nack == raxNotFound) { + if (force && nack == NULL) { /* Create the NACK. */ nack = streamCreateNACK(NULL); raxInsert(group->pel,buf,sizeof(buf),nack,NULL); } - if (nack != raxNotFound) { + if (nack != NULL) { /* We need to check if the minimum idle time requested * by the caller is satisfied by this entry. * diff --git a/src/tracking.c b/src/tracking.c index 5a9b114aa80..429770065bd 100644 --- a/src/tracking.c +++ b/src/tracking.c @@ -72,8 +72,10 @@ void disableTracking(client *c) { raxStart(&ri,c->client_tracking_prefixes); raxSeek(&ri,"^",NULL,0); while(raxNext(&ri)) { - bcastState *bs = raxFind(PrefixTable,ri.key,ri.key_len); - serverAssert(bs != raxNotFound); + void *result; + int found = raxFind(PrefixTable,ri.key,ri.key_len,&result); + serverAssert(found); + bcastState *bs = result; raxRemove(bs->clients,(unsigned char*)&c,sizeof(c),NULL); /* Was it the last client? Remove the prefix from the * table. */ @@ -153,14 +155,17 @@ int checkPrefixCollisionsOrReply(client *c, robj **prefixes, size_t numprefix) { /* Set the client 'c' to track the prefix 'prefix'. If the client 'c' is * already registered for the specified prefix, no operation is performed. */ void enableBcastTrackingForPrefix(client *c, char *prefix, size_t plen) { - bcastState *bs = raxFind(PrefixTable,(unsigned char*)prefix,plen); + void *result; + bcastState *bs; /* If this is the first client subscribing to such prefix, create * the prefix in the table. */ - if (bs == raxNotFound) { + if (!raxFind(PrefixTable,(unsigned char*)prefix,plen,&result)) { bs = zmalloc(sizeof(*bs)); bs->keys = raxNew(); bs->clients = raxNew(); raxInsert(PrefixTable,(unsigned char*)prefix,plen,bs,NULL); + } else { + bs = result; } if (raxTryInsert(bs->clients,(unsigned char*)&c,sizeof(c),NULL,NULL)) { if (c->client_tracking_prefixes == NULL) @@ -240,12 +245,15 @@ void trackingRememberKeys(client *tracking, client *executing) { for(int j = 0; j < numkeys; j++) { int idx = keys[j].pos; sds sdskey = executing->argv[idx]->ptr; - rax *ids = raxFind(TrackingTable,(unsigned char*)sdskey,sdslen(sdskey)); - if (ids == raxNotFound) { + void *result; + rax *ids; + if (!raxFind(TrackingTable,(unsigned char*)sdskey,sdslen(sdskey),&result)) { ids = raxNew(); int inserted = raxTryInsert(TrackingTable,(unsigned char*)sdskey, sdslen(sdskey),ids, NULL); serverAssert(inserted == 1); + } else { + ids = result; } if (raxTryInsert(ids,(unsigned char*)&tracking->id,sizeof(tracking->id),NULL,NULL)) TrackingTableTotalItems++; @@ -372,8 +380,9 @@ void trackingInvalidateKey(client *c, robj *keyobj, int bcast) { if (bcast && raxSize(PrefixTable) > 0) trackingRememberKeyToBroadcast(c,(char *)key,keylen); - rax *ids = raxFind(TrackingTable,key,keylen); - if (ids == raxNotFound) return; + void *result; + if (!raxFind(TrackingTable,key,keylen,&result)) return; + rax *ids = result; raxIterator ri; raxStart(&ri,ids); From d8a21c5767b29d53e72c4f52c00c6bd15d8648ec Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Fri, 15 Dec 2023 10:42:53 +0800 Subject: [PATCH 06/58] Unified db rehash method for both standalone and cluster (#12848) After #11695, we added two functions `rehashingStarted` and `rehashingCompleted` to the dict structure. We also registered two handlers for the main database's dict and expire structures. This allows the main database to record the dict in `rehashing` list when rehashing starts. Later, in `serverCron`, the `incrementallyRehash` function is continuously called to perform the rehashing operation. However, currently, when rehashing is completed, `rehashingCompleted` does not remove the dict from the `rehashing` list. This results in the `rehashing` list containing many invalid dicts. Although subsequent cron checks and removes dicts that don't require rehashing, it is still inefficient. This PR implements the functionality to remove the dict from the `rehashing` list in `rehashingCompleted`. This is achieved by adding `metadata` to the dict structure, which keeps track of its position in the `rehashing` list, allowing for quick removal. This approach avoids storing duplicate dicts in the `rehashing` list. Additionally, there are other modifications: 1. Whether in standalone or cluster mode, the dict in database is inserted into the rehashing linked list when rehashing starts. This eliminates the need to distinguish between standalone and cluster mode in `incrementallyRehash`. The function only needs to focus on the dicts in the `rehashing` list that require rehashing. 2. `rehashing` list is moved from per-database to Redis server level. This decouples `incrementallyRehash` from the database ID, and in standalone mode, there is no need to iterate over all databases, avoiding unnecessary access to databases that do not require rehashing. In the future, even if unsharded-cluster mode supports multiple databases, there will be no risk involved. 3. The insertion and removal operations of dict structures in the `rehashing` list are decoupled from `activerehashing` config. `activerehashing` only controls whether `incrementallyRehash` is executed in serverCron. There is no need for additional steps when modifying the `activerehashing` switch, as in #12705. --- src/db.c | 22 +++++--- src/dict.c | 16 ++++-- src/dict.h | 11 +++- src/lazyfree.c | 14 +++++ src/server.c | 147 +++++++++++++++++++++---------------------------- src/server.h | 9 ++- 6 files changed, 115 insertions(+), 104 deletions(-) diff --git a/src/db.c b/src/db.c index 5ab8d32c19b..50d6bd46030 100644 --- a/src/db.c +++ b/src/db.c @@ -669,9 +669,21 @@ long long emptyDbStructure(redisDb *dbarray, int dbnum, int async, if (async) { emptyDbAsync(&dbarray[j]); } else { + dbDictMetadata *metadata; for (int k = 0; k < dbarray[j].dict_count; k++) { dictEmpty(dbarray[j].dict[k],callback); + metadata = (dbDictMetadata *)dictMetadata(dbarray[j].dict[k]); + if (metadata->rehashing_node) { + listDelNode(server.rehashing, metadata->rehashing_node); + metadata->rehashing_node = NULL; + } + dictEmpty(dbarray[j].expires[k],callback); + metadata = (dbDictMetadata *)dictMetadata(dbarray[j].expires[k]); + if (metadata->rehashing_node) { + listDelNode(server.rehashing, metadata->rehashing_node); + metadata->rehashing_node = NULL; + } } } /* Because all keys of database are removed, reset average ttl. */ @@ -682,8 +694,6 @@ long long emptyDbStructure(redisDb *dbarray, int dbnum, int async, dbarray[j].sub_dict[subdict].key_count = 0; dbarray[j].sub_dict[subdict].resize_cursor = -1; if (server.cluster_enabled) { - if (dbarray[j].sub_dict[subdict].rehashing) - listEmpty(dbarray[j].sub_dict[subdict].rehashing); dbarray[j].sub_dict[subdict].bucket_count = 0; unsigned long long *slot_size_index = dbarray[j].sub_dict[subdict].slot_size_index; memset(slot_size_index, 0, sizeof(unsigned long long) * (CLUSTER_SLOTS + 1)); @@ -757,7 +767,6 @@ redisDb *initTempDb(void) { tempDb[i].dict = dictCreateMultiple(&dbDictType, tempDb[i].dict_count); tempDb[i].expires = dictCreateMultiple(&dbExpiresDictType, tempDb[i].dict_count); for (dbKeyType subdict = DB_MAIN; subdict <= DB_EXPIRES; subdict++) { - tempDb[i].sub_dict[subdict].rehashing = listCreate(); tempDb[i].sub_dict[subdict].slot_size_index = server.cluster_enabled ? zcalloc(sizeof(unsigned long long) * (CLUSTER_SLOTS + 1)) : NULL; } } @@ -779,7 +788,6 @@ void discardTempDb(redisDb *tempDb, void(callback)(dict*)) { zfree(tempDb[i].dict); zfree(tempDb[i].expires); for (dbKeyType subdict = DB_MAIN; subdict <= DB_EXPIRES; subdict++) { - listRelease(tempDb[i].sub_dict[subdict].rehashing); if (server.cluster_enabled) { zfree(tempDb[i].sub_dict[subdict].slot_size_index); } @@ -1445,7 +1453,7 @@ size_t dbMemUsage(redisDb *db, dbKeyType keyType) { unsigned long long keys_count = dbSize(db, keyType); mem += keys_count * dictEntryMemUsage() + dbBuckets(db, keyType) * sizeof(dictEntry*) + - db->dict_count * sizeof(dict); + db->dict_count * (sizeof(dict) + dictMetadataSize(db->dict[0])); if (keyType == DB_MAIN) { mem+=keys_count * sizeof(robj); } @@ -1890,7 +1898,6 @@ int dbSwapDatabases(int id1, int id2) { db1->expires_cursor = db2->expires_cursor; db1->dict_count = db2->dict_count; for (dbKeyType subdict = DB_MAIN; subdict <= DB_EXPIRES; subdict++) { - db1->sub_dict[subdict].rehashing = db2->sub_dict[subdict].rehashing; db1->sub_dict[subdict].key_count = db2->sub_dict[subdict].key_count; db1->sub_dict[subdict].bucket_count = db2->sub_dict[subdict].bucket_count; db1->sub_dict[subdict].non_empty_slots = db2->sub_dict[subdict].non_empty_slots; @@ -1904,7 +1911,6 @@ int dbSwapDatabases(int id1, int id2) { db2->expires_cursor = aux.expires_cursor; db2->dict_count = aux.dict_count; for (dbKeyType subdict = DB_MAIN; subdict <= DB_EXPIRES; subdict++) { - db2->sub_dict[subdict].rehashing = aux.sub_dict[subdict].rehashing; db2->sub_dict[subdict].key_count = aux.sub_dict[subdict].key_count; db2->sub_dict[subdict].bucket_count = aux.sub_dict[subdict].bucket_count; db2->sub_dict[subdict].non_empty_slots = aux.sub_dict[subdict].non_empty_slots; @@ -1950,7 +1956,6 @@ void swapMainDbWithTempDb(redisDb *tempDb) { activedb->expires_cursor = newdb->expires_cursor; activedb->dict_count = newdb->dict_count; for (dbKeyType subdict = DB_MAIN; subdict <= DB_EXPIRES; subdict++) { - activedb->sub_dict[subdict].rehashing = newdb->sub_dict[subdict].rehashing; activedb->sub_dict[subdict].key_count = newdb->sub_dict[subdict].key_count; activedb->sub_dict[subdict].bucket_count = newdb->sub_dict[subdict].bucket_count; activedb->sub_dict[subdict].non_empty_slots = newdb->sub_dict[subdict].non_empty_slots; @@ -1964,7 +1969,6 @@ void swapMainDbWithTempDb(redisDb *tempDb) { newdb->expires_cursor = aux.expires_cursor; newdb->dict_count = aux.dict_count; for (dbKeyType subdict = DB_MAIN; subdict <= DB_EXPIRES; subdict++) { - newdb->sub_dict[subdict].rehashing = aux.sub_dict[subdict].rehashing; newdb->sub_dict[subdict].key_count = aux.sub_dict[subdict].key_count; newdb->sub_dict[subdict].bucket_count = aux.sub_dict[subdict].bucket_count; newdb->sub_dict[subdict].non_empty_slots = aux.sub_dict[subdict].non_empty_slots; diff --git a/src/dict.c b/src/dict.c index f41575a9981..328c2dc8105 100644 --- a/src/dict.c +++ b/src/dict.c @@ -181,7 +181,11 @@ static void _dictReset(dict *d, int htidx) /* Create a new hash table */ dict *dictCreate(dictType *type) { - dict *d = zmalloc(sizeof(*d)); + size_t metasize = type->dictMetadataBytes ? type->dictMetadataBytes(NULL) : 0; + dict *d = zmalloc(sizeof(*d)+metasize); + if (metasize > 0) { + memset(dictMetadata(d), 0, metasize); + } _dictInit(d,type); return d; } @@ -399,10 +403,10 @@ long long timeInMilliseconds(void) { return (((long long)tv.tv_sec)*1000)+(tv.tv_usec/1000); } -/* Rehash in ms+"delta" milliseconds. The value of "delta" is larger - * than 0, and is smaller than 1 in most cases. The exact upper bound +/* Rehash in us+"delta" microseconds. The value of "delta" is larger + * than 0, and is smaller than 1000 in most cases. The exact upper bound * depends on the running time of dictRehash(d,100).*/ -int dictRehashMilliseconds(dict *d, unsigned int ms) { +int dictRehashMicroseconds(dict *d, uint64_t us) { if (d->pauserehash > 0) return 0; monotime timer; @@ -411,7 +415,7 @@ int dictRehashMilliseconds(dict *d, unsigned int ms) { while(dictRehash(d,100)) { rehashes += 100; - if (elapsedMs(timer) >= ms) break; + if (elapsedUs(timer) >= us) break; } return rehashes; } @@ -1714,7 +1718,7 @@ int dictTest(int argc, char **argv, int flags) { /* Wait for rehashing. */ while (dictIsRehashing(dict)) { - dictRehashMilliseconds(dict,100); + dictRehashMicroseconds(dict,100*1000); } start_benchmark(); diff --git a/src/dict.h b/src/dict.h index 334dc441e90..3d4de3be253 100644 --- a/src/dict.h +++ b/src/dict.h @@ -60,6 +60,9 @@ typedef struct dictType { /* Invoked at the end of dict initialization/rehashing of all the entries from old to new ht. Both ht still exists * and are cleaned up after this callback. */ void (*rehashingCompleted)(dict *d); + /* Allow a dict to carry extra caller-defined metadata. The + * extra memory is initialized to 0 when a dict is allocated. */ + size_t (*dictMetadataBytes)(dict *d); /* Flags */ /* The 'no_value' flag, if set, indicates that values are not used, i.e. the * dict is a set. When this flag is set, it's not possible to access the @@ -88,6 +91,7 @@ struct dict { /* Keep small vars at end for optimal (minimal) struct padding */ int16_t pauserehash; /* If >0 rehashing is paused (<0 indicates coding error) */ signed char ht_size_exp[2]; /* exponent of size. (size = 1<type->keyCompare((d), key1, key2) : \ (key1) == (key2)) +#define dictMetadata(d) (&(d)->metadata) +#define dictMetadataSize(d) ((d)->type->dictMetadataBytes \ + ? (d)->type->dictMetadataBytes(d) : 0) + #define dictHashKey(d, key) ((d)->type->hashFunction(key)) #define dictBuckets(d) (DICTHT_SIZE((d)->ht_size_exp[0])+DICTHT_SIZE((d)->ht_size_exp[1])) #define dictSize(d) ((d)->ht_used[0]+(d)->ht_used[1]) @@ -166,7 +174,6 @@ dict *dictCreate(dictType *type); dict **dictCreateMultiple(dictType *type, int count); int dictExpand(dict *d, unsigned long size); int dictTryExpand(dict *d, unsigned long size); -void *dictMetadata(dict *d); int dictAdd(dict *d, void *key, void *val); dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing); void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing); @@ -215,7 +222,7 @@ uint64_t dictGenCaseHashFunction(const unsigned char *buf, size_t len); void dictEmpty(dict *d, void(callback)(dict*)); void dictSetResizeEnabled(dictResizeEnable enable); int dictRehash(dict *d, int n); -int dictRehashMilliseconds(dict *d, unsigned int ms); +int dictRehashMicroseconds(dict *d, uint64_t us); void dictSetHashFunctionSeed(uint8_t *seed); uint8_t *dictGetHashFunctionSeed(void); unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *privdata); diff --git a/src/lazyfree.c b/src/lazyfree.c index 2a6d1b7e16d..1b58bb78e1c 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -179,6 +179,20 @@ void freeObjAsync(robj *key, robj *obj, int dbid) { * create a new empty set of hash tables and scheduling the old ones for * lazy freeing. */ void emptyDbAsync(redisDb *db) { + dbDictMetadata *metadata; + for (int i = 0; i < db->dict_count; i++) { + metadata = (dbDictMetadata *)dictMetadata(db->dict[i]); + if (metadata->rehashing_node) { + listDelNode(server.rehashing, metadata->rehashing_node); + metadata->rehashing_node = NULL; + } + + metadata = (dbDictMetadata *)dictMetadata(db->expires[i]); + if (metadata->rehashing_node) { + listDelNode(server.rehashing, metadata->rehashing_node); + metadata->rehashing_node = NULL; + } + } dict **oldDict = db->dict; dict **oldExpires = db->expires; atomicIncr(lazyfree_objects,dbSize(db, DB_MAIN)); diff --git a/src/server.c b/src/server.c index 25a20a49bcc..361a0928ff7 100644 --- a/src/server.c +++ b/src/server.c @@ -419,52 +419,61 @@ int dictExpandAllowed(size_t moreMem, double usedRatio) { } } -/* Updates the bucket count in cluster-mode for the given dictionary in a DB. bucket count - * incremented with the new ht size during the rehashing phase. - * And also adds dictionary to the rehashing list in cluster mode, which allows us +/* Adds dictionary to the rehashing list, which allows us * to quickly find rehash targets during incremental rehashing. - * - * In non-cluster mode, bucket count can be retrieved directly from single dict bucket and - * we don't need this list as there is only one dictionary per DB. */ -void dictRehashingStarted(dict *d) { - if (!server.cluster_enabled) return; + * + * Updates the bucket count in cluster-mode for the given dictionary in a DB, bucket count + * incremented with the new ht size during the rehashing phase. In non-cluster mode, + * bucket count can be retrieved directly from single dict bucket. */ +void dictRehashingStarted(dict *d, dbKeyType keyType) { + dbDictMetadata *metadata = (dbDictMetadata *)dictMetadata(d); + listAddNodeTail(server.rehashing, d); + metadata->rehashing_node = listLast(server.rehashing); + if (!server.cluster_enabled) return; unsigned long long from, to; dictRehashingInfo(d, &from, &to); - server.db[0].sub_dict[DB_MAIN].bucket_count += to; /* Started rehashing (Add the new ht size) */ - if (from == 0) return; /* No entries are to be moved. */ - if (server.activerehashing) { - listAddNodeTail(server.db[0].sub_dict[DB_MAIN].rehashing, d); - } + server.db[0].sub_dict[keyType].bucket_count += to; /* Started rehashing (Add the new ht size) */ } -/* Updates the bucket count for the given dictionary in a DB. It removes +/* Remove dictionary from the rehashing list. + * + * Updates the bucket count for the given dictionary in a DB. It removes * the old ht size of the dictionary from the total sum of buckets for a DB. */ -void dictRehashingCompleted(dict *d) { +void dictRehashingCompleted(dict *d, dbKeyType keyType) { + dbDictMetadata *metadata = (dbDictMetadata *)dictMetadata(d); + if (metadata->rehashing_node) { + listDelNode(server.rehashing, metadata->rehashing_node); + metadata->rehashing_node = NULL; + } + if (!server.cluster_enabled) return; unsigned long long from, to; dictRehashingInfo(d, &from, &to); - server.db[0].sub_dict[DB_MAIN].bucket_count -= from; /* Finished rehashing (Remove the old ht size) */ + server.db[0].sub_dict[keyType].bucket_count -= from; /* Finished rehashing (Remove the old ht size) */ } -void dictRehashingStartedForExpires(dict *d) { - if (!server.cluster_enabled) return; +void dbDictRehashingStarted(dict *d) { + dictRehashingStarted(d, DB_MAIN); +} - unsigned long long from, to; - dictRehashingInfo(d, &from, &to); - server.db[0].sub_dict[DB_EXPIRES].bucket_count += to; /* Started rehashing (Add the new ht size) */ - if (from == 0) return; /* No entries are to be moved. */ - if (server.activerehashing) { - listAddNodeTail(server.db[0].sub_dict[DB_EXPIRES].rehashing, d); - } +void dbDictRehashingCompleted(dict *d) { + dictRehashingCompleted(d, DB_MAIN); } -void dictRehashingCompletedForExpires(dict *d) { - if (!server.cluster_enabled) return; +void dbExpiresRehashingStarted(dict *d) { + dictRehashingStarted(d, DB_EXPIRES); +} - unsigned long long from, to; - dictRehashingInfo(d, &from, &to); - server.db[0].sub_dict[DB_EXPIRES].bucket_count -= from; /* Finished rehashing (Remove the old ht size) */ +void dbExpiresRehashingCompleted(dict *d) { + dictRehashingCompleted(d, DB_EXPIRES); +} + +/* Returns the size of the DB dict metadata in bytes. */ +size_t dbDictMetadataSize(dict *d) { + UNUSED(d); + /* NOTICE: this also affects overhead_ht_main and overhead_ht_expires in getMemoryOverheadData. */ + return sizeof(dbDictMetadata); } /* Generic hash table type where keys are Redis Objects, Values @@ -522,8 +531,9 @@ dictType dbDictType = { dictSdsDestructor, /* key destructor */ dictObjectDestructor, /* val destructor */ dictExpandAllowed, /* allow to expand */ - dictRehashingStarted, - dictRehashingCompleted, + dbDictRehashingStarted, + dbDictRehashingCompleted, + dbDictMetadataSize, }; /* Db->expires */ @@ -535,8 +545,9 @@ dictType dbExpiresDictType = { NULL, /* key destructor */ NULL, /* val destructor */ dictExpandAllowed, /* allow to expand */ - dictRehashingStartedForExpires, - dictRehashingCompletedForExpires, + dbExpiresRehashingStarted, + dbExpiresRehashingCompleted, + dbDictMetadataSize, }; /* Command table. sds string -> command struct pointer. */ @@ -683,45 +694,23 @@ void tryResizeHashTables(int dbid) { * * The function returns 1 if some rehashing was performed, otherwise 0 * is returned. */ -int incrementallyRehash(int dbid) { - /* Rehash main and expire dictionary . */ - if (server.cluster_enabled) { - listNode *node, *nextNode; - monotime timer; - elapsedStart(&timer); - /* Our goal is to rehash as many slot specific dictionaries as we can before reaching predefined threshold, - * while removing those that already finished rehashing from the queue. */ - for (dbKeyType subdict = DB_MAIN; subdict <= DB_EXPIRES; subdict++) { - serverLog(LL_DEBUG,"Rehashing list length: %lu", listLength(server.db[dbid].sub_dict[subdict].rehashing)); - while ((node = listFirst(server.db[dbid].sub_dict[subdict].rehashing))) { - if (dictIsRehashing((dict *) listNodeValue(node))) { - dictRehashMilliseconds(listNodeValue(node), INCREMENTAL_REHASHING_THRESHOLD_MS); - if (elapsedMs(timer) >= INCREMENTAL_REHASHING_THRESHOLD_MS) { - return 1; /* Reached the time limit. */ - } - } else { /* It is possible that rehashing has already completed for this dictionary, simply remove it from the queue. */ - nextNode = listNextNode(node); - listDelNode(server.db[dbid].sub_dict[subdict].rehashing, node); - node = nextNode; - } - } - } - /* When cluster mode is disabled, only one dict is used for the entire DB and rehashing list isn't populated. */ - } else { - /* Rehash main dict. */ - dict *main_dict = server.db[dbid].dict[0]; - if (dictIsRehashing(main_dict)) { - dictRehashMilliseconds(main_dict, INCREMENTAL_REHASHING_THRESHOLD_MS); - return 1; /* already used our millisecond for this loop... */ - } - /* Rehash expires. */ - dict *expires_dict = server.db[dbid].expires[0]; - if (dictIsRehashing(expires_dict)) { - dictRehashMilliseconds(expires_dict, INCREMENTAL_REHASHING_THRESHOLD_MS); - return 1; /* already used our millisecond for this loop... */ +int incrementallyRehash(void) { + if (listLength(server.rehashing) == 0) return 0; + serverLog(LL_DEBUG,"Rehashing list length: %lu", listLength(server.rehashing)); + + /* Our goal is to rehash as many dictionaries as we can before reaching predefined threshold, + * after each dictionary completes rehashing, it removes itself from the list. */ + listNode *node; + monotime timer; + elapsedStart(&timer); + while ((node = listFirst(server.rehashing))) { + uint64_t elapsed_us = elapsedUs(timer); + if (elapsed_us >= INCREMENTAL_REHASHING_THRESHOLD_US) { + break; /* Reached the time limit. */ } + dictRehashMicroseconds(listNodeValue(node), INCREMENTAL_REHASHING_THRESHOLD_US - elapsed_us); } - return 0; + return 1; } /* This function is called once a background process of some kind terminates, @@ -1162,7 +1151,6 @@ void databasesCron(void) { * DB we'll be able to start from the successive in the next * cron loop iteration. */ static unsigned int resize_db = 0; - static unsigned int rehash_db = 0; int dbs_per_call = CRON_DBS_PER_CALL; int j; @@ -1177,18 +1165,7 @@ void databasesCron(void) { /* Rehash */ if (server.activerehashing) { - for (j = 0; j < dbs_per_call; j++) { - int work_done = incrementallyRehash(rehash_db); - if (work_done) { - /* If the function did some work, stop here, we'll do - * more at the next cron loop. */ - break; - } else { - /* If this db didn't need rehash, we'll try the next one. */ - rehash_db++; - rehash_db %= server.dbnum; - } - } + incrementallyRehash(); } } } @@ -2654,7 +2631,6 @@ void makeThreadKillable(void) { /* When adding fields, please check the initTempDb related logic. */ void initDbState(redisDb *db){ for (dbKeyType subdict = DB_MAIN; subdict <= DB_EXPIRES; subdict++) { - db->sub_dict[subdict].rehashing = listCreate(); db->sub_dict[subdict].non_empty_slots = 0; db->sub_dict[subdict].key_count = 0; db->sub_dict[subdict].resize_cursor = -1; @@ -2754,6 +2730,7 @@ void initServer(void) { initDbState(&server.db[j]); listSetFreeMethod(server.db[j].defrag_later,(void (*)(void*))sdsfree); } + server.rehashing = listCreate(); evictionPoolAlloc(); /* Initialize the LRU keys pool. */ server.pubsub_channels = dictCreate(&keylistDictType); server.pubsub_patterns = dictCreate(&keylistDictType); diff --git a/src/server.h b/src/server.h index a0b028f00bc..c0a49f538e8 100644 --- a/src/server.h +++ b/src/server.h @@ -137,7 +137,7 @@ struct hdr_histogram; #define CONFIG_BINDADDR_MAX 16 #define CONFIG_MIN_RESERVED_FDS 32 #define CONFIG_DEFAULT_PROC_TITLE_TEMPLATE "{title} {listen-addr} {server-mode}" -#define INCREMENTAL_REHASHING_THRESHOLD_MS 1 +#define INCREMENTAL_REHASHING_THRESHOLD_US 1000 /* Bucket sizes for client eviction pools. Each bucket stores clients with * memory usage of up to twice the size of the bucket below it. */ @@ -971,7 +971,6 @@ typedef struct replBufBlock { /* When adding fields, please check the swap db related logic. */ typedef struct dbDictState { - list *rehashing; /* List of dictionaries in this DB that are currently rehashing. */ int resize_cursor; /* Cron job uses this cursor to gradually resize dictionaries (only used for cluster-enabled). */ int non_empty_slots; /* The number of non-empty slots. */ unsigned long long key_count; /* Total number of keys in this DB. */ @@ -984,6 +983,11 @@ typedef enum dbKeyType { DB_EXPIRES } dbKeyType; +/* Dict metadata for database, used for record the position in rehashing list. */ +typedef struct dbDictMetadata { + listNode *rehashing_node; /* list node in rehashing list */ +} dbDictMetadata; + /* Redis database representation. There are multiple databases identified * by integers from 0 (the default database) up to the max configured * database. The database number is the 'id' field in the structure. */ @@ -1569,6 +1573,7 @@ struct redisServer { int hz; /* serverCron() calls frequency in hertz */ int in_fork_child; /* indication that this is a fork child */ redisDb *db; + list *rehashing; /* List of dictionaries in DBs that are currently rehashing. */ dict *commands; /* Command table */ dict *orig_commands; /* Command table before command renaming. */ aeEventLoop *el; From adbb534f034f94453c619b106029500823f34cf6 Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 15 Dec 2023 23:22:02 +0800 Subject: [PATCH 07/58] Always keep an in-memory history of all commands in redis-cli (#12862) redis-cli avoids saving sensitive commands in it's history (doesn't persist them to the history file). this means that if you had a typo and you wanna re-run the command, you can't easily do that. This PR changes that to keep an in-memory history of all the redacted commands, and just not persist them to disk. This way we would be able to press the up arrow and re-try the command freely, and it'll just not survive a redis-cli restart. --- deps/linenoise/README.markdown | 2 +- deps/linenoise/linenoise.c | 32 +++++++++++++++++++++++++++----- deps/linenoise/linenoise.h | 2 +- src/redis-cli.c | 11 ++++++----- 4 files changed, 35 insertions(+), 12 deletions(-) diff --git a/deps/linenoise/README.markdown b/deps/linenoise/README.markdown index 1afea2ae65c..b3752da162b 100644 --- a/deps/linenoise/README.markdown +++ b/deps/linenoise/README.markdown @@ -108,7 +108,7 @@ to search and re-edit already inserted lines of text. The followings are the history API calls: - int linenoiseHistoryAdd(const char *line); + int linenoiseHistoryAdd(const char *line, int is_sensitive); int linenoiseHistorySetMaxLen(int len); int linenoiseHistorySave(const char *filename); int linenoiseHistoryLoad(const char *filename); diff --git a/deps/linenoise/linenoise.c b/deps/linenoise/linenoise.c index dd86abe86e2..75306390ecc 100644 --- a/deps/linenoise/linenoise.c +++ b/deps/linenoise/linenoise.c @@ -134,6 +134,8 @@ static int atexit_registered = 0; /* Register atexit just 1 time. */ static int history_max_len = LINENOISE_DEFAULT_HISTORY_MAX_LEN; static int history_len = 0; static char **history = NULL; +static int *history_sensitive = NULL; /* An array records whether each line in + * history is sensitive. */ /* The linenoiseState structure represents the state during line editing. * We pass this state to functions implementing specific editing @@ -177,7 +179,7 @@ enum KEY_ACTION{ }; static void linenoiseAtExit(void); -int linenoiseHistoryAdd(const char *line); +int linenoiseHistoryAdd(const char *line, int is_sensitive); static void refreshLine(struct linenoiseState *l); /* Debugging macro. */ @@ -818,7 +820,7 @@ static int linenoiseEdit(int stdin_fd, int stdout_fd, char *buf, size_t buflen, /* The latest history entry is always our current buffer, that * initially is just an empty string. */ - linenoiseHistoryAdd(""); + linenoiseHistoryAdd("", 0); if (write(l.ofd,prompt,l.plen) == -1) return -1; while(1) { @@ -1112,6 +1114,7 @@ static void freeHistory(void) { for (j = 0; j < history_len; j++) free(history[j]); free(history); + free(history_sensitive); } } @@ -1128,7 +1131,7 @@ static void linenoiseAtExit(void) { * histories, but will work well for a few hundred of entries. * * Using a circular buffer is smarter, but a bit more complex to handle. */ -int linenoiseHistoryAdd(const char *line) { +int linenoiseHistoryAdd(const char *line, int is_sensitive) { char *linecopy; if (history_max_len == 0) return 0; @@ -1137,7 +1140,14 @@ int linenoiseHistoryAdd(const char *line) { if (history == NULL) { history = malloc(sizeof(char*)*history_max_len); if (history == NULL) return 0; + history_sensitive = malloc(sizeof(int)*history_max_len); + if (history_sensitive == NULL) { + free(history); + history = NULL; + return 0; + } memset(history,0,(sizeof(char*)*history_max_len)); + memset(history_sensitive,0,(sizeof(int)*history_max_len)); } /* Don't add duplicated lines. */ @@ -1150,9 +1160,11 @@ int linenoiseHistoryAdd(const char *line) { if (history_len == history_max_len) { free(history[0]); memmove(history,history+1,sizeof(char*)*(history_max_len-1)); + memmove(history_sensitive,history_sensitive+1,sizeof(int)*(history_max_len-1)); history_len--; } history[history_len] = linecopy; + history_sensitive[history_len] = is_sensitive; history_len++; return 1; } @@ -1163,6 +1175,7 @@ int linenoiseHistoryAdd(const char *line) { * than the amount of items already inside the history. */ int linenoiseHistorySetMaxLen(int len) { char **new; + int *new_sensitive; if (len < 1) return 0; if (history) { @@ -1170,6 +1183,11 @@ int linenoiseHistorySetMaxLen(int len) { new = malloc(sizeof(char*)*len); if (new == NULL) return 0; + new_sensitive = malloc(sizeof(int)*len); + if (new_sensitive == NULL) { + free(new); + return 0; + } /* If we can't copy everything, free the elements we'll not use. */ if (len < tocopy) { @@ -1179,9 +1197,13 @@ int linenoiseHistorySetMaxLen(int len) { tocopy = len; } memset(new,0,sizeof(char*)*len); + memset(new_sensitive,0,sizeof(int)*len); memcpy(new,history+(history_len-tocopy), sizeof(char*)*tocopy); + memcpy(new_sensitive,history_sensitive+(history_len-tocopy), sizeof(int)*tocopy); free(history); + free(history_sensitive); history = new; + history_sensitive = new_sensitive; } history_max_len = len; if (history_len > history_max_len) @@ -1201,7 +1223,7 @@ int linenoiseHistorySave(const char *filename) { if (fp == NULL) return -1; fchmod(fileno(fp),S_IRUSR|S_IWUSR); for (j = 0; j < history_len; j++) - fprintf(fp,"%s\n",history[j]); + if (!history_sensitive[j]) fprintf(fp,"%s\n",history[j]); fclose(fp); return 0; } @@ -1223,7 +1245,7 @@ int linenoiseHistoryLoad(const char *filename) { p = strchr(buf,'\r'); if (!p) p = strchr(buf,'\n'); if (p) *p = '\0'; - linenoiseHistoryAdd(buf); + linenoiseHistoryAdd(buf, 0); } fclose(fp); return 0; diff --git a/deps/linenoise/linenoise.h b/deps/linenoise/linenoise.h index 6dfee73bcd4..beac6df467a 100644 --- a/deps/linenoise/linenoise.h +++ b/deps/linenoise/linenoise.h @@ -58,7 +58,7 @@ void linenoiseAddCompletion(linenoiseCompletions *, const char *); char *linenoise(const char *prompt); void linenoiseFree(void *ptr); -int linenoiseHistoryAdd(const char *line); +int linenoiseHistoryAdd(const char *line, int is_sensitive); int linenoiseHistorySetMaxLen(int len); int linenoiseHistorySave(const char *filename); int linenoiseHistoryLoad(const char *filename); diff --git a/src/redis-cli.c b/src/redis-cli.c index f18961eaf9d..b9f44bb1595 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -3394,7 +3394,7 @@ static void repl(void) { if (argv == NULL) { printf("Invalid argument(s)\n"); fflush(stdout); - if (history) linenoiseHistoryAdd(line); + if (history) linenoiseHistoryAdd(line, 0); if (historyfile) linenoiseHistorySave(historyfile); linenoiseFree(line); continue; @@ -3420,10 +3420,11 @@ static void repl(void) { repeat = 1; } - if (!isSensitiveCommand(argc - skipargs, argv + skipargs)) { - if (history) linenoiseHistoryAdd(line); - if (historyfile) linenoiseHistorySave(historyfile); - } + /* Always keep in-memory history. But for commands with sensitive information, + * avoid writing them to the history file. */ + int is_sensitive = isSensitiveCommand(argc - skipargs, argv + skipargs); + if (history) linenoiseHistoryAdd(line, is_sensitive); + if (!is_sensitive && historyfile) linenoiseHistorySave(historyfile); if (strcasecmp(argv[0],"quit") == 0 || strcasecmp(argv[0],"exit") == 0) From 5dc631d880eea750d394651d70822ffabe13caae Mon Sep 17 00:00:00 2001 From: Wen Hui Date: Sun, 17 Dec 2023 07:02:53 -0500 Subject: [PATCH 08/58] Add missing test cases for hash commands (#12851) We dont have test for hgetall against key doesnot exist so added the test in test suite and along with this, added wrong type cases for other missing commands. --- tests/unit/type/hash.tcl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/type/hash.tcl b/tests/unit/type/hash.tcl index 2a26f445582..b3d7ddc771d 100644 --- a/tests/unit/type/hash.tcl +++ b/tests/unit/type/hash.tcl @@ -363,6 +363,11 @@ start_server {tags {"hash"}} { assert_error "WRONGTYPE Operation against a key*" {r hvals wrongtype} assert_error "WRONGTYPE Operation against a key*" {r hkeys wrongtype} assert_error "WRONGTYPE Operation against a key*" {r hexists wrongtype field1} + assert_error "WRONGTYPE Operation against a key*" {r hset wrongtype field1 val1} + assert_error "WRONGTYPE Operation against a key*" {r hmset wrongtype field1 val1 field2 val2} + assert_error "WRONGTYPE Operation against a key*" {r hsetnx wrongtype field1 val1} + assert_error "WRONGTYPE Operation against a key*" {r hlen wrongtype} + assert_error "WRONGTYPE Operation against a key*" {r hscan wrongtype 0} } test {HMGET - small hash} { @@ -429,6 +434,11 @@ start_server {tags {"hash"}} { lsort [r hgetall bighash] } [lsort [array get bighash]] + test {HGETALL against non-existing key} { + r del htest + r hgetall htest + } {} + test {HDEL and return value} { set rv {} lappend rv [r hdel smallhash nokey] From 23e980e77a8e3d9e1f1f0a108106f3648c7b674a Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 21 Dec 2023 19:51:46 +0800 Subject: [PATCH 09/58] Move cliVersion to cli_common and add --version support for redis-check-aof (#10856) Let us see which version of redis this tool is part of. Similarly to redis-cli, redis-benchmark and redis-check-rdb redis-rdb-check and redis-aof-check are actually symlinks to redis, so they will directly use getVersion in server, the format became: ``` {title} v={redis_version} sha={sha}:{dirty} malloc={malloc} bits={bits} build={build} ``` Move cliVersion into cli_common, redis-cli and redis-benchmark will use it, and the format is not change: ``` {title} {redis_version} (git:{sha}) ``` --- src/cli_common.c | 18 ++++++++++++++++++ src/cli_common.h | 2 ++ src/redis-benchmark.c | 19 +------------------ src/redis-check-aof.c | 8 ++++++++ src/redis-check-rdb.c | 16 +--------------- src/redis-cli.c | 18 ------------------ src/server.c | 15 +++++++++++---- src/server.h | 1 + 8 files changed, 42 insertions(+), 55 deletions(-) diff --git a/src/cli_common.c b/src/cli_common.c index 421e7d34a39..7d98addfc9b 100644 --- a/src/cli_common.c +++ b/src/cli_common.c @@ -30,6 +30,8 @@ #include "fmacros.h" #include "cli_common.h" +#include "version.h" + #include #include #include @@ -48,6 +50,9 @@ #define UNUSED(V) ((void) V) +char *redisGitSHA1(void); +char *redisGitDirty(void); + /* Wrapper around redisSecureConnection to avoid hiredis_ssl dependencies if * not building with TLS support. */ @@ -406,3 +411,16 @@ sds escapeJsonString(sds s, const char *p, size_t len) { } return sdscatlen(s,"\"",1); } + +sds cliVersion(void) { + sds version = sdscatprintf(sdsempty(), "%s", REDIS_VERSION); + + /* Add git commit and working tree status when available. */ + if (strtoll(redisGitSHA1(),NULL,16)) { + version = sdscatprintf(version, " (git:%s", redisGitSHA1()); + if (strtoll(redisGitDirty(),NULL,10)) + version = sdscatprintf(version, "-dirty"); + version = sdscat(version, ")"); + } + return version; +} diff --git a/src/cli_common.h b/src/cli_common.h index cffdee61d89..3377eaf3a8f 100644 --- a/src/cli_common.h +++ b/src/cli_common.h @@ -51,4 +51,6 @@ void freeCliConnInfo(cliConnInfo connInfo); sds escapeJsonString(sds s, const char *p, size_t len); +sds cliVersion(void); + #endif /* __CLICOMMON_H */ diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c index ac2a313e8c0..05d054de162 100644 --- a/src/redis-benchmark.c +++ b/src/redis-benchmark.c @@ -29,7 +29,6 @@ */ #include "fmacros.h" -#include "version.h" #include #include @@ -186,8 +185,6 @@ typedef struct redisConfig { } redisConfig; /* Prototypes */ -char *redisGitSHA1(void); -char *redisGitDirty(void); static void writeHandler(aeEventLoop *el, int fd, void *privdata, int mask); static void createMissingClients(client c); static benchmarkThread *createBenchmarkThread(int index); @@ -205,20 +202,6 @@ static void updateClusterSlotsConfiguration(void); int showThroughput(struct aeEventLoop *eventLoop, long long id, void *clientData); -static sds benchmarkVersion(void) { - sds version; - version = sdscatprintf(sdsempty(), "%s", REDIS_VERSION); - - /* Add git commit and working tree status when available */ - if (strtoll(redisGitSHA1(),NULL,16)) { - version = sdscatprintf(version, " (git:%s", redisGitSHA1()); - if (strtoll(redisGitDirty(),NULL,10)) - version = sdscatprintf(version, "-dirty"); - version = sdscat(version, ")"); - } - return version; -} - /* Dict callbacks */ static uint64_t dictSdsHash(const void *key); static int dictSdsKeyCompare(dict *d, const void *key1, const void *key2); @@ -1423,7 +1406,7 @@ int parseOptions(int argc, char **argv) { if (lastarg) goto invalid; config.numclients = atoi(argv[++i]); } else if (!strcmp(argv[i],"-v") || !strcmp(argv[i], "--version")) { - sds version = benchmarkVersion(); + sds version = cliVersion(); printf("redis-benchmark %s\n", version); sdsfree(version); exit(0); diff --git a/src/redis-check-aof.c b/src/redis-check-aof.c index d39ac6109f1..9c2eb5eef70 100644 --- a/src/redis-check-aof.c +++ b/src/redis-check-aof.c @@ -29,6 +29,7 @@ */ #include "server.h" + #include #include #include @@ -515,6 +516,13 @@ int redis_check_aof_main(int argc, char **argv) { if (argc < 2) { goto invalid_args; } else if (argc == 2) { + if (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version")) { + sds version = getVersion(); + printf("redis-check-aof %s\n", version); + sdsfree(version); + exit(0); + } + filepath = argv[1]; } else if (argc == 3) { if (!strcmp(argv[1], "--fix")) { diff --git a/src/redis-check-rdb.c b/src/redis-check-rdb.c index 2344eec1c34..71cff0247c7 100644 --- a/src/redis-check-rdb.c +++ b/src/redis-check-rdb.c @@ -394,20 +394,6 @@ int redis_check_rdb(char *rdbfilename, FILE *fp) { return 1; } -static sds checkRdbVersion(void) { - sds version; - version = sdscatprintf(sdsempty(), "%s", REDIS_VERSION); - - /* Add git commit and working tree status when available */ - if (strtoll(redisGitSHA1(),NULL,16)) { - version = sdscatprintf(version, " (git:%s", redisGitSHA1()); - if (strtoll(redisGitDirty(),NULL,10)) - version = sdscatprintf(version, "-dirty"); - version = sdscat(version, ")"); - } - return version; -} - /* RDB check main: called form server.c when Redis is executed with the * redis-check-rdb alias, on during RDB loading errors. * @@ -427,7 +413,7 @@ int redis_check_rdb_main(int argc, char **argv, FILE *fp) { fprintf(stderr, "Usage: %s \n", argv[0]); exit(1); } else if (!strcmp(argv[1],"-v") || !strcmp(argv[1], "--version")) { - sds version = checkRdbVersion(); + sds version = getVersion(); printf("redis-check-rdb %s\n", version); sdsfree(version); exit(0); diff --git a/src/redis-cli.c b/src/redis-cli.c index b9f44bb1595..0510b770e16 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -29,7 +29,6 @@ */ #include "fmacros.h" -#include "version.h" #include #include @@ -64,7 +63,6 @@ #include "connection.h" #include "cli_common.h" #include "mt19937-64.h" - #include "cli_commands.h" #define UNUSED(V) ((void) V) @@ -287,8 +285,6 @@ static struct pref { static volatile sig_atomic_t force_cancel_loop = 0; static void usage(int err); static void slaveMode(int send_sync); -char *redisGitSHA1(void); -char *redisGitDirty(void); static int cliConnect(int flags); static char *getInfoField(char *info, char *field); @@ -424,20 +420,6 @@ typedef struct { static helpEntry *helpEntries = NULL; static int helpEntriesLen = 0; -static sds cliVersion(void) { - sds version; - version = sdscatprintf(sdsempty(), "%s", REDIS_VERSION); - - /* Add git commit and working tree status when available */ - if (strtoll(redisGitSHA1(),NULL,16)) { - version = sdscatprintf(version, " (git:%s", redisGitSHA1()); - if (strtoll(redisGitDirty(),NULL,10)) - version = sdscatprintf(version, "-dirty"); - version = sdscat(version, ")"); - } - return version; -} - /* For backwards compatibility with pre-7.0 servers. * cliLegacyInitHelp() sets up the helpEntries array with the command and group * names from the commands.c file. However the Redis instance we are connecting diff --git a/src/server.c b/src/server.c index 361a0928ff7..872c327a375 100644 --- a/src/server.c +++ b/src/server.c @@ -6250,15 +6250,16 @@ void daemonize(void) { } } -void version(void) { - printf("Redis server v=%s sha=%s:%d malloc=%s bits=%d build=%llx\n", +sds getVersion(void) { + sds version = sdscatprintf(sdsempty(), + "v=%s sha=%s:%d malloc=%s bits=%d build=%llx", REDIS_VERSION, redisGitSHA1(), atoi(redisGitDirty()) > 0, ZMALLOC_LIB, sizeof(long) == 4 ? 32 : 64, (unsigned long long) redisBuildId()); - exit(0); + return version; } void usage(void) { @@ -6992,7 +6993,13 @@ int main(int argc, char **argv) { /* Handle special options --help and --version */ if (strcmp(argv[1], "-v") == 0 || - strcmp(argv[1], "--version") == 0) version(); + strcmp(argv[1], "--version") == 0) + { + sds version = getVersion(); + printf("Redis server %s\n", version); + sdsfree(version); + exit(0); + } if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0) usage(); if (strcmp(argv[1], "--test-memory") == 0) { diff --git a/src/server.h b/src/server.h index c0a49f538e8..99bce884a53 100644 --- a/src/server.h +++ b/src/server.h @@ -3789,6 +3789,7 @@ void killIOThreads(void); void killThreads(void); void makeThreadKillable(void); void swapMainDbWithTempDb(redisDb *tempDb); +sds getVersion(void); /* Use macro for checking log level to avoid evaluating arguments in cases log * should be ignored due to low level. */ From 09e0d338f5504f7ba82c9500effde0dc4e3f9509 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sun, 24 Dec 2023 16:40:34 +0800 Subject: [PATCH 10/58] redis-cli adds -4 / -6 options to determine IPV4 / IPV6 priority in DNS lookup (#11315) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR, we added -4 and -6 options to redis-cli to determine IPV4 / IPV6 priority in DNS lookup. This was mentioned in https://github.com/redis/redis/pull/11151#issuecomment-1231570651 For now it's only used in CLUSTER MEET. The options also made it possible to reliably test dns lookup in CI, using this option, we can add some localhost tests for #11151. The commit was cherry-picked from #11151, back then we decided to split the PR. Co-authored-by: Viktor Söderqvist --- src/anet.c | 19 +++++++++++++++++-- src/anet.h | 2 ++ src/redis-cli.c | 25 +++++++++++++++++++++++-- tests/unit/cluster/cli.tcl | 33 ++++++++++++++++----------------- 4 files changed, 58 insertions(+), 21 deletions(-) diff --git a/src/anet.c b/src/anet.c index 64824a23f84..369e1c64129 100644 --- a/src/anet.c +++ b/src/anet.c @@ -239,7 +239,11 @@ int anetRecvTimeout(char *err, int fd, long long ms) { * * If flags is set to ANET_IP_ONLY the function only resolves hostnames * that are actually already IPv4 or IPv6 addresses. This turns the function - * into a validating / normalizing function. */ + * into a validating / normalizing function. + * + * If the flag ANET_PREFER_IPV4 is set, IPv4 is preferred over IPv6. + * If the flag ANET_PREFER_IPV6 is set, IPv6 is preferred over IPv4. + * */ int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len, int flags) { @@ -249,9 +253,20 @@ int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len, memset(&hints,0,sizeof(hints)); if (flags & ANET_IP_ONLY) hints.ai_flags = AI_NUMERICHOST; hints.ai_family = AF_UNSPEC; + if (flags & ANET_PREFER_IPV4 && !(flags & ANET_PREFER_IPV6)) { + hints.ai_family = AF_INET; + } else if (flags & ANET_PREFER_IPV6 && !(flags & ANET_PREFER_IPV4)) { + hints.ai_family = AF_INET6; + } hints.ai_socktype = SOCK_STREAM; /* specify socktype to avoid dups */ - if ((rv = getaddrinfo(host, NULL, &hints, &info)) != 0) { + rv = getaddrinfo(host, NULL, &hints, &info); + if (rv != 0 && hints.ai_family != AF_UNSPEC) { + /* Try the other IP version. */ + hints.ai_family = (hints.ai_family == AF_INET) ? AF_INET6 : AF_INET; + rv = getaddrinfo(host, NULL, &hints, &info); + } + if (rv != 0) { anetSetError(err, "%s", gai_strerror(rv)); return ANET_ERR; } diff --git a/src/anet.h b/src/anet.h index b13c14f7758..08e01a4bcab 100644 --- a/src/anet.h +++ b/src/anet.h @@ -40,6 +40,8 @@ /* Flags used with certain functions. */ #define ANET_NONE 0 #define ANET_IP_ONLY (1<<0) +#define ANET_PREFER_IPV4 (1<<1) +#define ANET_PREFER_IPV6 (1<<2) #if defined(__sun) || defined(_AIX) #define AF_LOCAL AF_UNIX diff --git a/src/redis-cli.c b/src/redis-cli.c index 0510b770e16..930582dd957 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -275,6 +275,8 @@ static struct config { char *server_version; char *test_hint; char *test_hint_file; + int prefer_ipv4; /* Prefer IPv4 over IPv6 on DNS lookup. */ + int prefer_ipv6; /* Prefer IPv6 over IPv4 on DNS lookup. */ } config; /* User preferences. */ @@ -2768,6 +2770,10 @@ static int parseOptions(int argc, char **argv) { config.set_errcode = 1; } else if (!strcmp(argv[i],"--verbose")) { config.verbose = 1; + } else if (!strcmp(argv[i],"-4")) { + config.prefer_ipv4 = 1; + } else if (!strcmp(argv[i],"-6")) { + config.prefer_ipv6 = 1; } else if (!strcmp(argv[i],"--cluster") && !lastarg) { if (CLUSTER_MANAGER_MODE()) usage(1); char *cmd = argv[++i]; @@ -2952,6 +2958,11 @@ static int parseOptions(int argc, char **argv) { exit(1); } + if (config.prefer_ipv4 && config.prefer_ipv6) { + fprintf(stderr, "Options -4 and -6 are mutually exclusive.\n"); + exit(1); + } + return i; } @@ -3028,6 +3039,8 @@ static void usage(int err) { " -D Delimiter between responses for raw formatting (default: \\n).\n" " -c Enable cluster mode (follow -ASK and -MOVED redirections).\n" " -e Return exit error code when command execution fails.\n" +" -4 Prefer IPv4 over IPv6 on DNS lookup.\n" +" -6 Prefer IPv6 over IPv4 on DNS lookup.\n" "%s" " --raw Use raw formatting for replies (default when STDOUT is\n" " not a tty).\n" @@ -7071,7 +7084,10 @@ static int clusterManagerCommandCreate(int argc, char **argv) { first = node; /* Although hiredis supports connecting to a hostname, CLUSTER * MEET requires an IP address, so we do a DNS lookup here. */ - if (anetResolve(NULL, first->ip, first_ip, sizeof(first_ip), ANET_NONE) + int anet_flags = ANET_NONE; + if (config.prefer_ipv4) anet_flags |= ANET_PREFER_IPV4; + if (config.prefer_ipv6) anet_flags |= ANET_PREFER_IPV6; + if (anetResolve(NULL, first->ip, first_ip, sizeof(first_ip), anet_flags) == ANET_ERR) { fprintf(stderr, "Invalid IP address or hostname specified: %s\n", first->ip); @@ -7266,7 +7282,10 @@ static int clusterManagerCommandAddNode(int argc, char **argv) { "join the cluster.\n", ip, port); /* CLUSTER MEET requires an IP address, so we do a DNS lookup here. */ char first_ip[NET_IP_STR_LEN]; - if (anetResolve(NULL, first->ip, first_ip, sizeof(first_ip), ANET_NONE) == ANET_ERR) { + int anet_flags = ANET_NONE; + if (config.prefer_ipv4) anet_flags |= ANET_PREFER_IPV4; + if (config.prefer_ipv6) anet_flags |= ANET_PREFER_IPV6; + if (anetResolve(NULL, first->ip, first_ip, sizeof(first_ip), anet_flags) == ANET_ERR) { fprintf(stderr, "Invalid IP address or hostname specified: %s\n", first->ip); success = 0; goto cleanup; @@ -9862,6 +9881,8 @@ int main(int argc, char **argv) { config.no_auth_warning = 0; config.in_multi = 0; config.server_version = NULL; + config.prefer_ipv4 = 0; + config.prefer_ipv6 = 0; config.cluster_manager_command.name = NULL; config.cluster_manager_command.argc = 0; config.cluster_manager_command.argv = NULL; diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl index 76e97210fa6..ce4629ec92e 100644 --- a/tests/unit/cluster/cli.tcl +++ b/tests/unit/cluster/cli.tcl @@ -327,6 +327,8 @@ test {Migrate the last slot away from a node using redis-cli} { } } +foreach ip_or_localhost {127.0.0.1 localhost} { + # Test redis-cli --cluster create, add-node with cluster-port. # Create five nodes, three with custom cluster_port and two with default values. start_server [list overrides [list cluster-enabled yes cluster-node-timeout 1 cluster-port [find_available_port $::baseport $::portcount]]] { @@ -337,17 +339,12 @@ start_server [list overrides [list cluster-enabled yes cluster-node-timeout 1 cl # The first three are used to test --cluster create. # The last two are used to test --cluster add-node - set node1_rd [redis_client 0] - set node2_rd [redis_client -1] - set node3_rd [redis_client -2] - set node4_rd [redis_client -3] - set node5_rd [redis_client -4] - test {redis-cli --cluster create with cluster-port} { - exec src/redis-cli --cluster-yes --cluster create \ - 127.0.0.1:[srv 0 port] \ - 127.0.0.1:[srv -1 port] \ - 127.0.0.1:[srv -2 port] + test "redis-cli -4 --cluster create using $ip_or_localhost with cluster-port" { + exec src/redis-cli -4 --cluster-yes --cluster create \ + $ip_or_localhost:[srv 0 port] \ + $ip_or_localhost:[srv -1 port] \ + $ip_or_localhost:[srv -2 port] wait_for_condition 1000 50 { [CI 0 cluster_state] eq {ok} && @@ -363,11 +360,11 @@ start_server [list overrides [list cluster-enabled yes cluster-node-timeout 1 cl assert_equal 3 [CI 2 cluster_known_nodes] } - test {redis-cli --cluster add-node with cluster-port} { + test "redis-cli -4 --cluster add-node using $ip_or_localhost with cluster-port" { # Adding node to the cluster (without cluster-port) - exec src/redis-cli --cluster-yes --cluster add-node \ - 127.0.0.1:[srv -3 port] \ - 127.0.0.1:[srv 0 port] + exec src/redis-cli -4 --cluster-yes --cluster add-node \ + $ip_or_localhost:[srv -3 port] \ + $ip_or_localhost:[srv 0 port] wait_for_cluster_size 4 @@ -381,9 +378,9 @@ start_server [list overrides [list cluster-enabled yes cluster-node-timeout 1 cl } # Adding node to the cluster (with cluster-port) - exec src/redis-cli --cluster-yes --cluster add-node \ - 127.0.0.1:[srv -4 port] \ - 127.0.0.1:[srv 0 port] + exec src/redis-cli -4 --cluster-yes --cluster add-node \ + $ip_or_localhost:[srv -4 port] \ + $ip_or_localhost:[srv 0 port] wait_for_cluster_size 5 @@ -411,6 +408,8 @@ start_server [list overrides [list cluster-enabled yes cluster-node-timeout 1 cl } } +} ;# foreach ip_or_localhost + } ;# tags set ::singledb $old_singledb From 20214b26a4136836904e9231ac23508f15fca3b8 Mon Sep 17 00:00:00 2001 From: Slava Koyfman Date: Sun, 24 Dec 2023 11:56:44 +0200 Subject: [PATCH 11/58] Don't disconnect all clients in ACL LOAD (#12171) Previous implementation would disconnect _all_ clients when running `ACL LOAD`, which wasn't very useful. This change brings the behavior in line with that of `ACL SETUSER`, `ACL DELUSER`, in that only clients whose user is deleted or clients subscribed to channels which they no longer have access to will be disconnected. --------- Co-authored-by: Oran Agra Co-authored-by: Madelyn Olson <34459052+madolson@users.noreply.github.com> --- src/acl.c | 180 +++++++++++++++++++++++++++--------------- src/adlist.c | 2 + tests/assets/user.acl | 1 + tests/unit/acl.tcl | 74 +++++++++++++++-- 4 files changed, 185 insertions(+), 72 deletions(-) diff --git a/src/acl.c b/src/acl.c index 841f101cb6a..8ae867130c7 100644 --- a/src/acl.c +++ b/src/acl.c @@ -539,12 +539,6 @@ void ACLCopyUser(user *dst, user *src) { } } -/* Free all the users registered in the radix tree 'users' and free the - * radix tree itself. */ -void ACLFreeUsersSet(rax *users) { - raxFreeWithCallback(users,(void(*)(void*))ACLFreeUserAndKillClients); -} - /* Given a command ID, this function set by reference 'word' and 'bit' * so that user->allowed_commands[word] will address the right word * where the corresponding bit for the provided ID is stored, and @@ -1909,29 +1903,26 @@ int ACLCheckAllPerm(client *c, int *idxptr) { return ACLCheckAllUserCommandPerm(c->user, c->cmd, c->argv, c->argc, idxptr); } -/* Check if the user's existing pub/sub clients violate the ACL pub/sub - * permissions specified via the upcoming argument, and kill them if so. */ -void ACLKillPubsubClientsIfNeeded(user *new, user *original) { - /* Do nothing if there are no subscribers. */ - if (!dictSize(server.pubsub_patterns) && - !dictSize(server.pubsub_channels) && - !dictSize(server.pubsubshard_channels)) - return; +int totalSubscriptions(void) { + return dictSize(server.pubsub_patterns) + + dictSize(server.pubsub_channels) + + dictSize(server.pubsubshard_channels); +} +/* If 'new' can access all channels 'original' could then return NULL; + Otherwise return a list of channels that the new user can access */ +list *getUpcomingChannelList(user *new, user *original) { listIter li, lpi; listNode *ln, *lpn; - robj *o; - int kill = 0; - - /* First optimization is we check if any selector has all channel - * permissions. */ + + /* Optimization: we check if any selector has all channel permissions. */ listRewind(new->selectors,&li); while((ln = listNext(&li))) { aclSelector *s = (aclSelector *) listNodeValue(ln); - if (s->flags & SELECTOR_FLAG_ALLCHANNELS) return; + if (s->flags & SELECTOR_FLAG_ALLCHANNELS) return NULL; } - /* Second optimization is to check if the new list of channels + /* Next, check if the new list of channels * is a strict superset of the original. This is done by * created an "upcoming" list of all channels that are in * the new user and checking each of the existing channels @@ -1969,58 +1960,87 @@ void ACLKillPubsubClientsIfNeeded(user *new, user *original) { if (match) { /* All channels were matched, no need to kill clients. */ listRelease(upcoming); - return; + return NULL; } - - /* Permissions have changed, so we need to iterate through all - * the clients and disconnect those that are no longer valid. - * Scan all connected clients to find the user's pub/subs. */ - listRewind(server.clients,&li); - while ((ln = listNext(&li)) != NULL) { - client *c = listNodeValue(ln); - kill = 0; - if (c->user == original && getClientType(c) == CLIENT_TYPE_PUBSUB) { - /* Check for pattern violations. */ - dictIterator *di = dictGetIterator(c->pubsub_patterns); - dictEntry *de; + return upcoming; +} + +/* Check if the client should be killed because it is subscribed to channels that were + * permitted in the past, are not in the `upcoming` channel list. */ +int ACLShouldKillPubsubClient(client *c, list *upcoming) { + robj *o; + int kill = 0; + + if (getClientType(c) == CLIENT_TYPE_PUBSUB) { + /* Check for pattern violations. */ + dictIterator *di = dictGetIterator(c->pubsub_patterns); + dictEntry *de; + while (!kill && ((de = dictNext(di)) != NULL)) { + o = dictGetKey(de); + int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 1); + kill = (res == ACL_DENIED_CHANNEL); + } + dictReleaseIterator(di); + + /* Check for channel violations. */ + if (!kill) { + /* Check for global channels violation. */ + di = dictGetIterator(c->pubsub_channels); + while (!kill && ((de = dictNext(di)) != NULL)) { o = dictGetKey(de); - int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 1); + int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 0); kill = (res == ACL_DENIED_CHANNEL); } dictReleaseIterator(di); - - /* Check for channel violations. */ - if (!kill) { - /* Check for global channels violation. */ - di = dictGetIterator(c->pubsub_channels); - while (!kill && ((de = dictNext(di)) != NULL)) { - o = dictGetKey(de); - int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 0); - kill = (res == ACL_DENIED_CHANNEL); - } - dictReleaseIterator(di); - } - - if (!kill) { - /* Check for shard channels violation. */ - di = dictGetIterator(c->pubsubshard_channels); - while (!kill && ((de = dictNext(di)) != NULL)) { - o = dictGetKey(de); - int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 0); - kill = (res == ACL_DENIED_CHANNEL); - } - dictReleaseIterator(di); + } + if (!kill) { + /* Check for shard channels violation. */ + di = dictGetIterator(c->pubsubshard_channels); + while (!kill && ((de = dictNext(di)) != NULL)) { + o = dictGetKey(de); + int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 0); + kill = (res == ACL_DENIED_CHANNEL); } + dictReleaseIterator(di); + } - /* Kill it. */ - if (kill) { - freeClient(c); - } + if (kill) { + return 1; } } - listRelease(upcoming); + return 0; +} + +/* Check if the user's existing pub/sub clients violate the ACL pub/sub + * permissions specified via the upcoming argument, and kill them if so. */ +void ACLKillPubsubClientsIfNeeded(user *new, user *original) { + /* Do nothing if there are no subscribers. */ + if (totalSubscriptions() == 0) + return; + + list *channels = getUpcomingChannelList(new, original); + /* If the new user's pubsub permissions are a strict superset of the original, return early. */ + if (!channels) + return; + + listIter li; + listNode *ln; + + /* Permissions have changed, so we need to iterate through all + * the clients and disconnect those that are no longer valid. + * Scan all connected clients to find the user's pub/subs. */ + listRewind(server.clients,&li); + while ((ln = listNext(&li)) != NULL) { + client *c = listNodeValue(ln); + if (c->user != original) + continue; + if (ACLShouldKillPubsubClient(c, channels)) + freeClient(c); + } + + listRelease(channels); } /* ============================================================================= @@ -2427,11 +2447,43 @@ sds ACLLoadFromFile(const char *filename) { ACLFreeUser(new_default); raxInsert(Users,(unsigned char*)"default",7,DefaultUser,NULL); raxRemove(old_users,(unsigned char*)"default",7,NULL); - ACLFreeUsersSet(old_users); + + /* If there are some subscribers, we need to check if we need to drop some clients. */ + rax *user_channels = NULL; + if (totalSubscriptions() > 0) { + user_channels = raxNew(); + } + + listIter li; + listNode *ln; + + listRewind(server.clients,&li); + while ((ln = listNext(&li)) != NULL) { + client *c = listNodeValue(ln); + user *original = c->user; + list *channels = NULL; + user *new = ACLGetUserByName(c->user->name, sdslen(c->user->name)); + if (new && user_channels) { + if (!raxFind(user_channels, (unsigned char*)(new->name), sdslen(new->name), (void**)&channels)) { + channels = getUpcomingChannelList(new, original); + raxInsert(user_channels, (unsigned char*)(new->name), sdslen(new->name), channels, NULL); + } + } + /* When the new channel list is NULL, it means the new user's channel list is a superset of the old user's list. */ + if (!new || (channels && ACLShouldKillPubsubClient(c, channels))) { + freeClient(c); + continue; + } + c->user = new; + } + + if (user_channels) + raxFreeWithCallback(user_channels, (void(*)(void*))listRelease); + raxFreeWithCallback(old_users,(void(*)(void*))ACLFreeUser); sdsfree(errors); return NULL; } else { - ACLFreeUsersSet(Users); + raxFreeWithCallback(Users,(void(*)(void*))ACLFreeUser); Users = old_users; errors = sdscat(errors,"WARNING: ACL errors detected, no change to the previously active ACL rules was performed"); return errors; diff --git a/src/adlist.c b/src/adlist.c index f031c46e87d..06eca7c4753 100644 --- a/src/adlist.c +++ b/src/adlist.c @@ -76,6 +76,8 @@ void listEmpty(list *list) * This function can't fail. */ void listRelease(list *list) { + if (!list) + return; listEmpty(list); zfree(list); } diff --git a/tests/assets/user.acl b/tests/assets/user.acl index 926ac54f6f8..56fc0c25493 100644 --- a/tests/assets/user.acl +++ b/tests/assets/user.acl @@ -1,3 +1,4 @@ user alice on allcommands allkeys &* >alice user bob on -@all +@set +acl ~set* &* >bob +user doug on resetchannels &test +@all ~* >doug user default on nopass ~* &* +@all diff --git a/tests/unit/acl.tcl b/tests/unit/acl.tcl index 36ef063706d..e1e610f3c78 100644 --- a/tests/unit/acl.tcl +++ b/tests/unit/acl.tcl @@ -1005,16 +1005,76 @@ start_server [list overrides [list "dir" $server_path "acl-pubsub-default" "allc set e } {*NOPERM*set*} + test {ACL LOAD only disconnects affected clients} { + reconnect + r ACL SETUSER doug on nopass resetchannels &test* +@all ~* + + set rd1 [redis_deferring_client] + set rd2 [redis_deferring_client] + + $rd1 AUTH alice alice + $rd1 read + $rd1 SUBSCRIBE test1 + $rd1 read + + $rd2 AUTH doug doug + $rd2 read + $rd2 SUBSCRIBE test1 + $rd2 read + + r ACL LOAD + r PUBLISH test1 test-message + + # Permissions for 'alice' haven't changed, so they should still be connected + assert_match {*test-message*} [$rd1 read] + + # 'doug' no longer has access to "test1" channel, so they should get disconnected + catch {$rd2 read} e + assert_match {*I/O error*} $e + + $rd1 close + $rd2 close + } + + test {ACL LOAD disconnects clients of deleted users} { + reconnect + r ACL SETUSER mortimer on >mortimer ~* &* +@all + + set rd1 [redis_deferring_client] + set rd2 [redis_deferring_client] + + $rd1 AUTH alice alice + $rd1 read + $rd1 SUBSCRIBE test + $rd1 read + + $rd2 AUTH mortimer mortimer + $rd2 read + $rd2 SUBSCRIBE test + $rd2 read + + r ACL LOAD + r PUBLISH test test-message + + # Permissions for 'alice' haven't changed, so they should still be connected + assert_match {*test-message*} [$rd1 read] + + # 'mortimer' has been deleted, so their client should get disconnected + catch {$rd2 read} e + assert_match {*I/O error*} $e + + $rd1 close + $rd2 close + } + test {ACL load and save} { r ACL setuser eve +get allkeys >eve on r ACL save - # ACL load will free user and kill clients r ACL load - catch {r ACL LIST} e - assert_match {*I/O error*} $e - reconnect + # Clients should not be disconnected since permissions haven't changed + r AUTH alice alice r SET key value r AUTH eve eve @@ -1028,12 +1088,10 @@ start_server [list overrides [list "dir" $server_path "acl-pubsub-default" "allc r ACL setuser harry on nopass resetchannels &test +@all ~* r ACL save - # ACL load will free user and kill clients r ACL load - catch {r ACL LIST} e - assert_match {*I/O error*} $e - reconnect + # Clients should not be disconnected since permissions haven't changed + r AUTH harry anything r publish test bar catch {r publish test1 bar} e From 71f31da66f26f38f87dbc435779d3b53cbba8aec Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 26 Dec 2023 10:36:44 +0800 Subject: [PATCH 12/58] Add restart option to create-cluster script (#12885) When testing and debugging the cluster code before, you need to stop the cluster after making changes, and then start the cluster again. Add a restart option for ease of use. --- utils/create-cluster/create-cluster | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/utils/create-cluster/create-cluster b/utils/create-cluster/create-cluster index d97ee2b9cce..dd200833138 100755 --- a/utils/create-cluster/create-cluster +++ b/utils/create-cluster/create-cluster @@ -58,6 +58,23 @@ then exit 0 fi +if [ "$1" == "restart" ] +then + OLD_PORT=$PORT + while [ $((PORT < ENDPORT)) != "0" ]; do + PORT=$((PORT+1)) + echo "Stopping $PORT" + $BIN_PATH/redis-cli -p $PORT shutdown nosave + done + PORT=$OLD_PORT + while [ $((PORT < ENDPORT)) != "0" ]; do + PORT=$((PORT+1)) + echo "Starting $PORT" + $BIN_PATH/redis-server --port $PORT --protected-mode $PROTECTED_MODE --cluster-enabled yes --cluster-config-file nodes-${PORT}.conf --cluster-node-timeout $TIMEOUT --appendonly yes --appendfilename appendonly-${PORT}.aof --appenddirname appendonlydir-${PORT} --dbfilename dump-${PORT}.rdb --logfile ${PORT}.log --daemonize yes ${ADDITIONAL_OPTIONS} + done + exit 0 +fi + if [ "$1" == "watch" ] then PORT=$((PORT+1)) @@ -113,10 +130,11 @@ then exit 0 fi -echo "Usage: $0 [start|create|stop|watch|tail|tailall|clean|clean-logs|call]" +echo "Usage: $0 [start|create|stop|restart|watch|tail|tailall|clean|clean-logs|call]" echo "start -- Launch Redis Cluster instances." echo "create [-f] -- Create a cluster using redis-cli --cluster create." echo "stop -- Stop Redis Cluster instances." +echo "restart -- Restart Redis Cluster instances." echo "watch -- Show CLUSTER NODES output (first 30 lines) of first node." echo "tail -- Run tail -f of instance at base port + ID." echo "tailall -- Run tail -f for all the log files at once." From baf5699d77e8573ccf5a2be17ee1dad9f4afe44f Mon Sep 17 00:00:00 2001 From: zalj <43174463+zalj@users.noreply.github.com> Date: Tue, 26 Dec 2023 10:58:14 +0800 Subject: [PATCH 13/58] fix comment of aeProcessEvents (#12884) The implementation of aeProcessEvents seems have different behavior from the top comment. The implementation process file events first, then process time events. --- src/ae.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ae.c b/src/ae.c index ff60630e379..453bea2c50b 100644 --- a/src/ae.c +++ b/src/ae.c @@ -343,8 +343,8 @@ static int processTimeEvents(aeEventLoop *eventLoop) { return processed; } -/* Process every pending time event, then every pending file event - * (that may be registered by time event callbacks just processed). +/* Process every pending file event, then every pending time event + * (that may be registered by file event callbacks just processed). * Without special flags the function sleeps until some file event * fires, or when the next time event occurs (if any). * From 27a8e3b04e645e0c614c597668355e9bc945c408 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Tue, 26 Dec 2023 11:30:05 +0800 Subject: [PATCH 14/58] fix missing comments (#12878) add a missing comment for `dont_compress` and fix the bits calculation --- src/quicklist.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/quicklist.h b/src/quicklist.h index f17834b9943..3d33634f57a 100644 --- a/src/quicklist.h +++ b/src/quicklist.h @@ -42,7 +42,8 @@ * container: 2 bits, PLAIN=1 (a single item as char array), PACKED=2 (listpack with multiple items). * recompress: 1 bit, bool, true if node is temporary decompressed for usage. * attempted_compress: 1 bit, boolean, used for verifying during testing. - * extra: 10 bits, free for future use; pads out the remainder of 32 bits */ + * dont_compress: 1 bit, boolean, used for preventing compression of entry. + * extra: 9 bits, free for future use; pads out the remainder of 32 bits */ typedef struct quicklistNode { struct quicklistNode *prev; struct quicklistNode *next; From 1aa633d61bc22ad5f8865977ac9e5ec581c6f3c2 Mon Sep 17 00:00:00 2001 From: Andy Pan Date: Wed, 27 Dec 2023 00:44:18 +0800 Subject: [PATCH 15/58] Implement TCP Keep-Alives across most Unix-like systems (#12782) ## TCP Keep-Alives [TCP Keep-Alives](https://datatracker.ietf.org/doc/html/rfc9293#name-tcp-keep-alives) provides a way to detect whether a TCP connection is alive or dead, which can be useful for reducing system resources by cleaning up dead connections. There is full support of TCP Keep-Alives on Linux and partial support on macOS in `redis` at present. This PR intends to complete the rest. ## Unix-like OS's support `TCP_KEEPIDLE`, `TCP_KEEPINTVL`, and `TCP_KEEPCNT` are not included in the POSIX standard for `setsockopts`, while these three socket options are widely available on most Unix-like systems and Windows. ### References - [AIX](https://www.ibm.com/support/pages/ibm-aix-tcp-keepalive-probes) - [DragonflyBSD](https://man.dragonflybsd.org/?command=tcp§ion=4) - [FreeBSD](https://www.freebsd.org/cgi/man.cgi?query=tcp) - [HP-UX](https://docstore.mik.ua/manuals/hp-ux/en/B2355-60130/TCP.7P.html) - [illumos](https://illumos.org/man/4P/tcp) - [Linux](https://man7.org/linux/man-pages/man7/tcp.7.html) - [NetBSD](https://man.netbsd.org/NetBSD-8.0/tcp.4) - [Windows](https://learn.microsoft.com/en-us/windows/win32/winsock/ipproto-tcp-socket-options) ### Mac OS In earlier versions, macOS only supported setting `TCP_KEEPALIVE` (the equivalent of `TCP_KEEPIDLE` on other platforms), but since macOS 10.8 it has supported `TCP_KEEPINTVL` and `TCP_KEEPCNT`. Check out [this mailing list](https://lists.apple.com/archives/macnetworkprog/2012/Jul/msg00005.html) and [the source code](https://github.com/apple/darwin-xnu/blob/main/bsd/netinet/tcp.h#L215-L230) for more details. ### Solaris Solaris claimed it supported the TCP-Alives mechanism, but `TCP_KEEPIDLE`, `TCP_KEEPINTVL`, and `TCP_KEEPCNT` were not available on Solaris until the latest version 11.4. Therefore, we need to simulate the TCP-Alives mechanism on other platforms via `TCP_KEEPALIVE_THRESHOLD` + `TCP_KEEPALIVE_ABORT_THRESHOLD`. - [Solaris 11.3](https://docs.oracle.com/cd/E86824_01/html/E54777/tcp-7p.html) - [Solaris 11.4](https://docs.oracle.com/cd/E88353_01/html/E37851/tcp-4p.html) --------- Co-authored-by: Oran Agra --- src/anet.c | 135 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 110 insertions(+), 25 deletions(-) diff --git a/src/anet.c b/src/anet.c index 369e1c64129..6ed40b32ef3 100644 --- a/src/anet.c +++ b/src/anet.c @@ -130,57 +130,142 @@ int anetCloexec(int fd) { return r; } -/* Set TCP keep alive option to detect dead peers. The interval option - * is only used for Linux as we are using Linux-specific APIs to set - * the probe send time, interval, and count. */ +/* Enable TCP keep-alive mechanism to detect dead peers, + * TCP_KEEPIDLE, TCP_KEEPINTVL and TCP_KEEPCNT will be set accordingly. */ int anetKeepAlive(char *err, int fd, int interval) { - int val = 1; - - if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &val, sizeof(val)) == -1) + int enabled = 1; + if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &enabled, sizeof(enabled))) { anetSetError(err, "setsockopt SO_KEEPALIVE: %s", strerror(errno)); return ANET_ERR; } -#ifdef __linux__ + int idle; + int intvl; + int cnt; + +/* There are platforms that are expected to support the full mechanism of TCP keep-alive, + * we want the compiler to emit warnings of unused variables if the preprocessor directives + * somehow fail, and other than those platforms, just omit these warnings if they happen. + */ +#if !(defined(_AIX) || defined(__APPLE__) || defined(__DragonFly__) || \ + defined(__FreeBSD__) || defined(__illumos__) || defined(__linux__) || \ + defined(__NetBSD__) || defined(__sun)) + UNUSED(interval); + UNUSED(idle); + UNUSED(intvl); + UNUSED(cnt); +#endif + +/* The implementation of TCP keep-alive on Solaris/SmartOS is a bit unusual + * compared to other Unix-like systems. + * Thus, we need to specialize it on Solaris. */ +#ifdef __sun + /* There are two keep-alive mechanisms on Solaris: + * - By default, the first keep-alive probe is sent out after a TCP connection is idle for two hours. + * If the peer does not respond to the probe within eight minutes, the TCP connection is aborted. + * You can alter the interval for sending out the first probe using the socket option TCP_KEEPALIVE_THRESHOLD + * in milliseconds or TCP_KEEPIDLE in seconds. + * The system default is controlled by the TCP ndd parameter tcp_keepalive_interval. The minimum value is ten seconds. + * The maximum is ten days, while the default is two hours. If you receive no response to the probe, + * you can use the TCP_KEEPALIVE_ABORT_THRESHOLD socket option to change the time threshold for aborting a TCP connection. + * The option value is an unsigned integer in milliseconds. The value zero indicates that TCP should never time out and + * abort the connection when probing. The system default is controlled by the TCP ndd parameter tcp_keepalive_abort_interval. + * The default is eight minutes. + + * - The second implementation is activated if socket option TCP_KEEPINTVL and/or TCP_KEEPCNT are set. + * The time between each consequent probes is set by TCP_KEEPINTVL in seconds. + * The minimum value is ten seconds. The maximum is ten days, while the default is two hours. + * The TCP connection will be aborted after certain amount of probes, which is set by TCP_KEEPCNT, without receiving response. + */ + + idle = interval; + if (idle < 10) idle = 10; // kernel expects at least 10 seconds + if (idle > 10*24*60*60) idle = 10*24*60*60; // kernel expects at most 10 days + + /* `TCP_KEEPIDLE`, `TCP_KEEPINTVL`, and `TCP_KEEPCNT` were not available on Solaris + * until version 11.4, but let's take a chance here. */ + #if defined(TCP_KEEPIDLE) && defined(TCP_KEEPINTVL) && defined(TCP_KEEPCNT) + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &idle, sizeof(idle))) { + anetSetError(err, "setsockopt TCP_KEEPIDLE: %s\n", strerror(errno)); + return ANET_ERR; + } + intvl = idle/3; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &intvl, sizeof(intvl))) { + anetSetError(err, "setsockopt TCP_KEEPINTVL: %s\n", strerror(errno)); + return ANET_ERR; + } + cnt = 3; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &cnt, sizeof(cnt))) { + anetSetError(err, "setsockopt TCP_KEEPCNT: %s\n", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; + #endif + + /* Fall back to the first implementation of tcp-alive mechanism for older Solaris, + * simulate the tcp-alive mechanism on other platforms via `TCP_KEEPALIVE_THRESHOLD` + `TCP_KEEPALIVE_ABORT_THRESHOLD`. + */ + idle *= 1000; // kernel expects milliseconds + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE_THRESHOLD, &idle, sizeof(idle))) { + anetSetError(err, "setsockopt TCP_KEEPINTVL: %s\n", strerror(errno)); + return ANET_ERR; + } + + /* Note that the consequent probes will not be sent at equal intervals on Solaris, + * but will be sent using the exponential backoff algorithm. */ + intvl = idle/3; + cnt = 3; + int time_to_abort = intvl * cnt; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE_ABORT_THRESHOLD, &time_to_abort, sizeof(time_to_abort))) { + anetSetError(err, "setsockopt TCP_KEEPCNT: %s\n", strerror(errno)); + return ANET_ERR; + } + + return ANET_OK; +#endif + +#ifdef TCP_KEEPIDLE /* Default settings are more or less garbage, with the keepalive time - * set to 7200 by default on Linux. Modify settings to make the feature - * actually useful. */ + * set to 7200 by default on Linux and other Unix-like systems. + * Modify settings to make the feature actually useful. */ /* Send first probe after interval. */ - val = interval; - if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &val, sizeof(val)) < 0) { + idle = interval; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &idle, sizeof(idle))) { anetSetError(err, "setsockopt TCP_KEEPIDLE: %s\n", strerror(errno)); return ANET_ERR; } +#elif defined(TCP_KEEPALIVE) + /* Darwin/macOS uses TCP_KEEPALIVE in place of TCP_KEEPIDLE. */ + idle = interval; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE, &idle, sizeof(idle))) { + anetSetError(err, "setsockopt TCP_KEEPALIVE: %s\n", strerror(errno)); + return ANET_ERR; + } +#endif +#ifdef TCP_KEEPINTVL /* Send next probes after the specified interval. Note that we set the * delay as interval / 3, as we send three probes before detecting * an error (see the next setsockopt call). */ - val = interval/3; - if (val == 0) val = 1; - if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &val, sizeof(val)) < 0) { + intvl = interval/3; + if (intvl == 0) intvl = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &intvl, sizeof(intvl))) { anetSetError(err, "setsockopt TCP_KEEPINTVL: %s\n", strerror(errno)); return ANET_ERR; } +#endif +#ifdef TCP_KEEPCNT /* Consider the socket in error state after three we send three ACK * probes without getting a reply. */ - val = 3; - if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &val, sizeof(val)) < 0) { + cnt = 3; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &cnt, sizeof(cnt))) { anetSetError(err, "setsockopt TCP_KEEPCNT: %s\n", strerror(errno)); return ANET_ERR; } -#elif defined(__APPLE__) - /* Set idle time with interval */ - val = interval; - if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE, &val, sizeof(val)) < 0) { - anetSetError(err, "setsockopt TCP_KEEPALIVE: %s\n", strerror(errno)); - return ANET_ERR; - } -#else - ((void) interval); /* Avoid unused var warning for non Linux systems. */ #endif return ANET_OK; From bef5715374a2fe9700b690f152f314d15aa6eacd Mon Sep 17 00:00:00 2001 From: sundb Date: Wed, 27 Dec 2023 14:42:46 +0800 Subject: [PATCH 16/58] Fix oom-score-adj test due to no permission (#12887) Fix #12792 On ubuntu 23(lunar), non-root users will not be allowed to change the oom_score_adj of a process to a value that is too low. Since terminal's default oom_score_adj is 200, if we run the test on terminal, we won't be able to set the oom_score_adj of the redis process to 9 or 22, which is too low. Reproduction on ubuntu 23(lunar) terminal: ```sh $ cat /proc/`pgrep redis-server`/oom_score_adj 200 $ echo 100 > /proc/`pgrep redis-server`/oom_score_adj # success without error $ echo 99 > /proc/`pgrep redis-server`/oom_score_adj echo: write error: Permission denied ``` As from the output above, we can only set the minimum oom score of redis processes to 100. By modifying the test, make oom_score_adj only increase upwards and not decrease. --------- Co-authored-by: debing.sun --- tests/unit/oom-score-adj.tcl | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/unit/oom-score-adj.tcl b/tests/unit/oom-score-adj.tcl index 6c7b7139264..c557fee4259 100644 --- a/tests/unit/oom-score-adj.tcl +++ b/tests/unit/oom-score-adj.tcl @@ -101,25 +101,27 @@ if {$system_name eq {linux}} { test {CONFIG SET oom score restored on disable} { r config set oom-score-adj no - set_oom_score_adj 22 - assert_equal [get_oom_score_adj] 22 + set custom_oom [expr [get_oom_score_adj] + 1] + set_oom_score_adj $custom_oom + assert_equal [get_oom_score_adj] $custom_oom r config set oom-score-adj-values "9 9 9" oom-score-adj yes - assert_equal [get_oom_score_adj] [expr 9+22] + assert_equal [get_oom_score_adj] [expr 9+$custom_oom] r config set oom-score-adj no - assert_equal [get_oom_score_adj] 22 + assert_equal [get_oom_score_adj] $custom_oom } test {CONFIG SET oom score relative and absolute} { - set custom_oom 9 r config set oom-score-adj no set base_oom [get_oom_score_adj] + set custom_oom 9 r config set oom-score-adj-values "$custom_oom $custom_oom $custom_oom" oom-score-adj relative assert_equal [get_oom_score_adj] [expr $base_oom+$custom_oom] - r config set oom-score-adj absolute + set custom_oom [expr [get_oom_score_adj] + 1] + r config set oom-score-adj-values "$custom_oom $custom_oom $custom_oom" oom-score-adj absolute assert_equal [get_oom_score_adj] $custom_oom } From fa751f9bef390a0e6de3687f61204d7a2df73d96 Mon Sep 17 00:00:00 2001 From: Moshe Kaplan Date: Wed, 27 Dec 2023 01:53:56 -0500 Subject: [PATCH 17/58] config.c: Avoid leaking file handle if file is 0 bytes (#12828) If fopen() is successful and redis_fstat determines that the file is 0 bytes, the file handle stored in fp will leak. This change closes the filehandle stored in fp if the file is 0 bytes. Second attempt at fixing Coverity 390029 This is a follow-up to #12796 --- src/config.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/config.c b/src/config.c index b152a8fa538..b4e14eaf1e5 100644 --- a/src/config.c +++ b/src/config.c @@ -1128,7 +1128,14 @@ struct rewriteConfigState *rewriteConfigReadOldFile(char *path) { int linenum = -1; struct rewriteConfigState *state = rewriteConfigCreateState(); - if (fp == NULL || sb.st_size == 0) return state; + if (fp == NULL) { + return state; + } + + if (sb.st_size == 0) { + fclose(fp); + return state; + } /* Load the file content */ sds config = sdsnewlen(SDS_NOINIT,sb.st_size); From 852795959822b60cbed190e88f7821969bc35670 Mon Sep 17 00:00:00 2001 From: Chen Tianjie Date: Wed, 27 Dec 2023 17:40:45 +0800 Subject: [PATCH 18/58] Replace slots_to_channels radix tree with slot specific dictionaries for shard channels. (#12804) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have achieved replacing `slots_to_keys` radix tree with key->slot linked list (#9356), and then replacing the list with slot specific dictionaries for keys (#11695). Shard channels behave just like keys in many ways, and we also need a slots->channels mapping. Currently this is still done by using a radix tree. So we should split `server.pubsubshard_channels` into 16384 dicts and drop the radix tree, just like what we did to DBs. Some benefits (basically the benefits of what we've done to DBs): 1. Optimize counting channels in a slot. This is currently used only in removing channels in a slot. But this is potentially more useful: sometimes we need to know how many channels there are in a specific slot when doing slot migration. Counting is now implemented by traversing the radix tree, and with this PR it will be as simple as calling `dictSize`, from O(n) to O(1). 2. The radix tree in the cluster has been removed. The shard channel names no longer require additional storage, which can save memory. 3. Potentially useful in slot migration, as shard channels are logically split by slots, thus making it easier to migrate, remove or add as a whole. 4. Avoid rehashing a big dict when there is a large number of channels. Drawbacks: 1. Takes more memory than using radix tree when there are relatively few shard channels. What this PR does: 1. in cluster mode, split `server.pubsubshard_channels` into 16384 dicts, in standalone mode, still use only one dict. 2. drop the `slots_to_channels` radix tree. 3. to save memory (to solve the drawback above), all 16384 dicts are created lazily, which means only when a channel is about to be inserted to the dict will the dict be initialized, and when all channels are deleted, the dict would delete itself. 5. use `server.shard_channel_count` to keep track of the number of all shard channels. --------- Co-authored-by: Viktor Söderqvist --- src/acl.c | 2 +- src/cluster.h | 2 - src/cluster_legacy.c | 73 +---------- src/cluster_legacy.h | 1 - src/pubsub.c | 171 +++++++++++++++---------- src/server.c | 15 ++- src/server.h | 6 +- tests/cluster/tests/26-pubsubshard.tcl | 38 +++++- 8 files changed, 159 insertions(+), 149 deletions(-) diff --git a/src/acl.c b/src/acl.c index 8ae867130c7..b7e43cffa51 100644 --- a/src/acl.c +++ b/src/acl.c @@ -1906,7 +1906,7 @@ int ACLCheckAllPerm(client *c, int *idxptr) { int totalSubscriptions(void) { return dictSize(server.pubsub_patterns) + dictSize(server.pubsub_channels) + - dictSize(server.pubsubshard_channels); + server.shard_channel_count; } /* If 'new' can access all channels 'original' could then return NULL; diff --git a/src/cluster.h b/src/cluster.h index 97a4febd532..0bd1eb6a051 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -48,8 +48,6 @@ void clusterUpdateMyselfHostname(void); void clusterUpdateMyselfAnnouncedPorts(void); void clusterUpdateMyselfHumanNodename(void); -void slotToChannelAdd(sds channel); -void slotToChannelDel(sds channel); void clusterPropagatePublish(robj *channel, robj *message, int sharded); unsigned long getClusterConnectionsCount(void); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 801becf3ef1..f203a9416f6 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1021,9 +1021,6 @@ void clusterInit(void) { exit(1); } - /* The slots -> channels map is a radix tree. Initialize it here. */ - server.cluster->slots_to_channels = raxNew(); - /* Set myself->port/cport/pport to my listening ports, we'll just need to * discover the IP address via MEET messages. */ deriveAnnouncedPorts(&myself->tcp_port, &myself->tls_port, &myself->cport); @@ -5075,7 +5072,7 @@ int verifyClusterConfigWithData(void) { /* Remove all the shard channel related information not owned by the current shard. */ static inline void removeAllNotOwnedShardChannelSubscriptions(void) { - if (!dictSize(server.pubsubshard_channels)) return; + if (!server.shard_channel_count) return; clusterNode *currmaster = clusterNodeIsMaster(myself) ? myself : myself->slaveof; for (int j = 0; j < CLUSTER_SLOTS; j++) { if (server.cluster->slots[j] != currmaster) { @@ -5664,27 +5661,9 @@ sds genClusterInfoString(void) { void removeChannelsInSlot(unsigned int slot) { - unsigned int channelcount = countChannelsInSlot(slot); - if (channelcount == 0) return; - - /* Retrieve all the channels for the slot. */ - robj **channels = zmalloc(sizeof(robj*)*channelcount); - raxIterator iter; - int j = 0; - unsigned char indexed[2]; - - indexed[0] = (slot >> 8) & 0xff; - indexed[1] = slot & 0xff; - raxStart(&iter,server.cluster->slots_to_channels); - raxSeek(&iter,">=",indexed,2); - while(raxNext(&iter)) { - if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break; - channels[j++] = createStringObject((char*)iter.key + 2, iter.key_len - 2); - } - raxStop(&iter); + if (countChannelsInSlot(slot) == 0) return; - pubsubUnsubscribeShardChannels(channels, channelcount); - zfree(channels); + pubsubShardUnsubscribeAllChannelsInSlot(slot); } @@ -5719,52 +5698,10 @@ unsigned int delKeysInSlot(unsigned int hashslot) { return j; } -/* ----------------------------------------------------------------------------- - * Operation(s) on channel rax tree. - * -------------------------------------------------------------------------- */ - -void slotToChannelUpdate(sds channel, int add) { - size_t keylen = sdslen(channel); - unsigned int hashslot = keyHashSlot(channel,keylen); - unsigned char buf[64]; - unsigned char *indexed = buf; - - if (keylen+2 > 64) indexed = zmalloc(keylen+2); - indexed[0] = (hashslot >> 8) & 0xff; - indexed[1] = hashslot & 0xff; - memcpy(indexed+2,channel,keylen); - if (add) { - raxInsert(server.cluster->slots_to_channels,indexed,keylen+2,NULL,NULL); - } else { - raxRemove(server.cluster->slots_to_channels,indexed,keylen+2,NULL); - } - if (indexed != buf) zfree(indexed); -} - -void slotToChannelAdd(sds channel) { - slotToChannelUpdate(channel,1); -} - -void slotToChannelDel(sds channel) { - slotToChannelUpdate(channel,0); -} - /* Get the count of the channels for a given slot. */ unsigned int countChannelsInSlot(unsigned int hashslot) { - raxIterator iter; - int j = 0; - unsigned char indexed[2]; - - indexed[0] = (hashslot >> 8) & 0xff; - indexed[1] = hashslot & 0xff; - raxStart(&iter,server.cluster->slots_to_channels); - raxSeek(&iter,">=",indexed,2); - while(raxNext(&iter)) { - if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break; - j++; - } - raxStop(&iter); - return j; + dict *d = server.pubsubshard_channels[hashslot]; + return d ? dictSize(d) : 0; } int clusterNodeIsMyself(clusterNode *n) { diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 578b46fc3ff..a857184ab3e 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -318,7 +318,6 @@ struct clusterState { clusterNode *migrating_slots_to[CLUSTER_SLOTS]; clusterNode *importing_slots_from[CLUSTER_SLOTS]; clusterNode *slots[CLUSTER_SLOTS]; - rax *slots_to_channels; /* The following fields are used to take the slave state on elections. */ mstime_t failover_auth_time; /* Time of previous or next election. */ int failover_auth_count; /* Number of votes received so far. */ diff --git a/src/pubsub.c b/src/pubsub.c index 2fe7a3ff56c..f8910ee4fb5 100644 --- a/src/pubsub.c +++ b/src/pubsub.c @@ -36,7 +36,7 @@ typedef struct pubsubtype { int shard; dict *(*clientPubSubChannels)(client*); int (*subscriptionCount)(client*); - dict **serverPubSubChannels; + dict **(*serverPubSubChannels)(unsigned int); robj **subscribeMsg; robj **unsubscribeMsg; robj **messageBulk; @@ -62,12 +62,22 @@ dict* getClientPubSubChannels(client *c); */ dict* getClientPubSubShardChannels(client *c); +/* + * Get server's global Pub/Sub channels dict. + */ +dict **getServerPubSubChannels(unsigned int slot); + +/* + * Get server's shard level Pub/Sub channels dict. + */ +dict **getServerPubSubShardChannels(unsigned int slot); + /* * Get list of channels client is subscribed to. * If a pattern is provided, the subset of channels is returned * matching the pattern. */ -void channelList(client *c, sds pat, dict* pubsub_channels); +void channelList(client *c, sds pat, dict** pubsub_channels, int is_sharded); /* * Pub/Sub type for global channels. @@ -76,7 +86,7 @@ pubsubtype pubSubType = { .shard = 0, .clientPubSubChannels = getClientPubSubChannels, .subscriptionCount = clientSubscriptionsCount, - .serverPubSubChannels = &server.pubsub_channels, + .serverPubSubChannels = getServerPubSubChannels, .subscribeMsg = &shared.subscribebulk, .unsubscribeMsg = &shared.unsubscribebulk, .messageBulk = &shared.messagebulk, @@ -89,7 +99,7 @@ pubsubtype pubSubShardType = { .shard = 1, .clientPubSubChannels = getClientPubSubShardChannels, .subscriptionCount = clientShardSubscriptionsCount, - .serverPubSubChannels = &server.pubsubshard_channels, + .serverPubSubChannels = getServerPubSubShardChannels, .subscribeMsg = &shared.ssubscribebulk, .unsubscribeMsg = &shared.sunsubscribebulk, .messageBulk = &shared.smessagebulk, @@ -213,7 +223,7 @@ int serverPubsubSubscriptionCount(void) { /* Return the number of pubsub shard level channels is handled. */ int serverPubsubShardSubscriptionCount(void) { - return dictSize(server.pubsubshard_channels); + return server.shard_channel_count; } @@ -235,6 +245,16 @@ dict* getClientPubSubShardChannels(client *c) { return c->pubsubshard_channels; } +dict **getServerPubSubChannels(unsigned int slot) { + UNUSED(slot); + return &server.pubsub_channels; +} + +dict **getServerPubSubShardChannels(unsigned int slot) { + serverAssert(server.cluster_enabled || slot == 0); + return &server.pubsubshard_channels[slot]; +} + /* Return the number of pubsub + pubsub shard level channels * a client is subscribed to. */ int clientTotalPubSubSubscriptionCount(client *c) { @@ -258,20 +278,32 @@ void unmarkClientAsPubSub(client *c) { /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or * 0 if the client was already subscribed to that channel. */ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { + dict **d_ptr; dictEntry *de; list *clients = NULL; int retval = 0; + unsigned int slot = 0; /* Add the channel to the client -> channels hash table */ if (dictAdd(type.clientPubSubChannels(c),channel,NULL) == DICT_OK) { retval = 1; incrRefCount(channel); /* Add the client to the channel -> list of clients hash table */ - de = dictFind(*type.serverPubSubChannels, channel); + if (server.cluster_enabled && type.shard) { + slot = c->slot; + } + d_ptr = type.serverPubSubChannels(slot); + if (*d_ptr == NULL) { + *d_ptr = dictCreate(&keylistDictType); + } + de = dictFind(*d_ptr, channel); if (de == NULL) { clients = listCreate(); - dictAdd(*type.serverPubSubChannels, channel, clients); + dictAdd(*d_ptr, channel, clients); incrRefCount(channel); + if (type.shard) { + server.shard_channel_count++; + } } else { clients = dictGetVal(de); } @@ -285,10 +317,12 @@ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or * 0 if the client was not subscribed to the specified channel. */ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype type) { + dict *d; dictEntry *de; list *clients; listNode *ln; int retval = 0; + int slot = 0; /* Remove the channel from the client -> channels hash table */ incrRefCount(channel); /* channel may be just a pointer to the same object @@ -296,7 +330,12 @@ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype ty if (dictDelete(type.clientPubSubChannels(c),channel) == DICT_OK) { retval = 1; /* Remove the client from the channel -> clients list hash table */ - de = dictFind(*type.serverPubSubChannels, channel); + if (server.cluster_enabled && type.shard) { + slot = c->slot != -1 ? c->slot : (int)keyHashSlot(channel->ptr, sdslen(channel->ptr)); + } + d = *type.serverPubSubChannels(slot); + serverAssertWithInfo(c,NULL,d != NULL); + de = dictFind(d, channel); serverAssertWithInfo(c,NULL,de != NULL); clients = dictGetVal(de); ln = listSearchKey(clients,c); @@ -306,11 +345,14 @@ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype ty /* Free the list and associated hash entry at all if this was * the latest client, so that it will be possible to abuse * Redis PUBSUB creating millions of channels. */ - dictDelete(*type.serverPubSubChannels, channel); - /* As this channel isn't subscribed by anyone, it's safe - * to remove the channel from the slot. */ - if (server.cluster_enabled & type.shard) { - slotToChannelDel(channel->ptr); + dictDelete(d, channel); + if (type.shard) { + if (dictSize(d) == 0) { + dictRelease(d); + dict **d_ptr = type.serverPubSubChannels(slot); + *d_ptr = NULL; + } + server.shard_channel_count--; } } } @@ -322,19 +364,22 @@ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype ty return retval; } -void pubsubShardUnsubscribeAllClients(robj *channel) { - int retval; - dictEntry *de = dictFind(server.pubsubshard_channels, channel); - serverAssertWithInfo(NULL,channel,de != NULL); - list *clients = dictGetVal(de); - if (listLength(clients) > 0) { +/* Unsubscribe all shard channels in a slot. */ +void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) { + dict *d = server.pubsubshard_channels[slot]; + if (!d) { + return; + } + dictIterator *di = dictGetSafeIterator(d); + dictEntry *de; + while ((de = dictNext(di)) != NULL) { + robj *channel = dictGetKey(de); + list *clients = dictGetVal(de); /* For each client subscribed to the channel, unsubscribe it. */ - listIter li; listNode *ln; - listRewind(clients, &li); - while ((ln = listNext(&li)) != NULL) { + while ((ln = listFirst(clients)) != NULL) { client *c = listNodeValue(ln); - retval = dictDelete(c->pubsubshard_channels, channel); + int retval = dictDelete(c->pubsubshard_channels, channel); serverAssertWithInfo(c,channel,retval == DICT_OK); addReplyPubsubUnsubscribed(c, channel, pubSubShardType); /* If the client has no other pubsub subscription, @@ -343,16 +388,14 @@ void pubsubShardUnsubscribeAllClients(robj *channel) { unmarkClientAsPubSub(c); } } + server.shard_channel_count--; + dictDelete(d, channel); } - /* Delete the channel from server pubsubshard channels hash table. */ - retval = dictDelete(server.pubsubshard_channels, channel); - /* Delete the channel from slots_to_channel mapping. */ - slotToChannelDel(channel->ptr); - serverAssertWithInfo(NULL,channel,retval == DICT_OK); - decrRefCount(channel); /* it is finally safe to release it */ + dictReleaseIterator(di); + dictRelease(d); + server.pubsubshard_channels[slot] = NULL; } - /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the client was already subscribed to that pattern. */ int pubsubSubscribePattern(client *c, robj *pattern) { dictEntry *de; @@ -446,17 +489,6 @@ int pubsubUnsubscribeShardAllChannels(client *c, int notify) { return count; } -/* - * Unsubscribe a client from provided shard subscribed channel(s). - */ -void pubsubUnsubscribeShardChannels(robj **channels, unsigned int count) { - for (unsigned int j = 0; j < count; j++) { - /* Remove the channel from server and from the clients - * subscribed to it as well as notify them. */ - pubsubShardUnsubscribeAllClients(channels[j]); - } -} - /* Unsubscribe from all the patterns. Return the number of patterns the * client was subscribed from. */ int pubsubUnsubscribeAllPatterns(client *c, int notify) { @@ -483,13 +515,19 @@ int pubsubUnsubscribeAllPatterns(client *c, int notify) { */ int pubsubPublishMessageInternal(robj *channel, robj *message, pubsubtype type) { int receivers = 0; + dict *d; dictEntry *de; dictIterator *di; listNode *ln; listIter li; + unsigned int slot = 0; /* Send to clients listening for that channel */ - de = dictFind(*type.serverPubSubChannels, channel); + if (server.cluster_enabled && type.shard) { + slot = keyHashSlot(channel->ptr, sdslen(channel->ptr)); + } + d = *type.serverPubSubChannels(slot); + de = d ? dictFind(d, channel) : NULL; if (de) { list *list = dictGetVal(de); listNode *ln; @@ -658,7 +696,7 @@ NULL { /* PUBSUB CHANNELS [] */ sds pat = (c->argc == 2) ? NULL : c->argv[2]->ptr; - channelList(c, pat, server.pubsub_channels); + channelList(c, pat, &server.pubsub_channels, 0); } else if (!strcasecmp(c->argv[1]->ptr,"numsub") && c->argc >= 2) { /* PUBSUB NUMSUB [Channel_1 ... Channel_N] */ int j; @@ -678,14 +716,15 @@ NULL { /* PUBSUB SHARDCHANNELS */ sds pat = (c->argc == 2) ? NULL : c->argv[2]->ptr; - channelList(c,pat,server.pubsubshard_channels); + channelList(c,pat,server.pubsubshard_channels,server.cluster_enabled); } else if (!strcasecmp(c->argv[1]->ptr,"shardnumsub") && c->argc >= 2) { /* PUBSUB SHARDNUMSUB [ShardChannel_1 ... ShardChannel_N] */ int j; - addReplyArrayLen(c, (c->argc-2)*2); for (j = 2; j < c->argc; j++) { - list *l = dictFetchValue(server.pubsubshard_channels, c->argv[j]); + unsigned int slot = calculateKeySlot(c->argv[j]->ptr); + dict *d = server.pubsubshard_channels[slot]; + list *l = d ? dictFetchValue(d, c->argv[j]) : NULL; addReplyBulk(c,c->argv[j]); addReplyLongLong(c,l ? listLength(l) : 0); @@ -695,25 +734,31 @@ NULL } } -void channelList(client *c, sds pat, dict *pubsub_channels) { - dictIterator *di = dictGetIterator(pubsub_channels); - dictEntry *de; +void channelList(client *c, sds pat, dict **pubsub_channels, int is_sharded) { long mblen = 0; void *replylen; + unsigned int slot_cnt = is_sharded ? CLUSTER_SLOTS : 1; replylen = addReplyDeferredLen(c); - while((de = dictNext(di)) != NULL) { - robj *cobj = dictGetKey(de); - sds channel = cobj->ptr; - - if (!pat || stringmatchlen(pat, sdslen(pat), - channel, sdslen(channel),0)) - { - addReplyBulk(c,cobj); - mblen++; + for (unsigned int i = 0; i < slot_cnt; i++) { + if (pubsub_channels[i] == NULL) { + continue; } + dictIterator *di = dictGetIterator(pubsub_channels[i]); + dictEntry *de; + while((de = dictNext(di)) != NULL) { + robj *cobj = dictGetKey(de); + sds channel = cobj->ptr; + + if (!pat || stringmatchlen(pat, sdslen(pat), + channel, sdslen(channel),0)) + { + addReplyBulk(c,cobj); + mblen++; + } + } + dictReleaseIterator(di); } - dictReleaseIterator(di); setDeferredArrayLen(c,replylen,mblen); } @@ -735,14 +780,6 @@ void ssubscribeCommand(client *c) { } for (int j = 1; j < c->argc; j++) { - /* A channel is only considered to be added, if a - * subscriber exists for it. And if a subscriber - * already exists the slotToChannel doesn't needs - * to be incremented. */ - if (server.cluster_enabled & - (dictFind(*pubSubShardType.serverPubSubChannels, c->argv[j]) == NULL)) { - slotToChannelAdd(c->argv[j]->ptr); - } pubsubSubscribeChannel(c, c->argv[j], pubSubShardType); } markClientAsPubSub(c); diff --git a/src/server.c b/src/server.c index 872c327a375..0b45616c338 100644 --- a/src/server.c +++ b/src/server.c @@ -2714,10 +2714,10 @@ void initServer(void) { server.db = zmalloc(sizeof(redisDb)*server.dbnum); /* Create the Redis databases, and initialize other internal state. */ - for (j = 0; j < server.dbnum; j++) { - int slotCount = (server.cluster_enabled) ? CLUSTER_SLOTS : 1; - server.db[j].dict = dictCreateMultiple(&dbDictType, slotCount); - server.db[j].expires = dictCreateMultiple(&dbExpiresDictType,slotCount); + int slot_count = (server.cluster_enabled) ? CLUSTER_SLOTS : 1; + for (j = 0; j < server.dbnum; j++) { + server.db[j].dict = dictCreateMultiple(&dbDictType, slot_count); + server.db[j].expires = dictCreateMultiple(&dbExpiresDictType,slot_count); server.db[j].expires_cursor = 0; server.db[j].blocking_keys = dictCreate(&keylistDictType); server.db[j].blocking_keys_unblock_on_nokey = dictCreate(&objectKeyPointerValueDictType); @@ -2726,7 +2726,7 @@ void initServer(void) { server.db[j].id = j; server.db[j].avg_ttl = 0; server.db[j].defrag_later = listCreate(); - server.db[j].dict_count = slotCount; + server.db[j].dict_count = slot_count; initDbState(&server.db[j]); listSetFreeMethod(server.db[j].defrag_later,(void (*)(void*))sdsfree); } @@ -2734,7 +2734,8 @@ void initServer(void) { evictionPoolAlloc(); /* Initialize the LRU keys pool. */ server.pubsub_channels = dictCreate(&keylistDictType); server.pubsub_patterns = dictCreate(&keylistDictType); - server.pubsubshard_channels = dictCreate(&keylistDictType); + server.pubsubshard_channels = zcalloc(sizeof(dict *) * slot_count); + server.shard_channel_count = 0; server.pubsub_clients = 0; server.cronloops = 0; server.in_exec = 0; @@ -5869,7 +5870,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { "keyspace_misses:%lld\r\n", server.stat_keyspace_misses, "pubsub_channels:%ld\r\n", dictSize(server.pubsub_channels), "pubsub_patterns:%lu\r\n", dictSize(server.pubsub_patterns), - "pubsubshard_channels:%lu\r\n", dictSize(server.pubsubshard_channels), + "pubsubshard_channels:%llu\r\n", server.shard_channel_count, "latest_fork_usec:%lld\r\n", server.stat_fork_time, "total_forks:%lld\r\n", server.stat_total_forks, "migrate_cached_sockets:%ld\r\n", dictSize(server.migrate_cached_sockets), diff --git a/src/server.h b/src/server.h index 99bce884a53..a0ffdf7465e 100644 --- a/src/server.h +++ b/src/server.h @@ -1994,7 +1994,8 @@ struct redisServer { dict *pubsub_patterns; /* A dict of pubsub_patterns */ int notify_keyspace_events; /* Events to propagate via Pub/Sub. This is an xor of NOTIFY_... flags. */ - dict *pubsubshard_channels; /* Map shard channels to list of subscribed clients */ + dict **pubsubshard_channels; /* Map shard channels in every slot to list of subscribed clients */ + unsigned long long shard_channel_count; unsigned int pubsub_clients; /* # of clients in Pub/Sub mode */ /* Cluster */ int cluster_enabled; /* Is cluster enabled? */ @@ -2498,6 +2499,7 @@ extern dictType sdsHashDictType; extern dictType dbExpiresDictType; extern dictType modulesDictType; extern dictType sdsReplyDictType; +extern dictType keylistDictType; extern dict *modules; /*----------------------------------------------------------------------------- @@ -3197,7 +3199,7 @@ robj *hashTypeDup(robj *o); /* Pub / Sub */ int pubsubUnsubscribeAllChannels(client *c, int notify); int pubsubUnsubscribeShardAllChannels(client *c, int notify); -void pubsubUnsubscribeShardChannels(robj **channels, unsigned int count); +void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot); int pubsubUnsubscribeAllPatterns(client *c, int notify); int pubsubPublishMessage(robj *channel, robj *message, int sharded); int pubsubPublishMessageAndPropagateToCluster(robj *channel, robj *message, int sharded); diff --git a/tests/cluster/tests/26-pubsubshard.tcl b/tests/cluster/tests/26-pubsubshard.tcl index 2619eda0a1f..34939acf7c6 100644 --- a/tests/cluster/tests/26-pubsubshard.tcl +++ b/tests/cluster/tests/26-pubsubshard.tcl @@ -56,6 +56,21 @@ test "client can subscribe to multiple shard channels across different slots in $cluster sunsubscribe ch7 } +test "sunsubscribe without specifying any channel would unsubscribe all shard channels subscribed" { + set publishclient [redis_client_by_addr $publishnode(host) $publishnode(port)] + set subscribeclient [redis_deferring_client_by_addr $publishnode(host) $publishnode(port)] + + set sub_res [ssubscribe $subscribeclient [list "\{channel.0\}1" "\{channel.0\}2" "\{channel.0\}3"]] + assert_equal [list 1 2 3] $sub_res + sunsubscribe $subscribeclient + + assert_equal 0 [$publishclient spublish "\{channel.0\}1" hello] + assert_equal 0 [$publishclient spublish "\{channel.0\}2" hello] + assert_equal 0 [$publishclient spublish "\{channel.0\}3" hello] + + $publishclient close + $subscribeclient close +} test "Verify Pub/Sub and Pub/Sub shard no overlap" { set slot [$cluster cluster keyslot "channel.0"] @@ -91,4 +106,25 @@ test "Verify Pub/Sub and Pub/Sub shard no overlap" { $publishclient close $subscribeclient close $subscribeshardclient close -} \ No newline at end of file +} + +test "PUBSUB channels/shardchannels" { + set subscribeclient [redis_deferring_client_by_addr $publishnode(host) $publishnode(port)] + set subscribeclient2 [redis_deferring_client_by_addr $publishnode(host) $publishnode(port)] + set subscribeclient3 [redis_deferring_client_by_addr $publishnode(host) $publishnode(port)] + set publishclient [redis_client_by_addr $publishnode(host) $publishnode(port)] + + ssubscribe $subscribeclient [list "\{channel.0\}1"] + ssubscribe $subscribeclient2 [list "\{channel.0\}2"] + ssubscribe $subscribeclient3 [list "\{channel.0\}3"] + assert_equal {3} [llength [$publishclient pubsub shardchannels]] + + subscribe $subscribeclient [list "\{channel.0\}4"] + assert_equal {3} [llength [$publishclient pubsub shardchannels]] + + sunsubscribe $subscribeclient + set channel_list [$publishclient pubsub shardchannels] + assert_equal {2} [llength $channel_list] + assert {[lsearch -exact $channel_list "\{channel.0\}2"] >= 0} + assert {[lsearch -exact $channel_list "\{channel.0\}3"] >= 0} +} From 5b1fe925f214bdf7eee8dd4f35380559d8a85c5c Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 28 Dec 2023 14:26:23 +0800 Subject: [PATCH 19/58] Adjust redis-cli --cluster create arity from -2 to -1 (#12892) When arity is -2, it allows us to input two nodes, but returns: ``` *** ERROR: Invalid configuration for cluster creation. *** Redis Cluster requires at least 3 master nodes. *** This is not possible with 2 nodes and 0 replicas per node. *** At least 3 nodes are required. ``` When we input one node, it return: ``` [ERR] Wrong number of arguments for specified --cluster sub command ``` Strictly speaking -2 should also be rejected, because redis-cli requires at least three nodes. However, the error message was not very friendly, so decided to change it arity -1. This closes #12891. --- src/redis-cli.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/redis-cli.c b/src/redis-cli.c index 930582dd957..ed6f59001a1 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -3765,7 +3765,7 @@ typedef struct clusterManagerCommandDef { } clusterManagerCommandDef; clusterManagerCommandDef clusterManagerCommands[] = { - {"create", clusterManagerCommandCreate, -2, "host1:port1 ... hostN:portN", + {"create", clusterManagerCommandCreate, -1, "host1:port1 ... hostN:portN", "replicas "}, {"check", clusterManagerCommandCheck, -1, " or - separated by either colon or space", "search-multiple-owners"}, From 99c468c38c597400f84df2128153e3abebe5e9f0 Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 28 Dec 2023 14:32:51 +0800 Subject: [PATCH 20/58] Fix crash caused by pubsubShardUnsubscribeAllChannelsInSlot not deleting the client (#12896) The code does not delete the corresponding node when traversing clients, resulting in a loop, causing the dictDelete() == DICT_OK assertion to fail. In addition, did a cleanup, in the dictCreate scenario, we can avoid a dictFind call since the dict is empty. Issue was introduced in #12804. --- src/pubsub.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/pubsub.c b/src/pubsub.c index f8910ee4fb5..6c69431b80f 100644 --- a/src/pubsub.c +++ b/src/pubsub.c @@ -295,8 +295,10 @@ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { d_ptr = type.serverPubSubChannels(slot); if (*d_ptr == NULL) { *d_ptr = dictCreate(&keylistDictType); + de = NULL; + } else { + de = dictFind(*d_ptr, channel); } - de = dictFind(*d_ptr, channel); if (de == NULL) { clients = listCreate(); dictAdd(*d_ptr, channel, clients); @@ -387,6 +389,7 @@ void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) { if (clientTotalPubSubSubscriptionCount(c) == 0) { unmarkClientAsPubSub(c); } + listDelNode(clients, ln); } server.shard_channel_count--; dictDelete(d, channel); From 12b611b374ef5f9ee2472b560a2fc3e5cc6c2dba Mon Sep 17 00:00:00 2001 From: guybe7 Date: Thu, 28 Dec 2023 16:27:58 +0700 Subject: [PATCH 21/58] WAITAOF: Try to wake blocked clients ASAP in the next beforeSleep (#12627) In case server.fsynced_reploff changed (e.g. flushAppendOnly set it to server.master_repl_offset in case there was nothing to fsync) we want to avoid sleeping before the next beforeSleep so we can call blockedBeforeSleep ASAP. without that, in case there's no incoming traffic, we could be waiting for the next cron timer event to wake us up. --- src/server.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/server.c b/src/server.c index 0b45616c338..5a17446dc2c 100644 --- a/src/server.c +++ b/src/server.c @@ -1724,7 +1724,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { connTypeProcessPendingData(); /* If any connection type(typical TLS) still has pending unread data don't sleep at all. */ - aeSetDontWait(server.el, connTypeHasPendingData()); + int dont_sleep = connTypeHasPendingData(); /* Call the Redis Cluster before sleep function. Note that this function * may change the state of Redis Cluster (from ok to fail or vice versa), @@ -1786,6 +1786,8 @@ void beforeSleep(struct aeEventLoop *eventLoop) { monotime aof_start_time = getMonotonicUs(); /* Record cron time in beforeSleep. This does not include the time consumed by AOF writing and IO writing below. */ monotime duration_before_aof = aof_start_time - cron_start_time_before_aof; + /* Record the fsync'd offset before flushAppendOnly */ + long long prev_fsynced_reploff = server.fsynced_reploff; /* Write the AOF buffer on disk, * must be done before handleClientsWithPendingWritesUsingThreads, @@ -1803,6 +1805,11 @@ void beforeSleep(struct aeEventLoop *eventLoop) { long long fsynced_reploff_pending; atomicGet(server.fsynced_reploff_pending, fsynced_reploff_pending); server.fsynced_reploff = fsynced_reploff_pending; + + /* If we have blocked [WAIT]AOF clients, and fsynced_reploff changed, we want to try to + * wake them up ASAP. */ + if (listLength(server.clients_waiting_acks) && prev_fsynced_reploff != server.fsynced_reploff) + dont_sleep = 1; } /* Handle writes with pending output buffers. */ @@ -1841,6 +1848,10 @@ void beforeSleep(struct aeEventLoop *eventLoop) { } } + /* Don't sleep at all before the next beforeSleep() if needed (e.g. a + * connection has pending data) */ + aeSetDontWait(server.el, dont_sleep); + /* Before we are going to sleep, let the threads access the dataset by * releasing the GIL. Redis main thread will not touch anything at this * time. */ From 2c5b51ad26b6129e161c2373fb48571648fc1393 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 28 Dec 2023 11:32:23 +0200 Subject: [PATCH 22/58] Bump github/codeql-action from 2 to 3 (#12869) Bumps [github/codeql-action](https://github.com/github/codeql-action) from 2 to 3.
Release notes

Sourced from github/codeql-action's releases.

CodeQL Bundle v2.15.4

Bundles CodeQL CLI v2.15.4

Includes the following CodeQL language packs from github/codeql@codeql-cli/v2.15.4:

CodeQL Bundle

Bundles CodeQL CLI v2.15.3

Includes the following CodeQL language packs from github/codeql@codeql-cli/v2.15.3:

CodeQL Bundle

Bundles CodeQL CLI v2.15.2

Includes the following CodeQL language packs from github/codeql@codeql-cli/v2.15.2:

... (truncated)

Changelog

Sourced from github/codeql-action's changelog.

Commits
  • 3a9f6a8 update javascript files
  • cc4fead update version in various hardcoded locations
  • 183559c Merge branch 'main' into update-bundle/codeql-bundle-v2.15.4
  • 5b52b36 reintroduce PR check that confirm action can be still be compiled on node16
  • 5b19bef change to node20 for all actions
  • f2d0c2e upgrade node type definitions
  • d651fbc change to node20 for all actions
  • 382a50a Merge pull request #2021 from github/mergeback/v2.22.9-to-main-c0d1daa7
  • 458b422 Update checked-in dependencies
  • 5e0f9db Update changelog and version after v2.22.9
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github/codeql-action&package-manager=github_actions&previous-version=2&new-version=3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codeql-analysis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index dc7413e59c4..fc92dec2182 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -22,12 +22,12 @@ jobs: uses: actions/checkout@v3 - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} - name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 From 9d0158bf89265daa96e1711478102147117f6b14 Mon Sep 17 00:00:00 2001 From: Chen Tianjie Date: Thu, 28 Dec 2023 19:29:27 +0800 Subject: [PATCH 23/58] Reorder signalModifiedKey in xaddCommand. (#12895) This PR is a supplement to #11144, moving `signalModifiedKey` in `xaddCommand` after the trimming, to ensure the state of key eventual consistency. Currently there is no problem with Redis, but it is better to avoid issues in future development on Redis. --- src/t_stream.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/t_stream.c b/src/t_stream.c index 733ccfc8c4e..dda3dab2cc6 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -2052,7 +2052,6 @@ void xaddCommand(client *c) { sds replyid = createStreamIDString(&id); addReplyBulkCBuffer(c, replyid, sdslen(replyid)); - signalModifiedKey(c,c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_STREAM,"xadd",c->argv[1],c->db->id); server.dirty++; @@ -2072,6 +2071,8 @@ void xaddCommand(client *c) { } } + signalModifiedKey(c,c->db,c->argv[1]); + /* Let's rewrite the ID argument with the one actually generated for * AOF/replication propagation. */ if (!parsed_args.id_given || !parsed_args.seq_given) { From c3f8b542eecec1a52de88f402e310687c36d03e9 Mon Sep 17 00:00:00 2001 From: AshMosh <127777999+AshMosh@users.noreply.github.com> Date: Wed, 3 Jan 2024 01:15:03 +0200 Subject: [PATCH 24/58] Manage number of new connections per cycle (#12178) There are situations (especially in TLS) in which the engine gets too occupied managing a large number of new connections. Existing connections may time-out while the server is processing the new connections initial TLS handshakes, which may cause cause new connections to be established, perpetuating the problem. To better manage the tradeoff between new connection rate and other workloads, this change adds a new config to manage maximum number of new connections per event loop cycle, instead of using a predetermined number (currently 1000). This change introduces two new configurations, max-new-connections-per-cycle and max-new-tls-connections-per-cycle. The default value of the tcp connections is 10 per cycle and the default value of tls connections per cycle is 1. --------- Co-authored-by: Madelyn Olson --- redis.conf | 20 ++++++++++++++++++++ src/config.c | 2 ++ src/connection.h | 1 - src/server.h | 2 ++ src/socket.c | 3 ++- src/tls.c | 3 ++- src/unix.c | 3 ++- 7 files changed, 30 insertions(+), 4 deletions(-) diff --git a/redis.conf b/redis.conf index c7499ce1f1f..ab9ba337d95 100644 --- a/redis.conf +++ b/redis.conf @@ -2200,6 +2200,26 @@ rdb-save-incremental-fsync yes # lfu-log-factor 10 # lfu-decay-time 1 + +# The maximum number of new client connections accepted per event-loop cycle. This configuration +# is set independently for TLS connections. +# +# By default, up to 10 new connection will be accepted per event-loop cycle for normal connections +# and up to 1 new connection per event-loop cycle for TLS connections. +# +# Adjusting this to a larger number can slightly improve efficiency for new connections +# at the risk of causing timeouts for regular commands on established connections. It is +# not advised to change this without ensuring that all clients have limited connection +# pools and exponential backoff in the case of command/connection timeouts. +# +# If your application is establishing a large number of new connections per second you should +# also consider tuning the value of tcp-backlog, which allows the kernel to buffer more +# pending connections before dropping or rejecting connections. +# +# max-new-connections-per-cycle 10 +# max-new-tls-connections-per-cycle 1 + + ########################### ACTIVE DEFRAGMENTATION ####################### # # What is active defragmentation? diff --git a/src/config.c b/src/config.c index b4e14eaf1e5..e5108558fcb 100644 --- a/src/config.c +++ b/src/config.c @@ -3198,6 +3198,8 @@ standardConfig static_configs[] = { createUIntConfig("maxclients", NULL, MODIFIABLE_CONFIG, 1, UINT_MAX, server.maxclients, 10000, INTEGER_CONFIG, NULL, updateMaxclients), createUIntConfig("unixsocketperm", NULL, IMMUTABLE_CONFIG, 0, 0777, server.unixsocketperm, 0, OCTAL_CONFIG, NULL, NULL), createUIntConfig("socket-mark-id", NULL, IMMUTABLE_CONFIG, 0, UINT_MAX, server.socket_mark_id, 0, INTEGER_CONFIG, NULL, NULL), + createUIntConfig("max-new-connections-per-cycle", NULL, MODIFIABLE_CONFIG, 1, 1000, server.max_new_conns_per_cycle, 10, INTEGER_CONFIG, NULL, NULL), + createUIntConfig("max-new-tls-connections-per-cycle", NULL, MODIFIABLE_CONFIG, 1, 1000, server.max_new_tls_conns_per_cycle, 1, INTEGER_CONFIG, NULL, NULL), #ifdef LOG_REQ_RES createUIntConfig("client-default-resp", NULL, IMMUTABLE_CONFIG | HIDDEN_CONFIG, 2, 3, server.client_default_resp, 2, INTEGER_CONFIG, NULL, NULL), #endif diff --git a/src/connection.h b/src/connection.h index d0a17ab4dd6..d0340d18f5d 100644 --- a/src/connection.h +++ b/src/connection.h @@ -40,7 +40,6 @@ #define CONN_INFO_LEN 32 #define CONN_ADDR_STR_LEN 128 /* Similar to INET6_ADDRSTRLEN, hoping to handle other protocols. */ -#define MAX_ACCEPTS_PER_CALL 1000 struct aeEventLoop; typedef struct connection connection; diff --git a/src/server.h b/src/server.h index a0ffdf7465e..437793b0e22 100644 --- a/src/server.h +++ b/src/server.h @@ -1774,6 +1774,8 @@ struct redisServer { int latency_tracking_enabled; /* 1 if extended latency tracking is enabled, 0 otherwise. */ double *latency_tracking_info_percentiles; /* Extended latency tracking info output percentile list configuration. */ int latency_tracking_info_percentiles_len; + unsigned int max_new_tls_conns_per_cycle; /* The maximum number of tls connections that will be accepted during each invocation of the event loop. */ + unsigned int max_new_conns_per_cycle; /* The maximum number of tcp connections that will be accepted during each invocation of the event loop. */ /* AOF persistence */ int aof_enabled; /* AOF configuration */ int aof_state; /* AOF_(ON|OFF|WAIT_REWRITE) */ diff --git a/src/socket.c b/src/socket.c index dad8e93cca5..61a2b9ea97d 100644 --- a/src/socket.c +++ b/src/socket.c @@ -309,7 +309,8 @@ static void connSocketEventHandler(struct aeEventLoop *el, int fd, void *clientD } static void connSocketAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { - int cport, cfd, max = MAX_ACCEPTS_PER_CALL; + int cport, cfd; + int max = server.max_new_conns_per_cycle; char cip[NET_IP_STR_LEN]; UNUSED(el); UNUSED(mask); diff --git a/src/tls.c b/src/tls.c index 7920b5f4e77..d011c16ea12 100644 --- a/src/tls.c +++ b/src/tls.c @@ -766,7 +766,8 @@ static void tlsEventHandler(struct aeEventLoop *el, int fd, void *clientData, in } static void tlsAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { - int cport, cfd, max = MAX_ACCEPTS_PER_CALL; + int cport, cfd; + int max = server.max_new_tls_conns_per_cycle; char cip[NET_IP_STR_LEN]; UNUSED(el); UNUSED(mask); diff --git a/src/unix.c b/src/unix.c index bd146d0245e..eb5850765a8 100644 --- a/src/unix.c +++ b/src/unix.c @@ -92,7 +92,8 @@ static connection *connCreateAcceptedUnix(int fd, void *priv) { } static void connUnixAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { - int cfd, max = MAX_ACCEPTS_PER_CALL; + int cfd; + int max = server.max_new_conns_per_cycle; UNUSED(el); UNUSED(mask); UNUSED(privdata); From 068051e378ceaea85a7fb011f1271587a17bd0e7 Mon Sep 17 00:00:00 2001 From: Madelyn Olson <34459052+madolson@users.noreply.github.com> Date: Tue, 2 Jan 2024 18:20:22 -0800 Subject: [PATCH 25/58] Handle recursive serverAsserts and provide more information for recursive segfaults (#12857) This change is trying to make two failure modes a bit easier to deep dive: 1. If a serverPanic or serverAssert occurs during the info (or module) printing, it will recursively panic, which is a lot of fun as it will just keep recursively printing. It will eventually stack overflow, but will generate a lot of text in the process. 2. When a segfault happens during the segfault handler, no information is communicated other than it happened. This can be problematic because `info` may help diagnose the real issue, but without fixing the recursive crash it might be hard to get at that info. --- runtest-moduleapi | 1 + src/debug.c | 66 +++++++++++++++++++++------------- tests/modules/Makefile | 7 ++-- tests/modules/crash.c | 39 ++++++++++++++++++++ tests/unit/moduleapi/crash.tcl | 52 +++++++++++++++++++++++++++ 5 files changed, 138 insertions(+), 27 deletions(-) create mode 100644 tests/modules/crash.c create mode 100644 tests/unit/moduleapi/crash.tcl diff --git a/runtest-moduleapi b/runtest-moduleapi index ff685afb66c..910d581f2fc 100755 --- a/runtest-moduleapi +++ b/runtest-moduleapi @@ -55,4 +55,5 @@ $TCLSH tests/test_helper.tcl \ --single unit/moduleapi/async_rm_call \ --single unit/moduleapi/moduleauth \ --single unit/moduleapi/rdbloadsave \ +--single unit/moduleapi/crash \ "${@}" diff --git a/src/debug.c b/src/debug.c index b924d9ed3f0..facbd6104a8 100644 --- a/src/debug.c +++ b/src/debug.c @@ -72,10 +72,10 @@ static pthread_mutex_t signal_handler_lock; static pthread_mutexattr_t signal_handler_lock_attr; static volatile int signal_handler_lock_initialized = 0; /* Forward declarations */ -void bugReportStart(void); +int bugReportStart(void); void printCrashReport(void); void bugReportEnd(int killViaSignal, int sig); -void logStackTrace(void *eip, int uplevel); +void logStackTrace(void *eip, int uplevel, int current_thread); void dbGetStats(char *buf, size_t bufsize, redisDb *db, int full, dbKeyType keyType); void sigalrmSignalHandler(int sig, siginfo_t *info, void *secret); @@ -1031,15 +1031,17 @@ NULL __attribute__ ((noinline)) void _serverAssert(const char *estr, const char *file, int line) { - bugReportStart(); - serverLog(LL_WARNING,"=== ASSERTION FAILED ==="); + int new_report = bugReportStart(); + serverLog(LL_WARNING,"=== %sASSERTION FAILED ===", new_report ? "" : "RECURSIVE "); serverLog(LL_WARNING,"==> %s:%d '%s' is not true",file,line,estr); if (server.crashlog_enabled) { #ifdef HAVE_BACKTRACE - logStackTrace(NULL, 1); + logStackTrace(NULL, 1, 0); #endif - printCrashReport(); + /* If this was a recursive assertion, it what most likely generated + * from printCrashReport. */ + if (new_report) printCrashReport(); } // remove the signal handler so on abort() we will output the crash report. @@ -1127,16 +1129,18 @@ void _serverPanic(const char *file, int line, const char *msg, ...) { vsnprintf(fmtmsg,sizeof(fmtmsg),msg,ap); va_end(ap); - bugReportStart(); + int new_report = bugReportStart(); serverLog(LL_WARNING,"------------------------------------------------"); serverLog(LL_WARNING,"!!! Software Failure. Press left mouse button to continue"); serverLog(LL_WARNING,"Guru Meditation: %s #%s:%d",fmtmsg,file,line); if (server.crashlog_enabled) { #ifdef HAVE_BACKTRACE - logStackTrace(NULL, 1); + logStackTrace(NULL, 1, 0); #endif - printCrashReport(); + /* If this was a recursive panic, it what most likely generated + * from printCrashReport. */ + if (new_report) printCrashReport(); } // remove the signal handler so on abort() we will output the crash report. @@ -1144,14 +1148,18 @@ void _serverPanic(const char *file, int line, const char *msg, ...) { bugReportEnd(0, 0); } -void bugReportStart(void) { +/* Start a bug report, returning 1 if this is the first time this function was called, 0 otherwise. */ +int bugReportStart(void) { pthread_mutex_lock(&bug_report_start_mutex); if (bug_report_start == 0) { serverLogRaw(LL_WARNING|LL_RAW, "\n\n=== REDIS BUG REPORT START: Cut & paste starting from here ===\n"); bug_report_start = 1; + pthread_mutex_unlock(&bug_report_start_mutex); + return 1; } pthread_mutex_unlock(&bug_report_start_mutex); + return 0; } #ifdef HAVE_BACKTRACE @@ -1895,9 +1903,9 @@ static void writeStacktraces(int fd, int uplevel) { } -#else /* __linux__*/ +#endif /* __linux__ */ __attribute__ ((noinline)) -static void writeStacktraces(int fd, int uplevel) { +static void writeCurrentThreadsStackTrace(int fd, int uplevel) { void *trace[BACKTRACE_MAX_SIZE]; int trace_size = backtrace(trace, BACKTRACE_MAX_SIZE); @@ -1906,7 +1914,6 @@ static void writeStacktraces(int fd, int uplevel) { if (write(fd,msg,strlen(msg)) == -1) {/* Avoid warning. */}; backtrace_symbols_fd(trace+uplevel, trace_size-uplevel, fd); } -#endif /* __linux__ */ /* Logs the stack trace using the backtrace() call. This function is designed * to be called from signal handlers safely. @@ -1916,7 +1923,7 @@ static void writeStacktraces(int fd, int uplevel) { * __attribute__ ((noinline)) to make sure the compiler won't inline them. */ __attribute__ ((noinline)) -void logStackTrace(void *eip, int uplevel) { +void logStackTrace(void *eip, int uplevel, int current_thread) { int fd = openDirectLogFiledes(); char *msg; uplevel++; /* skip this function */ @@ -1935,7 +1942,17 @@ void logStackTrace(void *eip, int uplevel) { /* Write symbols to log file */ ++uplevel; - writeStacktraces(fd, uplevel); +#ifdef __linux__ + if (current_thread) { + writeCurrentThreadsStackTrace(fd, uplevel); + } else { + writeStacktraces(fd, uplevel); + } +#else + /* Outside of linux, we only support writing the current thread. */ + UNUSED(current_thread); + writeCurrentThreadsStackTrace(fd, uplevel); +#endif msg = "\n------ STACK TRACE DONE ------\n"; if (write(fd,msg,strlen(msg)) == -1) {/* Avoid warning. */}; @@ -2218,15 +2235,14 @@ __attribute__ ((noinline)) static void sigsegvHandler(int sig, siginfo_t *info, void *secret) { UNUSED(secret); UNUSED(info); + int print_full_crash_info = 1; /* Check if it is safe to enter the signal handler. second thread crashing at the same time will deadlock. */ if(pthread_mutex_lock(&signal_handler_lock) == EDEADLK) { - /* If this thread already owns the lock (meaning we crashed during handling a signal) - * log that the crash report can't be generated. */ + /* If this thread already owns the lock (meaning we crashed during handling a signal) switch + * to printing the minimal information about the crash. */ serverLogRawFromHandler(LL_WARNING, - "Crashed running signal handler. Can't continue to generate the crash report"); - /* gracefully exit */ - bugReportEnd(1, sig); - return; + "Crashed running signal handler. Providing reduced version of recursive crash report."); + print_full_crash_info = 0; } bugReportStart(); @@ -2260,7 +2276,9 @@ static void sigsegvHandler(int sig, siginfo_t *info, void *secret) { getAndSetMcontextEip(uc, ptr); } - logStackTrace(eip, 1); + /* When printing the reduced crash info, just print the current thread + * to avoid race conditions with the multi-threaded stack collector. */ + logStackTrace(eip, 1, !print_full_crash_info); if (eip == info->si_addr) { /* Restore old eip */ @@ -2270,7 +2288,7 @@ static void sigsegvHandler(int sig, siginfo_t *info, void *secret) { logRegisters(uc); #endif - printCrashReport(); + if (print_full_crash_info) printCrashReport(); #ifdef HAVE_BACKTRACE if (eip != NULL) @@ -2430,7 +2448,7 @@ void sigalrmSignalHandler(int sig, siginfo_t *info, void *secret) { serverLogRawFromHandler(LL_WARNING, "\nReceived SIGALRM"); } #ifdef HAVE_BACKTRACE - logStackTrace(getAndSetMcontextEip(uc, NULL), 1); + logStackTrace(getAndSetMcontextEip(uc, NULL), 1, 0); #else serverLogRawFromHandler(LL_WARNING,"Sorry: no support for backtrace()."); #endif diff --git a/tests/modules/Makefile b/tests/modules/Makefile index d63c8548d63..586e66e067f 100644 --- a/tests/modules/Makefile +++ b/tests/modules/Makefile @@ -4,10 +4,10 @@ uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') warning_cflags = -W -Wall -Wno-missing-field-initializers ifeq ($(uname_S),Darwin) - SHOBJ_CFLAGS ?= $(warning_cflags) -dynamic -fno-common -g -ggdb -std=c99 -O2 + SHOBJ_CFLAGS ?= $(warning_cflags) -dynamic -fno-common -g -ggdb -std=gnu11 -O2 SHOBJ_LDFLAGS ?= -bundle -undefined dynamic_lookup else # Linux, others - SHOBJ_CFLAGS ?= $(warning_cflags) -fno-common -g -ggdb -std=c99 -O2 + SHOBJ_CFLAGS ?= $(warning_cflags) -fno-common -g -ggdb -std=gnu11 -O2 SHOBJ_LDFLAGS ?= -shared endif @@ -62,7 +62,8 @@ TEST_MODULES = \ usercall.so \ postnotifications.so \ moduleauthtwo.so \ - rdbloadsave.so + rdbloadsave.so \ + crash.so .PHONY: all diff --git a/tests/modules/crash.c b/tests/modules/crash.c new file mode 100644 index 00000000000..c7eccda529a --- /dev/null +++ b/tests/modules/crash.c @@ -0,0 +1,39 @@ +#include "redismodule.h" + +#include +#include + +#define UNUSED(V) ((void) V) + +void assertCrash(RedisModuleInfoCtx *ctx, int for_crash_report) { + UNUSED(ctx); + UNUSED(for_crash_report); + RedisModule_Assert(0); +} + +void segfaultCrash(RedisModuleInfoCtx *ctx, int for_crash_report) { + UNUSED(ctx); + UNUSED(for_crash_report); + /* Compiler gives warnings about writing to a random address + * e.g "*((char*)-1) = 'x';". As a workaround, we map a read-only area + * and try to write there to trigger segmentation fault. */ + char *p = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + *p = 'x'; +} + +int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) { + REDISMODULE_NOT_USED(argv); + REDISMODULE_NOT_USED(argc); + if (RedisModule_Init(ctx,"infocrash",1,REDISMODULE_APIVER_1) + == REDISMODULE_ERR) return REDISMODULE_ERR; + RedisModule_Assert(argc == 1); + if (!strcasecmp(RedisModule_StringPtrLen(argv[0], NULL), "segfault")) { + if (RedisModule_RegisterInfoFunc(ctx, segfaultCrash) == REDISMODULE_ERR) return REDISMODULE_ERR; + } else if(!strcasecmp(RedisModule_StringPtrLen(argv[0], NULL), "assert")) { + if (RedisModule_RegisterInfoFunc(ctx, assertCrash) == REDISMODULE_ERR) return REDISMODULE_ERR; + } else { + return REDISMODULE_ERR; + } + + return REDISMODULE_OK; +} diff --git a/tests/unit/moduleapi/crash.tcl b/tests/unit/moduleapi/crash.tcl new file mode 100644 index 00000000000..82b32f9542d --- /dev/null +++ b/tests/unit/moduleapi/crash.tcl @@ -0,0 +1,52 @@ +# This file is used to test certain crash edge cases to make sure they produce +# correct stack traces for debugging. +set testmodule [file normalize tests/modules/crash.so] + +# Valgrind will complain that the process terminated by a signal, skip it. +if {!$::valgrind} { + start_server {tags {"modules"}} { + r module load $testmodule assert + test {Test module crash when info crashes with an assertion } { + catch {r 0 info infocrash} + set res [wait_for_log_messages 0 {"*=== REDIS BUG REPORT START: Cut & paste starting from here ===*"} 0 10 1000] + set loglines [lindex $res 1] + + set res [wait_for_log_messages 0 {"*ASSERTION FAILED*"} $loglines 10 1000] + set loglines [lindex $res 1] + + set res [wait_for_log_messages 0 {"*RECURSIVE ASSERTION FAILED*"} $loglines 10 1000] + set loglines [lindex $res 1] + + wait_for_log_messages 0 {"*=== REDIS BUG REPORT END. Make sure to include from START to END. ===*"} $loglines 10 1000 + assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT END. Make sure to include from START to END. ==="] + assert_equal 2 [count_log_message 0 "ASSERTION FAILED"] + # There will be 3 crash assertions, 1 in the first stack trace and 2 in the second + assert_equal 3 [count_log_message 0 "assertCrash"] + assert_equal 1 [count_log_message 0 "RECURSIVE ASSERTION FAILED"] + assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT START: Cut & paste starting from here ==="] + } + } + + start_server {tags {"modules"}} { + r module load $testmodule segfault + test {Test module crash when info crashes with a segfault} { + catch {r 0 info infocrash} + set res [wait_for_log_messages 0 {"*=== REDIS BUG REPORT START: Cut & paste starting from here ===*"} 0 10 1000] + set loglines [lindex $res 1] + + set res [wait_for_log_messages 0 {"*Crashed running the instruction at*"} $loglines 10 1000] + set loglines [lindex $res 1] + + set res [wait_for_log_messages 0 {"*Crashed running signal handler. Providing reduced version of recursive crash report*"} $loglines 10 1000] + set loglines [lindex $res 1] + set res [wait_for_log_messages 0 {"*Crashed running the instruction at*"} $loglines 10 1000] + set loglines [lindex $res 1] + + wait_for_log_messages 0 {"*=== REDIS BUG REPORT END. Make sure to include from START to END. ===*"} $loglines 10 1000 + assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT END. Make sure to include from START to END. ==="] + assert_equal 1 [count_log_message 0 "Crashed running signal handler. Providing reduced version of recursive crash report"] + assert_equal 2 [count_log_message 0 "Crashed running the instruction at"] + assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT START: Cut & paste starting from here ==="] + } + } +} From 518983835006dc670cc8dd210293fe608fb909a9 Mon Sep 17 00:00:00 2001 From: Lior Kogan Date: Wed, 3 Jan 2024 17:21:19 +0200 Subject: [PATCH 26/58] Update CONTRIBUTING.md (#12907) - Referring to Redis Discord channel instead of the mailing list. - Referring to the licensing instead of repeating it. --- CONTRIBUTING.md | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 56b71834d6b..359020e07e5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,20 +1,19 @@ Note: by contributing code to the Redis project in any form, including sending -a pull request via Github, a code fragment or patch via private email or +a pull request via GitHub, a code fragment or patch via private email or public discussion groups, you agree to release your code under the terms -of the BSD license that you can find in the COPYING file included in the Redis -source distribution. You will include BSD license in the COPYING file within -each source file that you contribute. +of the Redis license that you can find in the COPYING file included in the Redis +source distribution. # IMPORTANT: HOW TO USE REDIS GITHUB ISSUES -Github issues SHOULD ONLY BE USED to report bugs, and for DETAILED feature -requests. Everything else belongs to the Redis Google Group: +GitHub issues SHOULD ONLY BE USED to report bugs and for DETAILED feature +requests. Everything else should be asked on Discord: - https://groups.google.com/forum/m/#!forum/Redis-db + https://discord.com/invite/redis PLEASE DO NOT POST GENERAL QUESTIONS that are not about bugs or suspected -bugs in the Github issues system. We'll be very happy to help you and provide -all the support in the mailing list. +bugs in the GitHub issues system. We'll be delighted to help you and provide +all the support on Discord. There is also an active community of Redis users at Stack Overflow: @@ -33,24 +32,24 @@ straight away: if your feature is not a conceptual fit you'll lose a lot of time writing the code without any reason. Start by posting in the mailing list and creating an issue at Github with the description of, exactly, what you want to accomplish and why. Use cases are important for features to be accepted. -Here you'll see if there is consensus about your idea. +Here you can see if there is consensus about your idea. 2. If in step 1 you get an acknowledgment from the project leaders, use the following procedure to submit a patch: - a. Fork Redis on github ( https://docs.github.com/en/github/getting-started-with-github/fork-a-repo ) + a. Fork Redis on GitHub ( https://docs.github.com/en/github/getting-started-with-github/fork-a-repo ) b. Create a topic branch (git checkout -b my_branch) c. Push to your branch (git push origin my_branch) - d. Initiate a pull request on github ( https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request ) + d. Initiate a pull request on GitHub ( https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request ) e. Done :) 3. Keep in mind that we are very overloaded, so issues and PRs sometimes wait -for a *very* long time. However this is not lack of interest, as the project +for a *very* long time. However this is not a lack of interest, as the project gets more and more users, we find ourselves in a constant need to prioritize certain issues/PRs over others. If you think your issue/PR is very important try to popularize it, have other users commenting and sharing their point of -view and so forth. This helps. +view, and so forth. This helps. -4. For minor fixes just open a pull request on Github. +4. For minor fixes - open a pull request on GitHub. Thanks! From 38f02349462d5aefa9b25386a130ebd67db1a4de Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 4 Jan 2024 22:38:33 +0200 Subject: [PATCH 27/58] Bump cross-platform-actions/action from 0.21.1 to 0.22.0 (#12904) Bumps [cross-platform-actions/action](https://github.com/cross-platform-actions/action) from 0.21.1 to 0.22.0.
Release notes

Sourced from cross-platform-actions/action's releases.

Cross Platform Action 0.22.0

Added

  • Added support for using the action in multiple steps in the same job (#26). All the inputs need to be the same for all steps, except for the following inputs: sync_files, shutdown_vm and run.

  • Added support for specifying that the VM should not shutdown after the action has run. This adds a new input parameter: shutdown_vm. When set to false, this will hopefully mitigate very frequent freezing of VM during teardown (#61, #72).

Changed

  • Always terminate VM instead of shutting down. This is more efficient and this will hopefully mitigate very frequent freezing of VM during teardown (#61, #72).

  • Use unsafe as the cache mode for QEMU disks. This should improve performance (#67).

Changelog

Sourced from cross-platform-actions/action's changelog.

[0.22.0] - 2023-12-27

Added

  • Added support for using the action in multiple steps in the same job (#26). All the inputs need to be the same for all steps, except for the following inputs: sync_files, shutdown_vm and run.

  • Added support for specifying that the VM should not shutdown after the action has run. This adds a new input parameter: shutdown_vm. When set to false, this will hopefully mitigate very frequent freezing of VM during teardown (#61, #72).

Changed

  • Always terminate VM instead of shutting down. This is more efficient and this will hopefully mitigate very frequent freezing of VM during teardown (#61, #72).

  • Use unsafe as the cache mode for QEMU disks. This should improve performance (#67).

Commits
  • 5800fa0 Release 0.22.0
  • 20ad4b2 Fix #67: Use unsafe as the cache mode disks
  • d918493 Always terminate VM instead of shutting down.
  • 626f1d6 Fix error when terminating the VM
  • d59f08d Print stack trace for uncaught exceptions
  • 7f2fab9 Revert "Run SSH in verbose mode when debug mode is enabled"
  • 0f566c3 [no ci] Update the changelog
  • b7f7744 [no ci] Fix spelling
  • 9894a9b Wrap host module in namespace
  • 87fdd34 Fix broken test-vm-shutdown tests
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=cross-platform-actions/action&package-manager=github_actions&previous-version=0.21.1&new-version=0.22.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/daily.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index 04b166490cc..8e382ec80a3 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -916,7 +916,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: test - uses: cross-platform-actions/action@v0.21.1 + uses: cross-platform-actions/action@v0.22.0 with: operating_system: freebsd environment_variables: MAKE From 4cae66f5e803c527c4e6141c06b94670162eca2c Mon Sep 17 00:00:00 2001 From: Binbin Date: Sun, 7 Jan 2024 12:24:41 +0800 Subject: [PATCH 28/58] Use shard-id of the master if the replica does not support shard-id (#12805) If there are nodes in the cluster that do not support shard-id, they will gossip shard-id. From the perspective of nodes that support shard-id, their shard-id is meaningless (since shard-id is randomly generated when we create a node.) Nodes that support shard-id will save the shard-id information in nodes.conf. If the node is restarted according to nodes.conf, the server will report a corrupted cluster config file error. Because auxShardIdSetter will reject configurations with inconsistent master-replica shard-ids. A cluster-wide consensus for the node's shard_id is not necessary. The key is maintaining consistency of the shard_id on each individual 7.2 node. As the cluster progressively upgrades to version 7.2, we can expect the shard_ids across all nodes to naturally converge and align. In this PR, when processing the gossip, if sender is a replica and does not support shard-id, set the shard_id to the shard_id of its master. --- src/cluster.h | 1 + src/cluster_legacy.c | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/cluster.h b/src/cluster.h index 0bd1eb6a051..f21f1e9c16e 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -87,6 +87,7 @@ int clusterNodeIsMaster(clusterNode *n); char *clusterNodeIp(clusterNode *node); int clusterNodeIsSlave(clusterNode *node); clusterNode *clusterNodeGetSlaveof(clusterNode *node); +clusterNode *clusterNodeGetMaster(clusterNode *node); char *clusterNodeGetName(clusterNode *node); int clusterNodeTimedOut(clusterNode *node); int clusterNodeIsFailing(clusterNode *node); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index f203a9416f6..08203edd102 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -2597,11 +2597,23 @@ void clusterProcessPingExtensions(clusterMsg *hdr, clusterLink *link) { /* We know this will be valid since we validated it ahead of time */ ext = getNextPingExt(ext); } + /* If the node did not send us a hostname extension, assume * they don't have an announced hostname. Otherwise, we'll * set it now. */ updateAnnouncedHostname(sender, ext_hostname); updateAnnouncedHumanNodename(sender, ext_humannodename); + /* If the node did not send us a shard-id extension, it means the sender + * does not support it (old version), node->shard_id is randomly generated. + * A cluster-wide consensus for the node's shard_id is not necessary. + * The key is maintaining consistency of the shard_id on each individual 7.2 node. + * As the cluster progressively upgrades to version 7.2, we can expect the shard_ids + * across all nodes to naturally converge and align. + * + * If sender is a replica, set the shard_id to the shard_id of its master. + * Otherwise, we'll set it now. */ + if (ext_shardid == NULL) ext_shardid = clusterNodeGetMaster(sender)->shard_id; + updateShardId(sender, ext_shardid); } @@ -5544,7 +5556,7 @@ void addShardReplyForClusterShards(client *c, list *nodes) { addReplyBulkCString(c, "slots"); /* Use slot_info_pairs from the primary only */ - while (n->slaveof != NULL) n = n->slaveof; + n = clusterNodeGetMaster(n); if (n->slot_info_pairs != NULL) { serverAssert((n->slot_info_pairs_count % 2) == 0); @@ -5805,6 +5817,11 @@ clusterNode *clusterNodeGetSlaveof(clusterNode *node) { return node->slaveof; } +clusterNode *clusterNodeGetMaster(clusterNode *node) { + while (node->slaveof != NULL) node = node->slaveof; + return node; +} + char *clusterNodeGetName(clusterNode *node) { return node->name; } From ca1f67af80434f831e8aa0eaaf8a6573bcb31bcb Mon Sep 17 00:00:00 2001 From: "debing.sun" Date: Sun, 7 Jan 2024 18:10:29 +0800 Subject: [PATCH 29/58] Make RM_Yield thread-safe (#12905) ## Issues and solutions from #12817 1. Touch ProcessingEventsWhileBlocked and calling moduleCount() without GIL in afterSleep() - Introduced: Version: 7.0.0 PR: #9963 - Harm Level: Very High If the module thread calls `RM_Yield()` before the main thread enters afterSleep(), and modifies `ProcessingEventsWhileBlocked`(+1), it will cause the main thread to not wait for GIL, which can lead to all kinds of unforeseen problems, including memory data corruption. - Initial / Abandoned Solution: * Added `__thread` specifier for ProcessingEventsWhileBlocked. `ProcessingEventsWhileBlocked` is used to protect against nested event processing, but event processing in the main thread and module threads should be completely independent and unaffected, so it is safer to use TLS. * Adding a cached module count to keep track of the current number of modules, to avoid having to use `dictSize()`. - Related Warnings: ``` WARNING: ThreadSanitizer: data race (pid=1136) Write of size 4 at 0x0001045990c0 by thread T4 (mutexes: write M0): #0 processEventsWhileBlocked networking.c:4135 (redis-server:arm64+0x10006d124) #1 RM_Yield module.c:2410 (redis-server:arm64+0x10018b66c) #2 bg_call_worker :83232836 (blockedclient.so:arm64+0x16a8) Previous read of size 4 at 0x0001045990c0 by main thread: #0 afterSleep server.c:1861 (redis-server:arm64+0x100024f98) #1 aeProcessEvents ae.c:408 (redis-server:arm64+0x10000fd64) #2 aeMain ae.c:496 (redis-server:arm64+0x100010f0c) #3 main server.c:7220 (redis-server:arm64+0x10003f38c) ``` 2. aeApiPoll() is not thread-safe When using RM_Yield to handle events in a module thread, if the main thread has not yet entered `afterSleep()`, both the module thread and the main thread may touch `server.el` at the same time. - Introduced: Version: 7.0.0 PR: #9963 - Old / Abandoned Solution: Adding a new mutex to protect timing between after beforeSleep() and before afterSleep(). Defect: If the main thread enters the ae loop without any IO events, it will wait until the next timeout or until there is any event again, and the module thread will always hang until the main thread leaves the event loop. - Related Warnings: ``` SUMMARY: ThreadSanitizer: data race ae_kqueue.c:55 in addEventMask ================== ================== WARNING: ThreadSanitizer: data race (pid=14682) Write of size 4 at 0x000100b54000 by thread T9 (mutexes: write M0): #0 aeApiPoll ae_kqueue.c:175 (redis-server:arm64+0x100010588) #1 aeProcessEvents ae.c:399 (redis-server:arm64+0x10000fb84) #2 processEventsWhileBlocked networking.c:4138 (redis-server:arm64+0x10006d3c4) #3 RM_Yield module.c:2410 (redis-server:arm64+0x10018b66c) #4 bg_call_worker :16042052 (blockedclient.so:arm64+0x169c) Previous write of size 4 at 0x000100b54000 by main thread: #0 aeApiPoll ae_kqueue.c:175 (redis-server:arm64+0x100010588) #1 aeProcessEvents ae.c:399 (redis-server:arm64+0x10000fb84) #2 aeMain ae.c:496 (redis-server:arm64+0x100010da8) #3 main server.c:7238 (redis-server:arm64+0x10003f51c) ``` ## The final fix as the comments: https://github.com/redis/redis/pull/12817#discussion_r1436427232 Optimized solution based on the above comment: First, we add `module_gil_acquring` to indicate whether the main thread is currently in the acquiring GIL state. When the module thread starts to yield, there are two possibilities(we assume the caller keeps the GIL): 1. The main thread is in the mid of beforeSleep() and afterSleep(), that is, `module_gil_acquring` is not 1 now. At this point, the module thread will wake up the main thread through the pipe and leave the yield, waiting for the next yield when the main thread may already in the acquiring GIL state. 2. The main thread is in the acquiring GIL state. The module thread release the GIL, yielding CPU to give the main thread an opportunity to start event processing, and then acquire the GIL again until the main thread releases it. This is what https://github.com/redis/redis/pull/12817#discussion_r1436427232 mentioned direction. --------- Co-authored-by: Oran Agra --- src/module.c | 29 ++++++++++++++++++++++++++++- src/server.c | 2 ++ src/server.h | 1 + 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/module.c b/src/module.c index b966998c671..a60a345ae38 100644 --- a/src/module.c +++ b/src/module.c @@ -2407,7 +2407,33 @@ void RM_Yield(RedisModuleCtx *ctx, int flags, const char *busy_reply) { server.busy_module_yield_flags |= BUSY_MODULE_YIELD_CLIENTS; /* Let redis process events */ - processEventsWhileBlocked(); + if (!pthread_equal(server.main_thread_id, pthread_self())) { + /* If we are not in the main thread, we defer event loop processing to the main thread + * after the main thread enters acquiring GIL state in order to protect the event + * loop (ae.c) and avoid potential race conditions. */ + + int acquiring; + atomicGet(server.module_gil_acquring, acquiring); + if (!acquiring) { + /* If the main thread has not yet entered the acquiring GIL state, + * we attempt to wake it up and exit without waiting for it to + * acquire the GIL. This avoids blocking the caller, allowing them to + * continue with unfinished tasks before the next yield. + * We assume the caller keeps the GIL locked. */ + if (write(server.module_pipe[1],"A",1) != 1) { + /* Ignore the error, this is best-effort. */ + } + } else { + /* Release the GIL, yielding CPU to give the main thread an opportunity to start + * event processing, and then acquire the GIL again until the main thread releases it. */ + moduleReleaseGIL(); + sched_yield(); + moduleAcquireGIL(); + } + } else { + /* If we are in the main thread, we can safely process events. */ + processEventsWhileBlocked(); + } server.busy_module_yield_reply = prev_busy_module_yield_reply; /* Possibly restore the previous flags in case of two nested contexts @@ -11888,6 +11914,7 @@ void moduleInitModulesSystem(void) { moduleUnblockedClients = listCreate(); server.loadmodule_queue = listCreate(); server.module_configs_queue = dictCreate(&sdsKeyValueHashDictType); + server.module_gil_acquring = 0; modules = dictCreate(&modulesDictType); moduleAuthCallbacks = listCreate(); diff --git a/src/server.c b/src/server.c index 5a17446dc2c..4fd4a993c3e 100644 --- a/src/server.c +++ b/src/server.c @@ -1875,7 +1875,9 @@ void afterSleep(struct aeEventLoop *eventLoop) { mstime_t latency; latencyStartMonitor(latency); + atomicSet(server.module_gil_acquring, 1); moduleAcquireGIL(); + atomicSet(server.module_gil_acquring, 0); moduleFireServerEvent(REDISMODULE_EVENT_EVENTLOOP, REDISMODULE_SUBEVENT_EVENTLOOP_AFTER_SLEEP, NULL); diff --git a/src/server.h b/src/server.h index 437793b0e22..b398d8ae93e 100644 --- a/src/server.h +++ b/src/server.h @@ -1608,6 +1608,7 @@ struct redisServer { int module_pipe[2]; /* Pipe used to awake the event loop by module threads. */ pid_t child_pid; /* PID of current child */ int child_type; /* Type of current child */ + redisAtomic int module_gil_acquring; /* Indicates whether the GIL is being acquiring by the main thread. */ /* Networking */ int port; /* TCP listening port */ int tls_port; /* TLS listening port */ From 5b0c6a8255af2d0e4921fa60d631bb3857724cb6 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 8 Jan 2024 12:54:41 +0800 Subject: [PATCH 30/58] Fix CLUSTER SHARDS crash in 7.0/7.2 mixed clusters where shard ids are not sync (#12832) Crash reported in #12695. In the process of upgrading the cluster from 7.0 to 7.2, because the 7.0 nodes will not gossip shard id, in 7.2 we will rely on shard id to build the server.cluster->shards dict. In some cases, for example, the 7.0 master node and the 7.2 replica node. From the view of 7.2 replica node, the cluster->shards dictionary does not have its master node. In this case calling CLUSTER SHARDS on the 7.2 replica node may crash. We should fix the underlying assumption of updateShardId, which is that the shard dict should be always in sync with the node's shard_id. The fix was suggested by PingXie, see more details in #12695. --- src/cluster_legacy.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 08203edd102..e09ec6d3add 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1609,6 +1609,7 @@ void clusterRenameNode(clusterNode *node, char *newname) { serverAssert(retval == DICT_OK); memcpy(node->name, newname, CLUSTER_NAMELEN); clusterAddNode(node); + clusterAddNodeToShard(node->shard_id, node); } void clusterAddNodeToShard(const char *shard_id, clusterNode *node) { @@ -2156,6 +2157,7 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { node->tls_port = msg_tls_port; node->cport = ntohs(g->cport); clusterAddNode(node); + clusterAddNodeToShard(node->shard_id, node); } } @@ -2957,6 +2959,10 @@ int clusterProcessPacket(clusterLink *link) { clusterNodeAddSlave(master,sender); sender->slaveof = master; + /* Update the shard_id when a replica is connected to its + * primary in the very first time. */ + updateShardId(sender, master->shard_id); + /* Update config. */ clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); } From 4730563e93152cac9b840d1768b35d8ce14b2f1f Mon Sep 17 00:00:00 2001 From: "debing.sun" Date: Mon, 8 Jan 2024 16:17:13 +0800 Subject: [PATCH 31/58] Change destination key's key-spec flag from RW to OW for SINTERSTORE command (#12917) In #10122, we set the destination key's flag of SINTERSTORE to `RW`, however, this command doesn't actually read or modify the destination key, just overwrites it. Therefore, we change it to `OW` similarly to all other *STORE commands. --- src/commands.def | 2 +- src/commands/sinterstore.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/commands.def b/src/commands.def index deba47feabb..3338ee20a06 100644 --- a/src/commands.def +++ b/src/commands.def @@ -7816,7 +7816,7 @@ struct COMMAND_ARG SINTERCARD_Args[] = { #ifndef SKIP_CMD_KEY_SPECS_TABLE /* SINTERSTORE key specs */ keySpec SINTERSTORE_Keyspecs[2] = { -{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}},{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={2},KSPEC_FK_RANGE,.fk.range={-1,1,0}} +{NULL,CMD_KEY_OW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}},{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={2},KSPEC_FK_RANGE,.fk.range={-1,1,0}} }; #endif diff --git a/src/commands/sinterstore.json b/src/commands/sinterstore.json index 28ccfff691e..e8e4bb44746 100644 --- a/src/commands/sinterstore.json +++ b/src/commands/sinterstore.json @@ -16,7 +16,7 @@ "key_specs": [ { "flags": [ - "RW", + "OW", "UPDATE" ], "begin_search": { From c452e414a800372a33aa2a112d563cea560600f5 Mon Sep 17 00:00:00 2001 From: Yanqi Lv Date: Mon, 8 Jan 2024 16:32:31 +0800 Subject: [PATCH 32/58] Optimize performance when many clients [p|s]unsubscribe simultaneously (#12838) I'm testing the performance of Pub/Sub command recently. I find if many clients unsubscribe or are killed simultaneously, Redis needs a long time to deal with it. In my experiment, I set 5000 clients and each client subscribes 100 channels. Then I call `client kill type pubsub` to simulate the situation where clients unsubscribe all channels at the same time and calculate the execution time. The result shows that it takes about 23s. I use the _perf_ and find that `listSearchKey` in `pubsubUnsubscribeChannel` costs more than 90% cpu time. I think we can optimize this situation. In this PR, I replace list with dict to track the clients subscribing the channel more efficiently. It changes O(N) to O(1) in the search phase. Then I repeat the experiment as above. The results are as follows. | | Execution Time(s) |used_memory(MB) | | :---------------- | :------: | :----: | | unstable(1bd0b54) | 23.734 | 65.41 | | optimize-pubsub | 0.288 | 67.66 | Thanks for #11595 , I use a no-value dict and the results shows that the performance improves significantly but the memory usage only increases slightly. Notice: - This PR will cause the performance degradation about 20% in `[p|s]subscribe` command but won't freeze Redis. --- src/pubsub.c | 82 +++++++++++++++++++++++++--------------------------- src/server.c | 42 +++++++++++++++++++++++++-- src/server.h | 2 ++ 3 files changed, 81 insertions(+), 45 deletions(-) diff --git a/src/pubsub.c b/src/pubsub.c index 6c69431b80f..9e3958b363b 100644 --- a/src/pubsub.c +++ b/src/pubsub.c @@ -280,7 +280,7 @@ void unmarkClientAsPubSub(client *c) { int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { dict **d_ptr; dictEntry *de; - list *clients = NULL; + dict *clients = NULL; int retval = 0; unsigned int slot = 0; @@ -294,13 +294,13 @@ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { } d_ptr = type.serverPubSubChannels(slot); if (*d_ptr == NULL) { - *d_ptr = dictCreate(&keylistDictType); + *d_ptr = dictCreate(&objToDictDictType); de = NULL; } else { de = dictFind(*d_ptr, channel); } if (de == NULL) { - clients = listCreate(); + clients = dictCreate(&clientDictType); dictAdd(*d_ptr, channel, clients); incrRefCount(channel); if (type.shard) { @@ -309,7 +309,7 @@ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { } else { clients = dictGetVal(de); } - listAddNodeTail(clients,c); + serverAssert(dictAdd(clients, c, NULL) != DICT_ERR); } /* Notify the client */ addReplyPubsubSubscribed(c,channel,type); @@ -321,8 +321,7 @@ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype type) { dict *d; dictEntry *de; - list *clients; - listNode *ln; + dict *clients; int retval = 0; int slot = 0; @@ -340,11 +339,9 @@ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype ty de = dictFind(d, channel); serverAssertWithInfo(c,NULL,de != NULL); clients = dictGetVal(de); - ln = listSearchKey(clients,c); - serverAssertWithInfo(c,NULL,ln != NULL); - listDelNode(clients,ln); - if (listLength(clients) == 0) { - /* Free the list and associated hash entry at all if this was + serverAssertWithInfo(c, NULL, dictDelete(clients, c) == DICT_OK); + if (dictSize(clients) == 0) { + /* Free the dict and associated hash entry at all if this was * the latest client, so that it will be possible to abuse * Redis PUBSUB creating millions of channels. */ dictDelete(d, channel); @@ -376,11 +373,13 @@ void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) { dictEntry *de; while ((de = dictNext(di)) != NULL) { robj *channel = dictGetKey(de); - list *clients = dictGetVal(de); + dict *clients = dictGetVal(de); + if (dictSize(clients) == 0) goto cleanup; /* For each client subscribed to the channel, unsubscribe it. */ - listNode *ln; - while ((ln = listFirst(clients)) != NULL) { - client *c = listNodeValue(ln); + dictIterator *iter = dictGetSafeIterator(clients); + dictEntry *entry; + while ((entry = dictNext(iter)) != NULL) { + client *c = dictGetKey(entry); int retval = dictDelete(c->pubsubshard_channels, channel); serverAssertWithInfo(c,channel,retval == DICT_OK); addReplyPubsubUnsubscribed(c, channel, pubSubShardType); @@ -389,8 +388,9 @@ void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) { if (clientTotalPubSubSubscriptionCount(c) == 0) { unmarkClientAsPubSub(c); } - listDelNode(clients, ln); } + dictReleaseIterator(iter); +cleanup: server.shard_channel_count--; dictDelete(d, channel); } @@ -402,7 +402,7 @@ void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) { /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the client was already subscribed to that pattern. */ int pubsubSubscribePattern(client *c, robj *pattern) { dictEntry *de; - list *clients; + dict *clients; int retval = 0; if (dictAdd(c->pubsub_patterns, pattern, NULL) == DICT_OK) { @@ -411,13 +411,13 @@ int pubsubSubscribePattern(client *c, robj *pattern) { /* Add the client to the pattern -> list of clients hash table */ de = dictFind(server.pubsub_patterns,pattern); if (de == NULL) { - clients = listCreate(); + clients = dictCreate(&clientDictType); dictAdd(server.pubsub_patterns,pattern,clients); incrRefCount(pattern); } else { clients = dictGetVal(de); } - listAddNodeTail(clients,c); + serverAssert(dictAdd(clients, c, NULL) != DICT_ERR); } /* Notify the client */ addReplyPubsubPatSubscribed(c,pattern); @@ -428,8 +428,7 @@ int pubsubSubscribePattern(client *c, robj *pattern) { * 0 if the client was not subscribed to the specified channel. */ int pubsubUnsubscribePattern(client *c, robj *pattern, int notify) { dictEntry *de; - list *clients; - listNode *ln; + dict *clients; int retval = 0; incrRefCount(pattern); /* Protect the object. May be the same we remove */ @@ -439,11 +438,9 @@ int pubsubUnsubscribePattern(client *c, robj *pattern, int notify) { de = dictFind(server.pubsub_patterns,pattern); serverAssertWithInfo(c,NULL,de != NULL); clients = dictGetVal(de); - ln = listSearchKey(clients,c); - serverAssertWithInfo(c,NULL,ln != NULL); - listDelNode(clients,ln); - if (listLength(clients) == 0) { - /* Free the list and associated hash entry at all if this was + serverAssertWithInfo(c, NULL, dictDelete(clients, c) == DICT_OK); + if (dictSize(clients) == 0) { + /* Free the dict and associated hash entry at all if this was * the latest client. */ dictDelete(server.pubsub_patterns,pattern); } @@ -521,8 +518,6 @@ int pubsubPublishMessageInternal(robj *channel, robj *message, pubsubtype type) dict *d; dictEntry *de; dictIterator *di; - listNode *ln; - listIter li; unsigned int slot = 0; /* Send to clients listening for that channel */ @@ -532,17 +527,16 @@ int pubsubPublishMessageInternal(robj *channel, robj *message, pubsubtype type) d = *type.serverPubSubChannels(slot); de = d ? dictFind(d, channel) : NULL; if (de) { - list *list = dictGetVal(de); - listNode *ln; - listIter li; - - listRewind(list,&li); - while ((ln = listNext(&li)) != NULL) { - client *c = ln->value; + dict *clients = dictGetVal(de); + dictEntry *entry; + dictIterator *iter = dictGetSafeIterator(clients); + while ((entry = dictNext(iter)) != NULL) { + client *c = dictGetKey(entry); addReplyPubsubMessage(c,channel,message,*type.messageBulk); updateClientMemUsageAndBucket(c); receivers++; } + dictReleaseIterator(iter); } if (type.shard) { @@ -556,19 +550,21 @@ int pubsubPublishMessageInternal(robj *channel, robj *message, pubsubtype type) channel = getDecodedObject(channel); while((de = dictNext(di)) != NULL) { robj *pattern = dictGetKey(de); - list *clients = dictGetVal(de); + dict *clients = dictGetVal(de); if (!stringmatchlen((char*)pattern->ptr, sdslen(pattern->ptr), (char*)channel->ptr, sdslen(channel->ptr),0)) continue; - listRewind(clients,&li); - while ((ln = listNext(&li)) != NULL) { - client *c = listNodeValue(ln); + dictEntry *entry; + dictIterator *iter = dictGetSafeIterator(clients); + while ((entry = dictNext(iter)) != NULL) { + client *c = dictGetKey(entry); addReplyPubsubPatMessage(c,pattern,channel,message); updateClientMemUsageAndBucket(c); receivers++; } + dictReleaseIterator(iter); } decrRefCount(channel); dictReleaseIterator(di); @@ -706,10 +702,10 @@ NULL addReplyArrayLen(c,(c->argc-2)*2); for (j = 2; j < c->argc; j++) { - list *l = dictFetchValue(server.pubsub_channels,c->argv[j]); + dict *d = dictFetchValue(server.pubsub_channels, c->argv[j]); addReplyBulk(c,c->argv[j]); - addReplyLongLong(c,l ? listLength(l) : 0); + addReplyLongLong(c, d ? dictSize(d) : 0); } } else if (!strcasecmp(c->argv[1]->ptr,"numpat") && c->argc == 2) { /* PUBSUB NUMPAT */ @@ -727,10 +723,10 @@ NULL for (j = 2; j < c->argc; j++) { unsigned int slot = calculateKeySlot(c->argv[j]->ptr); dict *d = server.pubsubshard_channels[slot]; - list *l = d ? dictFetchValue(d, c->argv[j]) : NULL; + dict *clients = d ? dictFetchValue(d, c->argv[j]) : NULL; addReplyBulk(c,c->argv[j]); - addReplyLongLong(c,l ? listLength(l) : 0); + addReplyLongLong(c, d ? dictSize(clients) : 0); } } else { addReplySubcommandSyntaxError(c); diff --git a/src/server.c b/src/server.c index 4fd4a993c3e..280644e7ff4 100644 --- a/src/server.c +++ b/src/server.c @@ -282,6 +282,12 @@ void dictListDestructor(dict *d, void *val) listRelease((list*)val); } +void dictDictDestructor(dict *d, void *val) +{ + UNUSED(d); + dictRelease((dict*)val); +} + int dictSdsKeyCompare(dict *d, const void *key1, const void *key2) { @@ -351,6 +357,17 @@ uint64_t dictCStrCaseHash(const void *key) { return dictGenCaseHashFunction((unsigned char*)key, strlen((char*)key)); } +/* Dict hash function for client */ +uint64_t dictClientHash(const void *key) { + return ((client *)key)->id; +} + +/* Dict compare function for client */ +int dictClientKeyCompare(dict *d, const void *key1, const void *key2) { + UNUSED(d); + return ((client *)key1)->id == ((client *)key2)->id; +} + /* Dict compare function for null terminated string */ int dictCStrKeyCompare(dict *d, const void *key1, const void *key2) { int l1,l2; @@ -596,6 +613,18 @@ dictType keylistDictType = { NULL /* allow to expand */ }; +/* KeyDict hash table type has unencoded redis objects as keys and + * dicts as values. It's used for PUBSUB command to track clients subscribing the channels. */ +dictType objToDictDictType = { + dictObjHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictObjKeyCompare, /* key compare */ + dictObjectDestructor, /* key destructor */ + dictDictDestructor, /* val destructor */ + NULL /* allow to expand */ +}; + /* Modules system dictionary type. Keys are module name, * values are pointer to RedisModule struct. */ dictType modulesDictType = { @@ -655,6 +684,15 @@ dictType sdsHashDictType = { NULL /* allow to expand */ }; +/* Client Set dictionary type. Keys are client, values are not used. */ +dictType clientDictType = { + dictClientHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictClientKeyCompare, /* key compare */ + .no_value = 1 /* no values in this dict */ +}; + int htNeedsResize(dict *dict) { long long size, used; @@ -2745,8 +2783,8 @@ void initServer(void) { } server.rehashing = listCreate(); evictionPoolAlloc(); /* Initialize the LRU keys pool. */ - server.pubsub_channels = dictCreate(&keylistDictType); - server.pubsub_patterns = dictCreate(&keylistDictType); + server.pubsub_channels = dictCreate(&objToDictDictType); + server.pubsub_patterns = dictCreate(&objToDictDictType); server.pubsubshard_channels = zcalloc(sizeof(dict *) * slot_count); server.shard_channel_count = 0; server.pubsub_clients = 0; diff --git a/src/server.h b/src/server.h index b398d8ae93e..be33bf8039c 100644 --- a/src/server.h +++ b/src/server.h @@ -2499,6 +2499,8 @@ extern dictType hashDictType; extern dictType stringSetDictType; extern dictType externalStringType; extern dictType sdsHashDictType; +extern dictType clientDictType; +extern dictType objToDictDictType; extern dictType dbExpiresDictType; extern dictType modulesDictType; extern dictType sdsReplyDictType; From 50b8b997636a334e4a9a0f2e502afefdab36f0a8 Mon Sep 17 00:00:00 2001 From: Andy Pan Date: Mon, 8 Jan 2024 17:12:24 +0800 Subject: [PATCH 33/58] Re-indent code and reduce code being complied on Solaris for anetKeepAlive (#12914) This is a follow-up PR for #12782, in which we introduced nested preprocessor directives for TCP keep-alive on Solaris and added redundant indentation for code. Besides, it could result in unreachable code due to the lack of `#else` on the latest Solaris 11.4 where `TCP_KEEPIDLE`, `TCP_KEEPINTVL`, and `TCP_KEEPCNT` are available. As a result, this PR does three main things: - To eliminate the redundant indention for C code in nested preprocessor directives - To add `#else` directives and move `TCP_KEEPALIVE_THRESHOLD` + `TCP_KEEPALIVE_ABORT_THRESHOLD` settings under it, avoid unreachable code and compiler warnings when `#if defined(TCP_KEEPIDLE) && defined(TCP_KEEPINTVL) && defined(TCP_KEEPCNT)` is met on Solaris 11.4 - To remove a few trailing whitespace in comments --- src/anet.c | 97 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 47 deletions(-) diff --git a/src/anet.c b/src/anet.c index 6ed40b32ef3..e4f9ecf37a0 100644 --- a/src/anet.c +++ b/src/anet.c @@ -82,7 +82,7 @@ int anetSetBlock(char *err, int fd, int non_block) { return ANET_ERR; } - /* Check if this flag has been set or unset, if so, + /* Check if this flag has been set or unset, if so, * then there is no need to call fcntl to set/unset it again. */ if (!!(flags & O_NONBLOCK) == !!non_block) return ANET_OK; @@ -107,8 +107,8 @@ int anetBlock(char *err, int fd) { return anetSetBlock(err,fd,0); } -/* Enable the FD_CLOEXEC on the given fd to avoid fd leaks. - * This function should be invoked for fd's on specific places +/* Enable the FD_CLOEXEC on the given fd to avoid fd leaks. + * This function should be invoked for fd's on specific places * where fork + execve system calls are called. */ int anetCloexec(int fd) { int r; @@ -145,10 +145,10 @@ int anetKeepAlive(char *err, int fd, int interval) int intvl; int cnt; -/* There are platforms that are expected to support the full mechanism of TCP keep-alive, - * we want the compiler to emit warnings of unused variables if the preprocessor directives - * somehow fail, and other than those platforms, just omit these warnings if they happen. - */ + /* There are platforms that are expected to support the full mechanism of TCP keep-alive, + * we want the compiler to emit warnings of unused variables if the preprocessor directives + * somehow fail, and other than those platforms, just omit these warnings if they happen. + */ #if !(defined(_AIX) || defined(__APPLE__) || defined(__DragonFly__) || \ defined(__FreeBSD__) || defined(__illumos__) || defined(__linux__) || \ defined(__NetBSD__) || defined(__sun)) @@ -158,62 +158,63 @@ int anetKeepAlive(char *err, int fd, int interval) UNUSED(cnt); #endif -/* The implementation of TCP keep-alive on Solaris/SmartOS is a bit unusual - * compared to other Unix-like systems. - * Thus, we need to specialize it on Solaris. */ -#ifdef __sun - /* There are two keep-alive mechanisms on Solaris: - * - By default, the first keep-alive probe is sent out after a TCP connection is idle for two hours. - * If the peer does not respond to the probe within eight minutes, the TCP connection is aborted. - * You can alter the interval for sending out the first probe using the socket option TCP_KEEPALIVE_THRESHOLD +#ifdef __sun + /* The implementation of TCP keep-alive on Solaris/SmartOS is a bit unusual + * compared to other Unix-like systems. + * Thus, we need to specialize it on Solaris. + * + * There are two keep-alive mechanisms on Solaris: + * - By default, the first keep-alive probe is sent out after a TCP connection is idle for two hours. + * If the peer does not respond to the probe within eight minutes, the TCP connection is aborted. + * You can alter the interval for sending out the first probe using the socket option TCP_KEEPALIVE_THRESHOLD * in milliseconds or TCP_KEEPIDLE in seconds. - * The system default is controlled by the TCP ndd parameter tcp_keepalive_interval. The minimum value is ten seconds. - * The maximum is ten days, while the default is two hours. If you receive no response to the probe, + * The system default is controlled by the TCP ndd parameter tcp_keepalive_interval. The minimum value is ten seconds. + * The maximum is ten days, while the default is two hours. If you receive no response to the probe, * you can use the TCP_KEEPALIVE_ABORT_THRESHOLD socket option to change the time threshold for aborting a TCP connection. - * The option value is an unsigned integer in milliseconds. The value zero indicates that TCP should never time out and - * abort the connection when probing. The system default is controlled by the TCP ndd parameter tcp_keepalive_abort_interval. + * The option value is an unsigned integer in milliseconds. The value zero indicates that TCP should never time out and + * abort the connection when probing. The system default is controlled by the TCP ndd parameter tcp_keepalive_abort_interval. * The default is eight minutes. - - * - The second implementation is activated if socket option TCP_KEEPINTVL and/or TCP_KEEPCNT are set. - * The time between each consequent probes is set by TCP_KEEPINTVL in seconds. - * The minimum value is ten seconds. The maximum is ten days, while the default is two hours. + * + * - The second implementation is activated if socket option TCP_KEEPINTVL and/or TCP_KEEPCNT are set. + * The time between each consequent probes is set by TCP_KEEPINTVL in seconds. + * The minimum value is ten seconds. The maximum is ten days, while the default is two hours. * The TCP connection will be aborted after certain amount of probes, which is set by TCP_KEEPCNT, without receiving response. */ idle = interval; if (idle < 10) idle = 10; // kernel expects at least 10 seconds if (idle > 10*24*60*60) idle = 10*24*60*60; // kernel expects at most 10 days - - /* `TCP_KEEPIDLE`, `TCP_KEEPINTVL`, and `TCP_KEEPCNT` were not available on Solaris + + /* `TCP_KEEPIDLE`, `TCP_KEEPINTVL`, and `TCP_KEEPCNT` were not available on Solaris * until version 11.4, but let's take a chance here. */ - #if defined(TCP_KEEPIDLE) && defined(TCP_KEEPINTVL) && defined(TCP_KEEPCNT) - if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &idle, sizeof(idle))) { - anetSetError(err, "setsockopt TCP_KEEPIDLE: %s\n", strerror(errno)); - return ANET_ERR; - } - intvl = idle/3; - if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &intvl, sizeof(intvl))) { - anetSetError(err, "setsockopt TCP_KEEPINTVL: %s\n", strerror(errno)); - return ANET_ERR; - } - cnt = 3; - if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &cnt, sizeof(cnt))) { - anetSetError(err, "setsockopt TCP_KEEPCNT: %s\n", strerror(errno)); - return ANET_ERR; - } - return ANET_OK; - #endif +#if defined(TCP_KEEPIDLE) && defined(TCP_KEEPINTVL) && defined(TCP_KEEPCNT) + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &idle, sizeof(idle))) { + anetSetError(err, "setsockopt TCP_KEEPIDLE: %s\n", strerror(errno)); + return ANET_ERR; + } - /* Fall back to the first implementation of tcp-alive mechanism for older Solaris, - * simulate the tcp-alive mechanism on other platforms via `TCP_KEEPALIVE_THRESHOLD` + `TCP_KEEPALIVE_ABORT_THRESHOLD`. - */ + intvl = idle/3; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &intvl, sizeof(intvl))) { + anetSetError(err, "setsockopt TCP_KEEPINTVL: %s\n", strerror(errno)); + return ANET_ERR; + } + + cnt = 3; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &cnt, sizeof(cnt))) { + anetSetError(err, "setsockopt TCP_KEEPCNT: %s\n", strerror(errno)); + return ANET_ERR; + } +#else + /* Fall back to the first implementation of tcp-alive mechanism for older Solaris, + * simulate the tcp-alive mechanism on other platforms via `TCP_KEEPALIVE_THRESHOLD` + `TCP_KEEPALIVE_ABORT_THRESHOLD`. + */ idle *= 1000; // kernel expects milliseconds if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE_THRESHOLD, &idle, sizeof(idle))) { anetSetError(err, "setsockopt TCP_KEEPINTVL: %s\n", strerror(errno)); return ANET_ERR; } - /* Note that the consequent probes will not be sent at equal intervals on Solaris, + /* Note that the consequent probes will not be sent at equal intervals on Solaris, * but will be sent using the exponential backoff algorithm. */ intvl = idle/3; cnt = 3; @@ -222,13 +223,15 @@ int anetKeepAlive(char *err, int fd, int interval) anetSetError(err, "setsockopt TCP_KEEPCNT: %s\n", strerror(errno)); return ANET_ERR; } +#endif return ANET_OK; + #endif #ifdef TCP_KEEPIDLE /* Default settings are more or less garbage, with the keepalive time - * set to 7200 by default on Linux and other Unix-like systems. + * set to 7200 by default on Linux and other Unix-like systems. * Modify settings to make the feature actually useful. */ /* Send first probe after interval. */ From 14e4a9835af2bb7df8c42cfb850bd19708cd73d0 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 8 Jan 2024 23:36:34 +0800 Subject: [PATCH 34/58] Fix minor fd leak in rdbSaveToSlavesSockets (#12919) We should close server.rdb_child_exit_pipe when redisFork fails, otherwise the pipe fd will be leaked. Just a cleanup. --- src/rdb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rdb.c b/src/rdb.c index f6b0054cc03..ac88c7be0cc 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -3636,6 +3636,7 @@ int rdbSaveToSlavesSockets(int req, rdbSaveInfo *rsi) { } close(rdb_pipe_write); close(server.rdb_pipe_read); + close(server.rdb_child_exit_pipe); zfree(server.rdb_pipe_conns); server.rdb_pipe_conns = NULL; server.rdb_pipe_numconns = 0; From 8bb9a2895ee344ddea737f4df173c944b9a6cbf0 Mon Sep 17 00:00:00 2001 From: Madelyn Olson <34459052+madolson@users.noreply.github.com> Date: Mon, 8 Jan 2024 17:56:06 -0800 Subject: [PATCH 35/58] Address some failures with new tests for improving debug report (#12915) Fix a daily test failure because alpine doesn't support stack traces and add in an extra assertion related to making sure the stack trace was printed twice. --- tests/integration/logging.tcl | 16 ++-------------- tests/support/util.tcl | 19 +++++++++++++++++++ tests/unit/moduleapi/crash.tcl | 29 ++++++++++++++++++++--------- 3 files changed, 41 insertions(+), 23 deletions(-) diff --git a/tests/integration/logging.tcl b/tests/integration/logging.tcl index 1ed3cc4d83e..b547cd8fab1 100644 --- a/tests/integration/logging.tcl +++ b/tests/integration/logging.tcl @@ -1,22 +1,10 @@ tags {"external:skip"} { set system_name [string tolower [exec uname -s]] -set backtrace_supported 0 +set backtrace_supported [system_backtrace_supported] set threads_mngr_supported 0 ;# Do we support printing stack trace from all threads, not just the one that got the signal? - -# We only support darwin or Linux with glibc -if {$system_name eq {darwin}} { - set backtrace_supported 1 -} elseif {$system_name eq {linux}} { +if {$system_name eq {linux}} { set threads_mngr_supported 1 - # Avoid the test on libmusl, which does not support backtrace - # and on static binaries (ldd exit code 1) where we can't detect libmusl - catch { - set ldd [exec ldd src/redis-server] - if {![string match {*libc.*musl*} $ldd]} { - set backtrace_supported 1 - } - } } # look for the DEBUG command in the backtrace, used when we triggered diff --git a/tests/support/util.tcl b/tests/support/util.tcl index 4136fb17e08..9b9ea0ac1c1 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -1123,3 +1123,22 @@ proc format_command {args} { set _ $cmd } +# Returns whether or not the system supports stack traces +proc system_backtrace_supported {} { + set system_name [string tolower [exec uname -s]] + if {$system_name eq {darwin}} { + return 1 + } elseif {$system_name ne {linux}} { + return 0 + } + + # libmusl does not support backtrace. Also return 0 on + # static binaries (ldd exit code 1) where we can't detect libmusl + catch { + set ldd [exec ldd src/redis-server] + if {![string match {*libc.*musl*} $ldd]} { + return 1 + } + } + return 0 +} diff --git a/tests/unit/moduleapi/crash.tcl b/tests/unit/moduleapi/crash.tcl index 82b32f9542d..dedbb1a1eb2 100644 --- a/tests/unit/moduleapi/crash.tcl +++ b/tests/unit/moduleapi/crash.tcl @@ -1,6 +1,7 @@ # This file is used to test certain crash edge cases to make sure they produce # correct stack traces for debugging. set testmodule [file normalize tests/modules/crash.so] +set backtrace_supported [system_backtrace_supported] # Valgrind will complain that the process terminated by a signal, skip it. if {!$::valgrind} { @@ -20,8 +21,11 @@ if {!$::valgrind} { wait_for_log_messages 0 {"*=== REDIS BUG REPORT END. Make sure to include from START to END. ===*"} $loglines 10 1000 assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT END. Make sure to include from START to END. ==="] assert_equal 2 [count_log_message 0 "ASSERTION FAILED"] - # There will be 3 crash assertions, 1 in the first stack trace and 2 in the second - assert_equal 3 [count_log_message 0 "assertCrash"] + if {$backtrace_supported} { + # Make sure the crash trace is printed twice. There will be 3 instances of, + # assertCrash 1 in the first stack trace and 2 in the second. + assert_equal 3 [count_log_message 0 "assertCrash"] + } assert_equal 1 [count_log_message 0 "RECURSIVE ASSERTION FAILED"] assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT START: Cut & paste starting from here ==="] } @@ -34,18 +38,25 @@ if {!$::valgrind} { set res [wait_for_log_messages 0 {"*=== REDIS BUG REPORT START: Cut & paste starting from here ===*"} 0 10 1000] set loglines [lindex $res 1] - set res [wait_for_log_messages 0 {"*Crashed running the instruction at*"} $loglines 10 1000] - set loglines [lindex $res 1] + if {$backtrace_supported} { + set res [wait_for_log_messages 0 {"*Crashed running the instruction at*"} $loglines 10 1000] + set loglines [lindex $res 1] - set res [wait_for_log_messages 0 {"*Crashed running signal handler. Providing reduced version of recursive crash report*"} $loglines 10 1000] - set loglines [lindex $res 1] - set res [wait_for_log_messages 0 {"*Crashed running the instruction at*"} $loglines 10 1000] - set loglines [lindex $res 1] + set res [wait_for_log_messages 0 {"*Crashed running signal handler. Providing reduced version of recursive crash report*"} $loglines 10 1000] + set loglines [lindex $res 1] + set res [wait_for_log_messages 0 {"*Crashed running the instruction at*"} $loglines 10 1000] + set loglines [lindex $res 1] + } wait_for_log_messages 0 {"*=== REDIS BUG REPORT END. Make sure to include from START to END. ===*"} $loglines 10 1000 assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT END. Make sure to include from START to END. ==="] assert_equal 1 [count_log_message 0 "Crashed running signal handler. Providing reduced version of recursive crash report"] - assert_equal 2 [count_log_message 0 "Crashed running the instruction at"] + if {$backtrace_supported} { + assert_equal 2 [count_log_message 0 "Crashed running the instruction at"] + # Make sure the crash trace is printed twice. There will be 3 instances of + # modulesCollectInfo, 1 in the first stack trace and 2 in the second. + assert_equal 3 [count_log_message 0 "modulesCollectInfo"] + } assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT START: Cut & paste starting from here ==="] } } From f7b1d0287d62ec9fac72bf14cf789e350d14e52b Mon Sep 17 00:00:00 2001 From: Oran Agra Date: Tue, 9 Jan 2024 13:51:56 +0200 Subject: [PATCH 36/58] Fix possible corruption in sdsResize (CVE-2023-41056) (#12924) #11766 introduced a bug in sdsResize where it could forget to update the sds type in the sds header and then cause an overflow in sdsalloc. it looks like the only implication of that is a possible assertion in HLL, but it's hard to rule out possible heap corruption issues with clientsCronResizeQueryBuffer --- src/sds.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/sds.c b/src/sds.c index f5383f90e6c..4ed792e5cdd 100644 --- a/src/sds.c +++ b/src/sds.c @@ -349,20 +349,22 @@ sds sdsResize(sds s, size_t size, int would_regrow) { * type. */ int use_realloc = (oldtype==type || (type < oldtype && type > SDS_TYPE_8)); size_t newlen = use_realloc ? oldhdrlen+size+1 : hdrlen+size+1; - int alloc_already_optimal = 0; - #if defined(USE_JEMALLOC) - /* je_nallocx returns the expected allocation size for the newlen. - * We aim to avoid calling realloc() when using Jemalloc if there is no - * change in the allocation size, as it incurs a cost even if the - * allocation size stays the same. */ - alloc_already_optimal = (je_nallocx(newlen, 0) == zmalloc_size(sh)); - #endif - - if (use_realloc && !alloc_already_optimal) { - newsh = s_realloc(sh, newlen); - if (newsh == NULL) return NULL; - s = (char*)newsh+oldhdrlen; - } else if (!alloc_already_optimal) { + + if (use_realloc) { + int alloc_already_optimal = 0; + #if defined(USE_JEMALLOC) + /* je_nallocx returns the expected allocation size for the newlen. + * We aim to avoid calling realloc() when using Jemalloc if there is no + * change in the allocation size, as it incurs a cost even if the + * allocation size stays the same. */ + alloc_already_optimal = (je_nallocx(newlen, 0) == zmalloc_size(sh)); + #endif + if (!alloc_already_optimal) { + newsh = s_realloc(sh, newlen); + if (newsh == NULL) return NULL; + s = (char*)newsh+oldhdrlen; + } + } else { newsh = s_malloc(newlen); if (newsh == NULL) return NULL; memcpy((char*)newsh+hdrlen, s, len); From b351a04b1ef5aa409b0fbe2b31fcadca07421400 Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 10 Jan 2024 10:18:59 +0800 Subject: [PATCH 37/58] Add announced-endpoints test to all_tests and fix tls related tests (#12927) The test was introduced in #10745, but we forgot to add it to the test_helper.tcl, so our CI did not actually run it. This PR adds it and ensures it passes CI tests. --- tests/test_helper.tcl | 1 + tests/unit/cluster/announced-endpoints.tcl | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 6c3714e9ad0..6909d14342b 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -94,6 +94,7 @@ set ::all_tests { unit/client-eviction unit/violations unit/replybufsize + unit/cluster/announced-endpoints unit/cluster/misc unit/cluster/cli unit/cluster/scripting diff --git a/tests/unit/cluster/announced-endpoints.tcl b/tests/unit/cluster/announced-endpoints.tcl index 941a8e0a396..becba2270e3 100644 --- a/tests/unit/cluster/announced-endpoints.tcl +++ b/tests/unit/cluster/announced-endpoints.tcl @@ -1,8 +1,12 @@ start_cluster 2 2 {tags {external:skip cluster}} { test "Test change cluster-announce-port and cluster-announce-tls-port at runtime" { - set baseport [lindex [R 0 config get port] 1] - set count [expr [llength $::servers] +1 ] + if {$::tls} { + set baseport [lindex [R 0 config get tls-port] 1] + } else { + set baseport [lindex [R 0 config get port] 1] + } + set count [expr [llength $::servers] + 1] set used_port [find_available_port $baseport $count] R 0 config set cluster-announce-tls-port $used_port @@ -17,12 +21,16 @@ start_cluster 2 2 {tags {external:skip cluster}} { R 0 config set cluster-announce-tls-port 0 R 0 config set cluster-announce-port 0 - assert_match "*:$baseport@*" [R 0 CLUSTER NODES] + assert_match "*:$baseport@*" [R 0 CLUSTER NODES] } test "Test change cluster-announce-bus-port at runtime" { - set baseport [lindex [R 0 config get port] 1] - set count [expr [llength $::servers] +1 ] + if {$::tls} { + set baseport [lindex [R 0 config get tls-port] 1] + } else { + set baseport [lindex [R 0 config get port] 1] + } + set count [expr [llength $::servers] + 1] set used_port [find_available_port $baseport $count] # Verify config set cluster-announce-bus-port From b3aaa0a1362d229ba1ecd44629655f76c77304ec Mon Sep 17 00:00:00 2001 From: bentotten <59932872+bentotten@users.noreply.github.com> Date: Thu, 11 Jan 2024 15:48:19 -0800 Subject: [PATCH 38/58] When one shard, sole primary node marks potentially failed replica as FAIL instead of PFAIL (#12824) Fixes issue where a single primary cannot mark a replica as failed in a single-shard cluster. --- src/cluster_legacy.c | 9 +++-- tests/support/cluster_util.tcl | 27 +++++++++++++ tests/test_helper.tcl | 1 + tests/unit/cluster/failure-marking.tcl | 53 ++++++++++++++++++++++++++ 4 files changed, 87 insertions(+), 3 deletions(-) create mode 100644 tests/unit/cluster/failure-marking.tcl diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index e09ec6d3add..db25461f5c0 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -959,7 +959,7 @@ void clusterInit(void) { server.cluster->myself = NULL; server.cluster->currentEpoch = 0; server.cluster->state = CLUSTER_FAIL; - server.cluster->size = 1; + server.cluster->size = 0; server.cluster->todo_before_sleep = 0; server.cluster->nodes = dictCreate(&clusterNodesDictType); server.cluster->shards = dictCreate(&clusterSdsToListType); @@ -4691,10 +4691,13 @@ void clusterCron(void) { /* Timeout reached. Set the node as possibly failing if it is * not already in this state. */ if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) { - serverLog(LL_DEBUG,"*** NODE %.40s possibly failing", - node->name); node->flags |= CLUSTER_NODE_PFAIL; update_state = 1; + if (clusterNodeIsMaster(myself) && server.cluster->size == 1) { + markNodeAsFailingIfNeeded(node); + } else { + serverLog(LL_DEBUG,"*** NODE %.40s possibly failing", node->name); + } } } } diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index 2e3611e1ee2..51604664746 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -199,3 +199,30 @@ proc are_hostnames_propagated {match_string} { } return 1 } + +proc wait_node_marked_fail {ref_node_index instance_id_to_check} { + wait_for_condition 1000 50 { + [check_cluster_node_mark fail $ref_node_index $instance_id_to_check] + } else { + fail "Replica node never marked as FAIL ('fail')" + } +} + +proc wait_node_marked_pfail {ref_node_index instance_id_to_check} { + wait_for_condition 1000 50 { + [check_cluster_node_mark fail\? $ref_node_index $instance_id_to_check] + } else { + fail "Replica node never marked as PFAIL ('fail?')" + } +} + +proc check_cluster_node_mark {flag ref_node_index instance_id_to_check} { + set nodes [get_cluster_nodes $ref_node_index] + + foreach n $nodes { + if {[dict get $n id] eq $instance_id_to_check} { + return [cluster_has_flag $n $flag] + } + } + fail "Unable to find instance id in cluster nodes. ID: $instance_id_to_check" +} diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 6909d14342b..6623d059ee5 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -104,6 +104,7 @@ set ::all_tests { unit/cluster/slot-ownership unit/cluster/links unit/cluster/cluster-response-tls + unit/cluster/failure-marking } # Index to the next test to run in the ::all_tests list. set ::next_test 0 diff --git a/tests/unit/cluster/failure-marking.tcl b/tests/unit/cluster/failure-marking.tcl new file mode 100644 index 00000000000..c4746c82647 --- /dev/null +++ b/tests/unit/cluster/failure-marking.tcl @@ -0,0 +1,53 @@ +# Test a single primary can mark replica as `fail` +start_cluster 1 1 {tags {external:skip cluster}} { + + test "Verify that single primary marks replica as failed" { + set primary [srv -0 client] + + set replica1 [srv -1 client] + set replica1_pid [srv -1 pid] + set replica1_instance_id [dict get [cluster_get_myself 1] id] + + assert {[lindex [$primary role] 0] eq {master}} + assert {[lindex [$replica1 role] 0] eq {slave}} + + wait_for_sync $replica1 + + pause_process $replica1_pid + + wait_node_marked_fail 0 $replica1_instance_id + } +} + +# Test multiple primaries wait for a quorum and then mark a replica as `fail` +start_cluster 2 1 {tags {external:skip cluster}} { + + test "Verify that multiple primaries mark replica as failed" { + set primary1 [srv -0 client] + + set primary2 [srv -1 client] + set primary2_pid [srv -1 pid] + + set replica1 [srv -2 client] + set replica1_pid [srv -2 pid] + set replica1_instance_id [dict get [cluster_get_myself 2] id] + + assert {[lindex [$primary1 role] 0] eq {master}} + assert {[lindex [$primary2 role] 0] eq {master}} + assert {[lindex [$replica1 role] 0] eq {slave}} + + wait_for_sync $replica1 + + pause_process $replica1_pid + + # Pause other primary to allow time for pfail flag to appear + pause_process $primary2_pid + + wait_node_marked_pfail 0 $replica1_instance_id + + # Resume other primary and wait for to show replica as failed + resume_process $primary2_pid + + wait_node_marked_fail 0 $replica1_instance_id + } +} From 964f4a457633cecf98d9501d626d6b6b56a8475b Mon Sep 17 00:00:00 2001 From: Harkrishn Patro Date: Thu, 11 Jan 2024 15:59:22 -0800 Subject: [PATCH 39/58] Avoid double free of cluster link (#12930) Avoid crash while performing `DEBUG CLUSTERLINK KILL` mutliple times (cluster link might not be created/valid). --- src/cluster_legacy.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index db25461f5c0..8dee109df69 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -5796,12 +5796,12 @@ int handleDebugClusterCommand(client *c) { /* Terminate the link based on the direction or all. */ if (!strcasecmp(c->argv[3]->ptr, "from")) { - freeClusterLink(n->inbound_link); + if (n->inbound_link) freeClusterLink(n->inbound_link); } else if (!strcasecmp(c->argv[3]->ptr, "to")) { - freeClusterLink(n->link); + if (n->link) freeClusterLink(n->link); } else if (!strcasecmp(c->argv[3]->ptr, "all")) { - freeClusterLink(n->link); - freeClusterLink(n->inbound_link); + if (n->link) freeClusterLink(n->link); + if (n->inbound_link) freeClusterLink(n->inbound_link); } else { addReplyErrorFormat(c, "Unknown direction %s", (char *) c->argv[3]->ptr); } From 87786342a525ddef1cee37fb156a328d6b2b28ec Mon Sep 17 00:00:00 2001 From: Chen Tianjie Date: Fri, 12 Jan 2024 11:58:53 +0800 Subject: [PATCH 40/58] Correct bytes_per_key computing. (#12897) Change the calculation method of bytes_per_key to make it closer to the true average key size. The calculation method is as follows: mh->bytes_per_key = mh->total_keys ? (mh->dataset / mh->total_keys) : 0; --- src/object.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/object.c b/src/object.c index bf85c7cc133..48f4820b1e4 100644 --- a/src/object.c +++ b/src/object.c @@ -1274,7 +1274,7 @@ struct redisMemOverhead *getMemoryOverheadData(void) { if (zmalloc_used > mh->startup_allocated) net_usage = zmalloc_used - mh->startup_allocated; mh->dataset_perc = (float)mh->dataset*100/net_usage; - mh->bytes_per_key = mh->total_keys ? (net_usage / mh->total_keys) : 0; + mh->bytes_per_key = mh->total_keys ? (mh->dataset / mh->total_keys) : 0; return mh; } From 284ef21ea000f18d1eaac374679e0d0561f7bef8 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sun, 14 Jan 2024 17:18:17 +0800 Subject: [PATCH 41/58] Fix fd check in memtest_test_linux_anonymous_maps (#12943) The open function returns a fd on success or -1 on failure, here we should check fd != -1, otherwise -1 will be judged as success. This closes #12938. --- src/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/debug.c b/src/debug.c index facbd6104a8..192b69d9c83 100644 --- a/src/debug.c +++ b/src/debug.c @@ -2076,7 +2076,7 @@ int memtest_test_linux_anonymous_maps(void) { int regions = 0, j; int fd = openDirectLogFiledes(); - if (!fd) return 0; + if (fd == -1) return 0; fp = fopen("/proc/self/maps","r"); if (!fp) { From bb2b6e29273c3dd2bf5f09ff625b33df1d5536b2 Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Mon, 15 Jan 2024 09:57:12 +0800 Subject: [PATCH 42/58] fix scripts access wrong slot if they disagree with pre-declared keys (#12906) Regarding how to obtain the hash slot of a key, there is an optimization in `getKeySlot()`, it is used to avoid redundant hash calculations for keys: when the current client is in the process of executing a command, it can directly use the slot of the current client because the slot to access has already been calculated in advance in `processCommand()`. However, scripts are a special case where, in default mode or with `allow-cross-slot-keys` enabled, they are allowed to access keys beyond the pre-declared range. This means that the keys they operate on may not belong to the slot of the pre-declared keys. Currently, when the commands in a script are executed, the slot of the original client (i.e., the current client) is not correctly updated, leading to subsequent access to the wrong slot. This PR fixes the above issue. When checking the cluster constraints in a script, the slot to be accessed by the current command is set for the original client (i.e., the current client). This ensures that `getKeySlot()` gets the correct slot cache. Additionally, the following modifications are made: 1. The 'sort' and 'sort_ro' commands use `getKeySlot()` instead of `c->slot` because the client could be an engine client in a script and can lead to potential bug. 2. `getKeySlot()` is also used in pubsub to obtain the slot for the channel, standardizing the way slots are retrieved. --- src/pubsub.c | 4 ++-- src/script.c | 13 ++++++++++--- src/script.h | 1 + src/sort.c | 4 ++-- tests/unit/cluster/scripting.tcl | 9 +++++++++ 5 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/pubsub.c b/src/pubsub.c index 9e3958b363b..1a151b96c44 100644 --- a/src/pubsub.c +++ b/src/pubsub.c @@ -290,7 +290,7 @@ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { incrRefCount(channel); /* Add the client to the channel -> list of clients hash table */ if (server.cluster_enabled && type.shard) { - slot = c->slot; + slot = getKeySlot(channel->ptr); } d_ptr = type.serverPubSubChannels(slot); if (*d_ptr == NULL) { @@ -332,7 +332,7 @@ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype ty retval = 1; /* Remove the client from the channel -> clients list hash table */ if (server.cluster_enabled && type.shard) { - slot = c->slot != -1 ? c->slot : (int)keyHashSlot(channel->ptr, sdslen(channel->ptr)); + slot = getKeySlot(channel->ptr); } d = *type.serverPubSubChannels(slot); serverAssertWithInfo(c,NULL,d != NULL); diff --git a/src/script.c b/src/script.c index 678773d9680..4a6461c0b4a 100644 --- a/src/script.c +++ b/src/script.c @@ -209,6 +209,7 @@ int scriptPrepareForRun(scriptRunCtx *run_ctx, client *engine_client, client *ca run_ctx->c = engine_client; run_ctx->original_client = caller; run_ctx->funcname = funcname; + run_ctx->slot = caller->slot; client *script_client = run_ctx->c; client *curr_client = run_ctx->original_client; @@ -262,6 +263,8 @@ void scriptResetRun(scriptRunCtx *run_ctx) { unprotectClient(run_ctx->original_client); } + run_ctx->slot = -1; + preventCommandPropagation(run_ctx->original_client); /* unset curr_run_ctx so we will know there is no running script */ @@ -463,14 +466,18 @@ static int scriptVerifyClusterState(scriptRunCtx *run_ctx, client *c, client *or * already been thrown. This is only checking for cross slot keys being accessed * that weren't pre-declared. */ if (hashslot != -1 && !(run_ctx->flags & SCRIPT_ALLOW_CROSS_SLOT)) { - if (original_c->slot == -1) { - original_c->slot = hashslot; - } else if (original_c->slot != hashslot) { + if (run_ctx->slot == -1) { + run_ctx->slot = hashslot; + } else if (run_ctx->slot != hashslot) { *err = sdsnew("Script attempted to access keys that do not hash to " "the same slot"); return C_ERR; } } + + c->slot = hashslot; + original_c->slot = hashslot; + return C_OK; } diff --git a/src/script.h b/src/script.h index c487165d66c..caf95ef9591 100644 --- a/src/script.h +++ b/src/script.h @@ -74,6 +74,7 @@ struct scriptRunCtx { int flags; int repl_flags; monotime start_time; + int slot; }; /* Scripts flags */ diff --git a/src/sort.c b/src/sort.c index a8b9391b117..bef260555a5 100644 --- a/src/sort.c +++ b/src/sort.c @@ -239,7 +239,7 @@ void sortCommandGeneric(client *c, int readonly) { /* If BY is specified with a real pattern, we can't accept it in cluster mode, * unless we can make sure the keys formed by the pattern are in the same slot * as the key to sort. */ - if (server.cluster_enabled && patternHashSlot(sortby->ptr, sdslen(sortby->ptr)) != c->slot) { + if (server.cluster_enabled && patternHashSlot(sortby->ptr, sdslen(sortby->ptr)) != getKeySlot(c->argv[1]->ptr)) { addReplyError(c, "BY option of SORT denied in Cluster mode when " "keys formed by the pattern may be in different slots."); syntax_error++; @@ -258,7 +258,7 @@ void sortCommandGeneric(client *c, int readonly) { /* If GET is specified with a real pattern, we can't accept it in cluster mode, * unless we can make sure the keys formed by the pattern are in the same slot * as the key to sort. */ - if (server.cluster_enabled && patternHashSlot(c->argv[j+1]->ptr, sdslen(c->argv[j+1]->ptr)) != c->slot) { + if (server.cluster_enabled && patternHashSlot(c->argv[j+1]->ptr, sdslen(c->argv[j+1]->ptr)) != getKeySlot(c->argv[1]->ptr)) { addReplyError(c, "GET option of SORT denied in Cluster mode when " "keys formed by the pattern may be in different slots."); syntax_error++; diff --git a/tests/unit/cluster/scripting.tcl b/tests/unit/cluster/scripting.tcl index b60c1255b4c..76aa882e83a 100644 --- a/tests/unit/cluster/scripting.tcl +++ b/tests/unit/cluster/scripting.tcl @@ -62,6 +62,15 @@ start_cluster 1 0 {tags {external:skip cluster}} { } 1 bar} } + test {Cross slot commands are allowed by default if they disagree with pre-declared keys} { + r 0 flushall + r 0 eval "redis.call('set', 'foo', 'bar')" 1 bar + + # Make sure the script writes to the right slot + assert_equal 1 [r 0 cluster COUNTKEYSINSLOT 12182] ;# foo slot + assert_equal 0 [r 0 cluster COUNTKEYSINSLOT 5061] ;# bar slot + } + test "Function no-cluster flag" { R 0 function load {#!lua name=test redis.register_function{function_name='f1', callback=function() return 'hello' end, flags={'no-cluster'}} From e2b7932b347d475dbbd7b6cc45008a8666015b6c Mon Sep 17 00:00:00 2001 From: Yanqi Lv Date: Mon, 15 Jan 2024 14:20:53 +0800 Subject: [PATCH 43/58] Shrink dict when deleting dictEntry (#12850) When we insert entries into dict, it may autonomously expand if needed. However, when we delete entries from dict, it doesn't shrink to the proper size. If there are few entries in a very large dict, it may cause huge waste of memory and inefficiency when iterating. The main keyspace dicts (keys and expires), are shrinked by cron (`tryResizeHashTables` calls `htNeedsResize` and `dictResize`), And some data structures such as zset and hash also do that (call `htNeedsResize`) right after a loop of calls to `dictDelete`, But many other dicts are completely missing that call (they can only expand). In this PR, we provide the ability to automatically shrink the dict when deleting. The conditions triggering the shrinking is the same as `htNeedsResize` used to have. i.e. we expand when we're over 100% utilization, and shrink when we're below 10% utilization. Additionally: * Add `dictPauseAutoResize` so that flows that do mass deletions, will only trigger shrinkage at the end. * Rename `dictResize` to `dictShrinkToFit` (same logic as it used to have, but better name describing it) * Rename `_dictExpand` to `_dictResize` (same logic as it used to have, but better name describing it) related to discussion https://github.com/redis/redis/pull/12819#discussion_r1409293878 --------- Co-authored-by: Oran Agra Co-authored-by: zhaozhao.zz --- src/dict.c | 95 +++++++++++++++++++++++++++++++---------- src/dict.h | 11 ++++- src/server.c | 12 +++--- src/server.h | 3 +- src/t_hash.c | 3 -- src/t_set.c | 1 - src/t_zset.c | 9 ++-- tests/unit/type/set.tcl | 5 +++ 8 files changed, 99 insertions(+), 40 deletions(-) diff --git a/src/dict.c b/src/dict.c index 328c2dc8105..1b7b2138fc3 100644 --- a/src/dict.c +++ b/src/dict.c @@ -78,7 +78,8 @@ typedef struct { /* -------------------------- private prototypes ---------------------------- */ -static int _dictExpandIfNeeded(dict *d); +static void _dictExpandIfNeeded(dict *d); +static void _dictShrinkIfNeeded(dict *d); static signed char _dictNextExp(unsigned long size); static int _dictInit(dict *d, dictType *type); static dictEntry *dictGetNext(const dictEntry *de); @@ -208,12 +209,13 @@ int _dictInit(dict *d, dictType *type) d->type = type; d->rehashidx = -1; d->pauserehash = 0; + d->pauseAutoResize = 0; return DICT_OK; } /* Resize the table to the minimal size that contains all the elements, * but with the invariant of a USED/BUCKETS ratio near to <= 1 */ -int dictResize(dict *d) +int dictShrinkToFit(dict *d) { unsigned long minimal; @@ -221,20 +223,18 @@ int dictResize(dict *d) minimal = d->ht_used[0]; if (minimal < DICT_HT_INITIAL_SIZE) minimal = DICT_HT_INITIAL_SIZE; - return dictExpand(d, minimal); + return dictShrink(d, minimal); } -/* Expand or create the hash table, +/* Resize or create the hash table, * when malloc_failed is non-NULL, it'll avoid panic if malloc fails (in which case it'll be set to 1). - * Returns DICT_OK if expand was performed, and DICT_ERR if skipped. */ -int _dictExpand(dict *d, unsigned long size, int* malloc_failed) + * Returns DICT_OK if resize was performed, and DICT_ERR if skipped. */ +int _dictResize(dict *d, unsigned long size, int* malloc_failed) { if (malloc_failed) *malloc_failed = 0; - /* the size is invalid if it is smaller than the number of - * elements already inside the hash table */ - if (dictIsRehashing(d) || d->ht_used[0] > size) - return DICT_ERR; + /* We can't rehash twice if rehashing is ongoing. */ + assert(!dictIsRehashing(d)); /* the new hash table */ dictEntry **new_ht_table; @@ -286,6 +286,14 @@ int _dictExpand(dict *d, unsigned long size, int* malloc_failed) return DICT_OK; } +int _dictExpand(dict *d, unsigned long size, int* malloc_failed) { + /* the size is invalid if it is smaller than the size of the hash table + * or smaller than the number of elements already inside the hash table */ + if (dictIsRehashing(d) || d->ht_used[0] > size || DICTHT_SIZE(d->ht_size_exp[0]) >= size) + return DICT_ERR; + return _dictResize(d, size, malloc_failed); +} + /* return DICT_ERR if expand was not performed */ int dictExpand(dict *d, unsigned long size) { return _dictExpand(d, size, NULL); @@ -293,11 +301,20 @@ int dictExpand(dict *d, unsigned long size) { /* return DICT_ERR if expand failed due to memory allocation failure */ int dictTryExpand(dict *d, unsigned long size) { - int malloc_failed; + int malloc_failed = 0; _dictExpand(d, size, &malloc_failed); return malloc_failed? DICT_ERR : DICT_OK; } +/* return DICT_ERR if shrink was not performed */ +int dictShrink(dict *d, unsigned long size) { + /* the size is invalid if it is bigger than the size of the hash table + * or smaller than the number of elements already inside the hash table */ + if (dictIsRehashing(d) || d->ht_used[0] > size || DICTHT_SIZE(d->ht_size_exp[0]) <= size) + return DICT_ERR; + return _dictResize(d, size, NULL); +} + /* Performs N steps of incremental rehashing. Returns 1 if there are still * keys to move from the old to the new hash table, otherwise 0 is returned. * @@ -588,6 +605,7 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { dictFreeUnlinkedEntry(d, he); } d->ht_used[table]--; + _dictShrinkIfNeeded(d); return he; } prevHe = he; @@ -752,6 +770,7 @@ void dictTwoPhaseUnlinkFree(dict *d, dictEntry *he, dictEntry **plink, int table dictFreeKey(d, he); dictFreeVal(d, he); if (!entryIsKey(he)) zfree(decodeMaskedPtr(he)); + _dictShrinkIfNeeded(d); dictResumeRehashing(d); } @@ -1401,21 +1420,27 @@ unsigned long dictScanDefrag(dict *d, /* Because we may need to allocate huge memory chunk at once when dict * expands, we will check this allocation is allowed or not if the dict * type has expandAllowed member function. */ -static int dictTypeExpandAllowed(dict *d) { - if (d->type->expandAllowed == NULL) return 1; - return d->type->expandAllowed( +static int dictTypeResizeAllowed(dict *d) { + if (d->type->resizeAllowed == NULL) return 1; + return d->type->resizeAllowed( DICTHT_SIZE(_dictNextExp(d->ht_used[0] + 1)) * sizeof(dictEntry*), (double)d->ht_used[0] / DICTHT_SIZE(d->ht_size_exp[0])); } /* Expand the hash table if needed */ -static int _dictExpandIfNeeded(dict *d) +static void _dictExpandIfNeeded(dict *d) { + /* Automatic resizing is disallowed. Return */ + if (d->pauseAutoResize > 0) return; + /* Incremental rehashing already in progress. Return. */ - if (dictIsRehashing(d)) return DICT_OK; + if (dictIsRehashing(d)) return; /* If the hash table is empty expand it to the initial size. */ - if (DICTHT_SIZE(d->ht_size_exp[0]) == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE); + if (DICTHT_SIZE(d->ht_size_exp[0]) == 0) { + dictExpand(d, DICT_HT_INITIAL_SIZE); + return; + } /* If we reached the 1:1 ratio, and we are allowed to resize the hash * table (global setting) or we should avoid it but the ratio between @@ -1426,11 +1451,35 @@ static int _dictExpandIfNeeded(dict *d) (dict_can_resize != DICT_RESIZE_FORBID && d->ht_used[0] / DICTHT_SIZE(d->ht_size_exp[0]) > dict_force_resize_ratio)) { - if (!dictTypeExpandAllowed(d)) - return DICT_OK; - return dictExpand(d, d->ht_used[0] + 1); + if (!dictTypeResizeAllowed(d)) + return; + dictExpand(d, d->ht_used[0] + 1); + } +} + +static void _dictShrinkIfNeeded(dict *d) +{ + /* Automatic resizing is disallowed. Return */ + if (d->pauseAutoResize > 0) return; + + /* Incremental rehashing already in progress. Return. */ + if (dictIsRehashing(d)) return; + + /* If the size of hash table is DICT_HT_INITIAL_SIZE, don't shrink it. */ + if (DICTHT_SIZE(d->ht_size_exp[0]) == DICT_HT_INITIAL_SIZE) return; + + /* If we reached below 1:10 elements/buckets ratio, and we are allowed to resize + * the hash table (global setting) or we should avoid it but the ratio is below 1:50, + * we'll trigger a resize of the hash table. */ + if ((dict_can_resize == DICT_RESIZE_ENABLE && + d->ht_used[0] * 100 / DICTHT_SIZE(d->ht_size_exp[0]) < HASHTABLE_MIN_FILL) || + (dict_can_resize != DICT_RESIZE_FORBID && + d->ht_used[0] * 100 / DICTHT_SIZE(d->ht_size_exp[0]) < HASHTABLE_MIN_FILL / dict_force_resize_ratio)) + { + if (!dictTypeResizeAllowed(d)) + return; + dictShrink(d, d->ht_used[0]); } - return DICT_OK; } /* Our hash table capability is a power of two */ @@ -1454,8 +1503,7 @@ void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing) if (dictIsRehashing(d)) _dictRehashStep(d); /* Expand the hash table if needed */ - if (_dictExpandIfNeeded(d) == DICT_ERR) - return NULL; + _dictExpandIfNeeded(d); for (table = 0; table <= 1; table++) { idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]); if (table == 0 && (long)idx < d->rehashidx) continue; @@ -1483,6 +1531,7 @@ void dictEmpty(dict *d, void(callback)(dict*)) { _dictClear(d,1,callback); d->rehashidx = -1; d->pauserehash = 0; + d->pauseAutoResize = 0; } void dictSetResizeEnabled(dictResizeEnable enable) { diff --git a/src/dict.h b/src/dict.h index 3d4de3be253..cebbe14985c 100644 --- a/src/dict.h +++ b/src/dict.h @@ -44,6 +44,9 @@ #define DICT_OK 0 #define DICT_ERR 1 +/* Hash table parameters */ +#define HASHTABLE_MIN_FILL 10 /* Minimal hash table fill 10% */ + typedef struct dictEntry dictEntry; /* opaque */ typedef struct dict dict; @@ -54,7 +57,7 @@ typedef struct dictType { int (*keyCompare)(dict *d, const void *key1, const void *key2); void (*keyDestructor)(dict *d, void *key); void (*valDestructor)(dict *d, void *obj); - int (*expandAllowed)(size_t moreMem, double usedRatio); + int (*resizeAllowed)(size_t moreMem, double usedRatio); /* Invoked at the start of dict initialization/rehashing (old and new ht are already created) */ void (*rehashingStarted)(dict *d); /* Invoked at the end of dict initialization/rehashing of all the entries from old to new ht. Both ht still exists @@ -91,6 +94,7 @@ struct dict { /* Keep small vars at end for optimal (minimal) struct padding */ int16_t pauserehash; /* If >0 rehashing is paused (<0 indicates coding error) */ signed char ht_size_exp[2]; /* exponent of size. (size = 1<0 automatic resizing is disallowed (<0 indicates coding error) */ void *metadata[]; }; @@ -155,6 +159,8 @@ typedef struct { #define dictIsRehashing(d) ((d)->rehashidx != -1) #define dictPauseRehashing(d) ((d)->pauserehash++) #define dictResumeRehashing(d) ((d)->pauserehash--) +#define dictPauseAutoResize(d) ((d)->pauseAutoResize++) +#define dictResumeAutoResize(d) ((d)->pauseAutoResize--) /* If our unsigned long type can store a 64 bit number, use a 64 bit PRNG. */ #if ULONG_MAX >= 0xffffffffffffffff @@ -174,6 +180,7 @@ dict *dictCreate(dictType *type); dict **dictCreateMultiple(dictType *type, int count); int dictExpand(dict *d, unsigned long size); int dictTryExpand(dict *d, unsigned long size); +int dictShrink(dict *d, unsigned long size); int dictAdd(dict *d, void *key, void *val); dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing); void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing); @@ -188,7 +195,7 @@ void dictTwoPhaseUnlinkFree(dict *d, dictEntry *he, dictEntry **plink, int table void dictRelease(dict *d); dictEntry * dictFind(dict *d, const void *key); void *dictFetchValue(dict *d, const void *key); -int dictResize(dict *d); +int dictShrinkToFit(dict *d); void dictSetKey(dict *d, dictEntry* de, void *key); void dictSetVal(dict *d, dictEntry *de, void *val); void dictSetSignedIntegerVal(dictEntry *de, int64_t val); diff --git a/src/server.c b/src/server.c index 280644e7ff4..30ec199c010 100644 --- a/src/server.c +++ b/src/server.c @@ -428,7 +428,7 @@ uint64_t dictEncObjHash(const void *key) { * provisionally if used memory will be over maxmemory after dict expands, * but to guarantee the performance of redis, we still allow dict to expand * if dict load factor exceeds HASHTABLE_MAX_LOAD_FACTOR. */ -int dictExpandAllowed(size_t moreMem, double usedRatio) { +int dictResizeAllowed(size_t moreMem, double usedRatio) { if (usedRatio <= HASHTABLE_MAX_LOAD_FACTOR) { return !overMaxmemoryAfterAlloc(moreMem); } else { @@ -547,7 +547,7 @@ dictType dbDictType = { dictSdsKeyCompare, /* key compare */ dictSdsDestructor, /* key destructor */ dictObjectDestructor, /* val destructor */ - dictExpandAllowed, /* allow to expand */ + dictResizeAllowed, /* allow to resize */ dbDictRehashingStarted, dbDictRehashingCompleted, dbDictMetadataSize, @@ -561,7 +561,7 @@ dictType dbExpiresDictType = { dictSdsKeyCompare, /* key compare */ NULL, /* key destructor */ NULL, /* val destructor */ - dictExpandAllowed, /* allow to expand */ + dictResizeAllowed, /* allow to resize */ dbExpiresRehashingStarted, dbExpiresRehashingCompleted, dbDictMetadataSize, @@ -693,7 +693,7 @@ dictType clientDictType = { .no_value = 1 /* no values in this dict */ }; -int htNeedsResize(dict *dict) { +int htNeedsShrink(dict *dict) { long long size, used; size = dictBuckets(dict); @@ -718,8 +718,8 @@ void tryResizeHashTables(int dbid) { for (int i = 0; i < CRON_DBS_PER_CALL && db->sub_dict[subdict].resize_cursor != -1; i++) { int slot = db->sub_dict[subdict].resize_cursor; dict *d = (subdict == DB_MAIN ? db->dict[slot] : db->expires[slot]); - if (htNeedsResize(d)) - dictResize(d); + if (htNeedsShrink(d)) + dictShrinkToFit(d); db->sub_dict[subdict].resize_cursor = dbGetNextNonEmptySlot(db, slot, subdict); } } diff --git a/src/server.h b/src/server.h index be33bf8039c..a913a1c8b4f 100644 --- a/src/server.h +++ b/src/server.h @@ -198,7 +198,6 @@ struct hdr_histogram; extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; /* Hash table parameters */ -#define HASHTABLE_MIN_FILL 10 /* Minimal hash table fill 10% */ #define HASHTABLE_MAX_LOAD_FACTOR 1.618 /* Maximum hash table load factor. */ /* Command flags. Please check the definition of struct redisCommand in this file @@ -3112,7 +3111,7 @@ void serverLogRaw(int level, const char *msg); void serverLogRawFromHandler(int level, const char *msg); void usage(void); void updateDictResizePolicy(void); -int htNeedsResize(dict *dict); +int htNeedsShrink(dict *dict); void populateCommandTable(void); void resetCommandTableStats(dict* commands); void resetErrorTableStats(void); diff --git a/src/t_hash.c b/src/t_hash.c index 9242d27cc93..ff8746384c4 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -292,9 +292,6 @@ int hashTypeDelete(robj *o, sds field) { } else if (o->encoding == OBJ_ENCODING_HT) { if (dictDelete((dict*)o->ptr, field) == C_OK) { deleted = 1; - - /* Always check if the dictionary needs a resize after a delete. */ - if (htNeedsResize(o->ptr)) dictResize(o->ptr); } } else { diff --git a/src/t_set.c b/src/t_set.c index c2729105dba..24e7b0e7d25 100644 --- a/src/t_set.c +++ b/src/t_set.c @@ -256,7 +256,6 @@ int setTypeRemoveAux(robj *setobj, char *str, size_t len, int64_t llval, int str if (setobj->encoding == OBJ_ENCODING_HT) { sds sdsval = str_is_sds ? (sds)str : sdsnewlen(str, len); int deleted = (dictDelete(setobj->ptr, sdsval) == DICT_OK); - if (deleted && htNeedsResize(setobj->ptr)) dictResize(setobj->ptr); if (sdsval != str) sdsfree(sdsval); /* free temp copy */ return deleted; } else if (setobj->encoding == OBJ_ENCODING_LISTPACK) { diff --git a/src/t_zset.c b/src/t_zset.c index 762f4aee77f..4bff5eb782d 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -1596,7 +1596,6 @@ int zsetDel(robj *zobj, sds ele) { } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = zobj->ptr; if (zsetRemoveFromSkiplist(zs, ele)) { - if (htNeedsResize(zs->dict)) dictResize(zs->dict); return 1; } } else { @@ -2011,6 +2010,7 @@ void zremrangeGenericCommand(client *c, zrange_type rangetype) { } } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = zobj->ptr; + dictPauseAutoResize(zs->dict); switch(rangetype) { case ZRANGE_AUTO: case ZRANGE_RANK: @@ -2023,7 +2023,8 @@ void zremrangeGenericCommand(client *c, zrange_type rangetype) { deleted = zslDeleteRangeByLex(zs->zsl,&lexrange,zs->dict); break; } - if (htNeedsResize(zs->dict)) dictResize(zs->dict); + dictResumeAutoResize(zs->dict); + if (htNeedsShrink(zs->dict)) dictShrinkToFit(zs->dict); if (dictSize(zs->dict) == 0) { dbDelete(c->db,key); keyremoved = 1; @@ -2535,10 +2536,12 @@ static void zdiffAlgorithm2(zsetopsrc *src, long setnum, zset *dstzset, size_t * dictAdd(dstzset->dict,tmp,&znode->score); cardinality++; } else { + dictPauseAutoResize(dstzset->dict); tmp = zuiSdsFromValue(&zval); if (zsetRemoveFromSkiplist(dstzset, tmp)) { cardinality--; } + dictResumeAutoResize(dstzset->dict); } /* Exit if result set is empty as any additional removal @@ -2551,7 +2554,7 @@ static void zdiffAlgorithm2(zsetopsrc *src, long setnum, zset *dstzset, size_t * } /* Resize dict if needed after removing multiple elements */ - if (htNeedsResize(dstzset->dict)) dictResize(dstzset->dict); + if (htNeedsShrink(dstzset->dict)) dictShrinkToFit(dstzset->dict); /* Using this algorithm, we can't calculate the max element as we go, * we have to iterate through all elements to find the max one after. */ diff --git a/tests/unit/type/set.tcl b/tests/unit/type/set.tcl index 29275622d13..4ffc892816c 100644 --- a/tests/unit/type/set.tcl +++ b/tests/unit/type/set.tcl @@ -1101,6 +1101,11 @@ foreach type {single multiple single_multiple} { assert_equal [r scard myset] 30 assert {[is_rehashing myset]} + # Wait for the hash set rehashing to finish. + while {[is_rehashing myset]} { + r srandmember myset 10 + } + # Now that we have a hash set with only one long chain bucket. set htstats [r debug HTSTATS-KEY myset full] assert {[regexp {different slots: ([0-9]+)} $htstats - different_slots]} From ecc31bc6973830c8aa9747471943f1bdd46f3257 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 15 Jan 2024 16:28:24 +0800 Subject: [PATCH 44/58] Updated comments on dictResizeEnable for new dict shrink (#12946) The new shrink was added in #12850. Also updated outdated comments, see #11692. --- src/dict.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/dict.c b/src/dict.c index 1b7b2138fc3..7d8913761b7 100644 --- a/src/dict.c +++ b/src/dict.c @@ -48,14 +48,17 @@ #include "redisassert.h" #include "monotonic.h" -/* Using dictEnableResize() / dictDisableResize() we make possible to disable +/* Using dictSetResizeEnabled() we make possible to disable * resizing and rehashing of the hash table as needed. This is very important * for Redis, as we use copy-on-write and don't want to move too much memory * around when there is a child performing saving operations. * * Note that even when dict_can_resize is set to DICT_RESIZE_AVOID, not all - * resizes are prevented: a hash table is still allowed to grow if the ratio - * between the number of elements and the buckets > dict_force_resize_ratio. */ + * resizes are prevented: + * - A hash table is still allowed to expand if the ratio between the number + * of elements and the buckets > dict_force_resize_ratio. + * - A hash table is still allowed to shrink if the ratio between the number + * of elements and the buckets < HASHTABLE_MIN_FILL / dict_force_resize_ratio. */ static dictResizeEnable dict_can_resize = DICT_RESIZE_ENABLE; static unsigned int dict_force_resize_ratio = 5; From 131d95f203351b19f307072e6582fda91e149580 Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 17 Jan 2024 14:46:09 +0800 Subject: [PATCH 45/58] Fix race in slot dict resize test (#12942) The test have a race: ``` *** [err]: Redis can rewind and trigger smaller slot resizing in tests/unit/other.tcl Expected '[Dictionary HT] Hash table 0 stats (main hash table): table size: 12 number of elements: 2 [Expires HT] Hash table 0 stats (main hash table): No stats available for empty dictionaries ' to match '*table size: 8*' (context: type eval line 12 cmd {assert_match "*table size: 8*" [r debug HTSTATS 0]} proc ::test) ``` When `r del "{alice}$j"` is executed in the loop, when the key is deleted to [9, 12], the load factor has meet HASHTABLE_MIN_FILL, if serverCron happens to trigger slot dict resize, then the test will fail. Because there is not way to meet HASHTABLE_MIN_FILL in the subsequent dels. The solution is to avoid triggering the resize in advance. We can use multi to delete them at once, or we can disable the resize. Since we disabled resize in the previous test, the fix also uses the method of disabling resize. The test is introduced in #12802. --- tests/unit/other.tcl | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl index 1c9966387fc..a09a6e6449e 100644 --- a/tests/unit/other.tcl +++ b/tests/unit/other.tcl @@ -459,15 +459,30 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} { } {} {needs:debug} test "Redis can rewind and trigger smaller slot resizing" { + # hashslot(foo) is 12182 # hashslot(alice) is 749, smaller than hashslot(foo), # attempt to trigger a resize on it, see details in #12802. for {set j 1} {$j <= 128} {incr j} { r set "{alice}$j" a } + + # disable resizing + r config set rdb-key-save-delay 10000000 + r bgsave + for {set j 1} {$j <= 127} {incr j} { r del "{alice}$j" } + # enable resizing + r config set rdb-key-save-delay 0 + catch {exec kill -9 [get_child_pid 0]} + wait_for_condition 1000 10 { + [s rdb_bgsave_in_progress] eq 0 + } else { + fail "bgsave did not stop in time." + } + after 200;# waiting for serverCron assert_match "*table size: 8*" [r debug HTSTATS 0] } {} {needs:debug} From 14b1edfd994991d22c2b766031f77c045b95c995 Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 18 Jan 2024 17:16:50 +0800 Subject: [PATCH 46/58] Fix dict resize ratio checks, avoid precision loss from integer division (#12952) In the past we used integers to compare ratios, let us assume that we have the following data in expanding: ``` used / size > 5 `80 / 16 > 5` is false `81 / 16 > 5` is false `95 / 16 > 5` is false `96 / 16 > 5` is true ``` Because the integer result is rounded, our resize breaks the ratio constraint, this has existed since the beginning, which resulted in us not strictly following the ratio (shrink also has the same issue). This PR change it to multiplication to avoid floating point calculations. --- src/dict.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++--- src/server.c | 2 +- 2 files changed, 127 insertions(+), 9 deletions(-) diff --git a/src/dict.c b/src/dict.c index 7d8913761b7..03fc4d7493e 100644 --- a/src/dict.c +++ b/src/dict.c @@ -333,8 +333,8 @@ int dictRehash(dict *d, int n) { unsigned long s1 = DICTHT_SIZE(d->ht_size_exp[1]); if (dict_can_resize == DICT_RESIZE_FORBID || !dictIsRehashing(d)) return 0; if (dict_can_resize == DICT_RESIZE_AVOID && - ((s1 > s0 && s1 / s0 < dict_force_resize_ratio) || - (s1 < s0 && s0 / s1 < dict_force_resize_ratio))) + ((s1 > s0 && s1 < dict_force_resize_ratio * s0) || + (s1 < s0 && s0 < dict_force_resize_ratio * s1))) { return 0; } @@ -1452,7 +1452,7 @@ static void _dictExpandIfNeeded(dict *d) if ((dict_can_resize == DICT_RESIZE_ENABLE && d->ht_used[0] >= DICTHT_SIZE(d->ht_size_exp[0])) || (dict_can_resize != DICT_RESIZE_FORBID && - d->ht_used[0] / DICTHT_SIZE(d->ht_size_exp[0]) > dict_force_resize_ratio)) + d->ht_used[0] >= dict_force_resize_ratio * DICTHT_SIZE(d->ht_size_exp[0]))) { if (!dictTypeResizeAllowed(d)) return; @@ -1474,10 +1474,10 @@ static void _dictShrinkIfNeeded(dict *d) /* If we reached below 1:10 elements/buckets ratio, and we are allowed to resize * the hash table (global setting) or we should avoid it but the ratio is below 1:50, * we'll trigger a resize of the hash table. */ - if ((dict_can_resize == DICT_RESIZE_ENABLE && - d->ht_used[0] * 100 / DICTHT_SIZE(d->ht_size_exp[0]) < HASHTABLE_MIN_FILL) || + if ((dict_can_resize == DICT_RESIZE_ENABLE && + d->ht_used[0] * 100 <= HASHTABLE_MIN_FILL * DICTHT_SIZE(d->ht_size_exp[0])) || (dict_can_resize != DICT_RESIZE_FORBID && - d->ht_used[0] * 100 / DICTHT_SIZE(d->ht_size_exp[0]) < HASHTABLE_MIN_FILL / dict_force_resize_ratio)) + d->ht_used[0] * 100 * dict_force_resize_ratio <= HASHTABLE_MIN_FILL * DICTHT_SIZE(d->ht_size_exp[0]))) { if (!dictTypeResizeAllowed(d)) return; @@ -1693,6 +1693,7 @@ void dictGetStats(char *buf, size_t bufsize, dict *d, int full) { #include "testhelp.h" #define UNUSED(V) ((void) V) +#define TEST(name) printf("test — %s\n", name); uint64_t hashCallback(const void *key) { return dictGenHashFunction((unsigned char*)key, strlen((char*)key)); @@ -1746,6 +1747,7 @@ dictType BenchmarkDictType = { int dictTest(int argc, char **argv, int flags) { long j; long long start, elapsed; + int retval; dict *dict = dictCreate(&BenchmarkDictType); long count = 0; int accurate = (flags & REDIS_TEST_ACCURATE); @@ -1760,9 +1762,125 @@ int dictTest(int argc, char **argv, int flags) { count = 5000; } + TEST("Add 16 keys and verify dict resize is ok") { + dictSetResizeEnabled(DICT_RESIZE_ENABLE); + for (j = 0; j < 16; j++) { + retval = dictAdd(dict,stringFromLongLong(j),(void*)j); + assert(retval == DICT_OK); + } + while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); + assert(dictSize(dict) == 16); + assert(dictBuckets(dict) == 16); + } + + TEST("Use DICT_RESIZE_AVOID to disable the dict resize and pad to 80") { + /* Use DICT_RESIZE_AVOID to disable the dict resize, and pad + * the number of keys to 80, now is 16:80, so we can satisfy + * dict_force_resize_ratio. */ + dictSetResizeEnabled(DICT_RESIZE_AVOID); + for (j = 16; j < 80; j++) { + retval = dictAdd(dict,stringFromLongLong(j),(void*)j); + assert(retval == DICT_OK); + } + assert(dictSize(dict) == 80); + assert(dictBuckets(dict) == 16); + } + + TEST("Add one more key, trigger the dict resize") { + retval = dictAdd(dict,stringFromLongLong(80),(void*)80); + assert(retval == DICT_OK); + assert(dictSize(dict) == 81); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 16); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 128); + assert(dictBuckets(dict) == 144); + + /* Wait for rehashing. */ + dictSetResizeEnabled(DICT_RESIZE_ENABLE); + while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); + assert(dictSize(dict) == 81); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 128); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); + assert(dictBuckets(dict) == 128); + } + + TEST("Delete keys until 13 keys remain") { + /* Delete keys until 13 keys remain, now is 13:128, so we can + * satisfy HASHTABLE_MIN_FILL in the next test. */ + for (j = 0; j < 68; j++) { + retval = dictDelete(dict,stringFromLongLong(j)); + assert(retval == DICT_OK); + } + assert(dictSize(dict) == 13); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 128); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); + assert(dictBuckets(dict) == 128); + } + + TEST("Delete one more key, trigger the dict resize") { + retval = dictDelete(dict,stringFromLongLong(68)); + assert(retval == DICT_OK); + assert(dictSize(dict) == 12); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 128); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 16); + assert(dictBuckets(dict) == 144); + + /* Wait for rehashing. */ + while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); + assert(dictSize(dict) == 12); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 16); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); + assert(dictBuckets(dict) == 16); + } + + TEST("Empty the dictionary and add 128 keys") { + dictEmpty(dict, NULL); + for (j = 0; j < 128; j++) { + retval = dictAdd(dict,stringFromLongLong(j),(void*)j); + assert(retval == DICT_OK); + } + while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); + assert(dictSize(dict) == 128); + assert(dictBuckets(dict) == 128); + } + + TEST("Use DICT_RESIZE_AVOID to disable the dict resize and reduce to 3") { + /* Use DICT_RESIZE_AVOID to disable the dict reset, and reduce + * the number of keys to 3, now is 3:128, so we can satisfy + * HASHTABLE_MIN_FILL / dict_force_resize_ratio. */ + dictSetResizeEnabled(DICT_RESIZE_AVOID); + for (j = 0; j < 125; j++) { + retval = dictDelete(dict,stringFromLongLong(j)); + assert(retval == DICT_OK); + } + assert(dictSize(dict) == 3); + assert(dictBuckets(dict) == 128); + } + + TEST("Delete one more key, trigger the dict resize") { + retval = dictDelete(dict,stringFromLongLong(125)); + assert(retval == DICT_OK); + assert(dictSize(dict) == 2); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 128); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 4); + assert(dictBuckets(dict) == 132); + + /* Wait for rehashing. */ + dictSetResizeEnabled(DICT_RESIZE_ENABLE); + while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); + assert(dictSize(dict) == 2); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 4); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); + assert(dictBuckets(dict) == 4); + } + + TEST("Restore to original state") { + dictEmpty(dict, NULL); + dictSetResizeEnabled(DICT_RESIZE_ENABLE); + } + start_benchmark(); for (j = 0; j < count; j++) { - int retval = dictAdd(dict,stringFromLongLong(j),(void*)j); + retval = dictAdd(dict,stringFromLongLong(j),(void*)j); assert(retval == DICT_OK); } end_benchmark("Inserting"); @@ -1820,7 +1938,7 @@ int dictTest(int argc, char **argv, int flags) { start_benchmark(); for (j = 0; j < count; j++) { char *key = stringFromLongLong(j); - int retval = dictDelete(dict,key); + retval = dictDelete(dict,key); assert(retval == DICT_OK); key[0] += 17; /* Change first number to letter. */ retval = dictAdd(dict,key,(void*)j); diff --git a/src/server.c b/src/server.c index 30ec199c010..d7707bb5a20 100644 --- a/src/server.c +++ b/src/server.c @@ -699,7 +699,7 @@ int htNeedsShrink(dict *dict) { size = dictBuckets(dict); used = dictSize(dict); return (size > DICT_HT_INITIAL_SIZE && - (used*100/size < HASHTABLE_MIN_FILL)); + (used*100 <= HASHTABLE_MIN_FILL*size)); } /* In cluster-enabled setup, this method traverses through all main/expires dictionaries (CLUSTER_SLOTS) From 29e6245a05f414159932f71d971793df34566dfe Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 18 Jan 2024 17:19:29 +0800 Subject: [PATCH 47/58] Fix unexpected resize causing test failure (#12960) Before #12850, we will only try to shrink the dict in serverCron, which we can control by using a child process, but now every time we delete a key, the shrink check will be called. In these test (added in #12802), we meant to disable the resizing, but druing the delete, the dict will meet the force shrink, like 2 / 128 = 0.015 < 0.2, the delete will trigger a force resize and will cause the test to fail. In this commit, we try to keep the load factor at 3 / 128 = 0.023, that is, do not meet the force shrink. --- tests/unit/other.tcl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl index a09a6e6449e..08364f44056 100644 --- a/tests/unit/other.tcl +++ b/tests/unit/other.tcl @@ -439,8 +439,8 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} { r config set rdb-key-save-delay 10000000 r bgsave - # delete data to have lot's (99%) of empty buckets - for {set j 1} {$j <= 127} {incr j} { + # delete data to have lot's (98%) of empty buckets + for {set j 1} {$j <= 125} {incr j} { r del "{foo}$j" } assert_match "*table size: 128*" [r debug HTSTATS 0] @@ -470,7 +470,7 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} { r config set rdb-key-save-delay 10000000 r bgsave - for {set j 1} {$j <= 127} {incr j} { + for {set j 1} {$j <= 125} {incr j} { r del "{alice}$j" } From 0e5a4a27ea49da6423513ff749093ccf1582171d Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 18 Jan 2024 22:28:52 +0800 Subject: [PATCH 48/58] Call emptyData when disk-based sync rdbLoad fails (#12510) We doing this in diskless on-empty-db mode, when diskless loading fails, we will call emptyData to remove the half-loaded data in case we started with an empty replica. Now when a disk-based sync rdbLoad fails, we will call emptyData too in case it loads partially incomplete data. when the replica attempts another re-sync, it'll empty the dataset again anyway, so this affects two things: 1. memory consumption in the time gap until the next rdb loading begins 2. if the unsynced replica is for some reason promoted, it would have kept the partial dataset instead of being empty. --- src/replication.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/replication.c b/src/replication.c index d4a77b57ea3..07e88c15164 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2240,6 +2240,10 @@ void readSyncBulkPayload(connection *conn) { "disabled"); bg_unlink(server.rdb_filename); } + + /* If disk-based RDB loading fails, remove the half-loaded dataset. */ + emptyData(-1, empty_db_flags, replicationEmptyDbCallback); + /* Note that there's no point in restarting the AOF on sync failure, it'll be restarted when sync succeeds or replica promoted. */ return; From 1c7eb0ad373a21a0610ba484cc0d5054e1f1aab0 Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 18 Jan 2024 22:32:04 +0800 Subject: [PATCH 49/58] Fix minor memory leaks in dictTest (#12962) Introduced in #12952, reported by valgrind. --- src/dict.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/dict.c b/src/dict.c index 03fc4d7493e..de6c21767c9 100644 --- a/src/dict.c +++ b/src/dict.c @@ -1807,7 +1807,9 @@ int dictTest(int argc, char **argv, int flags) { /* Delete keys until 13 keys remain, now is 13:128, so we can * satisfy HASHTABLE_MIN_FILL in the next test. */ for (j = 0; j < 68; j++) { - retval = dictDelete(dict,stringFromLongLong(j)); + char *key = stringFromLongLong(j); + retval = dictDelete(dict, key); + zfree(key); assert(retval == DICT_OK); } assert(dictSize(dict) == 13); @@ -1817,7 +1819,9 @@ int dictTest(int argc, char **argv, int flags) { } TEST("Delete one more key, trigger the dict resize") { - retval = dictDelete(dict,stringFromLongLong(68)); + char *key = stringFromLongLong(68); + retval = dictDelete(dict, key); + zfree(key); assert(retval == DICT_OK); assert(dictSize(dict) == 12); assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 128); @@ -1849,7 +1853,9 @@ int dictTest(int argc, char **argv, int flags) { * HASHTABLE_MIN_FILL / dict_force_resize_ratio. */ dictSetResizeEnabled(DICT_RESIZE_AVOID); for (j = 0; j < 125; j++) { - retval = dictDelete(dict,stringFromLongLong(j)); + char *key = stringFromLongLong(j); + retval = dictDelete(dict, key); + zfree(key); assert(retval == DICT_OK); } assert(dictSize(dict) == 3); @@ -1857,7 +1863,9 @@ int dictTest(int argc, char **argv, int flags) { } TEST("Delete one more key, trigger the dict resize") { - retval = dictDelete(dict,stringFromLongLong(125)); + char *key = stringFromLongLong(125); + retval = dictDelete(dict, key); + zfree(key); assert(retval == DICT_OK); assert(dictSize(dict) == 2); assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 128); From f81c3fd89ed998f211e60e76b8f55a2df27000fe Mon Sep 17 00:00:00 2001 From: Chen Tianjie Date: Thu, 18 Jan 2024 22:35:12 +0800 Subject: [PATCH 50/58] Optimize dictTypeResizeAllowed to avoid mistaken OOM judgement. (#12950) When doing dict resizing, dictTypeResizeAllowed is used to judge whether the new allocated memory for rehashing would cause OOM. However when shrinking, we alloc `_dictNextExp(d->ht_used[0])` bytes of memory, while in `dictTypeResizeAllowed` we still use `_dictNextExp(d->ht_used[0]+1)` as the new allocated memory size. This will overestimate the memory used by shrinking at special conditions, causing a false OOM judgement. --- src/dict.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/dict.c b/src/dict.c index de6c21767c9..bb50a6a3231 100644 --- a/src/dict.c +++ b/src/dict.c @@ -1421,12 +1421,12 @@ unsigned long dictScanDefrag(dict *d, /* ------------------------- private functions ------------------------------ */ /* Because we may need to allocate huge memory chunk at once when dict - * expands, we will check this allocation is allowed or not if the dict - * type has expandAllowed member function. */ -static int dictTypeResizeAllowed(dict *d) { + * resizes, we will check this allocation is allowed or not if the dict + * type has resizeAllowed member function. */ +static int dictTypeResizeAllowed(dict *d, size_t size) { if (d->type->resizeAllowed == NULL) return 1; return d->type->resizeAllowed( - DICTHT_SIZE(_dictNextExp(d->ht_used[0] + 1)) * sizeof(dictEntry*), + DICTHT_SIZE(_dictNextExp(size)) * sizeof(dictEntry*), (double)d->ht_used[0] / DICTHT_SIZE(d->ht_size_exp[0])); } @@ -1454,7 +1454,7 @@ static void _dictExpandIfNeeded(dict *d) (dict_can_resize != DICT_RESIZE_FORBID && d->ht_used[0] >= dict_force_resize_ratio * DICTHT_SIZE(d->ht_size_exp[0]))) { - if (!dictTypeResizeAllowed(d)) + if (!dictTypeResizeAllowed(d, d->ht_used[0] + 1)) return; dictExpand(d, d->ht_used[0] + 1); } @@ -1479,7 +1479,7 @@ static void _dictShrinkIfNeeded(dict *d) (dict_can_resize != DICT_RESIZE_FORBID && d->ht_used[0] * 100 * dict_force_resize_ratio <= HASHTABLE_MIN_FILL * DICTHT_SIZE(d->ht_size_exp[0]))) { - if (!dictTypeResizeAllowed(d)) + if (!dictTypeResizeAllowed(d, d->ht_used[0])) return; dictShrink(d, d->ht_used[0]); } From d0640029dcf8d9157b9ac78ca0b99fb48f97e380 Mon Sep 17 00:00:00 2001 From: "debing.sun" Date: Fri, 19 Jan 2024 21:12:49 +0800 Subject: [PATCH 51/58] Fix race condition issues between the main thread and module threads (#12817) Fix #12785 and other race condition issues. See the following isolated comments. The following report was obtained using SANITIZER thread. ```sh make SANITIZER=thread ./runtest-moduleapi --config io-threads 4 --config io-threads-do-reads yes --accurate ``` 1. Fixed thread-safe issue in RM_UnblockClient() Related discussion: https://github.com/redis/redis/pull/12817#issuecomment-1831181220 * When blocking a client in a module using `RM_BlockClientOnKeys()` or `RM_BlockClientOnKeysWithFlags()` with a timeout_callback, calling RM_UnblockClient() in module threads can lead to race conditions in `updateStatsOnUnblock()`. - Introduced: Version: 6.2 PR: #7491 - Touch: `server.stat_numcommands`, `cmd->latency_histogram`, `server.slowlog`, and `server.latency_events` - Harm Level: High Potentially corrupts the memory data of `cmd->latency_histogram`, `server.slowlog`, and `server.latency_events` - Solution: Differentiate whether the call to moduleBlockedClientTimedOut() comes from the module or the main thread. Since we can't know if RM_UnblockClient() comes from module threads, we always assume it does and let `updateStatsOnUnblock()` asynchronously update the unblock status. * When error reply is called in timeout_callback(), ctx is not thread-safe, eventually lead to race conditions in `afterErrorReply`. - Introduced: Version: 6.2 PR: #8217 - Touch `server.stat_total_error_replies`, `server.errors`, - Harm Level: High Potentially corrupts the memory data of `server.errors` - Solution: Make the ctx in `timeout_callback()` with `REDISMODULE_CTX_THREAD_SAFE`, and asynchronously reply errors to the client. 2. Made RM_Reply*() family API thread-safe Related discussion: https://github.com/redis/redis/pull/12817#discussion_r1408707239 Call chain: `RM_Reply*()` -> `_addReplyToBufferOrList()` -> touch server.current_client - Introduced: Version: 7.2.0 PR: #12326 - Harm Level: None Since the module fake client won't have the `CLIENT_PUSHING` flag, even if we touch server.current_client, we can still exit after `c->flags & CLIENT_PUSHING`. - Solution Checking `c->flags & CLIENT_PUSHING` earlier. 3. Made freeClient() thread-safe Fix #12785 - Introduced: Version: 4.0 Commit: https://github.com/redis/redis/commit/3fcf959e609e850a114d4016843e4c991066ebac - Harm Level: Moderate * Trigger assertion It happens when the module thread calls freeClient while the io-thread is in progress, which just triggers an assertion, and doesn't make any race condiaions. * Touch `server.current_client`, `server.stat_clients_type_memory`, and `clientMemUsageBucket->clients`. It happens between the main thread and the module threads, may cause data corruption. 1. Error reset `server.current_client` to NULL, but theoretically this won't happen, because the module has already reset `server.current_client` to old value before entering freeClient. 2. corrupts `clientMemUsageBucket->clients` in updateClientMemUsageAndBucket(). 3. Causes server.stat_clients_type_memory memory statistics to be inaccurate. - Solution: * No longer counts memory usage on fake clients, to avoid updating `server.stat_clients_type_memory` in freeClient. * No longer resetting `server.current_client` in unlinkClient, because the fake client won't be evicted or disconnected in the mid of the process. * Judgment assertion `io_threads_op == IO_THREADS_OP_IDLE` only if c is not a fake client. 4. Fixed free client args without GIL Related discussion: https://github.com/redis/redis/pull/12817#discussion_r1408706695 When freeing retained strings in the module thread (refcount decr), or using them in some way (refcount incr), we should do so while holding the GIL, otherwise, they might be simultaneously freed while the main thread is processing the unblock client state. - Introduced: Version: 6.2.0 PR: #8141 - Harm Level: Low Trigger assertion or double free or memory leak. - Solution: Documenting that module API users need to ensure any access to these retained strings is done with the GIL locked 5. Fix adding fake client to server.clients_pending_write It will incorrectly log the memory usage for the fake client. Related discussion: https://github.com/redis/redis/pull/12817#issuecomment-1851899163 - Introduced: Version: 4.0 Commit: https://github.com/redis/redis/commit/9b01b64430fbc1487429144d2e4e72a4a7fd9db2 - Harm Level: None Only result in NOP - Solution: * Don't add fake client into server.clients_pending_write * Add c->conn assertion for updateClientMemUsageAndBucket() and updateClientMemoryUsage() to avoid same issue in the future. So now it will be the responsibility of the caller of both of them to avoid passing in fake client. 6. Fix calling RM_BlockedClientMeasureTimeStart() and RM_BlockedClientMeasureTimeEnd() without GIL - Introduced: Version: 6.2 PR: #7491 - Harm Level: Low Causes inaccuracies in command latency histogram and slow logs, but does not corrupt memory. - Solution: Module API users, if know that non-thread-safe APIs will be used in multi-threading, need to take responsibility for protecting them with their own locks instead of the GIL, as using the GIL is too expensive. ### Other issue 1. RM_Yield is not thread-safe, fixed via #12905. ### Summarize 1. Fix thread-safe issues for `RM_UnblockClient()`, `freeClient()` and `RM_Yield`, potentially preventing memory corruption, data disorder, or assertion. 2. Updated docs and module test to clarify module API users' responsibility for locking non-thread-safe APIs in multi-threading, such as RM_BlockedClientMeasureTimeStart/End(), RM_FreeString(), RM_RetainString(), and RM_HoldString(). ### About backpot to 7.2 1. The implement of (1) is not too satisfying, would like to get more eyes. 2. (2), (3) can be safely for backport 3. (4), (6) just modifying the module tests and updating the documentation, no need for a backpot. 4. (5) is harmless, no need for a backpot. --------- Co-authored-by: Oran Agra --- src/blocked.c | 2 +- src/module.c | 61 +++++++++++++++++++++------- src/networking.c | 21 +++++----- src/server.c | 5 ++- src/server.h | 2 +- tests/modules/blockedclient.c | 15 +++---- tests/modules/blockonbackground.c | 67 +++++++++++++++++++++++-------- tests/modules/usercall.c | 15 +++---- 8 files changed, 130 insertions(+), 58 deletions(-) diff --git a/src/blocked.c b/src/blocked.c index 6ad4667dba5..3108cb67750 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -239,7 +239,7 @@ void replyToBlockedClientTimedOut(client *c) { addReplyLongLong(c,server.fsynced_reploff >= c->bstate.reploffset); addReplyLongLong(c,replicationCountAOFAcksByOffset(c->bstate.reploffset)); } else if (c->bstate.btype == BLOCKED_MODULE) { - moduleBlockedClientTimedOut(c); + moduleBlockedClientTimedOut(c, 0); } else { serverPanic("Unknown btype in replyToBlockedClientTimedOut()."); } diff --git a/src/module.c b/src/module.c index a60a345ae38..c54d164b76a 100644 --- a/src/module.c +++ b/src/module.c @@ -306,7 +306,6 @@ static size_t moduleTempClientMinCount = 0; /* Min client count in pool since * allow thread safe contexts to execute commands at a safe moment. */ static pthread_mutex_t moduleGIL = PTHREAD_MUTEX_INITIALIZER; - /* Function pointer type for keyspace event notification subscriptions from modules. */ typedef int (*RedisModuleNotificationFunc) (RedisModuleCtx *ctx, int type, const char *event, RedisModuleString *key); @@ -2338,7 +2337,10 @@ ustime_t RM_CachedMicroseconds(void) { * Within the same command, you can call multiple times * RM_BlockedClientMeasureTimeStart() and RM_BlockedClientMeasureTimeEnd() * to accumulate independent time intervals to the background duration. - * This method always return REDISMODULE_OK. */ + * This method always return REDISMODULE_OK. + * + * This function is not thread safe, If used in module thread and blocked callback (possibly main thread) + * simultaneously, it's recommended to protect them with lock owned by caller instead of GIL. */ int RM_BlockedClientMeasureTimeStart(RedisModuleBlockedClient *bc) { elapsedStart(&(bc->background_timer)); return REDISMODULE_OK; @@ -2348,7 +2350,10 @@ int RM_BlockedClientMeasureTimeStart(RedisModuleBlockedClient *bc) { * to calculate the elapsed execution time. * On success REDISMODULE_OK is returned. * This method only returns REDISMODULE_ERR if no start time was - * previously defined ( meaning RM_BlockedClientMeasureTimeStart was not called ). */ + * previously defined ( meaning RM_BlockedClientMeasureTimeStart was not called ). + * + * This function is not thread safe, If used in module thread and blocked callback (possibly main thread) + * simultaneously, it's recommended to protect them with lock owned by caller instead of GIL. */ int RM_BlockedClientMeasureTimeEnd(RedisModuleBlockedClient *bc) { // If the counter is 0 then we haven't called RM_BlockedClientMeasureTimeStart if (!bc->background_timer) @@ -2717,7 +2722,10 @@ RedisModuleString *RM_CreateStringFromStreamID(RedisModuleCtx *ctx, const RedisM * pass ctx as NULL when releasing the string (but passing a context will not * create any issue). Strings created with a context should be freed also passing * the context, so if you want to free a string out of context later, make sure - * to create it using a NULL context. */ + * to create it using a NULL context. + * + * This API is not thread safe, access to these retained strings (if they originated + * from a client command arguments) must be done with GIL locked. */ void RM_FreeString(RedisModuleCtx *ctx, RedisModuleString *str) { decrRefCount(str); if (ctx != NULL) autoMemoryFreed(ctx,REDISMODULE_AM_STRING,str); @@ -2754,7 +2762,10 @@ void RM_FreeString(RedisModuleCtx *ctx, RedisModuleString *str) { * * Threaded modules that reference retained strings from other threads *must* * explicitly trim the allocation as soon as the string is retained. Not doing - * so may result with automatic trimming which is not thread safe. */ + * so may result with automatic trimming which is not thread safe. + * + * This API is not thread safe, access to these retained strings (if they originated + * from a client command arguments) must be done with GIL locked. */ void RM_RetainString(RedisModuleCtx *ctx, RedisModuleString *str) { if (ctx == NULL || !autoMemoryFreed(ctx,REDISMODULE_AM_STRING,str)) { /* Increment the string reference counting only if we can't @@ -2796,7 +2807,10 @@ void RM_RetainString(RedisModuleCtx *ctx, RedisModuleString *str) { * * Threaded modules that reference held strings from other threads *must* * explicitly trim the allocation as soon as the string is held. Not doing - * so may result with automatic trimming which is not thread safe. */ + * so may result with automatic trimming which is not thread safe. + * + * This API is not thread safe, access to these retained strings (if they originated + * from a client command arguments) must be done with GIL locked. */ RedisModuleString* RM_HoldString(RedisModuleCtx *ctx, RedisModuleString *str) { if (str->refcount == OBJ_STATIC_REFCOUNT) { return RM_CreateStringFromString(ctx, str); @@ -8228,7 +8242,7 @@ int RM_UnblockClient(RedisModuleBlockedClient *bc, void *privdata) { * argument, but better to be safe than sorry. */ if (bc->timeout_callback == NULL) return REDISMODULE_ERR; if (bc->unblocked) return REDISMODULE_OK; - if (bc->client) moduleBlockedClientTimedOut(bc->client); + if (bc->client) moduleBlockedClientTimedOut(bc->client, 1); } moduleUnblockClientByHandle(bc,privdata); return REDISMODULE_OK; @@ -8327,8 +8341,10 @@ void moduleHandleBlockedClients(void) { * This needs to be out of the reply callback above given that a * module might not define any callback and still do blocking ops. */ - if (c && !clientHasModuleAuthInProgress(c) && !bc->blocked_on_keys) { - updateStatsOnUnblock(c, bc->background_duration, reply_us, server.stat_total_error_replies != prev_error_replies); + if (c && !clientHasModuleAuthInProgress(c)) { + int had_errors = c->deferred_reply_errors ? !!listLength(c->deferred_reply_errors) : + (server.stat_total_error_replies != prev_error_replies); + updateStatsOnUnblock(c, bc->background_duration, reply_us, had_errors); } if (c != NULL) { @@ -8346,7 +8362,7 @@ void moduleHandleBlockedClients(void) { * if there are pending replies here. This is needed since * during a non blocking command the client may receive output. */ if (!clientHasModuleAuthInProgress(c) && clientHasPendingReplies(c) && - !(c->flags & CLIENT_PENDING_WRITE)) + !(c->flags & CLIENT_PENDING_WRITE) && c->conn) { c->flags |= CLIENT_PENDING_WRITE; listLinkNodeHead(server.clients_pending_write, &c->clients_pending_write_node); @@ -8381,8 +8397,15 @@ int moduleBlockedClientMayTimeout(client *c) { /* Called when our client timed out. After this function unblockClient() * is called, and it will invalidate the blocked client. So this function * does not need to do any cleanup. Eventually the module will call the - * API to unblock the client and the memory will be released. */ -void moduleBlockedClientTimedOut(client *c) { + * API to unblock the client and the memory will be released. + * + * If this function is called from a module, we handle the timeout callback + * and the update of the unblock status in a thread-safe manner to avoid race + * conditions with the main thread. + * If this function is called from the main thread, we must handle the unblocking + * of the client synchronously. This ensures that we can reply to the client before + * resetClient() is called. */ +void moduleBlockedClientTimedOut(client *c, int from_module) { RedisModuleBlockedClient *bc = c->bstate.module_blocked_handle; /* Protect against re-processing: don't serve clients that are already @@ -8391,14 +8414,22 @@ void moduleBlockedClientTimedOut(client *c) { if (bc->unblocked) return; RedisModuleCtx ctx; - moduleCreateContext(&ctx, bc->module, REDISMODULE_CTX_BLOCKED_TIMEOUT); + int flags = REDISMODULE_CTX_BLOCKED_TIMEOUT; + if (from_module) flags |= REDISMODULE_CTX_THREAD_SAFE; + moduleCreateContext(&ctx, bc->module, flags); ctx.client = bc->client; ctx.blocked_client = bc; ctx.blocked_privdata = bc->privdata; - long long prev_error_replies = server.stat_total_error_replies; + + long long prev_error_replies; + if (!from_module) + prev_error_replies = server.stat_total_error_replies; + bc->timeout_callback(&ctx,(void**)c->argv,c->argc); moduleFreeContext(&ctx); - updateStatsOnUnblock(c, bc->background_duration, 0, server.stat_total_error_replies != prev_error_replies); + + if (!from_module) + updateStatsOnUnblock(c, bc->background_duration, 0, server.stat_total_error_replies != prev_error_replies); /* For timeout events, we do not want to call the disconnect callback, * because the blocked client will be automatically disconnected in diff --git a/src/networking.c b/src/networking.c index c020faf897f..0390092e4d8 100644 --- a/src/networking.c +++ b/src/networking.c @@ -414,8 +414,9 @@ void _addReplyToBufferOrList(client *c, const char *s, size_t len) { * to a channel which we are subscribed to, then we wanna postpone that message to be added * after the command's reply (specifically important during multi-exec). the exception is * the SUBSCRIBE command family, which (currently) have a push message instead of a proper reply. - * The check for executing_client also avoids affecting push messages that are part of eviction. */ - if (c == server.current_client && (c->flags & CLIENT_PUSHING) && + * The check for executing_client also avoids affecting push messages that are part of eviction. + * Check CLIENT_PUSHING first to avoid race conditions, as it's absent in module's fake client. */ + if ((c->flags & CLIENT_PUSHING) && c == server.current_client && server.executing_client && !cmdHasPushAsReply(server.executing_client->cmd)) { _addReplyProtoToList(c,server.pending_push_messages,s,len); @@ -1450,7 +1451,7 @@ void unlinkClient(client *c) { listNode *ln; /* If this is marked as current client unset it. */ - if (server.current_client == c) server.current_client = NULL; + if (c->conn && server.current_client == c) server.current_client = NULL; /* Certain operations must be done only if the client has an active connection. * If the client was already unlinked or if it's a "fake client" the @@ -1494,7 +1495,7 @@ void unlinkClient(client *c) { } /* Remove from the list of pending reads if needed. */ - serverAssert(io_threads_op == IO_THREADS_OP_IDLE); + serverAssert(!c->conn || io_threads_op == IO_THREADS_OP_IDLE); if (c->pending_read_list_node != NULL) { listDelNode(server.clients_pending_read,c->pending_read_list_node); c->pending_read_list_node = NULL; @@ -1649,6 +1650,12 @@ void freeClient(client *c) { reqresReset(c, 1); #endif + /* Remove the contribution that this client gave to our + * incrementally computed memory usage. */ + if (c->conn) + server.stat_clients_type_memory[c->last_memory_type] -= + c->last_memory_usage; + /* Unlink the client: this will close the socket, remove the I/O * handlers, and remove references of the client from different * places where active clients may be referenced. */ @@ -1697,10 +1704,6 @@ void freeClient(client *c) { * we lost the connection with the master. */ if (c->flags & CLIENT_MASTER) replicationHandleMasterDisconnection(); - /* Remove the contribution that this client gave to our - * incrementally computed memory usage. */ - server.stat_clients_type_memory[c->last_memory_type] -= - c->last_memory_usage; /* Remove client from memory usage buckets */ if (c->mem_usage_bucket) { c->mem_usage_bucket->mem_usage_sum -= c->last_memory_usage; @@ -2487,7 +2490,7 @@ int processCommandAndResetClient(client *c) { commandProcessed(c); /* Update the client's memory to include output buffer growth following the * processed command. */ - updateClientMemUsageAndBucket(c); + if (c->conn) updateClientMemUsageAndBucket(c); } if (server.current_client == NULL) deadclient = 1; diff --git a/src/server.c b/src/server.c index d7707bb5a20..4209db42d72 100644 --- a/src/server.c +++ b/src/server.c @@ -994,6 +994,7 @@ static inline clientMemUsageBucket *getMemUsageBucket(size_t mem) { * usage bucket. */ void updateClientMemoryUsage(client *c) { + serverAssert(c->conn); size_t mem = getClientMemoryUsage(c, NULL); int type = getClientType(c); /* Now that we have the memory used by the client, remove the old @@ -1006,7 +1007,7 @@ void updateClientMemoryUsage(client *c) { } int clientEvictionAllowed(client *c) { - if (server.maxmemory_clients == 0 || c->flags & CLIENT_NO_EVICT) { + if (server.maxmemory_clients == 0 || c->flags & CLIENT_NO_EVICT || !c->conn) { return 0; } int type = getClientType(c); @@ -1046,7 +1047,7 @@ void removeClientFromMemUsageBucket(client *c, int allow_eviction) { * returns 1 if client eviction for this client is allowed, 0 otherwise. */ int updateClientMemUsageAndBucket(client *c) { - serverAssert(io_threads_op == IO_THREADS_OP_IDLE); + serverAssert(io_threads_op == IO_THREADS_OP_IDLE && c->conn); int allow_eviction = clientEvictionAllowed(c); removeClientFromMemUsageBucket(c, allow_eviction); diff --git a/src/server.h b/src/server.h index a913a1c8b4f..ba55e3dee54 100644 --- a/src/server.h +++ b/src/server.h @@ -2532,7 +2532,7 @@ void moduleFreeContext(struct RedisModuleCtx *ctx); void moduleCallCommandUnblockedHandler(client *c); void unblockClientFromModule(client *c); void moduleHandleBlockedClients(void); -void moduleBlockedClientTimedOut(client *c); +void moduleBlockedClientTimedOut(client *c, int from_module); void modulePipeReadable(aeEventLoop *el, int fd, void *privdata, int mask); size_t moduleCount(void); void moduleAcquireGIL(void); diff --git a/tests/modules/blockedclient.c b/tests/modules/blockedclient.c index 92060fd338e..23030cef471 100644 --- a/tests/modules/blockedclient.c +++ b/tests/modules/blockedclient.c @@ -102,6 +102,7 @@ typedef struct { void *bg_call_worker(void *arg) { bg_call_data *bg = arg; + RedisModuleBlockedClient *bc = bg->bc; // Get Redis module context RedisModuleCtx *ctx = RedisModule_GetThreadSafeContext(bg->bc); @@ -135,6 +136,12 @@ void *bg_call_worker(void *arg) { RedisModuleCallReply *rep = RedisModule_Call(ctx, cmd, format, bg->argv + cmd_pos + 1, bg->argc - cmd_pos - 1); RedisModule_FreeString(NULL, format_redis_str); + /* Free the arguments within GIL to prevent simultaneous freeing in main thread. */ + for (int i=0; iargc; i++) + RedisModule_FreeString(ctx, bg->argv[i]); + RedisModule_Free(bg->argv); + RedisModule_Free(bg); + // Release GIL RedisModule_ThreadSafeContextUnlock(ctx); @@ -147,13 +154,7 @@ void *bg_call_worker(void *arg) { } // Unblock client - RedisModule_UnblockClient(bg->bc, NULL); - - /* Free the arguments */ - for (int i=0; iargc; i++) - RedisModule_FreeString(ctx, bg->argv[i]); - RedisModule_Free(bg->argv); - RedisModule_Free(bg); + RedisModule_UnblockClient(bc, NULL); // Free the Redis module context RedisModule_FreeThreadSafeContext(ctx); diff --git a/tests/modules/blockonbackground.c b/tests/modules/blockonbackground.c index 2e3b1a55733..e068e20d94e 100644 --- a/tests/modules/blockonbackground.c +++ b/tests/modules/blockonbackground.c @@ -7,12 +7,41 @@ #define UNUSED(x) (void)(x) +typedef struct { + /* Mutex for protecting RedisModule_BlockedClientMeasureTime*() API from race + * conditions due to timeout callback triggered in the main thread. */ + pthread_mutex_t measuretime_mutex; + int measuretime_completed; /* Indicates that time measure has ended and will not continue further */ + int myint; /* Used for replying */ +} BlockPrivdata; + +void blockClientPrivdataInit(RedisModuleBlockedClient *bc) { + BlockPrivdata *block_privdata = RedisModule_Calloc(1, sizeof(*block_privdata)); + block_privdata->measuretime_mutex = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; + RedisModule_BlockClientSetPrivateData(bc, block_privdata); +} + +void blockClientMeasureTimeStart(RedisModuleBlockedClient *bc, BlockPrivdata *block_privdata) { + pthread_mutex_lock(&block_privdata->measuretime_mutex); + RedisModule_BlockedClientMeasureTimeStart(bc); + pthread_mutex_unlock(&block_privdata->measuretime_mutex); +} + +void blockClientMeasureTimeEnd(RedisModuleBlockedClient *bc, BlockPrivdata *block_privdata, int completed) { + pthread_mutex_lock(&block_privdata->measuretime_mutex); + if (!block_privdata->measuretime_completed) { + RedisModule_BlockedClientMeasureTimeEnd(bc); + if (completed) block_privdata->measuretime_completed = 1; + } + pthread_mutex_unlock(&block_privdata->measuretime_mutex); +} + /* Reply callback for blocking command BLOCK.DEBUG */ int HelloBlock_Reply(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) { UNUSED(argv); UNUSED(argc); - int *myint = RedisModule_GetBlockedClientPrivateData(ctx); - return RedisModule_ReplyWithLongLong(ctx,*myint); + BlockPrivdata *block_privdata = RedisModule_GetBlockedClientPrivateData(ctx); + return RedisModule_ReplyWithLongLong(ctx,block_privdata->myint); } /* Timeout callback for blocking command BLOCK.DEBUG */ @@ -20,13 +49,16 @@ int HelloBlock_Timeout(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) UNUSED(argv); UNUSED(argc); RedisModuleBlockedClient *bc = RedisModule_GetBlockedClientHandle(ctx); - RedisModule_BlockedClientMeasureTimeEnd(bc); + BlockPrivdata *block_privdata = RedisModule_GetBlockedClientPrivateData(ctx); + blockClientMeasureTimeEnd(bc, block_privdata, 1); return RedisModule_ReplyWithSimpleString(ctx,"Request timedout"); } /* Private data freeing callback for BLOCK.DEBUG command. */ void HelloBlock_FreeData(RedisModuleCtx *ctx, void *privdata) { UNUSED(ctx); + BlockPrivdata *block_privdata = privdata; + pthread_mutex_destroy(&block_privdata->measuretime_mutex); RedisModule_Free(privdata); } @@ -42,19 +74,20 @@ void *BlockDebug_ThreadMain(void *arg) { RedisModuleBlockedClient *bc = targ[0]; long long delay = (unsigned long)targ[1]; long long enable_time_track = (unsigned long)targ[2]; + BlockPrivdata *block_privdata = RedisModule_BlockClientGetPrivateData(bc); + if (enable_time_track) - RedisModule_BlockedClientMeasureTimeStart(bc); + blockClientMeasureTimeStart(bc, block_privdata); RedisModule_Free(targ); struct timespec ts; ts.tv_sec = delay / 1000; ts.tv_nsec = (delay % 1000) * 1000000; nanosleep(&ts, NULL); - int *r = RedisModule_Alloc(sizeof(int)); - *r = rand(); if (enable_time_track) - RedisModule_BlockedClientMeasureTimeEnd(bc); - RedisModule_UnblockClient(bc,r); + blockClientMeasureTimeEnd(bc, block_privdata, 0); + block_privdata->myint = rand(); + RedisModule_UnblockClient(bc,block_privdata); return NULL; } @@ -64,23 +97,22 @@ void *DoubleBlock_ThreadMain(void *arg) { void **targ = arg; RedisModuleBlockedClient *bc = targ[0]; long long delay = (unsigned long)targ[1]; - RedisModule_BlockedClientMeasureTimeStart(bc); + BlockPrivdata *block_privdata = RedisModule_BlockClientGetPrivateData(bc); + blockClientMeasureTimeStart(bc, block_privdata); RedisModule_Free(targ); struct timespec ts; ts.tv_sec = delay / 1000; ts.tv_nsec = (delay % 1000) * 1000000; nanosleep(&ts, NULL); - int *r = RedisModule_Alloc(sizeof(int)); - *r = rand(); - RedisModule_BlockedClientMeasureTimeEnd(bc); + blockClientMeasureTimeEnd(bc, block_privdata, 0); /* call again RedisModule_BlockedClientMeasureTimeStart() and * RedisModule_BlockedClientMeasureTimeEnd and ensure that the * total execution time is 2x the delay. */ - RedisModule_BlockedClientMeasureTimeStart(bc); + blockClientMeasureTimeStart(bc, block_privdata); nanosleep(&ts, NULL); - RedisModule_BlockedClientMeasureTimeEnd(bc); - - RedisModule_UnblockClient(bc,r); + blockClientMeasureTimeEnd(bc, block_privdata, 0); + block_privdata->myint = rand(); + RedisModule_UnblockClient(bc,block_privdata); return NULL; } @@ -107,6 +139,7 @@ int HelloBlock_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int a pthread_t tid; RedisModuleBlockedClient *bc = RedisModule_BlockClient(ctx,HelloBlock_Reply,HelloBlock_Timeout,HelloBlock_FreeData,timeout); + blockClientPrivdataInit(bc); /* Here we set a disconnection handler, however since this module will * block in sleep() in a thread, there is not much we can do in the @@ -148,6 +181,7 @@ int HelloBlockNoTracking_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **a pthread_t tid; RedisModuleBlockedClient *bc = RedisModule_BlockClient(ctx,HelloBlock_Reply,HelloBlock_Timeout,HelloBlock_FreeData,timeout); + blockClientPrivdataInit(bc); /* Here we set a disconnection handler, however since this module will * block in sleep() in a thread, there is not much we can do in the @@ -184,6 +218,7 @@ int HelloDoubleBlock_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, pthread_t tid; RedisModuleBlockedClient *bc = RedisModule_BlockClient(ctx,HelloBlock_Reply,HelloBlock_Timeout,HelloBlock_FreeData,0); + blockClientPrivdataInit(bc); /* Now that we setup a blocking client, we need to pass the control * to the thread. However we need to pass arguments to the thread: diff --git a/tests/modules/usercall.c b/tests/modules/usercall.c index 6b23974d4f0..316de1eea00 100644 --- a/tests/modules/usercall.c +++ b/tests/modules/usercall.c @@ -115,6 +115,7 @@ typedef struct { void *bg_call_worker(void *arg) { bg_call_data *bg = arg; + RedisModuleBlockedClient *bc = bg->bc; // Get Redis module context RedisModuleCtx *ctx = RedisModule_GetThreadSafeContext(bg->bc); @@ -136,6 +137,12 @@ void *bg_call_worker(void *arg) { RedisModuleCallReply *rep = RedisModule_Call(ctx, cmd, format, bg->argv + 3, bg->argc - 3); RedisModule_FreeString(NULL, format_redis_str); + /* Free the arguments within GIL to prevent simultaneous freeing in main thread. */ + for (int i=0; iargc; i++) + RedisModule_FreeString(ctx, bg->argv[i]); + RedisModule_Free(bg->argv); + RedisModule_Free(bg); + // Release GIL RedisModule_ThreadSafeContextUnlock(ctx); @@ -148,13 +155,7 @@ void *bg_call_worker(void *arg) { } // Unblock client - RedisModule_UnblockClient(bg->bc, NULL); - - /* Free the arguments */ - for (int i=0; iargc; i++) - RedisModule_FreeString(ctx, bg->argv[i]); - RedisModule_Free(bg->argv); - RedisModule_Free(bg); + RedisModule_UnblockClient(bc, NULL); // Free the Redis module context RedisModule_FreeThreadSafeContext(ctx); From b07174afc2aa8519dae68ac89eb66b8104f918e5 Mon Sep 17 00:00:00 2001 From: Yanqi Lv Date: Fri, 19 Jan 2024 23:00:43 +0800 Subject: [PATCH 52/58] Change the threshold of dict expand, shrink and rehash (#12948) Before this change (most recently modified in https://github.com/redis/redis/pull/12850#discussion_r1421406393), The trigger for normal expand threshold was 100% utilization and the trigger for normal shrink threshold was 10% (HASHTABLE_MIN_FILL). While during fork (DICT_RESIZE_AVOID), when we want to avoid rehash, the trigger thresholds were multiplied by 5 (`dict_force_resize_ratio`), meaning 500% for expand and 2% (100/10/5) for shrink. However, in `dictRehash` (the incremental rehashing), the rehashing threshold for shrinking during fork (DICT_RESIZE_AVOID) was 20% by mistake. This meant that if a shrinking is triggered when `dict_can_resize` is `DICT_RESIZE_ENABLE` which the threshold is 10%, the rehashing can continue when `dict_can_resize` is `DICT_RESIZE_AVOID`. This would cause unwanted CopyOnWrite damage. It'll make sense to change the thresholds of the rehash trigger and the thresholds of the incremental rehashing the same, however, in one we compare the size of the hash table to the number of records, and in the other we compare the size of ht[0] to the size of ht[1], so the formula is not exactly the same. to make things easier we change all the thresholds to powers of 2, so the normal shrinking threshold is changed from 100/10 (i.e. 10%) to 100/8 (i.e. 12.5%), and we change the threshold during forks from 5 to 4, i.e. from 500% to 400% for expand, and from 2% (100/10/5) to 3.125% (100/8/4) --- src/dict.c | 98 +++++++++++++++++++++++--------------------- src/dict.h | 2 +- src/server.c | 4 +- tests/unit/other.tcl | 10 ++--- 4 files changed, 60 insertions(+), 54 deletions(-) diff --git a/src/dict.c b/src/dict.c index bb50a6a3231..b61e740e855 100644 --- a/src/dict.c +++ b/src/dict.c @@ -56,11 +56,11 @@ * Note that even when dict_can_resize is set to DICT_RESIZE_AVOID, not all * resizes are prevented: * - A hash table is still allowed to expand if the ratio between the number - * of elements and the buckets > dict_force_resize_ratio. + * of elements and the buckets >= dict_force_resize_ratio. * - A hash table is still allowed to shrink if the ratio between the number - * of elements and the buckets < HASHTABLE_MIN_FILL / dict_force_resize_ratio. */ + * of elements and the buckets <= 1 / (HASHTABLE_MIN_FILL * dict_force_resize_ratio). */ static dictResizeEnable dict_can_resize = DICT_RESIZE_ENABLE; -static unsigned int dict_force_resize_ratio = 5; +static unsigned int dict_force_resize_ratio = 4; /* -------------------------- types ----------------------------------------- */ struct dictEntry { @@ -332,9 +332,12 @@ int dictRehash(dict *d, int n) { unsigned long s0 = DICTHT_SIZE(d->ht_size_exp[0]); unsigned long s1 = DICTHT_SIZE(d->ht_size_exp[1]); if (dict_can_resize == DICT_RESIZE_FORBID || !dictIsRehashing(d)) return 0; + /* If dict_can_resize is DICT_RESIZE_AVOID, we want to avoid rehashing. + * - If expanding, the threshold is dict_force_resize_ratio which is 4. + * - If shrinking, the threshold is 1 / (HASHTABLE_MIN_FILL * dict_force_resize_ratio) which is 1/32. */ if (dict_can_resize == DICT_RESIZE_AVOID && ((s1 > s0 && s1 < dict_force_resize_ratio * s0) || - (s1 < s0 && s0 < dict_force_resize_ratio * s1))) + (s1 < s0 && s0 < HASHTABLE_MIN_FILL * dict_force_resize_ratio * s1))) { return 0; } @@ -1471,13 +1474,13 @@ static void _dictShrinkIfNeeded(dict *d) /* If the size of hash table is DICT_HT_INITIAL_SIZE, don't shrink it. */ if (DICTHT_SIZE(d->ht_size_exp[0]) == DICT_HT_INITIAL_SIZE) return; - /* If we reached below 1:10 elements/buckets ratio, and we are allowed to resize - * the hash table (global setting) or we should avoid it but the ratio is below 1:50, + /* If we reached below 1:8 elements/buckets ratio, and we are allowed to resize + * the hash table (global setting) or we should avoid it but the ratio is below 1:32, * we'll trigger a resize of the hash table. */ if ((dict_can_resize == DICT_RESIZE_ENABLE && - d->ht_used[0] * 100 <= HASHTABLE_MIN_FILL * DICTHT_SIZE(d->ht_size_exp[0])) || + d->ht_used[0] * HASHTABLE_MIN_FILL <= DICTHT_SIZE(d->ht_size_exp[0])) || (dict_can_resize != DICT_RESIZE_FORBID && - d->ht_used[0] * 100 * dict_force_resize_ratio <= HASHTABLE_MIN_FILL * DICTHT_SIZE(d->ht_size_exp[0]))) + d->ht_used[0] * HASHTABLE_MIN_FILL * dict_force_resize_ratio <= DICTHT_SIZE(d->ht_size_exp[0]))) { if (!dictTypeResizeAllowed(d, d->ht_used[0])) return; @@ -1750,6 +1753,7 @@ int dictTest(int argc, char **argv, int flags) { int retval; dict *dict = dictCreate(&BenchmarkDictType); long count = 0; + unsigned long new_dict_size, current_dict_used, remain_keys; int accurate = (flags & REDIS_TEST_ACCURATE); if (argc == 4) { @@ -1773,67 +1777,68 @@ int dictTest(int argc, char **argv, int flags) { assert(dictBuckets(dict) == 16); } - TEST("Use DICT_RESIZE_AVOID to disable the dict resize and pad to 80") { + TEST("Use DICT_RESIZE_AVOID to disable the dict resize and pad to (dict_force_resize_ratio * 16)") { /* Use DICT_RESIZE_AVOID to disable the dict resize, and pad - * the number of keys to 80, now is 16:80, so we can satisfy - * dict_force_resize_ratio. */ + * the number of keys to (dict_force_resize_ratio * 16), so we can satisfy + * dict_force_resize_ratio in next test. */ dictSetResizeEnabled(DICT_RESIZE_AVOID); - for (j = 16; j < 80; j++) { + for (j = 16; j < (long)dict_force_resize_ratio * 16; j++) { retval = dictAdd(dict,stringFromLongLong(j),(void*)j); assert(retval == DICT_OK); } - assert(dictSize(dict) == 80); + current_dict_used = dict_force_resize_ratio * 16; + assert(dictSize(dict) == current_dict_used); assert(dictBuckets(dict) == 16); } TEST("Add one more key, trigger the dict resize") { - retval = dictAdd(dict,stringFromLongLong(80),(void*)80); + retval = dictAdd(dict,stringFromLongLong(current_dict_used),(void*)(current_dict_used)); assert(retval == DICT_OK); - assert(dictSize(dict) == 81); + current_dict_used++; + new_dict_size = 1UL << _dictNextExp(current_dict_used); + assert(dictSize(dict) == current_dict_used); assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 16); - assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 128); - assert(dictBuckets(dict) == 144); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == new_dict_size); /* Wait for rehashing. */ dictSetResizeEnabled(DICT_RESIZE_ENABLE); while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); - assert(dictSize(dict) == 81); - assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 128); + assert(dictSize(dict) == current_dict_used); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == new_dict_size); assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); - assert(dictBuckets(dict) == 128); } - TEST("Delete keys until 13 keys remain") { - /* Delete keys until 13 keys remain, now is 13:128, so we can - * satisfy HASHTABLE_MIN_FILL in the next test. */ - for (j = 0; j < 68; j++) { + TEST("Delete keys until we can trigger shrink in next test") { + /* Delete keys until we can satisfy (1 / HASHTABLE_MIN_FILL) in the next test. */ + for (j = new_dict_size / HASHTABLE_MIN_FILL + 1; j < (long)current_dict_used; j++) { char *key = stringFromLongLong(j); retval = dictDelete(dict, key); zfree(key); assert(retval == DICT_OK); } - assert(dictSize(dict) == 13); - assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 128); + current_dict_used = new_dict_size / HASHTABLE_MIN_FILL + 1; + assert(dictSize(dict) == current_dict_used); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == new_dict_size); assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); - assert(dictBuckets(dict) == 128); } TEST("Delete one more key, trigger the dict resize") { - char *key = stringFromLongLong(68); + current_dict_used--; + char *key = stringFromLongLong(current_dict_used); retval = dictDelete(dict, key); zfree(key); + unsigned long oldDictSize = new_dict_size; + new_dict_size = 1UL << _dictNextExp(current_dict_used); assert(retval == DICT_OK); - assert(dictSize(dict) == 12); - assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 128); - assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 16); - assert(dictBuckets(dict) == 144); + assert(dictSize(dict) == current_dict_used); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == oldDictSize); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == new_dict_size); /* Wait for rehashing. */ while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); - assert(dictSize(dict) == 12); - assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 16); + assert(dictSize(dict) == current_dict_used); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == new_dict_size); assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); - assert(dictBuckets(dict) == 16); } TEST("Empty the dictionary and add 128 keys") { @@ -1849,36 +1854,37 @@ int dictTest(int argc, char **argv, int flags) { TEST("Use DICT_RESIZE_AVOID to disable the dict resize and reduce to 3") { /* Use DICT_RESIZE_AVOID to disable the dict reset, and reduce - * the number of keys to 3, now is 3:128, so we can satisfy - * HASHTABLE_MIN_FILL / dict_force_resize_ratio. */ + * the number of keys until we can trigger shrinking in next test. */ dictSetResizeEnabled(DICT_RESIZE_AVOID); - for (j = 0; j < 125; j++) { + remain_keys = DICTHT_SIZE(dict->ht_size_exp[0]) / (HASHTABLE_MIN_FILL * dict_force_resize_ratio) + 1; + for (j = remain_keys; j < 128; j++) { char *key = stringFromLongLong(j); retval = dictDelete(dict, key); zfree(key); assert(retval == DICT_OK); } - assert(dictSize(dict) == 3); + current_dict_used = remain_keys; + assert(dictSize(dict) == remain_keys); assert(dictBuckets(dict) == 128); } TEST("Delete one more key, trigger the dict resize") { - char *key = stringFromLongLong(125); + current_dict_used--; + char *key = stringFromLongLong(current_dict_used); retval = dictDelete(dict, key); zfree(key); + new_dict_size = 1UL << _dictNextExp(current_dict_used); assert(retval == DICT_OK); - assert(dictSize(dict) == 2); + assert(dictSize(dict) == current_dict_used); assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 128); - assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 4); - assert(dictBuckets(dict) == 132); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == new_dict_size); /* Wait for rehashing. */ dictSetResizeEnabled(DICT_RESIZE_ENABLE); while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); - assert(dictSize(dict) == 2); - assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 4); + assert(dictSize(dict) == current_dict_used); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == new_dict_size); assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); - assert(dictBuckets(dict) == 4); } TEST("Restore to original state") { diff --git a/src/dict.h b/src/dict.h index cebbe14985c..5b08319062d 100644 --- a/src/dict.h +++ b/src/dict.h @@ -45,7 +45,7 @@ #define DICT_ERR 1 /* Hash table parameters */ -#define HASHTABLE_MIN_FILL 10 /* Minimal hash table fill 10% */ +#define HASHTABLE_MIN_FILL 8 /* Minimal hash table fill 12.5%(100/8) */ typedef struct dictEntry dictEntry; /* opaque */ typedef struct dict dict; diff --git a/src/server.c b/src/server.c index 4209db42d72..95da296058f 100644 --- a/src/server.c +++ b/src/server.c @@ -699,11 +699,11 @@ int htNeedsShrink(dict *dict) { size = dictBuckets(dict); used = dictSize(dict); return (size > DICT_HT_INITIAL_SIZE && - (used*100 <= HASHTABLE_MIN_FILL*size)); + (used * HASHTABLE_MIN_FILL <= size)); } /* In cluster-enabled setup, this method traverses through all main/expires dictionaries (CLUSTER_SLOTS) - * and triggers a resize if the percentage of used buckets in the HT reaches HASHTABLE_MIN_FILL + * and triggers a resize if the percentage of used buckets in the HT reaches (100 / HASHTABLE_MIN_FILL) * we resize the hash table to save memory. * * In non cluster-enabled setup, it resize main/expires dictionary based on the same condition described above. */ diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl index 08364f44056..ab19be19f2f 100644 --- a/tests/unit/other.tcl +++ b/tests/unit/other.tcl @@ -439,8 +439,8 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} { r config set rdb-key-save-delay 10000000 r bgsave - # delete data to have lot's (98%) of empty buckets - for {set j 1} {$j <= 125} {incr j} { + # delete data to have lot's (96%) of empty buckets + for {set j 1} {$j <= 123} {incr j} { r del "{foo}$j" } assert_match "*table size: 128*" [r debug HTSTATS 0] @@ -455,7 +455,7 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} { } after 200;# waiting for serverCron - assert_match "*table size: 4*" [r debug HTSTATS 0] + assert_match "*table size: 8*" [r debug HTSTATS 0] } {} {needs:debug} test "Redis can rewind and trigger smaller slot resizing" { @@ -470,7 +470,7 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} { r config set rdb-key-save-delay 10000000 r bgsave - for {set j 1} {$j <= 125} {incr j} { + for {set j 1} {$j <= 123} {incr j} { r del "{alice}$j" } @@ -484,6 +484,6 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} { } after 200;# waiting for serverCron - assert_match "*table size: 8*" [r debug HTSTATS 0] + assert_match "*table size: 16*" [r debug HTSTATS 0] } {} {needs:debug} } From 85a239b363591a7f35f66eb8370cabead8545ffa Mon Sep 17 00:00:00 2001 From: Yanqi Lv Date: Fri, 19 Jan 2024 23:03:20 +0800 Subject: [PATCH 53/58] Change dictGetSafeIterator to dictGetIterator in pubsub (#12931) In #12838, we misuse the safe iterator of the client dict, so we can't catch the synchronous release of the client if there is a bug. Since we realize that clients (even subscribers) are released with async free, we change the safe iterators of the client dict into unsafe iterators in `pubsub.c`. And I also remove redundant code. --- src/pubsub.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/pubsub.c b/src/pubsub.c index 1a151b96c44..afaf0832fca 100644 --- a/src/pubsub.c +++ b/src/pubsub.c @@ -374,9 +374,8 @@ void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) { while ((de = dictNext(di)) != NULL) { robj *channel = dictGetKey(de); dict *clients = dictGetVal(de); - if (dictSize(clients) == 0) goto cleanup; /* For each client subscribed to the channel, unsubscribe it. */ - dictIterator *iter = dictGetSafeIterator(clients); + dictIterator *iter = dictGetIterator(clients); dictEntry *entry; while ((entry = dictNext(iter)) != NULL) { client *c = dictGetKey(entry); @@ -390,7 +389,6 @@ void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) { } } dictReleaseIterator(iter); -cleanup: server.shard_channel_count--; dictDelete(d, channel); } @@ -529,7 +527,7 @@ int pubsubPublishMessageInternal(robj *channel, robj *message, pubsubtype type) if (de) { dict *clients = dictGetVal(de); dictEntry *entry; - dictIterator *iter = dictGetSafeIterator(clients); + dictIterator *iter = dictGetIterator(clients); while ((entry = dictNext(iter)) != NULL) { client *c = dictGetKey(entry); addReplyPubsubMessage(c,channel,message,*type.messageBulk); @@ -557,7 +555,7 @@ int pubsubPublishMessageInternal(robj *channel, robj *message, pubsubtype type) sdslen(channel->ptr),0)) continue; dictEntry *entry; - dictIterator *iter = dictGetSafeIterator(clients); + dictIterator *iter = dictGetIterator(clients); while ((entry = dictNext(iter)) != NULL) { client *c = dictGetKey(entry); addReplyPubsubPatMessage(c,pattern,channel,message); From 8d0156eb186e60a32be3f0210d79f7289fcc6e66 Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Mon, 22 Jan 2024 11:47:51 +0800 Subject: [PATCH 54/58] Set the correct id for tempDb (#12947) background: some modules need to know the `dbid` information, such as the function used during RDB loading: ``` robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { .... moduleInitIOContext(io,mt,rdb,&keyobj,dbid); ``` However, during replication, the "tempDb" created for diskless RDB loading is not correctly set with the dbid. This leads to passing the wrong dbid to the `rdbLoadObject` function (as tempDb uses zcalloc, all ids are 0). ``` disklessLoadInitTempDb()->rdbLoadRioWithLoadingCtx()-> /* Read value */ val = rdbLoadObject(type,rdb,key,db->id,&error); ``` To fix it, set the correct ID (relative index) for the tempdb. --- src/db.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/db.c b/src/db.c index 50d6bd46030..d40b1599ab6 100644 --- a/src/db.c +++ b/src/db.c @@ -763,6 +763,7 @@ long long emptyData(int dbnum, int flags, void(callback)(dict*)) { redisDb *initTempDb(void) { redisDb *tempDb = zcalloc(sizeof(redisDb)*server.dbnum); for (int i=0; i Date: Mon, 22 Jan 2024 11:25:43 -0800 Subject: [PATCH 55/58] Prevent nodes with invalid IDs from being propagated through gossip (#12921) There have been occasional instances of memory corruption (though code bugs or bit flips) leading to invalid node information being gossiped around. To prevent this invalid information spreading, we verify the node IDs in received gossip are in an acceptable format, and disregard any gossiped nodes with invalid IDs. This PR uses the existing verifyClusterNodeId function to check the validity of the gossiped node IDs and if an invalid one is encountered, logs raw byte information to help debug the corruption. --------- Co-authored-by: Madelyn Olson --- src/cluster_legacy.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 8dee109df69..45e88efdda5 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -2043,6 +2043,41 @@ static void getClientPortFromGossip(clusterMsgDataGossip *g, int *tls_port, int } } +/* Returns a string with the byte representation of the node ID (i.e. nodename) + * along with 8 trailing bytes for debugging purposes. */ +char *getCorruptedNodeIdByteString(clusterMsgDataGossip *gossip_msg) { + const int num_bytes = CLUSTER_NAMELEN + 8; + /* Allocate enough room for 4 chars per byte + null terminator */ + char *byte_string = (char*) zmalloc((num_bytes*4) + 1); + const char *name_ptr = gossip_msg->nodename; + + /* Ensure we won't print beyond the bounds of the message */ + serverAssert(name_ptr + num_bytes <= (char*)gossip_msg + sizeof(clusterMsgDataGossip)); + + for (int i = 0; i < num_bytes; i++) { + snprintf(byte_string + 4*i, 5, "\\x%02hhX", name_ptr[i]); + } + return byte_string; +} + +/* Returns the number of nodes in the gossip with invalid IDs. */ +int verifyGossipSectionNodeIds(clusterMsgDataGossip *g, uint16_t count) { + int invalid_ids = 0; + for (int i = 0; i < count; i++) { + const char *nodename = g[i].nodename; + if (verifyClusterNodeId(nodename, CLUSTER_NAMELEN) != C_OK) { + invalid_ids++; + char *raw_node_id = getCorruptedNodeIdByteString(g); + serverLog(LL_WARNING, + "Received gossip about a node with invalid ID %.40s. For debugging purposes, " + "the 48 bytes including the invalid ID and 8 trailing bytes are: %s", + nodename, raw_node_id); + zfree(raw_node_id); + } + } + return invalid_ids; +} + /* Process the gossip section of PING or PONG packets. * Note that this function assumes that the packet is already sanity-checked * by the caller, not in the content of the gossip section, but in the @@ -2052,6 +2087,14 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { clusterMsgDataGossip *g = (clusterMsgDataGossip*) hdr->data.ping.gossip; clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender, CLUSTER_NAMELEN); + /* Abort if the gossip contains invalid node IDs to avoid adding incorrect information to + * the nodes dictionary. An invalid ID indicates memory corruption on the sender side. */ + int invalid_ids = verifyGossipSectionNodeIds(g, count); + if (invalid_ids) { + serverLog(LL_WARNING, "Node %.40s (%s) gossiped %d nodes with invalid IDs.", sender->name, sender->human_nodename, invalid_ids); + return; + } + while(count--) { uint16_t flags = ntohs(g->flags); clusterNode *node; From 2bce71b5ff98766a3dd64c36af8f3c1bdb54670b Mon Sep 17 00:00:00 2001 From: Harkrishn Patro Date: Mon, 22 Jan 2024 16:01:04 -0800 Subject: [PATCH 56/58] Exit early if slowlog/acllog max len set to zero (#12965) Currently slowlog gets disabled if slowlog-log-slower-than is set to less than zero. I think we should also disable it if slowlog-max-len is set to zero. We apply the same logic to acllog-max-len. --- src/acl.c | 21 +++++++++++++++------ src/latency.c | 2 +- src/slowlog.c | 2 +- tests/unit/acl.tcl | 15 +++++++++++++++ tests/unit/slowlog.tcl | 12 +++++++++++- 5 files changed, 43 insertions(+), 9 deletions(-) diff --git a/src/acl.c b/src/acl.c index b7e43cffa51..e2ec11e5507 100644 --- a/src/acl.c +++ b/src/acl.c @@ -2661,6 +2661,15 @@ void ACLUpdateInfoMetrics(int reason){ } } +static void trimACLLogEntriesToMaxLen(void) { + while(listLength(ACLLog) > server.acllog_max_len) { + listNode *ln = listLast(ACLLog); + ACLLogEntry *le = listNodeValue(ln); + ACLFreeLogEntry(le); + listDelNode(ACLLog,ln); + } +} + /* Adds a new entry in the ACL log, making sure to delete the old entry * if we reach the maximum length allowed for the log. This function attempts * to find similar entries in the current log in order to bump the counter of @@ -2680,6 +2689,11 @@ void addACLLogEntry(client *c, int reason, int context, int argpos, sds username /* Update ACL info metrics */ ACLUpdateInfoMetrics(reason); + if (server.acllog_max_len == 0) { + trimACLLogEntriesToMaxLen(); + return; + } + /* Create a new entry. */ struct ACLLogEntry *le = zmalloc(sizeof(*le)); le->count = 1; @@ -2742,12 +2756,7 @@ void addACLLogEntry(client *c, int reason, int context, int argpos, sds username * to its maximum size. */ ACLLogEntryCount++; /* Incrementing the entry_id count to make each record in the log unique. */ listAddNodeHead(ACLLog, le); - while(listLength(ACLLog) > server.acllog_max_len) { - listNode *ln = listLast(ACLLog); - ACLLogEntry *le = listNodeValue(ln); - ACLFreeLogEntry(le); - listDelNode(ACLLog,ln); - } + trimACLLogEntriesToMaxLen(); } } diff --git a/src/latency.c b/src/latency.c index d46890e826f..4805508e75d 100644 --- a/src/latency.c +++ b/src/latency.c @@ -279,7 +279,7 @@ sds createLatencyReport(void) { /* Potentially commands. */ if (!strcasecmp(event,"command")) { - if (server.slowlog_log_slower_than < 0) { + if (server.slowlog_log_slower_than < 0 || server.slowlog_max_len == 0) { advise_slowlog_enabled = 1; advices++; } else if (server.slowlog_log_slower_than/1000 > diff --git a/src/slowlog.c b/src/slowlog.c index 4c31917bb3b..a68064af2d3 100644 --- a/src/slowlog.c +++ b/src/slowlog.c @@ -121,7 +121,7 @@ void slowlogInit(void) { * This function will make sure to trim the slow log accordingly to the * configured max length. */ void slowlogPushEntryIfNeeded(client *c, robj **argv, int argc, long long duration) { - if (server.slowlog_log_slower_than < 0) return; /* Slowlog disabled */ + if (server.slowlog_log_slower_than < 0 || server.slowlog_max_len == 0) return; /* Slowlog disabled */ if (duration >= server.slowlog_log_slower_than) listAddNodeHead(server.slowlog, slowlogCreateEntry(c,argv,argc,duration)); diff --git a/tests/unit/acl.tcl b/tests/unit/acl.tcl index e1e610f3c78..b40000e519f 100644 --- a/tests/unit/acl.tcl +++ b/tests/unit/acl.tcl @@ -802,6 +802,16 @@ start_server {tags {"acl external:skip"}} { assert {[dict get $entry username] eq {antirez}} } + test {ACLLOG - zero max length is correctly handled} { + r ACL LOG RESET + r CONFIG SET acllog-max-len 0 + for {set j 0} {$j < 10} {incr j} { + catch {r SET obj:$j 123} + } + r AUTH default "" + assert {[llength [r ACL LOG]] == 0} + } + test {ACL LOG entries are limited to a maximum amount} { r ACL LOG RESET r CONFIG SET acllog-max-len 5 @@ -813,6 +823,11 @@ start_server {tags {"acl external:skip"}} { assert {[llength [r ACL LOG]] == 5} } + test {ACL LOG entries are still present on update of max len config} { + r CONFIG SET acllog-max-len 0 + assert {[llength [r ACL LOG]] == 5} + } + test {When default user is off, new connections are not authenticated} { r ACL setuser default off catch {set rd1 [redis_deferring_client]} e diff --git a/tests/unit/slowlog.tcl b/tests/unit/slowlog.tcl index a5e8862d7b7..e7f82ce7f90 100644 --- a/tests/unit/slowlog.tcl +++ b/tests/unit/slowlog.tcl @@ -14,6 +14,16 @@ start_server {tags {"slowlog"} overrides {slowlog-log-slower-than 1000000}} { assert_equal [r slowlog len] 1 } {} {needs:debug} + test {SLOWLOG - zero max length is correctly handled} { + r SLOWLOG reset + r config set slowlog-max-len 0 + r config set slowlog-log-slower-than 0 + for {set i 0} {$i < 100} {incr i} { + r ping + } + r slowlog len + } {0} + test {SLOWLOG - max entries is correctly handled} { r config set slowlog-log-slower-than 0 r config set slowlog-max-len 10 @@ -42,7 +52,7 @@ start_server {tags {"slowlog"} overrides {slowlog-log-slower-than 1000000}} { set e [lindex [r slowlog get] 0] assert_equal [llength $e] 6 if {!$::external} { - assert_equal [lindex $e 0] 107 + assert_equal [lindex $e 0] 106 } assert_equal [expr {[lindex $e 2] > 100000}] 1 assert_equal [lindex $e 3] {debug sleep 0.2} From 12fd7524435885b99790e5d945bc9db7f2b2b6fd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 23 Jan 2024 11:09:49 +0200 Subject: [PATCH 57/58] Bump actions/cache from 3 to 4 (#12978) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [actions/cache](https://github.com/actions/cache) from 3 to 4.
Release notes

Sourced from actions/cache's releases.

v4.0.0

What's Changed

New Contributors

Full Changelog: https://github.com/actions/cache/compare/v3...v4.0.0

v3.3.3

What's Changed

New Contributors

Full Changelog: https://github.com/actions/cache/compare/v3...v3.3.3

v3.3.2

What's Changed

New Contributors

Full Changelog: https://github.com/actions/cache/compare/v3...v3.3.2

v3.3.1

What's Changed

Full Changelog: https://github.com/actions/cache/compare/v3...v3.3.1

v3.3.0

What's Changed

... (truncated)

Changelog

Sourced from actions/cache's changelog.

Releases

3.0.0

  • Updated minimum runner version support from node 12 -> node 16

3.0.1

  • Added support for caching from GHES 3.5.
  • Fixed download issue for files > 2GB during restore.

3.0.2

  • Added support for dynamic cache size cap on GHES.

3.0.3

  • Fixed avoiding empty cache save when no files are available for caching. (issue)

3.0.4

  • Fixed tar creation error while trying to create tar with path as ~/ home folder on ubuntu-latest. (issue)

3.0.5

  • Removed error handling by consuming actions/cache 3.0 toolkit, Now cache server error handling will be done by toolkit. (PR)

3.0.6

  • Fixed #809 - zstd -d: no such file or directory error
  • Fixed #833 - cache doesn't work with github workspace directory

3.0.7

  • Fixed #810 - download stuck issue. A new timeout is introduced in the download process to abort the download if it gets stuck and doesn't finish within an hour.

3.0.8

  • Fix zstd not working for windows on gnu tar in issues #888 and #891.
  • Allowing users to provide a custom timeout as input for aborting download of a cache segment using an environment variable SEGMENT_DOWNLOAD_TIMEOUT_MINS. Default is 60 minutes.

3.0.9

  • Enhanced the warning message for cache unavailablity in case of GHES.

3.0.10

  • Fix a bug with sorting inputs.
  • Update definition for restore-keys in README.md

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/cache&package-manager=github_actions&previous-version=3&new-version=4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/spell-check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/spell-check.yml b/.github/workflows/spell-check.yml index 53360741291..77f5437ca2b 100644 --- a/.github/workflows/spell-check.yml +++ b/.github/workflows/spell-check.yml @@ -19,7 +19,7 @@ jobs: uses: actions/checkout@v3 - name: pip cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} From f9a0eb60f7bde2c2ef561afa36c1004bf7f41b93 Mon Sep 17 00:00:00 2001 From: Oran Agra Date: Tue, 23 Jan 2024 11:48:02 +0200 Subject: [PATCH 58/58] update redis-check-rdb types (#12969) seems that we forgot to update the array in redis-check rdb. --- src/rdb.h | 5 +---- src/redis-check-rdb.c | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/rdb.h b/src/rdb.h index cf94444ebcf..f0cca977ac3 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -81,9 +81,6 @@ #define RDB_TYPE_MODULE_PRE_GA 6 /* Used in 4.0 release candidates */ #define RDB_TYPE_MODULE_2 7 /* Module value with annotations for parsing without the generating module being loaded. */ -/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW */ - -/* Object types for encoded objects. */ #define RDB_TYPE_HASH_ZIPMAP 9 #define RDB_TYPE_LIST_ZIPLIST 10 #define RDB_TYPE_SET_INTSET 11 @@ -97,7 +94,7 @@ #define RDB_TYPE_STREAM_LISTPACKS_2 19 #define RDB_TYPE_SET_LISTPACK 20 #define RDB_TYPE_STREAM_LISTPACKS_3 21 -/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW */ +/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType(), and rdb_type_string[] */ /* Test if a type is an object type. */ #define rdbIsObjectType(t) (((t) >= 0 && (t) <= 7) || ((t) >= 9 && (t) <= 21)) diff --git a/src/redis-check-rdb.c b/src/redis-check-rdb.c index 71cff0247c7..ffc201cb24f 100644 --- a/src/redis-check-rdb.c +++ b/src/redis-check-rdb.c @@ -98,7 +98,9 @@ char *rdb_type_string[] = { "hash-listpack", "zset-listpack", "quicklist-v2", + "stream-v2", "set-listpack", + "stream-v3", }; /* Show a few stats collected into 'rdbstate' */