diff --git a/src/broker/content-cache.c b/src/broker/content-cache.c index d2ef184bcd36..87510606167f 100644 --- a/src/broker/content-cache.c +++ b/src/broker/content-cache.c @@ -524,6 +524,18 @@ static void cache_store_continuation (flux_future_t *f, void *arg) cache_resume_flush (cache); } +/* Issue #4482, there is a small chance a dirty entry could be added + * to the flush list twice which can lead to list corruption. As an + * extra measure, perform a delete from the list first. If the node + * is not on the list, the delete is a no-op. + */ +static void flush_list_append (struct content_cache *cache, + struct cache_entry *e) +{ + list_del (&e->list); + list_add_tail (&cache->flush, &e->list); +} + static int cache_store (struct content_cache *cache, struct cache_entry *e) { flux_future_t *f; @@ -535,7 +547,7 @@ static int cache_store (struct content_cache *cache, struct cache_entry *e) return 0; if (cache->rank == 0) { if (cache->flush_batch_count >= cache->flush_batch_limit) { - list_add_tail (&cache->flush, &e->list); + flush_list_append (cache, e); return 0; } flags = CONTENT_FLAG_CACHE_BYPASS; @@ -595,7 +607,7 @@ static void content_store_request (flux_t *h, flux_msg_handler_t *mh, * during purge or dropcache, so this does not alter * behavior */ if (cache->rank == 0 && !cache->backing) - list_add_tail (&cache->flush, &e->list); + flush_list_append (cache, e); } if (flux_respond_raw (h, msg, hash, hash_size) < 0) flux_log_error (h, "content store: flux_respond_raw"); diff --git a/t/Makefile.am b/t/Makefile.am index 87e073c36a95..dda1cd80a073 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -260,9 +260,11 @@ EXTRA_DIST= \ rc/rc1-kvs \ rc/rc1-testenv \ rc/rc1-job \ + rc/rc1-issue4482 \ rc/rc3-kvs \ rc/rc3-testenv \ rc/rc3-job \ + rc/rc3-issue4482 \ shell/input \ shell/output \ shell/initrc/tests \ @@ -307,6 +309,7 @@ dist_check_SCRIPTS = \ issues/t4379-dirty-cache-entries-flush.sh \ issues/t4413-empty-eventlog.sh \ issues/t4465-job-list-use-after-free.sh \ + issues/t4482-flush-list-corruption.sh \ python/__init__.py \ python/subflux.py \ python/tap \ diff --git a/t/issues/t4482-flush-list-corruption.sh b/t/issues/t4482-flush-list-corruption.sh new file mode 100755 index 000000000000..0ca63bba5e20 --- /dev/null +++ b/t/issues/t4482-flush-list-corruption.sh @@ -0,0 +1,52 @@ +#!/bin/sh -e + +# How this test works +# +# use rc1 script that does not load content-sqlite +# +# add some unique data to the kvs, we do multiple puts to build up +# a decent length internal list of flushable cache entries. +# +# drop kvs cache, this will force future KVS puts of identical data to +# be sent to the content-cache +# +# write the same data again, if error present internal flush list will +# be messed up and length of flush list < number of dirty entries +# (acct_dirty). +# +# before fix, flux content flush will hang b/c number of dirty entries +# (acct_dirty) never reaches zero. + +cat <<-EOF >t4482.sh +#!/bin/sh -e + +flux kvs put issue4482A.a="abcdefghijk" +flux kvs put issue4482A.b="lmnopqrstuv" +flux kvs put issue4482A.c="wxyz0123456" +flux kvs put issue4482A.d="7890ABCDEFG" +flux kvs put issue4482A.e="HIJKLMNOPQR" +flux kvs put issue4482A.f="STUVWXYZ!!!" +flux kvs put issue4482A.g="<<<<<:>>>>>" + +flux kvs dropcache + +flux kvs put issue4482B.a="abcdefghijk" +flux kvs put issue4482B.b="lmnopqrstuv" +flux kvs put issue4482B.c="wxyz0123456" +flux kvs put issue4482B.d="7890ABCDEFG" +flux kvs put issue4482B.e="HIJKLMNOPQR" +flux kvs put issue4482B.f="STUVWXYZ!!!" +flux kvs put issue4482B.g="<<<<<:>>>>>" + +flux module load content-sqlite + +flux content flush + +EOF + +chmod +x t4482.sh + +flux start -s 1 \ + -o,--setattr=broker.rc1_path=${FLUX_SOURCE_DIR}/t/rc/rc1-issue4482 \ + -o,--setattr=broker.rc3_path=${FLUX_SOURCE_DIR}/t/rc/rc3-issue4482 \ + ./t4482.sh diff --git a/t/rc/rc1-issue4482 b/t/rc/rc1-issue4482 new file mode 100755 index 000000000000..9d28bc6f222d --- /dev/null +++ b/t/rc/rc1-issue4482 @@ -0,0 +1,6 @@ +#!/bin/bash -e + +# For issue4482 - assumes test size of 1 +flux module load kvs +flux module load kvs-watch +flux module load heartbeat diff --git a/t/rc/rc3-issue4482 b/t/rc/rc3-issue4482 new file mode 100755 index 000000000000..798c81d6014a --- /dev/null +++ b/t/rc/rc3-issue4482 @@ -0,0 +1,8 @@ +#!/bin/bash -e + +# For issue4482 - assumes test size of 1 +flux module remove -f heartbeat +flux module remove -f kvs-watch +flux module remove -f kvs +# content-sqlite loaded in test +flux module remove -f content-sqlite