Skip to content

Commit

Permalink
Merge pull request #4484 from chu11/issue4482_flush_list_corruption
Browse files Browse the repository at this point in the history
broker: fix content-cache flush list corruption
  • Loading branch information
mergify[bot] authored Aug 11, 2022
2 parents 8e53bf5 + 8c51d6f commit 56a8c5e
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 2 deletions.
16 changes: 14 additions & 2 deletions src/broker/content-cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,18 @@ static void cache_store_continuation (flux_future_t *f, void *arg)
cache_resume_flush (cache);
}

/* Issue #4482, there is a small chance a dirty entry could be added
* to the flush list twice which can lead to list corruption. As an
* extra measure, perform a delete from the list first. If the node
* is not on the list, the delete is a no-op.
*/
static void flush_list_append (struct content_cache *cache,
struct cache_entry *e)
{
list_del (&e->list);
list_add_tail (&cache->flush, &e->list);
}

static int cache_store (struct content_cache *cache, struct cache_entry *e)
{
flux_future_t *f;
Expand All @@ -535,7 +547,7 @@ static int cache_store (struct content_cache *cache, struct cache_entry *e)
return 0;
if (cache->rank == 0) {
if (cache->flush_batch_count >= cache->flush_batch_limit) {
list_add_tail (&cache->flush, &e->list);
flush_list_append (cache, e);
return 0;
}
flags = CONTENT_FLAG_CACHE_BYPASS;
Expand Down Expand Up @@ -595,7 +607,7 @@ static void content_store_request (flux_t *h, flux_msg_handler_t *mh,
* during purge or dropcache, so this does not alter
* behavior */
if (cache->rank == 0 && !cache->backing)
list_add_tail (&cache->flush, &e->list);
flush_list_append (cache, e);
}
if (flux_respond_raw (h, msg, hash, hash_size) < 0)
flux_log_error (h, "content store: flux_respond_raw");
Expand Down
1 change: 1 addition & 0 deletions t/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ dist_check_SCRIPTS = \
issues/t4379-dirty-cache-entries-flush.sh \
issues/t4413-empty-eventlog.sh \
issues/t4465-job-list-use-after-free.sh \
issues/t4482-flush-list-corruption.sh \
python/__init__.py \
python/subflux.py \
python/tap \
Expand Down
40 changes: 40 additions & 0 deletions t/issues/t4482-flush-list-corruption.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/sh -e

# How this test works
#
# add some unique data to the content cache, we do several stores to
# build up a decent length internal list of flushable cache entries.
#
# write some of the same data again, if error present internal flush
# list will be messed up and length of flush list < number of dirty
# entries (acct_dirty).
#
# before fix, flux content flush will hang b/c number of dirty entries
# (acct_dirty) never reaches zero.

cat <<-EOF >t4482.sh
#!/bin/sh -e
echo "abcde" | flux content store
echo "fghij" | flux content store
echo "klmno" | flux content store
echo "pqrst" | flux content store
echo "tuvwx" | flux content store
echo "fghij" | flux content store
echo "klmno" | flux content store
flux module load content-sqlite
flux content flush
flux module remove content-sqlite
EOF

chmod +x t4482.sh

flux start -s 1 \
-o,--setattr=broker.rc1_path= \
-o,--setattr=broker.rc3_path= \
./t4482.sh

0 comments on commit 56a8c5e

Please sign in to comment.