Skip to content

Commit

Permalink
(A) Address issue #2 and issue #4 of the group test failures.
Browse files Browse the repository at this point in the history
    See Kent's documentation "Designed to Fail Tests and Issues".
    (a) Fix for issue #2:
    --Print out meaningful message about max_lag when there is
    checksum error in loading an entry via H5C__load_entry().
    --H5C.c: H5C_protect()
    (b) Fix for issue #4:
    --Allocate more space when the copy of the index read from the metadata file is
      bigger than the existing size
    --H5Fvfd_swmr.c: H5F_vfd_swmr_reader_end_of_tick()
(B) When putting the old index into the delayed free list, use the old
    writer_index_offset instead of the current writer_index_offset
    --H5Fvfd_swmr.c: vfd_swmr_enlarge_shadow_index()
(C) When there is error form calling H5F_vfd_swmr_process_eot_queue() in
    VFD_SWMR_ENTER(err) and VFD_SWMR_LEAVE(err), should report
    FAIL instead of "err"
    --H5private.h: VFD_SWMR_ENTER and VFD_SWMR_LEAVE macros
(D) Add tests to verify issue #2 and issue #4 are fixed.
  • Loading branch information
vchoi committed Jan 4, 2022
1 parent cdc93ea commit b12565b
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 10 deletions.
15 changes: 14 additions & 1 deletion src/H5C.c
Original file line number Diff line number Diff line change
Expand Up @@ -2801,8 +2801,21 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign
#ifdef H5_HAVE_PARALLEL
coll_access,
#endif /* H5_HAVE_PARALLEL */
type, addr, udata)))
type, addr, udata))) {
/* Print out meaningful message for VFD SWMR reader */
if(f->shared->vfd_swmr && !f->shared->vfd_swmr_writer) {
uint64_t tmp_tick_num = 0;

if (H5FD_vfd_swmr_get_tick_and_idx(f->shared->lf, TRUE, &tmp_tick_num, NULL, NULL) < 0)
HGOTO_ERROR(H5E_ARGS, H5E_CANTGET, NULL, "error in retrieving tick_num from driver");

if (tmp_tick_num >= f->shared->tick_num + f->shared->vfd_swmr_config.max_lag)
HDONE_ERROR(H5E_FILE, H5E_SYSTEM, NULL,
"Reader's API time exceeds max_lag ticks, suggest to increase the value of max_lag.");
}

HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't load entry")
}

entry_ptr = (H5C_cache_entry_t *)thing;
cache_ptr->entries_loaded_counter++;
Expand Down
41 changes: 38 additions & 3 deletions src/H5Fvfd_swmr.c
Original file line number Diff line number Diff line change
Expand Up @@ -1136,6 +1136,7 @@ H5F_vfd_swmr_reader_end_of_tick(H5F_t *f, hbool_t entering_api)
uint32_t tmp_mdf_idx_len;
uint32_t tmp_mdf_idx_entries_used;
uint32_t mdf_idx_entries_used;
uint32_t vfd_entries;
H5F_shared_t * shared = f->shared;
struct {
uint64_t pgno;
Expand All @@ -1157,8 +1158,12 @@ H5F_vfd_swmr_reader_end_of_tick(H5F_t *f, hbool_t entering_api)
*
* If the tick reported has not increased since the last
* call, do nothing and exit.
*
* Also retrieve the current number of entries in the index
* so as to detect the need to allocate more space for the
* index.
*/
if (H5FD_vfd_swmr_get_tick_and_idx(file, TRUE, &tmp_tick_num, NULL, NULL) < 0)
if (H5FD_vfd_swmr_get_tick_and_idx(file, TRUE, &tmp_tick_num, &vfd_entries, NULL) < 0)

HGOTO_ERROR(H5E_ARGS, H5E_CANTGET, FAIL, "error in retrieving tick_num from driver")

Expand All @@ -1172,7 +1177,7 @@ H5F_vfd_swmr_reader_end_of_tick(H5F_t *f, hbool_t entering_api)
* */
if (!entering_api && tmp_tick_num >= shared->tick_num + shared->vfd_swmr_config.max_lag) {
HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL,
"Reader's API time exceeds max_lag of ticks, may increase the value of max_lag.");
"Reader's API time exceeds max_lag ticks, suggest to increase the value of max_lag.");
}
#if 0 /* Kent */
/* The original code */
Expand Down Expand Up @@ -1207,6 +1212,33 @@ H5F_vfd_swmr_reader_end_of_tick(H5F_t *f, hbool_t entering_api)
if (shared->mdf_idx == NULL && H5F__vfd_swmr_create_index(shared) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL, "unable to allocate metadata file index");

/* Check if more space is needed for the index */
if (shared->mdf_idx_len < vfd_entries) {
uint32_t inc_mdf_idx_len;
H5FD_vfd_swmr_idx_entry_t *new_idx;
H5FD_vfd_swmr_idx_entry_t *old_idx;

HDassert(shared->old_mdf_idx_len <= shared->mdf_idx_len);

/* Determine the size needed for the index */
inc_mdf_idx_len = shared->mdf_idx_len * 2;
while (vfd_entries > inc_mdf_idx_len)
inc_mdf_idx_len *= 2;

/* Allocate more space for shared->mdf_idx and shared->old_mdf_idx */
new_idx = H5MM_realloc(shared->mdf_idx, inc_mdf_idx_len * sizeof(shared->mdf_idx[0]));
if(new_idx == NULL)
HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL, "new index null after H5MM_realloc()")
shared->mdf_idx = new_idx;
shared->mdf_idx_len = inc_mdf_idx_len;

old_idx = H5MM_realloc(shared->old_mdf_idx, inc_mdf_idx_len * sizeof(shared->old_mdf_idx[0]));
if(old_idx == NULL)
HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL, "old index null after H5MM_realloc()")
shared->old_mdf_idx = old_idx;
shared->old_mdf_idx_len = inc_mdf_idx_len;
}

mdf_idx_entries_used = shared->mdf_idx_len;

#if 0 /* JRM */
Expand Down Expand Up @@ -1887,6 +1919,7 @@ vfd_swmr_enlarge_shadow_index(H5F_t *f)
H5F_shared_t * shared = f->shared;
H5FD_vfd_swmr_idx_entry_t *ret_value = NULL;
haddr_t idx_addr;
haddr_t old_writer_index_offset;
hsize_t idx_size;
H5FD_vfd_swmr_idx_entry_t *new_mdf_idx = NULL, *old_mdf_idx;
uint32_t new_mdf_idx_len, old_mdf_idx_len;
Expand Down Expand Up @@ -1924,6 +1957,7 @@ vfd_swmr_enlarge_shadow_index(H5F_t *f)
*/
H5MM_memcpy(new_mdf_idx, old_mdf_idx, sizeof(new_mdf_idx[0]) * old_mdf_idx_len);

old_writer_index_offset = shared->writer_index_offset;
shared->writer_index_offset = idx_addr;
ret_value = shared->mdf_idx = new_mdf_idx;
shared->mdf_idx_len = new_mdf_idx_len;
Expand All @@ -1939,7 +1973,8 @@ vfd_swmr_enlarge_shadow_index(H5F_t *f)
* past what is strictly necessary, but it seems like a reasonable
* trade-off for simplicity.
*/
if (shadow_range_defer_free(shared, shared->writer_index_offset, H5FD_MD_INDEX_SIZE(old_mdf_idx_len)) ==
/* Fix: use the saved old_writer_index_offset not the current one */
if (shadow_range_defer_free(shared, old_writer_index_offset, H5FD_MD_INDEX_SIZE(old_mdf_idx_len)) ==
-1) {
HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "could not schedule index reclamation");
}
Expand Down
6 changes: 4 additions & 2 deletions src/H5private.h
Original file line number Diff line number Diff line change
Expand Up @@ -2145,7 +2145,8 @@ H5_DLL herr_t H5CX_pop(hbool_t update_dxpl_props);
else if (TAILQ_EMPTY(&eot_queue_g)) \
; /* Nothing to do. */ \
else if (H5F_vfd_swmr_process_eot_queue(true) < 0) { \
HGOTO_ERROR(H5E_FUNC, H5E_CANTSET, err, "error processing EOT queue") \
/* Report error instead of "err" */ \
HGOTO_ERROR(H5E_FUNC, H5E_CANTSET, FALSE, "error processing EOT queue") \
} \
} while (0)

Expand All @@ -2160,7 +2161,8 @@ H5_DLL herr_t H5CX_pop(hbool_t update_dxpl_props);
else if (TAILQ_EMPTY(&eot_queue_g)) \
; /* Nothing to do. */ \
else if (H5F_vfd_swmr_process_eot_queue(false) < 0) { \
HDONE_ERROR(H5E_FUNC, H5E_CANTSET, err, "error processing EOT queue") \
/* Report error instead of "err" */ \
HDONE_ERROR(H5E_FUNC, H5E_CANTSET, FALSE, "error processing EOT queue") \
} \
} while (0)

Expand Down
132 changes: 128 additions & 4 deletions test/testvfdswmr.sh.in
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,16 @@ if [ $rc -ne 0 ] ; then
fi

all_tests="generator expand shrink expand_shrink sparse vlstr_null vlstr_oob zoo"
all_tests="${all_tests} groups groups_attrs groups_ops few_big many_small attrdset"
all_tests="${all_tests} groups groups_attrs groups_ops few_big many_small"

# For exhaustive run, add: os_groups_attrs, os_groups_ops, os_groups_seg, dsetops, dsetchks,independ_wr
# For exhaustive run, add:
# attrdset, dsetops, dsetchks,
# os_groups_attrs, os_groups_ops, os_groups_seg, independ_wr,
# gfail_entry_length, gfail_checksum, gfail_page_size, gfail_index_space
if [[ "$HDF5TestExpress" -eq 0 ]] ; then # exhaustive run
all_tests="${all_tests} os_groups_attrs os_groups_ops os_groups_seg dsetops dsetchks independ_wr"
all_tests="${all_tests} gfail_entry_length gfail_page_size"
all_tests="${all_tests} attrdset dsetops dsetchks"
all_tests="${all_tests} os_groups_attrs os_groups_ops os_groups_seg independ_wr"
all_tests="${all_tests} gfail_entry_length gfail_checksum gfail_page_size gfail_index_space"
fi

tests=${all_tests}
Expand Down Expand Up @@ -1451,6 +1455,70 @@ if [ ${do_gfail_entry_length:-no} = yes ]; then
rm -f vfd_swmr_gfail_entry_length_reader.*.{out,rc}
fi

###############################################################################
#
# "gfail_checksum" test
#
# Only for exhaustive run
#
# Verify that meaningful message about increasing max_lag is printed when
# incorrect metadata checksum error is enountered when loading the cache entry.
#
# Note that there will be messages from the error stack printed out for this test.
# This is expected.
#
# (See issue #2 in Kent's documentation "Designed to Fail Tests and Issues".
#
###############################################################################
#
#
if [ ${do_gfail_checksum:-no} = yes ]; then
#
# Clean up any existing fifo files from previous runs
if [ -e ./$GFAIL_FIFO_WRITER_TO_READER ]; then # If writer fifo file is found
rm -f ./$GFAIL_FIFO_WRITER_TO_READER
fi
if [ -e ./$GFAIL_FIFO_READER_TO_WRITER ]; then # If reader fifo file is found
rm -f ./$GFAIL_FIFO_READER_TO_WRITER
fi
#
echo launch vfd_swmr_gfail_writer -q -n 420000 ......may take some time......
catch_out_err_and_rc vfd_swmr_gfail_checksum_writer \
../vfd_swmr_gfail_writer -q -m 50 -t 10 -n 4000000 &
pid_writer=$!

echo launch vfd_swmr_gfail_reader -n 420000 ......may take some time......
catch_out_err_and_rc vfd_swmr_gfail_checksum_reader \
../vfd_swmr_gfail_reader -q -m 50 -t 10 -n 4000000 &
pid_reader=$!

# Wait for the reader to finish before signaling the
# writer to quit: the writer holds the file open so that the
# reader will find the shadow file when it opens
# the .h5 file.
wait $pid_reader
wait $pid_writer

# Collect exit code of the reader
if [ $(cat vfd_swmr_gfail_checksum_reader.rc) -ne 0 ]; then
grep "suggest to increase the value of max_lag" vfd_swmr_gfail_checksum_reader.out >/dev/null 2>&1
if test $? -ne 0; then
echo reader had error
nerrors=$((nerrors + 1))
fi
fi

# Collect exit code of the writer
if [ $(cat vfd_swmr_gfail_checksum_writer.rc) -ne 0 ]; then
echo writer had error
nerrors=$((nerrors + 1))
fi

# Clean up output files
rm -f vfd_swmr_gfail_checksum_writer.{out,rc}
rm -f vfd_swmr_gfail_checksum_reader.*.{out,rc}
fi

###############################################################################
#
# "gfail_page_size" test
Expand Down Expand Up @@ -1507,6 +1575,62 @@ if [ ${do_gfail_page_size:-no} = yes ]; then
rm -f vfd_swmr_gfail_page_size_reader.*.{out,rc}
fi

###############################################################################
#
# "gfail_index_space" test
#
# Only for exhaustive run
#
# Verify that the failure is fixed when there is not enough space to copy the index.
# (See issue #4 in Kent's documentation "Designed to Fail Tests and Issues".
#
###############################################################################
#
#
if [ ${do_gfail_index_space:-no} = yes ]; then
#
# Clean up any existing fifo files from previous runs
if [ -e ./$GFAIL_FIFO_WRITER_TO_READER ]; then # If writer fifo file is found
rm -f ./$GFAIL_FIFO_WRITER_TO_READER
fi
if [ -e ./$GFAIL_FIFO_READER_TO_WRITER ]; then # If reader fifo file is found
rm -f ./$GFAIL_FIFO_READER_TO_WRITER
fi
#
echo launch vfd_swmr_gfail_writer -q -m 50 -t 10 -n 1000000 ......may take some time......
catch_out_err_and_rc vfd_swmr_gfail_index_space_writer \
../vfd_swmr_gfail_writer -q -m 50 -t 10 -n 1000000 &
pid_writer=$!

echo launch vfd_swmr_gfail_reader -m 50 -t 10 -n 1000000 ......may take some time......
catch_out_err_and_rc vfd_swmr_gfail_index_space_reader \
../vfd_swmr_gfail_reader -m 50 -t 10 -n 1000000 &
pid_reader=$!

# Wait for the reader to finish before signaling the
# writer to quit: the writer holds the file open so that the
# reader will find the shadow file when it opens
# the .h5 file.
wait $pid_reader
wait $pid_writer

# Collect exit code of the reader
if [ $(cat vfd_swmr_gfail_index_space_reader.rc) -ne 0 ]; then
echo reader had error
nerrors=$((nerrors + 1))
fi

# Collect exit code of the writer
if [ $(cat vfd_swmr_gfail_index_space_writer.rc) -ne 0 ]; then
echo writer had error
nerrors=$((nerrors + 1))
fi

# Clean up output files
rm -f vfd_swmr_gfail_index_space_writer.{out,rc}
rm -f vfd_swmr_gfail_index_space_reader.*.{out,rc}
fi

###############################################################################
## Report and exit
###############################################################################
Expand Down

0 comments on commit b12565b

Please sign in to comment.