Skip to content

Commit

Permalink
Fix hangs during collective I/O with independent metadata writes
Browse files Browse the repository at this point in the history
  • Loading branch information
jhendersonHDF committed Oct 17, 2023
1 parent 1b62827 commit 86961fe
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 4 deletions.
19 changes: 19 additions & 0 deletions release_docs/RELEASE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,25 @@ Bug Fixes since HDF5-1.14.0 release
===================================
Library
-------
- Fixed potential hangs in parallel library during collective I/O with
independent metadata writes

When performing collective parallel writes to a dataset where metadata
writes are requested as (or left as the default setting of) independent,
hangs could potentially occur during metadata cache sync points. This
was due to incorrect management of the internal state tracking whether
an I/O operation should be collective or not, causing the library to
attempt collective writes of metadata when they were meant to be
independent writes. During the metadata cache sync points, if the number
of cache entries being flushed was a multiple of the number of MPI ranks
in the MPI communicator used to access the HDF5 file, an equal amount of
collective MPI I/O calls were made and the dataset write call would be
successful. However, when the number of cache entries being flushed was
NOT a multiple of the number of MPI ranks, the ranks with more entries
than others would get stuck in an MPI_File_set_view call, while other
ranks would get stuck in a post-write MPI_Barrier call. This issue has
been fixed by correctly switching to independent I/O temporarily when
writing metadata independently during collective dataset I/O.

- Dropped support for MPI-2

Expand Down
36 changes: 33 additions & 3 deletions src/H5Cmpio.c
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,9 @@ herr_t
H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, haddr_t *candidates_list_ptr,
int mpi_rank, int mpi_size)
{
unsigned first_entry_to_flush;
unsigned last_entry_to_flush;
H5FD_mpio_xfer_t orig_xfer_mode;
unsigned first_entry_to_flush;
unsigned last_entry_to_flush;
#ifndef NDEBUG
unsigned total_entries_to_clear = 0;
unsigned total_entries_to_flush = 0;
Expand All @@ -173,7 +174,8 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha
#endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */
unsigned m, n;
unsigned u; /* Local index variable */
herr_t ret_value = SUCCEED; /* Return value */
bool restore_io_mode = false;
herr_t ret_value = SUCCEED; /* Return value */

FUNC_ENTER_NOAPI(FAIL)

Expand All @@ -185,6 +187,10 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha
assert(0 <= mpi_rank);
assert(mpi_rank < mpi_size);

/* Get I/O transfer mode */
if (H5CX_get_io_xfer_mode(&orig_xfer_mode) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode");

/* Initialize the entries_to_flush and entries_to_clear arrays */
memset(entries_to_flush, 0, sizeof(entries_to_flush));
memset(entries_to_clear, 0, sizeof(entries_to_clear));
Expand Down Expand Up @@ -354,6 +360,19 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha
num_candidates, total_entries_to_clear, total_entries_to_flush);
#endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */

/*
* If collective I/O was requested, but collective metadata
* writes were not requested, temporarily disable collective
* I/O while flushing candidate entries so that we don't cause
* a hang in the case where the number of candidate entries
* to flush isn't a multiple of mpi_size.
*/
if ((orig_xfer_mode == H5FD_MPIO_COLLECTIVE) && !f->shared->coll_md_write) {
if (H5CX_set_io_xfer_mode(H5FD_MPIO_INDEPENDENT) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode");
restore_io_mode = true;
}

/* We have now marked all the entries on the candidate list for
* either flush or clear -- now scan the LRU and the pinned list
* for these entries and do the deed. Do this via a call to
Expand All @@ -367,6 +386,13 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha
if (H5C__flush_candidate_entries(f, entries_to_flush, entries_to_clear) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "flush candidates failed");

/* Restore collective I/O if we temporarily disabled it */
if (restore_io_mode) {
if (H5CX_set_io_xfer_mode(orig_xfer_mode) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode");
restore_io_mode = false;
}

/* If we've deferred writing to do it collectively, take care of that now */
if (f->shared->coll_md_write) {
/* Sanity check */
Expand All @@ -378,6 +404,10 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha
} /* end if */

done:
/* Restore collective I/O if we temporarily disabled it */
if (restore_io_mode && (H5CX_set_io_xfer_mode(orig_xfer_mode) < 0))
HDONE_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode");

if (candidate_assignment_table != NULL)
candidate_assignment_table = (unsigned *)H5MM_xfree((void *)candidate_assignment_table);
if (cache_ptr->coll_write_list) {
Expand Down
2 changes: 1 addition & 1 deletion src/H5Pfapl.c
Original file line number Diff line number Diff line change
Expand Up @@ -5174,7 +5174,7 @@ H5Pget_all_coll_metadata_ops(hid_t plist_id, hbool_t *is_collective /*out*/)
* Function: H5Pset_coll_metadata_write
*
* Purpose: Tell the library whether the metadata write operations will
* be done collectively (1) or not (0). Default is collective.
* be done collectively (1) or not (0). Default is independent.
*
* Return: Non-negative on success/Negative on failure
*
Expand Down
107 changes: 107 additions & 0 deletions testpar/t_coll_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@
#define COLL_GHEAP_WRITE_ATTR_NAME "coll_gheap_write_attr"
#define COLL_GHEAP_WRITE_ATTR_DIMS 1

#define COLL_IO_IND_MD_WRITE_NDIMS 2
#define COLL_IO_IND_MD_WRITE_CHUNK0 4
#define COLL_IO_IND_MD_WRITE_CHUNK1 256
#define COLL_IO_IND_MD_WRITE_NCHUNK1 16384

/*
* A test for issue HDFFV-10501. A parallel hang was reported which occurred
* in linked-chunk I/O when collective metadata reads are enabled and some ranks
Expand Down Expand Up @@ -569,3 +574,105 @@ test_collective_global_heap_write(void)
VRFY((H5Pclose(fapl_id) >= 0), "H5Pclose succeeded");
VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded");
}

/*
* A test to ensure that hangs don't occur when collective I/O
* is requested at the interface level (by a call to
* H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE)), while
* collective metadata writes are NOT requested.
*/
void
test_coll_io_ind_md_write(void)
{
const char *filename;
long long *data = NULL;
hsize_t dset_dims[COLL_IO_IND_MD_WRITE_NDIMS];
hsize_t chunk_dims[COLL_IO_IND_MD_WRITE_NDIMS];
hsize_t sel_dims[COLL_IO_IND_MD_WRITE_NDIMS];
hsize_t offset[COLL_IO_IND_MD_WRITE_NDIMS];
hid_t file_id = H5I_INVALID_HID;
hid_t fapl_id = H5I_INVALID_HID;
hid_t dset_id = H5I_INVALID_HID;
hid_t dset_id2 = H5I_INVALID_HID;
hid_t dcpl_id = H5I_INVALID_HID;
hid_t dxpl_id = H5I_INVALID_HID;
hid_t fspace_id = H5I_INVALID_HID;
int mpi_rank, mpi_size;

MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);

filename = GetTestParameters();

fapl_id = create_faccess_plist(MPI_COMM_WORLD, MPI_INFO_NULL, facc_type);
VRFY((fapl_id >= 0), "create_faccess_plist succeeded");

VRFY((H5Pset_all_coll_metadata_ops(fapl_id, false) >= 0), "Unset collective metadata reads succeeded");
VRFY((H5Pset_coll_metadata_write(fapl_id, false) >= 0), "Unset collective metadata writes succeeded");

file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
VRFY((file_id >= 0), "H5Fcreate succeeded");

dset_dims[0] = (hsize_t)(mpi_size * COLL_IO_IND_MD_WRITE_CHUNK0);
dset_dims[1] = (hsize_t)(COLL_IO_IND_MD_WRITE_CHUNK1 * COLL_IO_IND_MD_WRITE_NCHUNK1);

fspace_id = H5Screate_simple(COLL_IO_IND_MD_WRITE_NDIMS, dset_dims, NULL);
VRFY((fspace_id >= 0), "H5Screate_simple succeeded");

dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
VRFY((dcpl_id >= 0), "H5Pcreate succeeded");

chunk_dims[0] = (hsize_t)(COLL_IO_IND_MD_WRITE_CHUNK0);
chunk_dims[1] = (hsize_t)(COLL_IO_IND_MD_WRITE_CHUNK1);

VRFY((H5Pset_chunk(dcpl_id, COLL_IO_IND_MD_WRITE_NDIMS, chunk_dims) >= 0),
"H5Pset_chunk succeeded");

VRFY((H5Pset_shuffle(dcpl_id) >= 0), "H5Pset_shuffle succeeded");

dset_id = H5Dcreate2(file_id, "dset1", H5T_NATIVE_LLONG, fspace_id,
H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
VRFY((dset_id >= 0), "H5Dcreate2 succeeded");

sel_dims[0] = (hsize_t)(COLL_IO_IND_MD_WRITE_CHUNK0);
sel_dims[1] = (hsize_t)(COLL_IO_IND_MD_WRITE_CHUNK1 * COLL_IO_IND_MD_WRITE_NCHUNK1);

offset[0] = (hsize_t)mpi_rank * sel_dims[0];
offset[1] = 0;

VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, offset, NULL, sel_dims, NULL) >= 0),
"H5Sselect_hyperslab succeeded");

dxpl_id = H5Pcreate(H5P_DATASET_XFER);
VRFY((dxpl_id >= 0), "H5Pcreate succeeded");

VRFY((H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE) >= 0),
"H5Pset_dxpl_mpio succeeded");

data = malloc(sel_dims[0] * sel_dims[1] * sizeof(long long));
for (size_t i = 0; i < sel_dims[0] * sel_dims[1]; i++)
data[i] = rand();

VRFY((H5Dwrite(dset_id, H5T_NATIVE_LLONG, H5S_BLOCK, fspace_id, dxpl_id, data) >= 0),
"H5Dwrite succeeded");

dset_id2 = H5Dcreate2(file_id, "dset2", H5T_NATIVE_LLONG, fspace_id,
H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
VRFY((dset_id2 >= 0), "H5Dcreate2 succeeded");

for (size_t i = 0; i < sel_dims[0] * sel_dims[1]; i++)
data[i] = rand();

VRFY((H5Dwrite(dset_id2, H5T_NATIVE_LLONG, H5S_BLOCK, fspace_id, dxpl_id, data) >= 0),
"H5Dwrite succeeded");

free(data);

VRFY((H5Sclose(fspace_id) >= 0), "H5Sclose succeeded");
VRFY((H5Dclose(dset_id) >= 0), "H5Dclose succeeded");
VRFY((H5Dclose(dset_id2) >= 0), "H5Dclose succeeded");
VRFY((H5Pclose(dcpl_id) >= 0), "H5Pclose succeeded");
VRFY((H5Pclose(dxpl_id) >= 0), "H5Pclose succeeded");
VRFY((H5Pclose(fapl_id) >= 0), "H5Pclose succeeded");
VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded");
}
2 changes: 2 additions & 0 deletions testpar/testphdf5.c
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,8 @@ main(int argc, char **argv)
"Collective MD read with link chunk I/O (H5D__sort_chunk)", PARATESTFILE);
AddTest("GH_coll_MD_wr", test_collective_global_heap_write, NULL,
"Collective MD write of global heap data", PARATESTFILE);
AddTest("COLLIO_INDMDWR", test_coll_io_ind_md_write, NULL,
"Collective I/O with Independent metadata writes", PARATESTFILE);

/* Display testing information */
TestInfo(argv[0]);
Expand Down
1 change: 1 addition & 0 deletions testpar/testphdf5.h
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ void test_partial_no_selection_coll_md_read(void);
void test_multi_chunk_io_addrmap_issue(void);
void test_link_chunk_io_sort_chunk_issue(void);
void test_collective_global_heap_write(void);
void test_coll_io_ind_md_write(void);
void test_oflush(void);

/* commonly used prototypes */
Expand Down

0 comments on commit 86961fe

Please sign in to comment.