From 59bac69939119575196ea3e053631d45a78064af Mon Sep 17 00:00:00 2001 From: jhendersonHDF Date: Thu, 31 Aug 2023 23:02:19 -0500 Subject: [PATCH] [1.12 Merge] Fix assertion failure during file close on error (#3461) --- release_docs/RELEASE.txt | 11 +++++++ src/H5Cimage.c | 5 +++ src/H5Fint.c | 10 +++--- src/H5Fsuper.c | 20 ++++++++++-- src/H5MF.c | 68 +++++++++++++++++++++------------------- testpar/t_file.c | 51 ++++++++++++++++++++++++++++++ testpar/testphdf5.c | 3 ++ testpar/testphdf5.h | 1 + 8 files changed, 128 insertions(+), 41 deletions(-) diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index d6f3a570079..358a9e00abb 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -241,6 +241,17 @@ Bug Fixes since HDF5-1.12.2 release =================================== Library ------- + - Fixed an assertion failure in Parallel HDF5 when a file can't be created + due to an invalid library version bounds setting + + An assertion failure could occur in H5MF_settle_raw_data_fsm when a file + can't be created with Parallel HDF5 due to specifying the use of a paged, + persistent file free space manager + (H5Pset_file_space_strategy(..., H5F_FSPACE_STRATEGY_PAGE, 1, ...)) with + an invalid library version bounds combination + (H5Pset_libver_bounds(..., H5F_LIBVER_EARLIEST, H5F_LIBVER_V18)). This + has now been fixed. + - Fixed a bug in H5Ocopy that could generate invalid HDF5 files H5Ocopy was missing a check to determine whether the new object's diff --git a/src/H5Cimage.c b/src/H5Cimage.c index a25f144c7be..446dc8801fe 100644 --- a/src/H5Cimage.c +++ b/src/H5Cimage.c @@ -1113,6 +1113,11 @@ H5C__load_cache_image(H5F_t *f) } /* end if */ done: + if (ret_value < 0) { + if (H5F_addr_defined(cache_ptr->image_addr)) + cache_ptr->image_buffer = H5MM_xfree(cache_ptr->image_buffer); + } + FUNC_LEAVE_NOAPI(ret_value) } /* H5C__load_cache_image() */ diff --git a/src/H5Fint.c b/src/H5Fint.c index 372537678d7..e897d4f6509 100644 --- a/src/H5Fint.c +++ b/src/H5Fint.c @@ -80,7 +80,7 @@ static herr_t H5F__build_name(const char *prefix, const char *file_name, char ** static char *H5F__getenv_prefix_name(char **env_prefix /*in,out*/); static H5F_t *H5F__new(H5F_shared_t *shared, unsigned flags, hid_t fcpl_id, hid_t fapl_id, H5FD_t *lf); static herr_t H5F__check_if_using_file_locks(H5P_genplist_t *fapl, hbool_t *use_file_locking); -static herr_t H5F__dest(H5F_t *f, hbool_t flush); +static herr_t H5F__dest(H5F_t *f, hbool_t flush, hbool_t free_on_failure); static herr_t H5F__build_actual_name(const H5F_t *f, const H5P_genplist_t *fapl, const char *name, char ** /*out*/ actual_name); static herr_t H5F__flush_phase1(H5F_t *f); @@ -1381,7 +1381,7 @@ H5F__new(H5F_shared_t *shared, unsigned flags, hid_t fcpl_id, hid_t fapl_id, H5F *------------------------------------------------------------------------- */ static herr_t -H5F__dest(H5F_t *f, hbool_t flush) +H5F__dest(H5F_t *f, hbool_t flush, hbool_t free_on_failure) { herr_t ret_value = SUCCEED; /* Return value */ @@ -1648,7 +1648,7 @@ H5F__dest(H5F_t *f, hbool_t flush) HDONE_ERROR(H5E_FILE, H5E_CANTINIT, FAIL, "problems closing file") f->shared = NULL; - if (ret_value >= 0) + if ((ret_value >= 0) || free_on_failure) f = H5FL_FREE(H5F_t, f); FUNC_LEAVE_NOAPI(ret_value) @@ -2145,7 +2145,7 @@ H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id) done: if ((NULL == ret_value) && file) - if (H5F__dest(file, FALSE) < 0) + if (H5F__dest(file, FALSE, TRUE) < 0) HDONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, NULL, "problems closing file") FUNC_LEAVE_NOAPI(ret_value) @@ -2555,7 +2555,7 @@ H5F_try_close(H5F_t *f, hbool_t *was_closed /*out*/) * shared H5F_shared_t struct. If the reference count for the H5F_shared_t * struct reaches zero then destroy it also. */ - if (H5F__dest(f, TRUE) < 0) + if (H5F__dest(f, TRUE, FALSE) < 0) HGOTO_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "problems closing file") /* Since we closed the file, this should be set to TRUE */ diff --git a/src/H5Fsuper.c b/src/H5Fsuper.c index b56c482bdd4..020c378d862 100644 --- a/src/H5Fsuper.c +++ b/src/H5Fsuper.c @@ -1087,8 +1087,9 @@ H5F__super_init(H5F_t *f) FALSE; /* Whether the driver info block has been inserted into the metadata cache */ H5P_genplist_t *plist; /* File creation property list */ H5AC_ring_t orig_ring = H5AC_RING_INV; - hsize_t userblock_size; /* Size of userblock, in bytes */ - hsize_t superblock_size; /* Size of superblock, in bytes */ + hsize_t userblock_size; /* Size of userblock, in bytes */ + hsize_t superblock_size = 0; /* Size of superblock, in bytes */ + haddr_t superblock_addr = HADDR_UNDEF; size_t driver_size; /* Size of driver info block (bytes) */ unsigned super_vers = HDF5_SUPERBLOCK_VERSION_DEF; /* Superblock version for file */ H5O_loc_t ext_loc; /* Superblock extension object location */ @@ -1288,7 +1289,7 @@ H5F__super_init(H5F_t *f) f->shared->sblock = sblock; /* Allocate space for the superblock */ - if (HADDR_UNDEF == H5MF_alloc(f, H5FD_MEM_SUPER, superblock_size)) + if (HADDR_UNDEF == (superblock_addr = H5MF_alloc(f, H5FD_MEM_SUPER, superblock_size))) HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "file allocation failed for superblock") /* set the drvinfo filed to NULL -- will overwrite this later if needed */ @@ -1479,6 +1480,19 @@ H5F__super_init(H5F_t *f) /* Check if the superblock has been allocated yet */ if (sblock) { + if (non_default_fs_settings && H5F_addr_defined(superblock_addr)) { + /* + * For non-default free-space settings, the allocation of + * space in the file for the superblock may have have allocated + * memory for the free-space manager and inserted it into the + * metadata cache. Clean that up before returning or we may fail + * to close the file later due to the metadata cache's metadata + * free space manager ring (H5AC_RING_MDFSM) not being clean. + */ + if (H5MF_try_close(f) < 0) + HDONE_ERROR(H5E_FILE, H5E_CANTFREE, FAIL, "can't close file free space manager"); + } + /* Check if we've cached it already */ if (sblock_in_cache) { /* Unpin superblock in cache */ diff --git a/src/H5MF.c b/src/H5MF.c index b75afaebf86..bd9f74d593f 100644 --- a/src/H5MF.c +++ b/src/H5MF.c @@ -2654,16 +2654,14 @@ H5MF_settle_raw_data_fsm(H5F_t *f, hbool_t *fsm_settled) hbool_t fsm_opened[H5F_MEM_PAGE_NTYPES]; /* State of FSM */ hbool_t fsm_visited[H5F_MEM_PAGE_NTYPES]; /* State of FSM */ - /* Sanity check */ - HDassert(f->shared->sblock); - /* should only be called if file is opened R/W */ HDassert(H5F_INTENT(f) & H5F_ACC_RDWR); /* shouldn't be called unless we have a superblock supporting the * superblock extension. */ - HDassert(f->shared->sblock->super_vers >= HDF5_SUPERBLOCK_VERSION_2); + if (f->shared->sblock) + HDassert(f->shared->sblock->super_vers >= HDF5_SUPERBLOCK_VERSION_2); /* Initialize fsm_opened and fsm_visited */ HDmemset(fsm_opened, 0, sizeof(fsm_opened)); @@ -2811,40 +2809,44 @@ H5MF_settle_raw_data_fsm(H5F_t *f, hbool_t *fsm_settled) * file space manager info message is guaranteed to exist. * Leave it in for now, but consider removing it. */ - if (H5F_addr_defined(f->shared->sblock->ext_addr)) - if (H5F__super_ext_remove_msg(f, H5O_FSINFO_ID) < 0) - HGOTO_ERROR(H5E_RESOURCE, H5E_CANTRELEASE, FAIL, - "error in removing message from superblock extension") + if (f->shared->sblock) { + if (H5F_addr_defined(f->shared->sblock->ext_addr)) + if (H5F__super_ext_remove_msg(f, H5O_FSINFO_ID) < 0) + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTRELEASE, FAIL, + "error in removing message from superblock extension") + } /* As the final element in 1), shrink the EOA for the file */ if (H5MF__close_shrink_eoa(f) < 0) HGOTO_ERROR(H5E_RESOURCE, H5E_CANTSHRINK, FAIL, "can't shrink eoa") - /* 2) Ensure that space is allocated for the free space manager superblock - * extension message. Must do this now, before reallocating file space - * for free space managers, as it is possible that this allocation may - * grab the last section in a FSM -- making it unnecessary to - * re-allocate file space for it. - * - * Do this by writing a free space manager superblock extension message. - * - * Since no free space manager has file space allocated for it, this - * message must be invalid since we can't save addresses of FSMs when - * those addresses are unknown. This is OK -- we will write the correct - * values to the message at free space manager shutdown. - */ - for (fsm_type = H5F_MEM_PAGE_SUPER; fsm_type < H5F_MEM_PAGE_NTYPES; fsm_type++) - fsinfo.fs_addr[fsm_type - 1] = HADDR_UNDEF; - fsinfo.strategy = f->shared->fs_strategy; - fsinfo.persist = f->shared->fs_persist; - fsinfo.threshold = f->shared->fs_threshold; - fsinfo.page_size = f->shared->fs_page_size; - fsinfo.pgend_meta_thres = f->shared->pgend_meta_thres; - fsinfo.eoa_pre_fsm_fsalloc = HADDR_UNDEF; - - if (H5F__super_ext_write_msg(f, H5O_FSINFO_ID, &fsinfo, TRUE, H5O_MSG_FLAG_MARK_IF_UNKNOWN) < 0) - HGOTO_ERROR(H5E_RESOURCE, H5E_WRITEERROR, FAIL, - "error in writing fsinfo message to superblock extension") + if (f->shared->sblock) { + /* 2) Ensure that space is allocated for the free space manager superblock + * extension message. Must do this now, before reallocating file space + * for free space managers, as it is possible that this allocation may + * grab the last section in a FSM -- making it unnecessary to + * re-allocate file space for it. + * + * Do this by writing a free space manager superblock extension message. + * + * Since no free space manager has file space allocated for it, this + * message must be invalid since we can't save addresses of FSMs when + * those addresses are unknown. This is OK -- we will write the correct + * values to the message at free space manager shutdown. + */ + for (fsm_type = H5F_MEM_PAGE_SUPER; fsm_type < H5F_MEM_PAGE_NTYPES; fsm_type++) + fsinfo.fs_addr[fsm_type - 1] = HADDR_UNDEF; + fsinfo.strategy = f->shared->fs_strategy; + fsinfo.persist = f->shared->fs_persist; + fsinfo.threshold = f->shared->fs_threshold; + fsinfo.page_size = f->shared->fs_page_size; + fsinfo.pgend_meta_thres = f->shared->pgend_meta_thres; + fsinfo.eoa_pre_fsm_fsalloc = HADDR_UNDEF; + + if (H5F__super_ext_write_msg(f, H5O_FSINFO_ID, &fsinfo, TRUE, H5O_MSG_FLAG_MARK_IF_UNKNOWN) < 0) + HGOTO_ERROR(H5E_RESOURCE, H5E_WRITEERROR, FAIL, + "error in writing fsinfo message to superblock extension") + } /* 3) Scan all free space managers not involved in allocating * space for free space managers. For each such free space diff --git a/testpar/t_file.c b/testpar/t_file.c index d8189b99d41..4a3665f5573 100644 --- a/testpar/t_file.c +++ b/testpar/t_file.c @@ -947,3 +947,54 @@ test_file_properties(void) VRFY((mpi_ret >= 0), "MPI_Info_free succeeded"); } /* end test_file_properties() */ + +/* + * Tests for an assertion failure during file close that used + * to occur when the library fails to create a file in parallel + * due to an invalid library version bounds setting + */ +void +test_invalid_libver_bounds_file_close_assert(void) +{ + const char *filename = NULL; + MPI_Comm comm = MPI_COMM_WORLD; + MPI_Info info = MPI_INFO_NULL; + herr_t ret; + hid_t fid = H5I_INVALID_HID; + hid_t fapl_id = H5I_INVALID_HID; + hid_t fcpl_id = H5I_INVALID_HID; + + filename = (const char *)GetTestParameters(); + + /* set up MPI parameters */ + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + /* setup file access plist */ + fapl_id = H5Pcreate(H5P_FILE_ACCESS); + VRFY((fapl_id != H5I_INVALID_HID), "H5Pcreate"); + ret = H5Pset_fapl_mpio(fapl_id, comm, info); + VRFY((SUCCEED == ret), "H5Pset_fapl_mpio"); + ret = H5Pset_libver_bounds(fapl_id, H5F_LIBVER_EARLIEST, H5F_LIBVER_V18); + VRFY((SUCCEED == ret), "H5Pset_libver_bounds"); + + /* setup file creation plist */ + fcpl_id = H5Pcreate(H5P_FILE_CREATE); + VRFY((fcpl_id != H5I_INVALID_HID), "H5Pcreate"); + + ret = H5Pset_file_space_strategy(fcpl_id, H5F_FSPACE_STRATEGY_PAGE, TRUE, 1); + VRFY((SUCCEED == ret), "H5Pset_file_space_strategy"); + + /* create the file */ + H5E_BEGIN_TRY + { + fid = H5Fcreate(filename, H5F_ACC_TRUNC, fcpl_id, fapl_id); + } + H5E_END_TRY + VRFY((fid == H5I_INVALID_HID), "H5Fcreate"); + + ret = H5Pclose(fapl_id); + VRFY((SUCCEED == ret), "H5Pclose"); + ret = H5Pclose(fcpl_id); + VRFY((SUCCEED == ret), "H5Pclose"); +} diff --git a/testpar/testphdf5.c b/testpar/testphdf5.c index 780a22e18bc..678db05d4f3 100644 --- a/testpar/testphdf5.c +++ b/testpar/testphdf5.c @@ -357,6 +357,9 @@ main(int argc, char **argv) AddTest("props", test_file_properties, NULL, "Coll Metadata file property settings", PARATESTFILE); + AddTest("invlibverassert", test_invalid_libver_bounds_file_close_assert, NULL, + "Invalid libver bounds assertion failure", PARATESTFILE); + AddTest("idsetw", dataset_writeInd, NULL, "dataset independent write", PARATESTFILE); AddTest("idsetr", dataset_readInd, NULL, "dataset independent read", PARATESTFILE); diff --git a/testpar/testphdf5.h b/testpar/testphdf5.h index 9cdd7168d31..d4f0162ff4c 100644 --- a/testpar/testphdf5.h +++ b/testpar/testphdf5.h @@ -231,6 +231,7 @@ void test_plist_ed(void); void external_links(void); void zero_dim_dset(void); void test_file_properties(void); +void test_invalid_libver_bounds_file_close_assert(void); void multiple_dset_write(void); void multiple_group_write(void); void multiple_group_read(void);