Skip to content

Commit

Permalink
[1.12 Merge] Fix assertion failure during file close on error (#3461)
Browse files Browse the repository at this point in the history
  • Loading branch information
jhendersonHDF authored Sep 1, 2023
1 parent f9511f4 commit 59bac69
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 41 deletions.
11 changes: 11 additions & 0 deletions release_docs/RELEASE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,17 @@ Bug Fixes since HDF5-1.12.2 release
===================================
Library
-------
- Fixed an assertion failure in Parallel HDF5 when a file can't be created
due to an invalid library version bounds setting

An assertion failure could occur in H5MF_settle_raw_data_fsm when a file
can't be created with Parallel HDF5 due to specifying the use of a paged,
persistent file free space manager
(H5Pset_file_space_strategy(..., H5F_FSPACE_STRATEGY_PAGE, 1, ...)) with
an invalid library version bounds combination
(H5Pset_libver_bounds(..., H5F_LIBVER_EARLIEST, H5F_LIBVER_V18)). This
has now been fixed.

- Fixed a bug in H5Ocopy that could generate invalid HDF5 files

H5Ocopy was missing a check to determine whether the new object's
Expand Down
5 changes: 5 additions & 0 deletions src/H5Cimage.c
Original file line number Diff line number Diff line change
Expand Up @@ -1113,6 +1113,11 @@ H5C__load_cache_image(H5F_t *f)
} /* end if */

done:
if (ret_value < 0) {
if (H5F_addr_defined(cache_ptr->image_addr))
cache_ptr->image_buffer = H5MM_xfree(cache_ptr->image_buffer);
}

FUNC_LEAVE_NOAPI(ret_value)
} /* H5C__load_cache_image() */

Expand Down
10 changes: 5 additions & 5 deletions src/H5Fint.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ static herr_t H5F__build_name(const char *prefix, const char *file_name, char **
static char *H5F__getenv_prefix_name(char **env_prefix /*in,out*/);
static H5F_t *H5F__new(H5F_shared_t *shared, unsigned flags, hid_t fcpl_id, hid_t fapl_id, H5FD_t *lf);
static herr_t H5F__check_if_using_file_locks(H5P_genplist_t *fapl, hbool_t *use_file_locking);
static herr_t H5F__dest(H5F_t *f, hbool_t flush);
static herr_t H5F__dest(H5F_t *f, hbool_t flush, hbool_t free_on_failure);
static herr_t H5F__build_actual_name(const H5F_t *f, const H5P_genplist_t *fapl, const char *name,
char ** /*out*/ actual_name);
static herr_t H5F__flush_phase1(H5F_t *f);
Expand Down Expand Up @@ -1381,7 +1381,7 @@ H5F__new(H5F_shared_t *shared, unsigned flags, hid_t fcpl_id, hid_t fapl_id, H5F
*-------------------------------------------------------------------------
*/
static herr_t
H5F__dest(H5F_t *f, hbool_t flush)
H5F__dest(H5F_t *f, hbool_t flush, hbool_t free_on_failure)
{
herr_t ret_value = SUCCEED; /* Return value */

Expand Down Expand Up @@ -1648,7 +1648,7 @@ H5F__dest(H5F_t *f, hbool_t flush)
HDONE_ERROR(H5E_FILE, H5E_CANTINIT, FAIL, "problems closing file")
f->shared = NULL;

if (ret_value >= 0)
if ((ret_value >= 0) || free_on_failure)
f = H5FL_FREE(H5F_t, f);

FUNC_LEAVE_NOAPI(ret_value)
Expand Down Expand Up @@ -2145,7 +2145,7 @@ H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id)

done:
if ((NULL == ret_value) && file)
if (H5F__dest(file, FALSE) < 0)
if (H5F__dest(file, FALSE, TRUE) < 0)
HDONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, NULL, "problems closing file")

FUNC_LEAVE_NOAPI(ret_value)
Expand Down Expand Up @@ -2555,7 +2555,7 @@ H5F_try_close(H5F_t *f, hbool_t *was_closed /*out*/)
* shared H5F_shared_t struct. If the reference count for the H5F_shared_t
* struct reaches zero then destroy it also.
*/
if (H5F__dest(f, TRUE) < 0)
if (H5F__dest(f, TRUE, FALSE) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "problems closing file")

/* Since we closed the file, this should be set to TRUE */
Expand Down
20 changes: 17 additions & 3 deletions src/H5Fsuper.c
Original file line number Diff line number Diff line change
Expand Up @@ -1087,8 +1087,9 @@ H5F__super_init(H5F_t *f)
FALSE; /* Whether the driver info block has been inserted into the metadata cache */
H5P_genplist_t *plist; /* File creation property list */
H5AC_ring_t orig_ring = H5AC_RING_INV;
hsize_t userblock_size; /* Size of userblock, in bytes */
hsize_t superblock_size; /* Size of superblock, in bytes */
hsize_t userblock_size; /* Size of userblock, in bytes */
hsize_t superblock_size = 0; /* Size of superblock, in bytes */
haddr_t superblock_addr = HADDR_UNDEF;
size_t driver_size; /* Size of driver info block (bytes) */
unsigned super_vers = HDF5_SUPERBLOCK_VERSION_DEF; /* Superblock version for file */
H5O_loc_t ext_loc; /* Superblock extension object location */
Expand Down Expand Up @@ -1288,7 +1289,7 @@ H5F__super_init(H5F_t *f)
f->shared->sblock = sblock;

/* Allocate space for the superblock */
if (HADDR_UNDEF == H5MF_alloc(f, H5FD_MEM_SUPER, superblock_size))
if (HADDR_UNDEF == (superblock_addr = H5MF_alloc(f, H5FD_MEM_SUPER, superblock_size)))
HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "file allocation failed for superblock")

/* set the drvinfo filed to NULL -- will overwrite this later if needed */
Expand Down Expand Up @@ -1479,6 +1480,19 @@ H5F__super_init(H5F_t *f)

/* Check if the superblock has been allocated yet */
if (sblock) {
if (non_default_fs_settings && H5F_addr_defined(superblock_addr)) {
/*
* For non-default free-space settings, the allocation of
* space in the file for the superblock may have have allocated
* memory for the free-space manager and inserted it into the
* metadata cache. Clean that up before returning or we may fail
* to close the file later due to the metadata cache's metadata
* free space manager ring (H5AC_RING_MDFSM) not being clean.
*/
if (H5MF_try_close(f) < 0)
HDONE_ERROR(H5E_FILE, H5E_CANTFREE, FAIL, "can't close file free space manager");
}

/* Check if we've cached it already */
if (sblock_in_cache) {
/* Unpin superblock in cache */
Expand Down
68 changes: 35 additions & 33 deletions src/H5MF.c
Original file line number Diff line number Diff line change
Expand Up @@ -2654,16 +2654,14 @@ H5MF_settle_raw_data_fsm(H5F_t *f, hbool_t *fsm_settled)
hbool_t fsm_opened[H5F_MEM_PAGE_NTYPES]; /* State of FSM */
hbool_t fsm_visited[H5F_MEM_PAGE_NTYPES]; /* State of FSM */

/* Sanity check */
HDassert(f->shared->sblock);

/* should only be called if file is opened R/W */
HDassert(H5F_INTENT(f) & H5F_ACC_RDWR);

/* shouldn't be called unless we have a superblock supporting the
* superblock extension.
*/
HDassert(f->shared->sblock->super_vers >= HDF5_SUPERBLOCK_VERSION_2);
if (f->shared->sblock)
HDassert(f->shared->sblock->super_vers >= HDF5_SUPERBLOCK_VERSION_2);

/* Initialize fsm_opened and fsm_visited */
HDmemset(fsm_opened, 0, sizeof(fsm_opened));
Expand Down Expand Up @@ -2811,40 +2809,44 @@ H5MF_settle_raw_data_fsm(H5F_t *f, hbool_t *fsm_settled)
* file space manager info message is guaranteed to exist.
* Leave it in for now, but consider removing it.
*/
if (H5F_addr_defined(f->shared->sblock->ext_addr))
if (H5F__super_ext_remove_msg(f, H5O_FSINFO_ID) < 0)
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTRELEASE, FAIL,
"error in removing message from superblock extension")
if (f->shared->sblock) {
if (H5F_addr_defined(f->shared->sblock->ext_addr))
if (H5F__super_ext_remove_msg(f, H5O_FSINFO_ID) < 0)
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTRELEASE, FAIL,
"error in removing message from superblock extension")
}

/* As the final element in 1), shrink the EOA for the file */
if (H5MF__close_shrink_eoa(f) < 0)
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTSHRINK, FAIL, "can't shrink eoa")

/* 2) Ensure that space is allocated for the free space manager superblock
* extension message. Must do this now, before reallocating file space
* for free space managers, as it is possible that this allocation may
* grab the last section in a FSM -- making it unnecessary to
* re-allocate file space for it.
*
* Do this by writing a free space manager superblock extension message.
*
* Since no free space manager has file space allocated for it, this
* message must be invalid since we can't save addresses of FSMs when
* those addresses are unknown. This is OK -- we will write the correct
* values to the message at free space manager shutdown.
*/
for (fsm_type = H5F_MEM_PAGE_SUPER; fsm_type < H5F_MEM_PAGE_NTYPES; fsm_type++)
fsinfo.fs_addr[fsm_type - 1] = HADDR_UNDEF;
fsinfo.strategy = f->shared->fs_strategy;
fsinfo.persist = f->shared->fs_persist;
fsinfo.threshold = f->shared->fs_threshold;
fsinfo.page_size = f->shared->fs_page_size;
fsinfo.pgend_meta_thres = f->shared->pgend_meta_thres;
fsinfo.eoa_pre_fsm_fsalloc = HADDR_UNDEF;

if (H5F__super_ext_write_msg(f, H5O_FSINFO_ID, &fsinfo, TRUE, H5O_MSG_FLAG_MARK_IF_UNKNOWN) < 0)
HGOTO_ERROR(H5E_RESOURCE, H5E_WRITEERROR, FAIL,
"error in writing fsinfo message to superblock extension")
if (f->shared->sblock) {
/* 2) Ensure that space is allocated for the free space manager superblock
* extension message. Must do this now, before reallocating file space
* for free space managers, as it is possible that this allocation may
* grab the last section in a FSM -- making it unnecessary to
* re-allocate file space for it.
*
* Do this by writing a free space manager superblock extension message.
*
* Since no free space manager has file space allocated for it, this
* message must be invalid since we can't save addresses of FSMs when
* those addresses are unknown. This is OK -- we will write the correct
* values to the message at free space manager shutdown.
*/
for (fsm_type = H5F_MEM_PAGE_SUPER; fsm_type < H5F_MEM_PAGE_NTYPES; fsm_type++)
fsinfo.fs_addr[fsm_type - 1] = HADDR_UNDEF;
fsinfo.strategy = f->shared->fs_strategy;
fsinfo.persist = f->shared->fs_persist;
fsinfo.threshold = f->shared->fs_threshold;
fsinfo.page_size = f->shared->fs_page_size;
fsinfo.pgend_meta_thres = f->shared->pgend_meta_thres;
fsinfo.eoa_pre_fsm_fsalloc = HADDR_UNDEF;

if (H5F__super_ext_write_msg(f, H5O_FSINFO_ID, &fsinfo, TRUE, H5O_MSG_FLAG_MARK_IF_UNKNOWN) < 0)
HGOTO_ERROR(H5E_RESOURCE, H5E_WRITEERROR, FAIL,
"error in writing fsinfo message to superblock extension")
}

/* 3) Scan all free space managers not involved in allocating
* space for free space managers. For each such free space
Expand Down
51 changes: 51 additions & 0 deletions testpar/t_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -947,3 +947,54 @@ test_file_properties(void)
VRFY((mpi_ret >= 0), "MPI_Info_free succeeded");

} /* end test_file_properties() */

/*
* Tests for an assertion failure during file close that used
* to occur when the library fails to create a file in parallel
* due to an invalid library version bounds setting
*/
void
test_invalid_libver_bounds_file_close_assert(void)
{
const char *filename = NULL;
MPI_Comm comm = MPI_COMM_WORLD;
MPI_Info info = MPI_INFO_NULL;
herr_t ret;
hid_t fid = H5I_INVALID_HID;
hid_t fapl_id = H5I_INVALID_HID;
hid_t fcpl_id = H5I_INVALID_HID;

filename = (const char *)GetTestParameters();

/* set up MPI parameters */
MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);

/* setup file access plist */
fapl_id = H5Pcreate(H5P_FILE_ACCESS);
VRFY((fapl_id != H5I_INVALID_HID), "H5Pcreate");
ret = H5Pset_fapl_mpio(fapl_id, comm, info);
VRFY((SUCCEED == ret), "H5Pset_fapl_mpio");
ret = H5Pset_libver_bounds(fapl_id, H5F_LIBVER_EARLIEST, H5F_LIBVER_V18);
VRFY((SUCCEED == ret), "H5Pset_libver_bounds");

/* setup file creation plist */
fcpl_id = H5Pcreate(H5P_FILE_CREATE);
VRFY((fcpl_id != H5I_INVALID_HID), "H5Pcreate");

ret = H5Pset_file_space_strategy(fcpl_id, H5F_FSPACE_STRATEGY_PAGE, TRUE, 1);
VRFY((SUCCEED == ret), "H5Pset_file_space_strategy");

/* create the file */
H5E_BEGIN_TRY
{
fid = H5Fcreate(filename, H5F_ACC_TRUNC, fcpl_id, fapl_id);
}
H5E_END_TRY
VRFY((fid == H5I_INVALID_HID), "H5Fcreate");

ret = H5Pclose(fapl_id);
VRFY((SUCCEED == ret), "H5Pclose");
ret = H5Pclose(fcpl_id);
VRFY((SUCCEED == ret), "H5Pclose");
}
3 changes: 3 additions & 0 deletions testpar/testphdf5.c
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,9 @@ main(int argc, char **argv)

AddTest("props", test_file_properties, NULL, "Coll Metadata file property settings", PARATESTFILE);

AddTest("invlibverassert", test_invalid_libver_bounds_file_close_assert, NULL,
"Invalid libver bounds assertion failure", PARATESTFILE);

AddTest("idsetw", dataset_writeInd, NULL, "dataset independent write", PARATESTFILE);
AddTest("idsetr", dataset_readInd, NULL, "dataset independent read", PARATESTFILE);

Expand Down
1 change: 1 addition & 0 deletions testpar/testphdf5.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ void test_plist_ed(void);
void external_links(void);
void zero_dim_dset(void);
void test_file_properties(void);
void test_invalid_libver_bounds_file_close_assert(void);
void multiple_dset_write(void);
void multiple_group_write(void);
void multiple_group_read(void);
Expand Down

0 comments on commit 59bac69

Please sign in to comment.