Skip to content

Commit

Permalink
Implement optimized support for vector I/O in Subfiling VFD (HDFGroup…
Browse files Browse the repository at this point in the history
…#3896)

Vector I/O requests are now processed within a single
set of I/O call batches, rather than each I/O vector
entry (tuple constructed from the types, addrs, sizes
and bufs arrays) being processed individually. This allows I/O to be
more efficiently parallelized among the I/O concentrator processes
during large I/O requests.

* Fixed some calculations and add test cases for issues spotted from review

* Removed a variable that was compensating for previous miscalculations
  • Loading branch information
jhendersonHDF authored Dec 27, 2023
1 parent 695efa9 commit 6ffc55c
Show file tree
Hide file tree
Showing 8 changed files with 2,190 additions and 1,060 deletions.
10 changes: 10 additions & 0 deletions release_docs/RELEASE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,16 @@ New Features

Library:
--------
- Implemented optimized support for vector I/O in the Subfiling VFD

Previously, the Subfiling VFD would handle vector I/O requests by
breaking them down into individual I/O requests, one for each entry
in the I/O vectors provided. This could result in poor I/O performance
for features in HDF5 that utilize vector I/O, such as parallel I/O
to filtered datasets. The Subfiling VFD now properly handles vector
I/O requests in their entirety, resulting in fewer I/O calls, improved
vector I/O performance and improved vector I/O memory efficiency.

- Added a simple cache to the read-only S3 (ros3) VFD

The read-only S3 VFD now caches the first N bytes of a file stored
Expand Down
50 changes: 37 additions & 13 deletions src/H5FDsubfiling/H5FDioc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1610,12 +1610,14 @@ H5FD__ioc_write_vector_internal(H5FD_t *_file, uint32_t count, H5FD_mem_t H5_ATT
H5FD_ioc_t *file_ptr = (H5FD_ioc_t *)_file;
io_req_t **sf_io_reqs = NULL;
int64_t sf_context_id = -1;
size_t io_size = 0;
bool extend_sizes = false;
herr_t ret_value = SUCCEED;

assert(_file);
assert(addrs);
assert(sizes);
assert(bufs);
assert((addrs) || (count == 0));
assert((sizes) || (count == 0));
assert((bufs) || (count == 0));

if (count == 0)
H5_SUBFILING_GOTO_DONE(SUCCEED);
Expand Down Expand Up @@ -1648,12 +1650,22 @@ H5FD__ioc_write_vector_internal(H5FD_t *_file, uint32_t count, H5FD_mem_t H5_ATT
for (size_t i = 0; i < (size_t)count; i++) {
herr_t write_status;

if (sizes[i] == 0)
if (!extend_sizes) {
if ((i > 0) && (sizes[i] == 0)) {
extend_sizes = true;
io_size = sizes[i - 1];
}
else {
io_size = sizes[i];
}
}

if (io_size == 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "invalid size argument of 0");

H5_CHECK_OVERFLOW(addrs[i], haddr_t, int64_t);
H5_CHECK_OVERFLOW(sizes[i], size_t, int64_t);
write_status = ioc__write_independent_async(sf_context_id, (int64_t)addrs[i], (int64_t)sizes[i],
H5_CHECK_OVERFLOW(io_size, size_t, int64_t);
write_status = ioc__write_independent_async(sf_context_id, (int64_t)addrs[i], (int64_t)io_size,
bufs[i], &sf_io_reqs[i]);

if (write_status < 0)
Expand Down Expand Up @@ -1691,12 +1703,14 @@ H5FD__ioc_read_vector_internal(H5FD_t *_file, uint32_t count, haddr_t addrs[], s
H5FD_ioc_t *file_ptr = (H5FD_ioc_t *)_file;
io_req_t **sf_io_reqs = NULL;
int64_t sf_context_id = -1;
size_t io_size = 0;
bool extend_sizes = false;
herr_t ret_value = SUCCEED;

assert(_file);
assert(addrs);
assert(sizes);
assert(bufs);
assert((addrs) || (count == 0));
assert((sizes) || (count == 0));
assert((bufs) || (count == 0));

if (count == 0)
H5_SUBFILING_GOTO_DONE(SUCCEED);
Expand All @@ -1720,12 +1734,22 @@ H5FD__ioc_read_vector_internal(H5FD_t *_file, uint32_t count, haddr_t addrs[], s
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate MPI request array");

for (size_t i = 0; i < (size_t)count; i++) {
int read_status;
herr_t read_status;

if (!extend_sizes) {
if ((i > 0) && (sizes[i] == 0)) {
extend_sizes = true;
io_size = sizes[i - 1];
}
else {
io_size = sizes[i];
}
}

H5_CHECK_OVERFLOW(addrs[i], haddr_t, int64_t);
H5_CHECK_OVERFLOW(sizes[i], size_t, int64_t);
read_status = ioc__read_independent_async(sf_context_id, (int64_t)addrs[i], (int64_t)sizes[i],
bufs[i], &sf_io_reqs[i]);
H5_CHECK_OVERFLOW(io_size, size_t, int64_t);
read_status = ioc__read_independent_async(sf_context_id, (int64_t)addrs[i], (int64_t)io_size, bufs[i],
&sf_io_reqs[i]);

if (read_status < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't queue read operation");
Expand Down
8 changes: 6 additions & 2 deletions src/H5FDsubfiling/H5FDioc_int.c
Original file line number Diff line number Diff line change
Expand Up @@ -297,9 +297,13 @@ ioc__read_independent_async(int64_t context_id, int64_t offset, int64_t elements
* unpredictable order. However, if some IOCs own more than
* 1 subfile, we need to associate each read with a unique
* message tag to make sure the data is received in the
* correct order.
* correct order. We also need a unique message tag in the
* case where only 1 subfile is used in total. In this case,
* vector I/O calls are passed directly down to this VFD without
* being split up into multiple I/O requests, so we need the
* tag to distinguish each I/O request.
*/
need_data_tag = num_subfiles != num_io_concentrators;
need_data_tag = (num_subfiles == 1) || (num_subfiles != num_io_concentrators);
if (!need_data_tag)
data_tag = READ_INDEP_DATA;

Expand Down
12 changes: 9 additions & 3 deletions src/H5FDsubfiling/H5FDioc_threads.c
Original file line number Diff line number Diff line change
Expand Up @@ -456,8 +456,9 @@ translate_opcode(io_op_t op)
case LOGGING_OP:
return "LOGGING_OP";
break;
default:
return "unknown";
}
return "unknown";
}
#endif

Expand Down Expand Up @@ -873,9 +874,14 @@ ioc_file_queue_read_indep(sf_work_request_t *msg, int ioc_idx, int source, MPI_C
* unpredictable order. However, if some IOCs own more than
* 1 subfile, we need to associate each read with a unique
* message tag to make sure the data is received in the
* correct order.
* correct order. We also need a unique message tag in the
* case where only 1 subfile is used in total. In this case,
* vector I/O calls are passed directly down to this VFD without
* being split up into multiple I/O requests, so we need the
* tag to distinguish each I/O request.
*/
need_data_tag = sf_context->sf_num_subfiles != sf_context->topology->n_io_concentrators;
need_data_tag = (sf_context->sf_num_subfiles == 1) ||
(sf_context->sf_num_subfiles != sf_context->topology->n_io_concentrators);
if (!need_data_tag)
send_tag = READ_INDEP_DATA;

Expand Down
Loading

0 comments on commit 6ffc55c

Please sign in to comment.